wyldecat github-actions[bot] commited on Feb 26

Commit

33929c0

unverified ·

1 Parent(s): ae32572

Refactor pipeline to async generator pattern (#16)

* Refactor muon.py into modules with async generator pipeline

* Add MoE expert weight support with EP+FSDP tests

* Add built binary [skip-build]

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

CLAUDE.md +108 -0
README.md +6 -0
build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu126-x86_64-linux/{_optimizer_06a260a_dirty.abi3.so → _optimizer_7aef62f_dirty.abi3.so} +1 -1
build/torch210-cxx11-cu126-x86_64-linux/adamw.py +154 -0
build/torch210-cxx11-cu126-x86_64-linux/async_utils.py +77 -0
build/torch210-cxx11-cu126-x86_64-linux/core.py +116 -0
build/torch210-cxx11-cu126-x86_64-linux/distributed/utils.py +174 -115
build/torch210-cxx11-cu126-x86_64-linux/matmul_transpose_triton.py +0 -7
build/torch210-cxx11-cu126-x86_64-linux/metadata.json +3 -1
build/torch210-cxx11-cu126-x86_64-linux/muon.py +196 -870
build/torch210-cxx11-cu126-x86_64-linux/newton_schulz.py +50 -0
build/torch210-cxx11-cu126-x86_64-linux/pipeline.py +390 -0
build/torch210-cxx11-cu126-x86_64-linux/qk_clip.py +129 -0
build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu128-x86_64-linux/{_optimizer_06a260a_dirty.abi3.so → _optimizer_7aef62f_dirty.abi3.so} +1 -1
build/torch210-cxx11-cu128-x86_64-linux/adamw.py +154 -0
build/torch210-cxx11-cu128-x86_64-linux/async_utils.py +77 -0
build/torch210-cxx11-cu128-x86_64-linux/core.py +116 -0
build/torch210-cxx11-cu128-x86_64-linux/distributed/utils.py +174 -115
build/torch210-cxx11-cu128-x86_64-linux/matmul_transpose_triton.py +0 -7
build/torch210-cxx11-cu128-x86_64-linux/metadata.json +3 -1
build/torch210-cxx11-cu128-x86_64-linux/muon.py +196 -870
build/torch210-cxx11-cu128-x86_64-linux/newton_schulz.py +50 -0
build/torch210-cxx11-cu128-x86_64-linux/pipeline.py +390 -0
build/torch210-cxx11-cu128-x86_64-linux/qk_clip.py +129 -0
build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu130-x86_64-linux/{_optimizer_06a260a_dirty.abi3.so → _optimizer_7aef62f_dirty.abi3.so} +1 -1
build/torch210-cxx11-cu130-x86_64-linux/adamw.py +154 -0
build/torch210-cxx11-cu130-x86_64-linux/async_utils.py +77 -0
build/torch210-cxx11-cu130-x86_64-linux/core.py +116 -0
build/torch210-cxx11-cu130-x86_64-linux/distributed/utils.py +174 -115
build/torch210-cxx11-cu130-x86_64-linux/matmul_transpose_triton.py +0 -7
build/torch210-cxx11-cu130-x86_64-linux/metadata.json +3 -1
build/torch210-cxx11-cu130-x86_64-linux/muon.py +196 -870
build/torch210-cxx11-cu130-x86_64-linux/newton_schulz.py +50 -0
build/torch210-cxx11-cu130-x86_64-linux/pipeline.py +390 -0
build/torch210-cxx11-cu130-x86_64-linux/qk_clip.py +129 -0
build/torch210-cxx11-rocm70-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-rocm70-x86_64-linux/{_optimizer_06a260a_dirty.abi3.so → _optimizer_7aef62f_dirty.abi3.so} +1 -1
build/torch210-cxx11-rocm70-x86_64-linux/adamw.py +154 -0
build/torch210-cxx11-rocm70-x86_64-linux/async_utils.py +77 -0
build/torch210-cxx11-rocm70-x86_64-linux/core.py +116 -0
build/torch210-cxx11-rocm70-x86_64-linux/distributed/utils.py +174 -115
build/torch210-cxx11-rocm70-x86_64-linux/matmul_transpose_triton.py +0 -7
build/torch210-cxx11-rocm70-x86_64-linux/metadata.json +3 -1
build/torch210-cxx11-rocm70-x86_64-linux/muon.py +196 -870
build/torch210-cxx11-rocm70-x86_64-linux/newton_schulz.py +50 -0
build/torch210-cxx11-rocm70-x86_64-linux/pipeline.py +390 -0
build/torch210-cxx11-rocm70-x86_64-linux/qk_clip.py +129 -0

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,108 @@

+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Project Overview
+Optimizer is a PyTorch package implementing the **Muon optimizer** with support for N-D sharding parallelism for large-scale distributed training. Based on the paper at https://arxiv.org/abs/2511.07464. It supports general N-D sharding configurations (FSDP2 through hybrid setups like 2 TP + 2 DP-Replicate + 2 DP-Shard).
+## Commands
+### Lint & Format
+```bash
+pre-commit run --all-files          # Run all pre-commit hooks
+pre-commit run isort --all-files    # Run a specific hook (e.g., isort)
+```
+Hooks: yapf (Python formatter), isort (import sorter), typos (spell checker), clang-format (C++/CUDA), pymarkdown (Markdown linter), actionlint (GitHub Actions).
+### Tests
+Tests require **8 GPUs**, access to `Motif-Technologies/Motif-2.6B-4layer-random` on HuggingFace (`HF_TOKEN` env var), and PyTorch >= 2.8.0.
+```bash
+cd test && ./run_test.sh
+# Equivalent to:
+cd test && torchrun --nproc-per-node=8 --local-ranks-filter=0 -m pytest test_muon.py
+```
+Useful pytest flags: `--measure-perf` (timing/memory), `--do-profile` (profiling, requires `--measure-perf`), `--skip-verify` (skip correctness check against sequential implementation).
+### Build
+Uses kernel-builder infrastructure (`build.toml`, `flake.nix`). Pre-built binaries for various PyTorch/CUDA/ROCm combinations are stored in `build/`.
+### Commit Convention
+**Always append `[skip-build]` to every commit message.** This prevents CI from triggering unnecessary build jobs on development branches.
+## Architecture
+### Source Layout
+```
+torch-ext/optimizer/
+├── __init__.py                    # Public API: exports Muon
+├── muon.py                        # Muon optimizer class (~430 lines)
+├── newton_schulz.py               # Newton-Schulz iteration (~50 lines)
+├── qk_clip.py                     # QK clipping for attention heads (~130 lines)
+├── core.py                        # Shared state, helpers, param grouping (~110 lines)
+├── pipeline.py                    # Async generator pipeline for parallel mode (~290 lines)
+├── async_utils.py                 # AsyncTask / AsyncRuntime scheduling (~75 lines)
+├── adamw.py                       # Fused AdamW for non-Muon parameters (~160 lines)
+├── matmul_transpose_triton.py     # Triton kernel for X @ X.T (~130 lines)
+└── distributed/
+    └── utils.py                   # Shard mesh construction, DTensor slicing (~175 lines)
+```
+### Optimizer Modes
+The `Muon` optimizer has three execution paths selected per-parameter based on its tensor type and mesh structure:
+1. **Base mode** (`base()`) — Single-device / non-sharded tensors. Standard Muon with Newton-Schulz orthogonalization.
+2. **Distributed mode** (`distributed_muon()`) — Gathers full tensors via all-gather, computes updates, redistributes. Used for small parameters or fallback.
+3. **Parallel mode** (`parallel()`) — Pipelined all2all communication overlapped with compute. Uses an async generator pipeline scheduled by `run_pipeline()`. This is the main advanced feature.
+### Parallel Mode Pipeline
+The parallel pipeline is implemented as a single generator function `muon_chunk_pipeline()` in `pipeline.py`. Parameters are split into chunks, and each chunk flows through:
+```
+build bufs + async all2all_gather → yield → wait + Newton-Schulz compute + async all2all_scatter → yield → wait + update_param
+```
+The generator yields 2 times (after launching async gather and async scatter via `async_op=True`), allowing `run_pipeline()` to interleave multiple chunks for communication overlap. `work.wait()` completes each async operation after the yield.
+`warmup_step` maps to `max_concurrent_tasks = warmup_step + 1` in `run_pipeline()`.
+For detailed implementation documentation (pipeline internals, distributed utilities, QK clipping with strided sharding, etc.), see [`docs/implementation.md`](docs/implementation.md).
+### Key Abstractions
+- **`get_default_muon_param_groups(model, is_muon_func)`** (`core.py`) — Separates parameters into Muon-optimizable (2D+) and AdamW groups. Skips embeddings and output layers by default.
+- **`_muon_state` dataclass** (`core.py`) — Per-parameter config: rank ownership (`worker_rank`), process group, precomputed shard indices (`rank_indices`, `rank_numels`), and optional QK clip state. Config-only; no transient pipeline state.
+- **`muon_chunk_pipeline()` generator** (`pipeline.py`) — Processes one chunk through the full gather→compute→scatter→update pipeline. Uses `async_op=True` for non-blocking all-to-all and yields to allow chunk interleaving. All intermediate buffers are generator-local variables.
+- **`run_pipeline()`** (`async_utils.py`) — Generator-based pipeline scheduling with bounded concurrency. Interleaves multiple chunk pipelines at yield points.
+- **`construct_shard_mesh()` / `get_slices_of_dtensor()`** (`distributed/utils.py`) — Utilities for building shard meshes from DTensor placements and computing per-rank local slices. Handles both `Shard` and `_StridedShard` (PyTorch 2.10+).
+- **Newton-Schulz iteration** (`newton_schulz.py`) — `_zeropower_via_newtonschulz5()`: 5 quintic iterations in bfloat16 with pre-optimized coefficients for gradient orthogonalization. Uses Triton kernel `matmul_transpose_assign` for efficient X @ X.T.
+- **QK Clipping** (`qk_clip.py`) — Optional dynamic clipping of attention head projections when QK logits exceed a threshold. Configured via `q_indices`, `k_indices`, `head_dim`, `threshold`.
+- **Fused AdamW** (`adamw.py`) — Uses PyTorch's `torch._fused_adamw_` for non-Muon parameters, grouping tensors by device/dtype and DTensor placement.
+### Dependency Graph
+```
+matmul_transpose_triton.py       (leaf)
+         │
+    newton_schulz.py              (leaf + triton)
+         │
+      core.py ──── qk_clip.py    (leaf, distributed/utils)
+       │    │         │
+       │  pipeline.py ─── async_utils.py
+       │       │
+       │   adamw.py
+       │       │
+      muon.py                     (all above)
+         │
+    __init__.py
+```

README.md CHANGED Viewed

@@ -45,7 +45,13 @@ optim = optimizer.Muon(
 )
 ```
 ## Test
 - Check [test/README.md](./test/README.md) for how to run the tests.
 ## Pre-commit Hooks

 )
 ```
+## Documentation
+- [Implementation Guide](./docs/implementation.md) — Detailed walkthrough of the internal architecture, parallel pipeline, distributed utilities, and QK clipping. Recommended for code reviewers and new contributors.
+- [PyTorch 2.10 TP Fix](./docs/pytorch-2.10-tp-fix.md) — Root cause analysis and fixes for `_StridedShard` compatibility with PyTorch 2.10+.
 ## Test
 - Check [test/README.md](./test/README.md) for how to run the tests.
 ## Pre-commit Hooks

build/torch210-cxx11-cu126-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_06a260a_dirty
-ops = torch.ops._optimizer_06a260a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_06a260a_dirty::{op_name}"

 import torch
+from . import _optimizer_7aef62f_dirty
+ops = torch.ops._optimizer_7aef62f_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_7aef62f_dirty::{op_name}"

build/torch210-cxx11-cu126-x86_64-linux/{_optimizer_06a260a_dirty.abi3.so → _optimizer_7aef62f_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5384da54f22f488e0646e09915b821b3235cb404b163a570aa377967f853e3cf
 size 1940944

 version https://git-lfs.github.com/spec/v1
+oid sha256:f095be87ff6185010a3cff4175abbde0b2e50fe1e435dc1db4eaf5bf1f6199ca
 size 1940944

build/torch210-cxx11-cu126-x86_64-linux/adamw.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from collections import defaultdict
+from typing import cast
+import torch
+from torch.distributed.tensor import DTensor
+def fused_adamw(
+    params: list[torch.Tensor],
+    grads: list[torch.Tensor],
+    exp_avgs: list[torch.Tensor],
+    exp_avg_sqs: list[torch.Tensor],
+    max_exp_avg_sqs: list[torch.Tensor],
+    state_steps: list[torch.Tensor],
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: float | torch.Tensor,
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+) -> None:
+    if not params:
+        return
+    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+    # treating it as a scalar.
+    lr_dict: dict | None = ({
+        lr.device: lr
+    } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else None)
+    grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+         state_steps]  # type: ignore[list-item]
+    )
+    for (device, _), (
+        (
+            device_params_,
+            device_grads_,
+            device_exp_avgs_,
+            device_exp_avg_sqs_,
+            device_max_exp_avg_sqs,
+            device_state_steps_,
+        ),
+            _,
+    ) in grouped_tensors.items():
+        device_params = cast(list[torch.Tensor], device_params_)
+        device_grads = cast(list[torch.Tensor], device_grads_)
+        device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+        device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+        device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+        if lr_dict is not None and device not in lr_dict:
+            lr_dict[device] = lr.to(
+                device=device, non_blocking=True)  # type: ignore[union-attr]
+            lr = lr_dict[device]
+        torch._foreach_add_(device_state_steps, 1)
+        func = torch._fused_adamw_
+        func(
+            device_params,
+            device_grads,
+            device_exp_avgs,
+            device_exp_avg_sqs,
+            device_max_exp_avg_sqs,  # type: ignore[arg-type]
+            device_state_steps,
+            amsgrad=amsgrad,
+            lr=lr,  # type: ignore[arg-type]
+            beta1=beta1,
+            beta2=beta2,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=maximize,
+        )
+def step_adamw_params(optimizer_state, params, group):
+    """Run fused AdamW on a list of parameters sharing the same placement.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        params: List of parameters to update.
+        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
+    """
+    params_with_grads = []
+    grads = []
+    moment1 = []
+    moment2 = []
+    max_exp_avg_sqs = []
+    state_steps = []
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
+    for p in params:
+        g = p.grad
+        if g is None:
+            continue
+        state = optimizer_state[p]
+        params_with_grads.append(p)
+        grads.append(g)
+        if "step" not in state:
+            state["step"] = (torch.zeros((),
+                                         dtype=torch.float32,
+                                         device=p.device))
+            state["moment1"] = torch.zeros_like(g)
+            state["moment2"] = torch.zeros_like(g)
+        moment1.append(state["moment1"])
+        moment2.append(state["moment2"])
+        if not isinstance(state["step"], torch.Tensor):
+            step_tensor = torch.tensor(state["step"],
+                                       dtype=torch.float32,
+                                       device=p.device)
+        else:
+            step_tensor = state["step"]
+        state_steps.append(step_tensor)
+    fused_adamw(
+        params_with_grads,
+        grads,
+        moment1,
+        moment2,
+        max_exp_avg_sqs,
+        state_steps,
+        amsgrad=False,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        eps=eps,
+        maximize=False,
+    )
+def step_adamw(optimizer_state, group):
+    """Dispatch AdamW step, grouping parameters by type and placement.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        group: Parameter group dict.
+    """
+    params = group["params"]
+    # group params with its type and placement
+    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
+    for p in params:
+        match p:
+            case DTensor():
+                placement_to_params[tuple([p.placements,
+                                           p.device_mesh])].append(p)
+            case torch.Tensor():
+                placement_to_params[tuple([torch.Tensor, None])].append(p)
+    for group_params in placement_to_params.values():
+        step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-cu126-x86_64-linux/async_utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import logging
+from typing import Generator
+logger = logging.getLogger(__name__)
+class _Task:
+    """Internal: wraps a generator, advances one yield at a time."""
+    def __init__(self, generator: Generator[None, None, None], index: int):
+        self._generator = generator
+        self._index = index
+        self._steps_completed = 0
+        self.step()  # run to first yield
+    def step(self) -> bool:
+        try:
+            next(self._generator)
+            self._steps_completed += 1
+            logger.debug("pipeline[%d] completed stage %d", self._index,
+                         self._steps_completed)
+            return True
+        except StopIteration:
+            logger.debug("pipeline[%d] finished after %d stages", self._index,
+                         self._steps_completed)
+            return False
+    def close(self):
+        self._generator.close()
+def run_pipeline(
+    pipelines: Generator[Generator[None, None, None], None, None],
+    max_concurrent: int,
+) -> None:
+    """Run generator-based pipelines with bounded concurrency.
+    Each pipeline is a generator that yields at stage boundaries.
+    The runtime interleaves pipelines so communication and computation
+    overlap across chunks.
+    """
+    if max_concurrent <= 0:
+        raise ValueError(f"max_concurrent must be > 0, got {max_concurrent}")
+    have_new = True
+    task_index = 0
+    previous_tasks: list[_Task] = []
+    try:
+        while have_new or previous_tasks:
+            running_tasks: list[_Task] = []
+            # Admit one new pipeline per iteration (staggered admission).
+            # Admitting one at a time ensures that while chunk N does NS
+            # compute on the default stream, chunk N+1's NCCL all-to-all
+            # runs concurrently on the NCCL stream — creating real
+            # communication/computation overlap on the GPU.
+            if have_new and len(previous_tasks) < max_concurrent:
+                try:
+                    gen = next(pipelines)
+                    task = _Task(gen, task_index)
+                    task_index += 1
+                    running_tasks.append(task)
+                except StopIteration:
+                    have_new = False
+            # Advance every previously-yielded task by one step.
+            for task in previous_tasks:
+                if task.step():
+                    running_tasks.append(task)
+            previous_tasks = running_tasks
+    except BaseException:
+        # Clean up all in-flight generators to release GPU resources.
+        for task in previous_tasks:
+            task.close()
+        raise

build/torch210-cxx11-cu126-x86_64-linux/core.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+from torch.distributed.tensor import DTensor
+@dataclass
+class _muon_state:
+    worker_rank: int
+    process_group: ProcessGroup
+    rank_indices: dict[int, tuple]  # local_rank -> per-dim indices
+    rank_numels: dict[int, int]  # local_rank -> numel
+    name: str
+    qk_clip_state: torch.Tensor | None = None
+def update_g(optimizer_state, p, g, group, momentum):
+    """Apply momentum update to gradient.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        p: Parameter tensor.
+        g: Gradient tensor.
+        group: Parameter group dict.
+        momentum: Momentum coefficient.
+    Returns:
+        Momentum-updated gradient tensor.
+    """
+    state = optimizer_state[p]
+    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+    torch.add(g, buf, alpha=momentum, out=buf)
+    if group["nesterov"]:
+        g.add_(buf, alpha=momentum)
+        return g
+    return buf
+def update_p(p, u, lr, adjusted_lr, weight_decay):
+    """Apply weight decay and orthogonalized update to parameter.
+    Args:
+        p: Parameter (torch.nn.Parameter or DTensor).
+        u: Orthogonalized update tensor.
+        lr: Base learning rate.
+        adjusted_lr: Size-adjusted learning rate.
+        weight_decay: Weight decay coefficient.
+    """
+    if isinstance(p, torch.nn.Parameter):
+        # apply weight decay
+        p.data.mul_(1 - lr * weight_decay)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    else:
+        p.mul_(1 - lr * weight_decay)
+        p.add_(u, alpha=-adjusted_lr)
+def adjust_lr_for_muon(lr, param_shape):
+    """Scale learning rate based on parameter matrix dimensions.
+    Args:
+        lr: Base learning rate.
+        param_shape: Shape of the parameter tensor.
+    Returns:
+        Adjusted learning rate.
+    """
+    A, B = param_shape[:2]
+    # We adjust the learning rate and weight decay based on the size of the parameter matrix
+    # as described in the paper
+    adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+    adjusted_lr = lr * adjusted_ratio
+    return adjusted_lr
+def default_is_muon(name, x, expert_keys=None):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    if any(key in name for key in skip_keys):
+        return False
+    effective_ndim = x.ndim
+    if expert_keys and any(key in name for key in expert_keys):
+        effective_ndim -= 1
+    return effective_ndim >= 2
+def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
+    if is_muon_func is None:
+        is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
+    return [
+        {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
+        },
+        {
+            "params": non_muon_params,
+            "use_muon": False,
+        },
+    ]

build/torch210-cxx11-cu126-x86_64-linux/distributed/utils.py CHANGED Viewed

@@ -7,22 +7,40 @@ from torch.distributed.tensor.placement_types import (Placement, Shard,
                                                       _StridedShard)
 def get_slices_of_dtensor(
     target: DTensor | torch.Tensor,
     local_rank: int,
     shard_mesh: DeviceMesh,
     shard_placements: tuple[Placement],
-) -> tuple[slice]:
     """
-    Get the slice of local tensor for a given rank from a tensor.
     Args:
-        target (DTensor | torch.Tensor): The target tensor.
-        rank (int): The local rank of the shard group.
-        shard_mesh (DeviceMesh): The shard mesh. It consists of global ranks.
         shard_placements (tuple[Placement]): The shard placements.
-    """
-    slices: list[slice] = [slice(0, dim_size) for dim_size in target.size()]
     # find the global rank of the local rank in the shard mesh
     rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
@@ -34,34 +52,75 @@ def get_slices_of_dtensor(
     assert len(rank_coords) == len(shard_placements)
     # Caution: Assuming replicate-to-shard of the shard mesh goes with
     # left-to-right sharding. This is ensured by the sorting logic of
     # construct_shard_mesh function.
-    for i, (rank_coord,
-            placement) in enumerate(zip(rank_coords, shard_placements)):
-        assert isinstance(placement, Shard)
-        num_ranks = shard_mesh.mesh.shape[i]
-        dim = placement.dim
-        dim_size = (slices[dim].stop - slices[dim].start)
-        if dim_size % num_ranks != 0:
             raise NotImplementedError(
-                f"Dimension size {dim_size} is not divisible "
-                f"by number of ranks {num_ranks} for shard "
-                f"placement on dim {dim}. (shape: {target.shape})")
-        shard_size = dim_size // num_ranks
-        start = slices[dim].start + rank_coord * shard_size
-        end = start + shard_size
-        assert start < end <= slices[dim].stop
-        slices[dim] = slice(start, end)
-    return tuple(slices)
 _ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
@@ -71,105 +130,105 @@ _ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
 def construct_shard_mesh(
     placements: tuple[Placement],
     mesh: DeviceMesh,
-) -> (DeviceMesh, ProcessGroup, tuple[Placement]):
-    """
-    Construct Shard Mesh and Placements for unsharding.
-    It removes Replicate placements and constructs a new Mesh and ProcessGroup.
-    """
-    my_rank = dist.get_rank()
-    assert mesh.mesh.device.type == 'cpu'
-    # Copy mesh to avoid modifying the original mesh
-    mesh = mesh.mesh.clone()
-    # 1. Sort placements. Replicate first, then Shard by dim ascending.
-    # For Shard, strided shard comes after regular shard on the same dim
-    # to preserve left-to-right order of replicate-to-shard.
-    # This is because that strided shard is using stride to represent
-    # more fine-grained sharding on the same dim.
-    # Please check the URL below for _StridedShard.
-    # https://github.com/pytorch/pytorch/blob/v2.8.0/torch/distributed/tensor/placement_types.py#L366
-    def placement_sort_key(
-        placement_with_index: tuple[float, Placement]
-    ) -> tuple[int, float, int]:  # (dim, split factor, original index)
-        index, placement = placement_with_index
-        is_replicate = placement.is_replicate()
-        is_shard = placement.is_shard()
-        is_partial = placement.is_partial()
-        assert is_replicate or is_shard, f"Unsupported placement type: {type(placement)}"
-        assert not is_partial, "Partial placement is not supported."
-        if is_replicate:
-            return (-1.0, 0, index)
-        elif is_shard:
-            if isinstance(placement, _StridedShard):
-                return (placement.dim, 1 / placement.split_factor, index)
-            return (placement.dim, 0, index)
-        else:
-            raise TypeError(f"Unknown placement type: {type(placement)}")
-    placements_with_index: list[tuple[int,
-                                      Placement]] = list(enumerate(placements))
-    placements_with_index = sorted(placements_with_index,
-                                   key=placement_sort_key)
-    sorted_indices, sorted_placements = zip(*placements_with_index)
-    # 2. Permute mesh according to sorted placements.
-    sorted_mesh = mesh.permute(sorted_indices)
-    # 3. Collect list of shard meshes by removing replicate dims
-    # For example, (2, 3, 4, 4) with placements [R, R, S(0), S(1)]
-    # shard_meshes should be list with 2 * 3 = 6 shard meshes of shape (4, 4)
-    num_replicates = sum(1 for p in sorted_placements if p.is_replicate())
-    # merge replicate dims
-    # shard_meshes became a list of shard meshes with a length of replicate degree
-    if num_replicates > 0:
-        sorted_mesh = sorted_mesh.flatten(
-            0, num_replicates - 1) if num_replicates > 1 else sorted_mesh
         shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
     else:
         shard_meshes = [sorted_mesh]
-    shard_placements = sorted_placements[num_replicates:]
-    # assume all shard placements are different
     assert len(shard_placements) == len(set(shard_placements))
-    # 4. Construct ProcessGroups
-    # Caution: all groups should be created in the same order in all processes,
-    # even though each process only needs its own group.
-    # To use tensor as dict key, convert it to tuple
-    def tensor_to_tuple(t):
-        if isinstance(t, torch.Tensor):
-            t = t.tolist()
-        if isinstance(t, list):
-            return tuple(tensor_to_tuple(x) for x in t)
-        return t
-    my_shard_mesh_as_tuple = None
-    for shard_mesh in shard_meshes:
-        assert isinstance(shard_mesh, torch.Tensor)
-        shard_mesh_as_tuple = tensor_to_tuple(shard_mesh)
-        if (my_rank == shard_mesh).any().item():
-            assert my_shard_mesh_as_tuple is None
-            my_shard_mesh_as_tuple = shard_mesh_as_tuple
-        # update global cache
-        if shard_mesh_as_tuple not in _ranks_to_dist_cache:
-            shard_process_group = dist.new_group(shard_mesh.flatten().tolist())
-            _ranks_to_dist_cache[shard_mesh_as_tuple] = (
-                DeviceMesh(device_type="cuda", mesh=shard_mesh),
-                shard_process_group,
             )
-    my_shard_mesh, my_shard_process_group = _ranks_to_dist_cache[
-        my_shard_mesh_as_tuple]
-    return my_shard_mesh, my_shard_process_group, shard_placements

                                                       _StridedShard)
+def _is_shard(placement: Placement) -> bool:
+    """Check if a placement is a shard type (Shard or _StridedShard).
+    In PyTorch 2.10+, _StridedShard no longer inherits from Shard, so
+    ``placement.is_shard()`` returns False for _StridedShard.  This helper
+    handles both old and new hierarchies.
+    """
+    return isinstance(placement, (Shard, _StridedShard))
 def get_slices_of_dtensor(
     target: DTensor | torch.Tensor,
     local_rank: int,
     shard_mesh: DeviceMesh,
     shard_placements: tuple[Placement],
+) -> tuple[slice | torch.Tensor, ...]:
     """
+    Get per-dimension indices for a given rank's shard of the target tensor.
+    Uses ``Shard.local_shard_size_and_offset`` and
+    ``_StridedShard.local_shard_size_and_offset`` for correct handling of
+    both contiguous and strided (non-contiguous) sharding.
     Args:
+        target (DTensor | torch.Tensor): The target tensor (for its shape).
+        local_rank (int): The local rank within the shard group.
+        shard_mesh (DeviceMesh): The shard mesh (only shard dimensions).
         shard_placements (tuple[Placement]): The shard placements.
+    Returns:
+        A tuple of indices (one per tensor dim).  Each element is either:
+        - A ``slice`` (for contiguous or unsharded dims)
+        - A 1-D ``torch.LongTensor`` of indices (for strided sharding)
+    """
     # find the global rank of the local rank in the shard mesh
     rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
     assert len(rank_coords) == len(shard_placements)
+    # Track per-shard-dim indices.
+    # None means "not yet sharded on this dim".
+    dim_indices: dict[int, torch.Tensor] = {}
     # Caution: Assuming replicate-to-shard of the shard mesh goes with
     # left-to-right sharding. This is ensured by the sorting logic of
     # construct_shard_mesh function.
+    for mesh_dim_idx, (rank_coord, placement) in enumerate(
+            zip(rank_coords, shard_placements)):
+        assert _is_shard(placement)
+        num_chunks = shard_mesh.mesh.shape[mesh_dim_idx]
+        shard_dim = placement.dim
+        # Current effective size on this dim (may already be sub-sharded)
+        if shard_dim in dim_indices:
+            curr_size = len(dim_indices[shard_dim])
+        else:
+            curr_size = target.size()[shard_dim]
+        if curr_size % num_chunks != 0:
             raise NotImplementedError(
+                f"Dimension size {curr_size} is not divisible "
+                f"by number of ranks {num_chunks} for shard "
+                f"placement on dim {shard_dim}. (shape: {target.shape})")
+        # Compute indices for this level of sharding
+        if isinstance(placement, _StridedShard):
+            _shard_size, offsets = _StridedShard.local_shard_size_and_offset(
+                placement,
+                curr_size,
+                num_chunks,
+                rank_coord,
+                return_first_offset=False)
+            new_indices = torch.tensor(offsets, dtype=torch.long)
+        else:
+            shard_size, offset = Shard.local_shard_size_and_offset(
+                curr_size, num_chunks, rank_coord)
+            new_indices = torch.arange(offset,
+                                       offset + shard_size,
+                                       dtype=torch.long)
+        # Compose with previous indices on this dim
+        if shard_dim in dim_indices:
+            dim_indices[shard_dim] = dim_indices[shard_dim][new_indices]
+        else:
+            dim_indices[shard_dim] = new_indices
+    # Build result tuple
+    result: list[slice | torch.Tensor] = []
+    for d in range(len(target.size())):
+        if d not in dim_indices:
+            result.append(slice(None))
+        else:
+            indices = dim_indices[d]
+            # Convert contiguous indices to slice for efficiency
+            if len(indices) > 0:
+                start = indices[0].item()
+                expected = torch.arange(start,
+                                        start + len(indices),
+                                        dtype=torch.long)
+                if torch.equal(indices, expected):
+                    result.append(slice(start, start + len(indices)))
+                else:
+                    result.append(indices)
+            else:
+                result.append(slice(0, 0))
+    return tuple(result)
 _ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
 def construct_shard_mesh(
     placements: tuple[Placement],
     mesh: DeviceMesh,
+) -> tuple[DeviceMesh, ProcessGroup, tuple[Placement, ...]]:
+    """Construct shard sub-mesh and ProcessGroup for all-to-all communication.
+    Given a DTensor's placements and device mesh, extracts the "shard group"
+    — the set of ranks that together hold all shards of the same replica —
+    and creates a ProcessGroup for all-to-all among them.
+    Steps:
+        1. Sort placements: Replicate first, then Shard by (dim, granularity).
+        2. Permute the mesh tensor to match the sorted order.
+        3. Collapse Replicate dims → list of shard sub-meshes (one per replica).
+        4. Create/retrieve a cached ProcessGroup for the current rank's sub-mesh.
+    Example — 8 GPUs, mesh shape (2, 2, 2),
+              placements ``[Shard(0), Replicate, _StridedShard(0)]``::
+        Step 1 — Sort: [Replicate, _StridedShard(0), Shard(0)]
+                 Permutation: [1, 2, 0]
+        Step 2 — Permute mesh dims by [1, 2, 0]:
+                 Original:                Permuted:
+                 [[[0,1],[2,3]],          [[[0,2],[1,3]],
+                  [[4,5],[6,7]]]           [[4,6],[5,7]]]
+        Step 3 — Unbind replicate dim (dim 0), giving 2 shard sub-meshes:
+                 sub-mesh 0 = [[0,2],[1,3]]  (replica group 0)
+                 sub-mesh 1 = [[4,6],[5,7]]  (replica group 1)
+                 shard_placements = (_StridedShard(0), Shard(0))
+        Step 4 — Rank 0 → ProcessGroup([0,1,4,5])
+                 Rank 2 → ProcessGroup([2,3,6,7])
+    Returns:
+        ``(shard_mesh, process_group, shard_placements)``
+    """
+    my_rank = dist.get_rank()
+    assert mesh.mesh.device.type == 'cpu'
+    # -- Fast path: 1D all-shard mesh → reuse existing PG. ----------------
+    # This avoids a non-collective dist.new_group() call, which would
+    # deadlock when only a subset of ranks call this function (e.g. expert
+    # DTensors on a TP submesh where ranks 0-3 and 4-7 call separately).
+    if mesh.ndim == 1 and len(placements) == 1 and _is_shard(placements[0]):
+        key = (*mesh.mesh.shape, *mesh.mesh.flatten().tolist())
+        if key not in _ranks_to_dist_cache:
+            _ranks_to_dist_cache[key] = (mesh, mesh.get_group())
+        return (*_ranks_to_dist_cache[key], tuple(placements))
+    mesh_tensor = mesh.mesh.clone()
+    # -- Step 1: Sort placements (Replicate first, then Shard by dim). ------
+    # _StridedShard comes BEFORE regular Shard on the same dim so that
+    # get_slices_of_dtensor applies the outer sharding first, matching
+    # DTensor's left-to-right (outer-to-inner) composition order.
+    def _sort_key(item):
+        index, placement = item
+        assert not placement.is_partial(), "Partial placement not supported"
+        if placement.is_replicate():
+            return (-1, 0, index)
+        assert _is_shard(placement), f"Unsupported: {type(placement)}"
+        split = (-1 / placement.split_factor if isinstance(
+            placement, _StridedShard) else 0)
+        return (placement.dim, split, index)
+    indexed = sorted(enumerate(placements), key=_sort_key)
+    perm, sorted_placements = zip(*indexed)
+    # -- Step 2: Permute mesh to match sorted placement order. --------------
+    sorted_mesh = mesh_tensor.permute(perm)
+    # -- Step 3: Collapse replicate dims → list of shard sub-meshes. --------
+    # E.g. mesh (2, 3, 4, 4) with [R, R, S(0), S(1)] → 6 sub-meshes of (4, 4)
+    num_rep = sum(1 for p in sorted_placements if p.is_replicate())
+    if num_rep > 0:
+        if num_rep > 1:
+            sorted_mesh = sorted_mesh.flatten(0, num_rep - 1)
         shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
     else:
         shard_meshes = [sorted_mesh]
+    shard_placements = sorted_placements[num_rep:]
     assert len(shard_placements) == len(set(shard_placements))
+    # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
+    # All ranks must call dist.new_group in the same order, even though each
+    # rank only joins one group.
+    def _cache_key(t: torch.Tensor) -> tuple:
+        return (*t.shape, *t.flatten().tolist())
+    my_key = None
+    for sm in shard_meshes:
+        key = _cache_key(sm)
+        if (my_rank == sm).any().item():
+            assert my_key is None, "Rank appears in multiple shard groups"
+            my_key = key
+        if key not in _ranks_to_dist_cache:
+            pg = dist.new_group(sm.flatten().tolist())
+            _ranks_to_dist_cache[key] = (
+                DeviceMesh(device_type="cuda", mesh=sm),
+                pg,
             )
+    return (*_ranks_to_dist_cache[my_key], shard_placements)

build/torch210-cxx11-cu126-x86_64-linux/matmul_transpose_triton.py CHANGED Viewed

@@ -119,10 +119,3 @@ def matmul_transpose_assign(d_in, d_out):
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))
-def matmul_transpose(d_in):
-    M, _ = d_in.shape
-    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
-    matmul_transpose_assign(d_in, d_out)
-    return d_out

     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))

build/torch210-cxx11-cu126-x86_64-linux/metadata.json CHANGED Viewed

	@@ -1 +1,3 @@
1	- {~~"python-depends":[]}~~

+{
+  "python-depends": []
+}

build/torch210-cxx11-cu126-x86_64-linux/muon.py CHANGED Viewed

@@ -1,536 +1,121 @@
 import logging
-import math
 import types
 from collections import defaultdict
-from dataclasses import dataclass
-from typing import Any, cast
 import torch
 import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import DTensor, Replicate
-from torch.distributed.tensor.placement_types import Placement
-from .distributed.utils import construct_shard_mesh, get_slices_of_dtensor
-from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
-COMM_DTYPE = torch.bfloat16
-DEFAULT_CHUNK_SIZE_RATIO = 4
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# Muon's Newton–Schulz iteration causes high variance in singular values
-# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
-@torch.no_grad()
-# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert len(G.shape) == 2
-    assert G.dtype == COMM_DTYPE
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm() + 1e-7)
-    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    # Perform the NS iterations
-    for a, b, c in [
-        (4.0848, -6.8946, 2.9270),
-        (3.9505, -6.3029, 2.6377),
-        (3.7418, -5.5913, 2.3037),
-        (2.8769, -3.1427, 1.2046),
-        (2.8366, -3.0525, 1.2012),
-    ]:
-        matmul_transpose_assign(X, buf1)
-        matmul_transpose_assign(buf1, buf2)
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X
-@dataclass
-class _muon_state:
-    # TODO: use Optional
-    worker_rank: int
-    process_group: ProcessGroup
-    shard_mesh: DeviceMesh
-    shard_placements: tuple[Placement, ...]
-    name: str
-    qk_clip_state: torch.Tensor | None = None
-    gathered_grad: torch.Tensor | None = None
-    scattered_u: DTensor | None = None
-    computed_u: torch.Tensor | None = None
-    gather_event: torch.cuda.Event | None = None
-    compute_event: torch.cuda.Event | None = None
-    scatter_event: torch.cuda.Event | None = None
-def numel_for_rank(
-    param: DTensor,
-    local_rank: int,
-    state: _muon_state,
-) -> int:
-    slices = get_slices_of_dtensor(
-        param,
-        local_rank,
-        state.shard_mesh,
-        state.shard_placements,
-    )
-    numel = 1
-    for s, dim in zip(slices, param.shape):
-        start, stop, step = s.indices(dim)
-        length = max(0, (stop - start + (step - 1)) // step)
-        numel *= length
-    return numel
-@torch.no_grad()
-def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
-    """
-    Pre-allocate gathered_grad buffer on compute_stream
-    before launching all2all gather
-    """
-    with torch.cuda.stream(compute_stream):
-        for p in params:
-            state = param_to_state[id(p)]
-            if rank == state.worker_rank:
-                state.gathered_grad = torch.empty(p.shape,
-                                                  dtype=COMM_DTYPE,
-                                                  device="cuda")
-            else:
-                state.gathered_grad = None
-        alloc_event = torch.cuda.Event()
-        alloc_event.record(compute_stream)
-        return alloc_event
-@torch.no_grad()
-def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
-                    alloc_event):
-    """
-    All2all gathers shards so each owner rank reconstructs its full gradient
-    """
-    with torch.cuda.stream(comm_stream):
-        process_group = param_to_state[id(params[0])].process_group
-        num_ranks = dist.get_world_size(group=process_group)
-        # Construct sending buffers
-        per_dst = [[] for _ in range(num_ranks)]
-        send_counts = [0] * num_ranks
-        for p in params:
-            state = param_to_state[id(p)]
-            dst = state.worker_rank
-            assert dst < num_ranks
-            shard_elems = numel_for_rank(p, rank, state)
-            g = p.grad
-            g = g.to_local().to(COMM_DTYPE).contiguous()
-            assert g.numel() == shard_elems
-            per_dst[dst].append(g.view(-1))
-            send_counts[dst] += shard_elems
-        assert any(
-            len(v) > 0 for v in per_dst
-        ), "At least one destination rank must receive a sharded tensor"
-        # list[list[Tensor]] -> list[Tensor]
-        per_dst = [t for dst in per_dst for t in dst]
-        send_buf = torch.cat(per_dst, dim=0)
-        owned_params = [
-            p for p in params if param_to_state[id(p)].worker_rank == rank
-        ]
-        # Compute receive sizes and allocate receiving buffers
-        recv_counts = [0] * num_ranks
-        for src in range(num_ranks):
-            total = 0
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                assert state.worker_rank == rank
-                total += numel_for_rank(p, src, state)
-            recv_counts[src] = total
-        recv_total = sum(recv_counts)
-        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-        #All2All
-        logger.debug(f"send_buf size: {send_buf.numel()}, "
-                     f"recv_buf size: {recv_buf.numel()}, "
-                     f"recv_counts: {recv_counts}, "
-                     f"send_counts: {send_counts}, "
-                     f"process_group: {str(process_group)}")
-        dist.all_to_all_single(
-            recv_buf,
-            send_buf,
-            output_split_sizes=recv_counts,
-            input_split_sizes=send_counts,
-            group=process_group,
-        )
-        # Reconstructs gathered grad from the received buffer
-        #
-        #                  recv_buf (num ranks = 3)
-        #
-        #      From rank 0        From rank 1        From rank 2
-        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
-        #
-        # Outer loop:
-        # rank 0 -> rank 1 -> rank2
-        #
-        # Inner loop:
-        # p1_n -> p2_n -> p3_n
-        comm_stream.wait_event(alloc_event)
-        off = 0
-        for src in range(num_ranks):
-            if recv_counts[src] == 0:
-                continue
-            block = recv_counts[src]
-            inner_off = 0
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                assert state.worker_rank == rank
-                # get the slice of the full dtensor corresponding to rank src.
-                slices = get_slices_of_dtensor(state.gathered_grad, src,
-                                               state.shard_mesh,
-                                               state.shard_placements)
-                dst = state.gathered_grad[slices]
-                assert dst._base is state.gathered_grad
-                n = dst.numel()
-                assert n > 0
-                sg = recv_buf.narrow(0, off + inner_off, n)
-                sg = sg.reshape_as(dst)
-                dst.copy_(sg)
-                inner_off += n
-            off += block
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank == rank:
-                state.gather_event = torch.cuda.Event()
-                state.gather_event.record(comm_stream)
-            else:
-                state.gathered_grad = None
-                state.gather_event = None
-            if none_grad:
-                p.grad = None
-@torch.no_grad()
-def _compute_u(p, state, steps, rank, compute_stream):
-    """
-    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
-    """
-    with torch.cuda.stream(compute_stream):
-        if rank == state.worker_rank:
-            if state.gather_event is None:
-                raise RuntimeError("Gather event must be set before compute.")
-            compute_stream.wait_event(state.gather_event)
-            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
-            state.gathered_grad = None
-            state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
-@torch.no_grad()
-def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
-    """
-    Pre-allocate scattered_u buffer on compute_stream
-    before launching all2all gather
-    """
-    with torch.cuda.stream(compute_stream):
-        for p in params:
-            state = param_to_state[id(p)]
-            state.scattered_u = torch.empty_like(p.to_local(),
-                                                 dtype=COMM_DTYPE)
-        alloc_event = torch.cuda.Event()
-        alloc_event.record(compute_stream)
-        return alloc_event
-def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
-    """
-    All2all scatters full gradients to all ranks
-    """
-    with torch.cuda.stream(comm_stream):
-        process_group = param_to_state[id(params[0])].process_group
-        num_ranks = dist.get_world_size(group=process_group)
-        owned_params = [
-            p for p in params if param_to_state[id(p)].worker_rank == rank
-        ]
-        # Construct sending buffer
-        per_dst = [[] for _ in range(num_ranks)]
-        send_counts = [0] * num_ranks
-        if owned_params:
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                if state.compute_event is None:
-                    raise RuntimeError(
-                        "Compute event must be set before scatter.")
-                comm_stream.wait_event(state.compute_event)
-                state.gathered_grad = None
-                assert state.computed_u is not None
-                u_full = state.computed_u.to(COMM_DTYPE).contiguous()
-                offset = 0
-                for dst in range(num_ranks):
-                    # get the slice of the full tensor corresponding to rank dst.
-                    slices = get_slices_of_dtensor(u_full, dst,
-                                                   state.shard_mesh,
-                                                   state.shard_placements)
-                    su = u_full[slices].flatten()
-                    n = su.numel()
-                    assert n > 0
-                    per_dst[dst].append(su)
-                    send_counts[dst] += n
-                    offset += n
-                assert offset == u_full.numel()
-        lengths = [len(v) for v in per_dst]
-        if all(l > 0 for l in lengths):
-            assert all(
-                l == lengths[0] for l in lengths
-            ), "All destination ranks must have the same number of sharded tensor"
-            # list[list[Tensor]] -> list[Tensor]
-            per_dst = [t for dst in per_dst for t in dst]
-            send_buf = torch.cat(per_dst, dim=0)
-        else:
-            # all_to_all requires participation from all ranks
-            # Even non-owner ranks must join the collective call
-            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-        # Compute receive sizes and allocate receiving buffers
-        recv_counts = [0] * num_ranks
-        for src in range(num_ranks):
-            total = 0
-            for p in params:
-                state = param_to_state[id(p)]
-                if state.worker_rank != src:
-                    continue
-                total += numel_for_rank(p, rank, state)
-            recv_counts[src] = total
-        recv_total = sum(recv_counts)
-        assert recv_total > 0
-        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-        #All2All
-        dist.all_to_all_single(
-            recv_buf,
-            send_buf,
-            output_split_sizes=recv_counts,
-            input_split_sizes=send_counts,
-            group=process_group,
-        )
-        # Copy to pre-allocated scattered_u buffer from the received buffer
-        #
-        #                  recv_buf (num ranks = 3, local_rank = 0)
-        #
-        #      From rank 0        From rank 1       From rank 2
-        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
-        #
-        # Outer loop:
-        # rank 0 -> rank 1 -> rank2
-        #
-        # Inner loop:
-        # src(0) :  p1_0 -> p2_0 -> p3_0
-        # src(1) :  p4_0
-        # src(2) :  p5_0 -> p6_0
-        comm_stream.wait_event(alloc_event)
-        off = 0
-        for src in range(num_ranks):
-            block = recv_counts[src]
-            if block == 0:
-                continue
-            inner_off = 0
-            for p in params:
-                state = param_to_state[id(p)]
-                if state.worker_rank != src:
-                    continue
-                n = numel_for_rank(p, rank, state)
-                assert n > 0
-                flat_local = recv_buf.narrow(0, off + inner_off,
-                                             n).view_as(p.to_local())
-                state.scattered_u.copy_(flat_local)
-                state.scatter_event = torch.cuda.Event()
-                state.scatter_event.record(comm_stream)
-                inner_off += n
-            assert inner_off == block
-            off += block
-def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
-                  compute_stream):
-    """
-    Update sharded parameter p with the scattered_u.
-    Only worker_rank frees computed_u.
     """
-    with torch.cuda.stream(compute_stream):
-        if state.scatter_event is None:
-            raise RuntimeError("Scatter event must be set before update")
-        compute_stream.wait_event(state.scatter_event)
-        u_dtensor = DTensor.from_local(
-            state.scattered_u,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
-        if rank == state.worker_rank:
-            # Free computed_u
-            state.computed_u = None
-        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-        state.scattered_u = None
-        u_dtensor = None
-        scales_full = Muon._compute_scales(
-            p,
-            state.qk_clip_state) if state.qk_clip_state is not None else None
-        if scales_full is not None:
-            # Have to slice scales_full among dim 0
-            weight_slices = get_slices_of_dtensor(p, rank, state.shard_mesh,
-                                                  state.shard_placements)
-            ratio = p.shape[0] // scales_full.shape[0]
-            scales_slice = slice(
-                None if weight_slices[0].start is None else
-                weight_slices[0].start // ratio,
-                None if weight_slices[0].stop is None else
-                weight_slices[0].stop // ratio,
-                None,
-            )
-            scales_local = scales_full[scales_slice]
-            scales_local = DTensor.from_local(
-                scales_local,
-                placements=p.placements,
-                device_mesh=p.device_mesh,
-            )
-            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
-def default_is_muon(name, x):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    return x.ndim >= 2 and not any(key in name for key in skip_keys)
-def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
-    muon_params, muon_names = [], []
-    non_muon_params = []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
             continue
-        if is_muon_func(n, p):
-            muon_params.append(p)
-            muon_names.append(n)
-        else:
-            non_muon_params.append(p)
-    return [
-        {
-            "params": muon_params,
-            "names": muon_names,
-            "use_muon": True,
-        },
-        {
-            "params": non_muon_params,
-            "use_muon": False,
-        },
-    ]
-def parse_qk_layer(name: str) -> tuple[str | None, int]:
-    """
-    Parse a parameter name to check if it is a query/key projection layer
-    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
-    Returns:
-        (kind, layer_idx) or (None, -1) if not matched.
-    Example:
-        'model.3.attn.wq.weight'      -> ('wq', 3)
-        'model.5.attn.wk.weight'      -> ('wk', 5)
-        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
-        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
-        'model.4.attn.v_proj.weight'  -> (None, -1)
-    """
-    parts = name.split('.')
-    if len(parts) < 3:
-        return None, -1
-    kind = parts[-2]
-    layer_idx = -1
-    for part in reversed(parts):
-        if part.isdigit():
-            layer_idx = int(part)
-            break
-    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
-        return kind, layer_idx
-    return None, -1
-@dataclass
-class QKClipInfo:
-    """Per-parameter dynamic info computed from config + runtime logits."""
-    kind: str | None  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
-    indices: list[int]  # which heads to consider for clipping
-    head_dim: int  # from config
-    threshold: float  # from config
-    logit: torch.Tensor | None
 class Muon(torch.optim.Optimizer):
@@ -554,7 +139,7 @@ class Muon(torch.optim.Optimizer):
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
         weight_decay: The weight decay for Muon and AdamW.
-        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
@@ -564,7 +149,7 @@ class Muon(torch.optim.Optimizer):
             - "q_indices" (list[int]): Indices of query heads to consider.
             - "k_indices" (list[int]): Indices of key heads to consider.
             - "head_dim" (int): Dimensionality of each attention head.
-            - "threshold" (float): Threshold value; heads whose QK logits exceed
             this value will be scaled down.
             Default is:
                 {
@@ -584,6 +169,13 @@ class Muon(torch.optim.Optimizer):
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
         small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
     """
     def __init__(self,
@@ -597,16 +189,12 @@ class Muon(torch.optim.Optimizer):
                  adamw_eps=1e-8,
                  none_grad=True,
                  debug=False,
-                 clip_config={
-                     "q_indices": [],
-                     "k_indices": [],
-                     "head_dim": 128,
-                     "threshold": 100
-                 },
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
-                 small_param_numel_threshold=65536):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -630,16 +218,18 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        self.rank = None
-        self.comm_stream = torch.cuda.Stream()
-        self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-        self.clip_config = clip_config
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
         self.small_param_numel_threshold = small_param_numel_threshold
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -649,20 +239,6 @@ class Muon(torch.optim.Optimizer):
         return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def adjust_lr_for_muon(self, lr, param_shape):
-        A, B = param_shape[:2]
-        # We adjust the learning rate and weight decay based on the size of the parameter matrix
-        # as describted in the paper
-        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-        adjusted_lr = lr * adjusted_ratio
-        return adjusted_lr
-    def set_rank_once(self, rank):
-        if self.rank is None:
-            self.rank = rank
-        else:
-            assert self.rank == rank
     def get_shard_mesh(self, p):
         """
         Get the shard mesh for a parameter p on the given rank.
@@ -673,9 +249,6 @@ class Muon(torch.optim.Optimizer):
         shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
             p.placements, p.device_mesh)
-        # set rank with the local rank in the shard process group
-        self.set_rank_once(dist.get_rank(group=shard_pg))
         return shard_mesh, shard_pg, shard_placements
     def init_state_and_assign_params(self, names, params, group, qk_logits):
@@ -694,8 +267,8 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
-                  flush=True)
         paired = list(zip(names, params))
@@ -724,44 +297,54 @@ class Muon(torch.optim.Optimizer):
             worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
             round_robin = (round_robin + 1) % len(shard_mesh_flattened)
-            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)] = _muon_state(
                 worker_rank=worker_rank,
                 process_group=shard_pg,
-                shard_mesh=shard_mesh,
-                shard_placements=shard_placements,
                 name=n,
                 qk_clip_state=qk_clip_state,
             )
         return param_to_state, ordered_params
-    def base(self, names, params, group, lr, weight_decay, momentum,
-             qk_logits):
-        # generate weight updates in distributed fashion
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            assert g is not None
-            g = self._update_g(p, g, group, momentum)
             u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
-            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
-            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
-            scales_full = self._compute_scales(
                 p, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
-                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def distributed_muon(
         self,
@@ -770,20 +353,15 @@ class Muon(torch.optim.Optimizer):
         group: dict[str, Any],
         lr: float,
         weight_decay: float,
-        momentum: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
         """ Implementation of Distributed Muon by Liu et al. """
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            assert g is not None
-            g = self._update_g(p, g, group, momentum)
             # Gather G
             if isinstance(p.data, DTensor):
@@ -796,16 +374,16 @@ class Muon(torch.optim.Optimizer):
             u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
                                                   steps=group["ns_steps"])
-            adjusted_lr = self.adjust_lr_for_muon(lr, p_full.shape)
-            Muon._update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
-            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
-            scales_full = self._compute_scales(
                 p_full, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
-                Muon._qk_clip(p_full, scales_full, qk_clip_state.head_dim)
             if isinstance(p.data, DTensor):
                 ndims = len(p.device_mesh.mesh.shape)
@@ -822,244 +400,53 @@ class Muon(torch.optim.Optimizer):
                 p.copy_(p_sharded)
-    def _update_g(self, p, g, group, momentum):
-        # calc update
-        state = self.state[p]
-        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-        torch.add(g, buf, alpha=momentum, out=buf)
-        if group["nesterov"]:
-            g.add_(buf, alpha=momentum)
-            return g
-        return buf
-    @staticmethod
-    def _update_p(p, u, lr, adjusted_lr, weight_decay):
-        if isinstance(p, torch.nn.Parameter):
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
-        else:
-            p.mul_(1 - lr * weight_decay)
-            p.add_(u, alpha=-adjusted_lr)
-    def get_qk_clip_info(self, n, qk_logits):
-        if self.clip_config is None:
-            return None
-        head_dim = self.clip_config.get('head_dim')
-        threshold = self.clip_config.get('threshold')
-        kind, layer_idx = parse_qk_layer(n)
-        logit, indices = None, []
-        if qk_logits is not None and kind is not None:
-            logit = qk_logits[layer_idx]
-            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
-            indices = self.clip_config.get(indices_key, []) or []
-            if isinstance(logit, DTensor):
-                # In TP settings, qk_logits may be DTensor
-                # We convert it to full tensor here for simplicity
-                logit = logit.full_tensor()
-        return QKClipInfo(
-            kind=kind,
-            indices=indices,
-            head_dim=head_dim,
-            threshold=threshold,
-            logit=logit,
-        )
-    @staticmethod
-    def _compute_scales(p, qk_clip_state):
-        kind = qk_clip_state.kind
-        indices = qk_clip_state.indices
-        head_dim = qk_clip_state.head_dim
-        threshold = qk_clip_state.threshold
-        logit = qk_clip_state.logit
-        H_global = p.shape[0] // head_dim
-        scales_full = torch.ones(H_global, device=p.data.device)
-        scaling = 0
-        for logit_idx, head_idx in enumerate(indices):
-            v_ele = float(logit[logit_idx])
-            if v_ele > threshold:
-                new_scale = math.sqrt(threshold / v_ele)
-                if new_scale < scales_full[head_idx]:
-                    scales_full[head_idx] = new_scale
-                    logger.info(
-                        f"[{kind}] Head {head_idx} exceeded threshold "
-                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
-                    )
-                    scaling += 1
-        return scales_full if scaling > 0 else None
-    @staticmethod
-    def _qk_clip(p, scales, head_dim):
-        if isinstance(p, torch.nn.Parameter):
-            W = p.data.view(-1, head_dim, p.data.shape[1])
-            W.mul_(scales.view(-1, 1, 1))
-        else:
-            W = p.view(-1, head_dim, p.shape[1])
-            W.mul_(scales.view(-1, 1, 1))
-    def parallel(self, names, params, group, lr, weight_decay, momentum,
-                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
-        """
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            # Update g in the local rank
-            g = self._update_g(
-                p,
-                g,
-                group,
-                momentum=momentum,
-            )
-            p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        assert self.rank is not None
-        def enqueue_all2all_gather(start_idx, chunk_size):
-            target_params = ordered_params[start_idx:start_idx + chunk_size]
-            if target_params:
-                alloc_event = _alloc_gathered_grad(target_params,
-                                                   param_to_state, self.rank,
-                                                   self.compute_stream)
-                _all2all_gather(target_params, param_to_state, self.rank,
-                                self.comm_stream, group["none_grad"],
-                                alloc_event)
-        def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _compute_u(p, state, group["ns_steps"], self.rank,
-                           self.compute_stream)
-        def enqueue_all2all_scatter(start_idx, chunk_size):
-            target_params = ordered_params[start_idx:start_idx + chunk_size]
-            if target_params:
-                alloc_event = _alloc_scattered_u(target_params, param_to_state,
-                                                 self.rank,
-                                                 self.compute_stream)
-                _all2all_scatter(target_params, param_to_state, self.rank,
-                                 self.comm_stream, alloc_event)
-        def enqueue_update_param(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _update_param(p, state, lr, adjusted_lr, weight_decay,
-                              self.rank, self.compute_stream)
         if self.chunk_size == -1:
             shard_ranks = dist.get_world_size(param_to_state[id(
-                params[0])].process_group)
             chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
         elif self.chunk_size > 0:
             chunk_size = self.chunk_size
         else:
             raise ValueError("chunk_size must be -1 or a positive integer.")
-        # Wait grad update
-        self.comm_stream.wait_stream(torch.cuda.current_stream())
-        warmup_step = self.warmup_step
-        for i in range(0, warmup_step):
-            enqueue_all2all_gather(i * chunk_size, chunk_size)
-            enqueue_computes(i * chunk_size, chunk_size)
-        for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_all2all_scatter(i, chunk_size)
-            enqueue_all2all_gather(i + warmup_step * chunk_size, chunk_size)
-            enqueue_update_param(i, chunk_size)
-            enqueue_computes(i + warmup_step * chunk_size, chunk_size)
-        # Wait the last update_param to finish
-        torch.cuda.current_stream().wait_stream(self.compute_stream)
-    @staticmethod
-    def _fused_adamw(
-        params: list[torch.Tensor],
-        grads: list[torch.Tensor],
-        exp_avgs: list[torch.Tensor],
-        exp_avg_sqs: list[torch.Tensor],
-        max_exp_avg_sqs: list[torch.Tensor],
-        state_steps: list[torch.Tensor],
-        amsgrad: bool,
-        beta1: float,
-        beta2: float,
-        lr: float | torch.Tensor,
-        weight_decay: float,
-        eps: float,
-        maximize: bool,
-    ) -> None:
-        if not params:
-            return
-        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
-        # treating it as a scalar.
-        lr_dict: DeviceDict | None = ({
-            lr.device: lr
-        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
-                                      None)
-        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
-            [
-                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
-                state_steps
-            ]  # type: ignore[list-item]
-        )
-        for (device, _), (
-            (
-                device_params_,
-                device_grads_,
-                device_exp_avgs_,
-                device_exp_avg_sqs_,
-                device_max_exp_avg_sqs,
-                device_state_steps_,
-            ),
-                _,
-        ) in grouped_tensors.items():
-            device_params = cast(list[torch.Tensor], device_params_)
-            device_grads = cast(list[torch.Tensor], device_grads_)
-            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
-            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
-            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
-            if lr_dict is not None and device not in lr_dict:
-                lr_dict[device] = lr.to(
-                    device=device,
-                    non_blocking=True)  # type: ignore[union-attr]
-                lr = lr_dict[device]
-            torch._foreach_add_(device_state_steps, 1)
-            func = torch._fused_adamw_
-            func(
-                device_params,
-                device_grads,
-                device_exp_avgs,
-                device_exp_avg_sqs,
-                device_max_exp_avg_sqs,  # type: ignore[arg-type]
-                device_state_steps,
-                amsgrad=amsgrad,
-                lr=lr,  # type: ignore[arg-type]
-                beta1=beta1,
-                beta2=beta2,
-                weight_decay=weight_decay,
-                eps=eps,
-                maximize=maximize,
-            )
     def _step_muon(self, group, qk_logits=None):
         params = group["params"]
@@ -1068,6 +455,18 @@ class Muon(torch.optim.Optimizer):
         momentum = group["momentum"]
         names = group["names"]
         param_dtensors = []
         name_dtensors = []
@@ -1083,7 +482,6 @@ class Muon(torch.optim.Optimizer):
                                   group=group,
                                   lr=lr,
                                   weight_decay=weight_decay,
-                                  momentum=momentum,
                                   qk_logits=qk_logits)
             return
@@ -1119,7 +517,6 @@ class Muon(torch.optim.Optimizer):
             # and run parallel Muon on each group.
             placement_to_params = defaultdict(lambda: ([], []))
-            # type: dict[tuple[Placement, DeviceMesh], tuple[list[str], list[DTensor]]]
             assert len(dtensors) == len(names)
             for p, n in zip(dtensors, names):
@@ -1141,7 +538,6 @@ class Muon(torch.optim.Optimizer):
                 group=group,
                 lr=lr,
                 weight_decay=weight_decay,
-                momentum=momentum,
                 qk_logits=qk_logits,
             )
@@ -1159,7 +555,6 @@ class Muon(torch.optim.Optimizer):
                     group,
                     lr=lr,
                     weight_decay=weight_decay,
-                    momentum=momentum,
                     qk_logits=qk_logits,
                 )
@@ -1170,78 +565,9 @@ class Muon(torch.optim.Optimizer):
                 group,
                 lr=lr,
                 weight_decay=weight_decay,
-                momentum=momentum,
                 qk_logits=qk_logits,
             )
-    def _step_adamw_params(self, params, group):
-        params_with_grads = []
-        grads = []
-        moment1 = []
-        moment2 = []
-        max_exp_avg_sqs = []
-        state_steps = []
-        lr = group["lr"]
-        beta1, beta2 = group["adamw_betas"]
-        eps = group["adamw_eps"]
-        weight_decay = group["weight_decay"]
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            state = self.state[p]
-            params_with_grads.append(p)
-            grads.append(g)
-            if "step" not in state:
-                state["step"] = (torch.zeros((),
-                                             dtype=torch.float32,
-                                             device=p.device))
-                state["moment1"] = torch.zeros_like(g)
-                state["moment2"] = torch.zeros_like(g)
-            moment1.append(state["moment1"])
-            moment2.append(state["moment2"])
-            if not isinstance(state["step"], torch.Tensor):
-                step_tensor = torch.tensor(state["step"],
-                                           dtype=torch.float32,
-                                           device=p.device)
-            else:
-                step_tensor = state["step"]
-            state_steps.append(step_tensor)
-        self._fused_adamw(
-            params_with_grads,
-            grads,
-            moment1,
-            moment2,
-            max_exp_avg_sqs,
-            state_steps,
-            amsgrad=False,
-            beta1=beta1,
-            beta2=beta2,
-            lr=lr,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=False,
-        )
-    def _step_adamw(self, group):
-        params = group["params"]
-        # group params with it's type and placement
-        placement_to_params: dict[tuple[Placement | type,
-                                        DeviceMesh | None]] = defaultdict(list)
-        for p in params:
-            match p:
-                case DTensor():
-                    placement_to_params[tuple([p.placements,
-                                               p.device_mesh])].append(p)
-                case torch.Tensor():
-                    placement_to_params[tuple([torch.Tensor, None])].append(p)
-        for params in placement_to_params.values():
-            self._step_adamw_params(params, group)
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
@@ -1249,9 +575,9 @@ class Muon(torch.optim.Optimizer):
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
-            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
-                to 1D tensors of shape (num_heads,), representing the maximum
-                QK logits across all tokens, computed as
                 (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
@@ -1263,6 +589,6 @@ class Muon(torch.optim.Optimizer):
             if group["use_muon"]:
                 self._step_muon(group, qk_logits=qk_logits)
             else:
-                self._step_adamw(group)
         return loss

 import logging
 import types
 from collections import defaultdict
+from typing import Any
 import torch
 import torch.distributed as dist
+from torch.distributed.tensor import DTensor, Replicate, Shard
+from torch.profiler import record_function
+from .adamw import step_adamw
+from .async_utils import run_pipeline
+from .core import (_muon_state, adjust_lr_for_muon,
+                   get_default_muon_param_groups, update_g, update_p)
+from .distributed.utils import (_is_shard, construct_shard_mesh,
+                                get_slices_of_dtensor)
+from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
+                            _zeropower_via_newtonschulz5)
+from .pipeline import muon_chunk_pipeline
+from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
 logger = logging.getLogger(__name__)
+def _expand_expert_params(names, params, expert_keys):
+    """Expand expert params by splitting on dim 0 (expert dimension).
+    Params whose name matches any key in ``expert_keys`` are treated as
+    expert-parallel tensors.  Their outermost dimension is the expert
+    dimension: an ``(E, out, in)`` tensor becomes ``E`` separate 2D
+    ``nn.Parameter`` views so that in-place updates propagate back to
+    the original storage.
+    Non-expert params with ``ndim > 2`` trigger an ``AssertionError`` —
+    if they are expert params, their key must be added to ``expert_keys``.
+    The grad must already be set on each expert param (e.g. after momentum).
+    For DTensor expert params, placements that shard on dim 0 (expert dim)
+    are consumed by the split.  Non-dim-0 shard placements (e.g. TP) are
+    preserved: each 2D slice is wrapped as a DTensor on the corresponding
+    submesh so the parallel pipeline handles the TP communication.
     """
+    expanded_names = []
+    expanded_params = []
+    for n, p in zip(names, params):
+        is_expert = expert_keys and any(key in n for key in expert_keys)
+        is_dtensor = isinstance(p.data, DTensor)
+        if not is_expert:
+            assert p.data.ndim <= 2, (
+                f"Param {n} has ndim={p.data.ndim} but does not match "
+                f"expert_keys={expert_keys}. If this is an expert param, "
+                f"add its key to expert_keys.")
+            expanded_names.append(n)
+            expanded_params.append(p)
             continue
+        g = p.grad
+        assert g is not None, (
+            f"Expert param {n} must have grad set before expansion")
+        tp_mesh = None
+        tp_placements_2d = None
+        if is_dtensor:
+            local_data = p.to_local()
+            local_grad = g.to_local() if isinstance(g, DTensor) else g
+            # Find non-dim-0 shard placements (e.g. TP sharding).
+            # After splitting on dim 0, Shard(k) becomes Shard(k-1).
+            tp_dim_indices = []
+            tp_placements_2d = []
+            for i, pl in enumerate(p.placements):
+                if _is_shard(pl) and pl.dim != 0:
+                    tp_dim_indices.append(i)
+                    tp_placements_2d.append(Shard(pl.dim - 1))
+            if tp_dim_indices:
+                tp_dim_names = tuple(p.device_mesh.mesh_dim_names[i]
+                                     for i in tp_dim_indices)
+                if len(tp_dim_names) == 1:
+                    tp_mesh = p.device_mesh[tp_dim_names[0]]
+                else:
+                    tp_mesh = p.device_mesh[tp_dim_names]
+        else:
+            local_data = p.data
+            local_grad = g
+        # Expand: split dim 0, reshape each slice to 2D.
+        num_local_experts = local_data.shape[0]
+        for i in range(num_local_experts):
+            slice_data = local_data[i]
+            slice_grad = local_grad[i]
+            if tp_mesh is not None:
+                # Wrap as DTensor on TP submesh so the pipeline handles
+                # TP communication (gather/scatter across TP ranks).
+                dt_data = DTensor.from_local(slice_data,
+                                             device_mesh=tp_mesh,
+                                             placements=tp_placements_2d)
+                dt_grad = DTensor.from_local(slice_grad,
+                                             device_mesh=tp_mesh,
+                                             placements=tp_placements_2d)
+                expert_param = torch.nn.Parameter(dt_data, requires_grad=False)
+                expert_param.grad = dt_grad
+            else:
+                expert_param = torch.nn.Parameter(slice_data,
+                                                  requires_grad=False)
+                expert_param.grad = slice_grad
+            expanded_names.append(f"{n}[{i}]")
+            expanded_params.append(expert_param)
+        p.grad = None  # allow expert grad storage to be freed after pipeline
+    return expanded_names, expanded_params
 class Muon(torch.optim.Optimizer):
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
         weight_decay: The weight decay for Muon and AdamW.
+            Parameters that are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW instead.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
             - "q_indices" (list[int]): Indices of query heads to consider.
             - "k_indices" (list[int]): Indices of key heads to consider.
             - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
             this value will be scaled down.
             Default is:
                 {
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
         small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
+        expert_keys: List of strings to identify expert-parallel parameters.
+                     If any key appears in a parameter's name, its outermost
+                     dimension is treated as the expert dimension and expanded
+                     into per-expert 2D params for Muon.  For example,
+                     ``expert_keys=["experts"]`` matches any param whose name
+                     contains "experts".  3D+ params not matched by any key
+                     will raise an error.
     """
     def __init__(self,
                  adamw_eps=1e-8,
                  none_grad=True,
                  debug=False,
+                 clip_config=None,
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
+                 small_param_numel_threshold=65536,
+                 expert_keys=None):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
         self.debug = debug
+        self.clip_config = clip_config if clip_config is not None else {
+            "q_indices": [],
+            "k_indices": [],
+            "head_dim": 128,
+            "threshold": 100,
+        }
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
         self.small_param_numel_threshold = small_param_numel_threshold
+        self.expert_keys = expert_keys
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
     def get_shard_mesh(self, p):
         """
         Get the shard mesh for a parameter p on the given rank.
         shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
             p.placements, p.device_mesh)
         return shard_mesh, shard_pg, shard_placements
     def init_state_and_assign_params(self, names, params, group, qk_logits):
             total_flops += flops
         if self.debug:
+            logger.debug("Total TFLOPs for Muon: %.2f TFLOPs",
+                         total_flops / 1e12)
         paired = list(zip(names, params))
             worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
             round_robin = (round_robin + 1) % len(shard_mesh_flattened)
+            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
+            # Precompute per-rank indices and numels for all-to-all.
+            rank_indices: dict[int, tuple] = {}
+            rank_numels: dict[int, int] = {}
+            for r in range(num_ranks):
+                indices = get_slices_of_dtensor(p, r, shard_mesh,
+                                                shard_placements)
+                rank_indices[r] = indices
+                numel = 1
+                for idx, dim_size in zip(indices, p.shape):
+                    if isinstance(idx, slice):
+                        start, stop, step = idx.indices(dim_size)
+                        numel *= max(0, (stop - start + (step - 1)) // step)
+                    else:
+                        numel *= len(idx)
+                rank_numels[r] = numel
             param_to_state[id(p)] = _muon_state(
                 worker_rank=worker_rank,
                 process_group=shard_pg,
+                rank_indices=rank_indices,
+                rank_numels=rank_numels,
                 name=n,
                 qk_clip_state=qk_clip_state,
             )
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, qk_logits):
+        # Momentum is already applied by _step_muon before this method.
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
+            adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+            update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
+            scales_full = compute_scales(
                 p, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
+                qk_clip(p, scales_full, qk_clip_state.head_dim)
     def distributed_muon(
         self,
         group: dict[str, Any],
         lr: float,
         weight_decay: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
         """ Implementation of Distributed Muon by Liu et al. """
+        # Momentum is already applied by _step_muon before this method.
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             # Gather G
             if isinstance(p.data, DTensor):
             u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
                                                   steps=group["ns_steps"])
+            adjusted_lr = adjust_lr_for_muon(lr, p_full.shape)
+            update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
+            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
+            scales_full = compute_scales(
                 p_full, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
+                qk_clip(p_full, scales_full, qk_clip_state.head_dim)
             if isinstance(p.data, DTensor):
                 ndims = len(p.device_mesh.mesh.shape)
                 p.copy_(p_sharded)
+    def parallel(self, names, params, group, lr, weight_decay, qk_logits):
         """
         Perform a parallel optimization step using Muon.
+        Parameters are chunked and each chunk is processed by a
+        :func:`muon_chunk_pipeline` generator.  :func:`run_pipeline`
+        interleaves multiple chunks so that communication and computation
+        overlap across chunks (the same overlap previously achieved by the
+        warmup + main-loop index scheduling).
+        """
+        # Momentum is already applied by _step_muon before this method.
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        # Compute local rank for this group's shard process group.
+        shard_pg = param_to_state[id(ordered_params[0])].process_group
+        rank = dist.get_rank(group=shard_pg)
         if self.chunk_size == -1:
             shard_ranks = dist.get_world_size(param_to_state[id(
+                ordered_params[0])].process_group)
             chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
         elif self.chunk_size > 0:
             chunk_size = self.chunk_size
         else:
             raise ValueError("chunk_size must be -1 or a positive integer.")
+        def pipelines():
+            for start in range(0, len(ordered_params), chunk_size):
+                chunk = ordered_params[start:start + chunk_size]
+                if chunk:
+                    yield muon_chunk_pipeline(
+                        params=chunk,
+                        param_to_state=param_to_state,
+                        rank=rank,
+                        ns_steps=group["ns_steps"],
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        none_grad=group["none_grad"],
+                    )
+        with record_function("muon::barrier"):
+            dist.barrier()
+        with record_function("muon::pipeline"):
+            run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
     def _step_muon(self, group, qk_logits=None):
         params = group["params"]
         momentum = group["momentum"]
         names = group["names"]
+        # Apply momentum to all params before routing/expansion.
+        with record_function("muon::momentum"):
+            for n, p in zip(names, params):
+                g = p.grad
+                if g is None:
+                    continue
+                g = update_g(self.state, p, g, group, momentum)
+                p.grad = g
+        # Expand expert params by splitting on dim 0.
+        names, params = _expand_expert_params(names, params, self.expert_keys)
         param_dtensors = []
         name_dtensors = []
                                   group=group,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   qk_logits=qk_logits)
             return
             # and run parallel Muon on each group.
             placement_to_params = defaultdict(lambda: ([], []))
             assert len(dtensors) == len(names)
             for p, n in zip(dtensors, names):
                 group=group,
                 lr=lr,
                 weight_decay=weight_decay,
                 qk_logits=qk_logits,
             )
                     group,
                     lr=lr,
                     weight_decay=weight_decay,
                     qk_logits=qk_logits,
                 )
                 group,
                 lr=lr,
                 weight_decay=weight_decay,
                 qk_logits=qk_logits,
             )
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
                 (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
             if group["use_muon"]:
                 self._step_muon(group, qk_logits=qk_logits)
             else:
+                step_adamw(self.state, group)
         return loss

build/torch210-cxx11-cu126-x86_64-linux/newton_schulz.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+from .matmul_transpose_triton import matmul_transpose_assign
+COMM_DTYPE = torch.bfloat16
+DEFAULT_CHUNK_SIZE_RATIO = 4
+# This code snippet is a modified version adapted from the following GitHub repositories:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
+@torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X

build/torch210-cxx11-cu126-x86_64-linux/pipeline.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import logging
+from typing import Generator
+import torch
+import torch.distributed as dist
+from torch.distributed.tensor import DTensor
+from torch.profiler import record_function
+from .core import _muon_state, adjust_lr_for_muon, update_p
+from .newton_schulz import COMM_DTYPE, _zeropower_via_newtonschulz5
+from .qk_clip import compute_scales
+logger = logging.getLogger(__name__)
+# ======================================================================
+# Stage helpers
+# ======================================================================
+def _launch_gather(
+    params: list[DTensor],
+    owned_params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    num_ranks: int,
+    process_group: dist.ProcessGroup,
+) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
+    """Allocate gather buffers, build send/recv, and launch async all-to-all.
+    Returns:
+        work: Async operation handle.
+        recv_buf: Flat receive buffer (needed by ``_complete_gather``).
+        gathered_grads: ``{id(p): empty_tensor}`` for owned params,
+            ``None`` for non-owned.
+        recv_counts: Per-source-rank element counts.
+    """
+    # Allocate gathered-grad buffers
+    gathered_grads: dict[int, torch.Tensor | None] = {}
+    for p in params:
+        state = param_to_state[id(p)]
+        if rank == state.worker_rank:
+            gathered_grads[id(p)] = torch.empty(p.shape,
+                                                dtype=COMM_DTYPE,
+                                                device="cuda")
+        else:
+            gathered_grads[id(p)] = None
+    # Build send buffer
+    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
+    send_counts = [0] * num_ranks
+    for p in params:
+        state = param_to_state[id(p)]
+        dst = state.worker_rank
+        assert dst < num_ranks
+        shard_elems = state.rank_numels[rank]
+        g = p.grad
+        g = g.to_local().to(COMM_DTYPE).contiguous()
+        assert g.numel() == shard_elems
+        per_dst[dst].append(g.view(-1))
+        send_counts[dst] += shard_elems
+    assert any(
+        len(v) > 0 for v in
+        per_dst), "At least one destination rank must receive a sharded tensor"
+    per_dst_flat = [t for dst in per_dst for t in dst]
+    send_buf = torch.cat(per_dst_flat, dim=0)
+    # Build recv buffer
+    recv_counts = [0] * num_ranks
+    for src in range(num_ranks):
+        total = 0
+        for p in owned_params:
+            state = param_to_state[id(p)]
+            assert state.worker_rank == rank
+            total += state.rank_numels[src]
+        recv_counts[src] = total
+    recv_buf = torch.empty(sum(recv_counts), dtype=COMM_DTYPE, device="cuda")
+    # Launch async all-to-all
+    logger.debug(f"send_buf size: {send_buf.numel()}, "
+                 f"recv_buf size: {recv_buf.numel()}, "
+                 f"recv_counts: {recv_counts}, "
+                 f"send_counts: {send_counts}, "
+                 f"process_group: {str(process_group)}")
+    work = dist.all_to_all_single(
+        recv_buf,
+        send_buf,
+        output_split_sizes=recv_counts,
+        input_split_sizes=send_counts,
+        group=process_group,
+        async_op=True,
+    )
+    return work, recv_buf, gathered_grads, recv_counts
+def _complete_gather(
+    recv_buf: torch.Tensor,
+    recv_counts: list[int],
+    owned_params: list[DTensor],
+    gathered_grads: dict[int, torch.Tensor | None],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+) -> None:
+    """Reconstruct gathered grads from the recv buffer (in-place)."""
+    off = 0
+    for src in range(len(recv_counts)):
+        if recv_counts[src] == 0:
+            continue
+        block = recv_counts[src]
+        inner_off = 0
+        for p in owned_params:
+            state = param_to_state[id(p)]
+            assert state.worker_rank == rank
+            indices = state.rank_indices[src]
+            shard_view = gathered_grads[id(p)][indices]
+            n = shard_view.numel()
+            assert n > 0
+            sg = recv_buf.narrow(0, off + inner_off, n)
+            sg = sg.reshape(shard_view.shape)
+            gathered_grads[id(p)][indices] = sg
+            inner_off += n
+        assert inner_off == block
+        off += block
+def _compute_ns(
+    owned_params: list[DTensor],
+    gathered_grads: dict[int, torch.Tensor | None],
+    ns_steps: int,
+) -> dict[int, torch.Tensor | None]:
+    """Run Newton-Schulz orthogonalization on owned parameters.
+    Returns:
+        computed_us: ``{id(p): orthogonalized_update}`` for owned params.
+    """
+    computed_us: dict[int, torch.Tensor | None] = {}
+    for p in owned_params:
+        u = _zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
+        gathered_grads[id(p)] = None  # free gathered grad
+        computed_us[id(p)] = u
+    return computed_us
+def _launch_scatter(
+    params: list[DTensor],
+    owned_params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    num_ranks: int,
+    process_group: dist.ProcessGroup,
+    computed_us: dict[int, torch.Tensor | None],
+) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor], list[int]]:
+    """Allocate scatter buffers, build send/recv, and launch async all-to-all.
+    Returns:
+        work: Async operation handle.
+        recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
+        scattered_us: ``{id(p): empty_local_tensor}`` for all params.
+        recv_counts: Per-source-rank element counts.
+    """
+    # Allocate scattered-u buffers
+    scattered_us: dict[int, torch.Tensor] = {}
+    for p in params:
+        scattered_us[id(p)] = torch.empty_like(p.to_local(), dtype=COMM_DTYPE)
+    # Build send buffer (from computed_us on owner ranks)
+    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
+    send_counts = [0] * num_ranks
+    if owned_params:
+        for p in owned_params:
+            state = param_to_state[id(p)]
+            assert computed_us[id(p)] is not None
+            u_full = computed_us[id(p)].to(COMM_DTYPE).contiguous()
+            total_sent = 0
+            for dst_rank in range(num_ranks):
+                indices = state.rank_indices[dst_rank]
+                su = u_full[indices].flatten()
+                n = su.numel()
+                assert n > 0
+                per_dst[dst_rank].append(su)
+                send_counts[dst_rank] += n
+                total_sent += n
+            assert total_sent == u_full.numel()
+    lengths = [len(v) for v in per_dst]
+    if all(l > 0 for l in lengths):
+        assert all(
+            l == lengths[0] for l in lengths
+        ), "All destination ranks must have the same number of sharded tensor"
+        per_dst_flat = [t for dst in per_dst for t in dst]
+        send_buf = torch.cat(per_dst_flat, dim=0)
+    else:
+        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+    # Build recv buffer
+    recv_counts = [0] * num_ranks
+    for src in range(num_ranks):
+        total = 0
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank != src:
+                continue
+            total += state.rank_numels[rank]
+        recv_counts[src] = total
+    recv_total = sum(recv_counts)
+    assert recv_total > 0
+    recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+    # Launch async all-to-all
+    work = dist.all_to_all_single(
+        recv_buf,
+        send_buf,
+        output_split_sizes=recv_counts,
+        input_split_sizes=send_counts,
+        group=process_group,
+        async_op=True,
+    )
+    return work, recv_buf, scattered_us, recv_counts
+def _complete_scatter(
+    recv_buf: torch.Tensor,
+    recv_counts: list[int],
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    scattered_us: dict[int, torch.Tensor],
+) -> None:
+    """Copy recv buffer into scattered_us (in-place)."""
+    off = 0
+    for src in range(len(recv_counts)):
+        block = recv_counts[src]
+        if block == 0:
+            continue
+        inner_off = 0
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank != src:
+                continue
+            n = state.rank_numels[rank]
+            assert n > 0
+            flat_local = recv_buf.narrow(0, off + inner_off,
+                                         n).view_as(p.to_local())
+            scattered_us[id(p)].copy_(flat_local)
+            inner_off += n
+        assert inner_off == block
+        off += block
+def _update_params(
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    scattered_us: dict[int, torch.Tensor],
+    lr: float,
+    weight_decay: float,
+) -> None:
+    """Apply weight decay, Muon update, and optional QK clipping."""
+    for p in params:
+        state = param_to_state[id(p)]
+        u_dtensor = DTensor.from_local(
+            scattered_us[id(p)],
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+        update_p(p, u_dtensor, lr, adjusted_lr, weight_decay)
+        # QK clipping – applied directly on the local tensor to
+        # avoid DTensor sharding-propagation issues with _StridedShard.
+        scales_full = compute_scales(
+            p,
+            state.qk_clip_state) if state.qk_clip_state is not None else None
+        if scales_full is not None:
+            ratio = p.shape[0] // scales_full.shape[0]
+            idx0 = state.rank_indices[rank][0]
+            if isinstance(idx0, slice):
+                start = idx0.start or 0
+                idx0 = torch.arange(start,
+                                    idx0.stop,
+                                    device=scales_full.device)
+            row_scales = scales_full[idx0 // ratio]
+            p._local_tensor.mul_(row_scales.view(-1, 1))
+# ======================================================================
+# Main generator – thin orchestrator that wires stages together.
+# ======================================================================
+@torch.no_grad()
+def muon_chunk_pipeline(
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    ns_steps: int,
+    lr: float,
+    weight_decay: float,
+    none_grad: bool,
+) -> Generator[None, None, None]:
+    """Process one chunk of parameters through the full Muon pipeline.
+    Stages: gather -> compute (Newton-Schulz) -> scatter -> update.
+    Each ``yield`` lets :func:`run_pipeline` interleave other chunks so
+    that communication and computation overlap across chunks.  Async
+    communication is launched via ``async_op=True`` and completed after
+    the yield with ``work.wait()``.
+    Overlap happens because :func:`run_pipeline` admits one new chunk
+    per iteration (staggered admission).  While chunk *N* does NS
+    compute on the default CUDA stream, chunk *N+1*'s async all-to-all
+    runs concurrently on the NCCL stream — no separate ``comm_stream``
+    is required.
+    Yields exactly **2** times:
+    1. After launching async all-to-all gather.
+    2. After launching async all-to-all scatter.
+    """
+    process_group = param_to_state[id(params[0])].process_group
+    num_ranks = dist.get_world_size(group=process_group)
+    owned_params = [
+        p for p in params if param_to_state[id(p)].worker_rank == rank
+    ]
+    # Stages 1-2: launch async gather.
+    with record_function("muon::launch_gather"):
+        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
+            params, owned_params, param_to_state, rank, num_ranks,
+            process_group)
+        if none_grad:
+            for p in params:
+                p.grad = None
+    yield  # --- YIELD 1: other chunks can launch their gather ---
+    with record_function("muon::wait_gather"):
+        work.wait()
+        _complete_gather(recv_buf, recv_counts, owned_params, gathered_grads,
+                         param_to_state, rank)
+        del recv_buf
+    # Stage 3: Newton-Schulz orthogonalization.
+    with record_function("muon::newton_schulz"):
+        computed_us = _compute_ns(owned_params, gathered_grads, ns_steps)
+        gathered_grads.clear()
+    # Stages 4-5: launch async scatter.
+    with record_function("muon::launch_scatter"):
+        work, recv_buf, scattered_us, recv_counts = _launch_scatter(
+            params, owned_params, param_to_state, rank, num_ranks,
+            process_group, computed_us)
+        computed_us.clear()
+    yield  # --- YIELD 2: other chunks can launch their scatter ---
+    with record_function("muon::wait_scatter"):
+        work.wait()
+        _complete_scatter(recv_buf, recv_counts, params, param_to_state, rank,
+                          scattered_us)
+        del recv_buf
+    # Stage 6: apply parameter updates.
+    with record_function("muon::update_params"):
+        _update_params(params, param_to_state, rank, scattered_us, lr,
+                       weight_decay)
+        scattered_us.clear()

build/torch210-cxx11-cu126-x86_64-linux/qk_clip.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import logging
+import math
+from dataclasses import dataclass
+import torch
+from torch.distributed.tensor import DTensor
+logger = logging.getLogger(__name__)
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: str | None  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: list[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: torch.Tensor | None
+def get_qk_clip_info(clip_config, n, qk_logits):
+    """Extract QK clipping info for a named parameter.
+    Args:
+        clip_config: QK clipping configuration dict (or None).
+        n: Parameter name string.
+        qk_logits: Dict mapping layer indices to logit tensors (or None).
+    Returns:
+        QKClipInfo instance with clipping configuration for this parameter.
+    """
+    if clip_config is None:
+        return None
+    head_dim = clip_config.get('head_dim')
+    threshold = clip_config.get('threshold')
+    kind, layer_idx = parse_qk_layer(n)
+    logit, indices = None, []
+    if qk_logits is not None and kind is not None:
+        logit = qk_logits[layer_idx]
+        indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+        indices = clip_config.get(indices_key, []) or []
+        if isinstance(logit, DTensor):
+            # In TP settings, qk_logits may be DTensor
+            # We convert it to full tensor here for simplicity
+            logit = logit.full_tensor()
+    return QKClipInfo(
+        kind=kind,
+        indices=indices,
+        head_dim=head_dim,
+        threshold=threshold,
+        logit=logit,
+    )
+def compute_scales(p, qk_clip_state):
+    """Compute per-head scaling factors for QK clipping.
+    Returns scales tensor if any head exceeds threshold, else None.
+    """
+    kind = qk_clip_state.kind
+    indices = qk_clip_state.indices
+    head_dim = qk_clip_state.head_dim
+    threshold = qk_clip_state.threshold
+    logit = qk_clip_state.logit
+    H_global = p.shape[0] // head_dim
+    scales_full = torch.ones(H_global, device=p.data.device)
+    scaling = 0
+    for logit_idx, head_idx in enumerate(indices):
+        v_ele = float(logit[logit_idx])
+        if v_ele > threshold:
+            new_scale = math.sqrt(threshold / v_ele)
+            if new_scale < scales_full[head_idx]:
+                scales_full[head_idx] = new_scale
+                logger.info(
+                    f"[{kind}] Head {head_idx} exceeded threshold "
+                    f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                )
+                scaling += 1
+    return scales_full if scaling > 0 else None
+def qk_clip(p, scales, head_dim):
+    """Apply per-head scaling to a Q/K projection weight matrix."""
+    if isinstance(p, torch.nn.Parameter):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    else:
+        W = p.view(-1, head_dim, p.shape[1])
+        W.mul_(scales.view(-1, 1, 1))

build/torch210-cxx11-cu128-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_06a260a_dirty
-ops = torch.ops._optimizer_06a260a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_06a260a_dirty::{op_name}"

 import torch
+from . import _optimizer_7aef62f_dirty
+ops = torch.ops._optimizer_7aef62f_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_7aef62f_dirty::{op_name}"

build/torch210-cxx11-cu128-x86_64-linux/{_optimizer_06a260a_dirty.abi3.so → _optimizer_7aef62f_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:976df6a1ec3ec4c462dea18477b56dfb75bcff76f504d55b592ce417931597c0
 size 2004144

 version https://git-lfs.github.com/spec/v1
+oid sha256:4919c48c77c6223dbf668f1461bcec175ef1bd6ea4cec8c2509de12ca7200a62
 size 2004144

build/torch210-cxx11-cu128-x86_64-linux/adamw.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from collections import defaultdict
+from typing import cast
+import torch
+from torch.distributed.tensor import DTensor
+def fused_adamw(
+    params: list[torch.Tensor],
+    grads: list[torch.Tensor],
+    exp_avgs: list[torch.Tensor],
+    exp_avg_sqs: list[torch.Tensor],
+    max_exp_avg_sqs: list[torch.Tensor],
+    state_steps: list[torch.Tensor],
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: float | torch.Tensor,
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+) -> None:
+    if not params:
+        return
+    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+    # treating it as a scalar.
+    lr_dict: dict | None = ({
+        lr.device: lr
+    } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else None)
+    grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+         state_steps]  # type: ignore[list-item]
+    )
+    for (device, _), (
+        (
+            device_params_,
+            device_grads_,
+            device_exp_avgs_,
+            device_exp_avg_sqs_,
+            device_max_exp_avg_sqs,
+            device_state_steps_,
+        ),
+            _,
+    ) in grouped_tensors.items():
+        device_params = cast(list[torch.Tensor], device_params_)
+        device_grads = cast(list[torch.Tensor], device_grads_)
+        device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+        device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+        device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+        if lr_dict is not None and device not in lr_dict:
+            lr_dict[device] = lr.to(
+                device=device, non_blocking=True)  # type: ignore[union-attr]
+            lr = lr_dict[device]
+        torch._foreach_add_(device_state_steps, 1)
+        func = torch._fused_adamw_
+        func(
+            device_params,
+            device_grads,
+            device_exp_avgs,
+            device_exp_avg_sqs,
+            device_max_exp_avg_sqs,  # type: ignore[arg-type]
+            device_state_steps,
+            amsgrad=amsgrad,
+            lr=lr,  # type: ignore[arg-type]
+            beta1=beta1,
+            beta2=beta2,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=maximize,
+        )
+def step_adamw_params(optimizer_state, params, group):
+    """Run fused AdamW on a list of parameters sharing the same placement.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        params: List of parameters to update.
+        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
+    """
+    params_with_grads = []
+    grads = []
+    moment1 = []
+    moment2 = []
+    max_exp_avg_sqs = []
+    state_steps = []
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
+    for p in params:
+        g = p.grad
+        if g is None:
+            continue
+        state = optimizer_state[p]
+        params_with_grads.append(p)
+        grads.append(g)
+        if "step" not in state:
+            state["step"] = (torch.zeros((),
+                                         dtype=torch.float32,
+                                         device=p.device))
+            state["moment1"] = torch.zeros_like(g)
+            state["moment2"] = torch.zeros_like(g)
+        moment1.append(state["moment1"])
+        moment2.append(state["moment2"])
+        if not isinstance(state["step"], torch.Tensor):
+            step_tensor = torch.tensor(state["step"],
+                                       dtype=torch.float32,
+                                       device=p.device)
+        else:
+            step_tensor = state["step"]
+        state_steps.append(step_tensor)
+    fused_adamw(
+        params_with_grads,
+        grads,
+        moment1,
+        moment2,
+        max_exp_avg_sqs,
+        state_steps,
+        amsgrad=False,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        eps=eps,
+        maximize=False,
+    )
+def step_adamw(optimizer_state, group):
+    """Dispatch AdamW step, grouping parameters by type and placement.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        group: Parameter group dict.
+    """
+    params = group["params"]
+    # group params with its type and placement
+    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
+    for p in params:
+        match p:
+            case DTensor():
+                placement_to_params[tuple([p.placements,
+                                           p.device_mesh])].append(p)
+            case torch.Tensor():
+                placement_to_params[tuple([torch.Tensor, None])].append(p)
+    for group_params in placement_to_params.values():
+        step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-cu128-x86_64-linux/async_utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import logging
+from typing import Generator
+logger = logging.getLogger(__name__)
+class _Task:
+    """Internal: wraps a generator, advances one yield at a time."""
+    def __init__(self, generator: Generator[None, None, None], index: int):
+        self._generator = generator
+        self._index = index
+        self._steps_completed = 0
+        self.step()  # run to first yield
+    def step(self) -> bool:
+        try:
+            next(self._generator)
+            self._steps_completed += 1
+            logger.debug("pipeline[%d] completed stage %d", self._index,
+                         self._steps_completed)
+            return True
+        except StopIteration:
+            logger.debug("pipeline[%d] finished after %d stages", self._index,
+                         self._steps_completed)
+            return False
+    def close(self):
+        self._generator.close()
+def run_pipeline(
+    pipelines: Generator[Generator[None, None, None], None, None],
+    max_concurrent: int,
+) -> None:
+    """Run generator-based pipelines with bounded concurrency.
+    Each pipeline is a generator that yields at stage boundaries.
+    The runtime interleaves pipelines so communication and computation
+    overlap across chunks.
+    """
+    if max_concurrent <= 0:
+        raise ValueError(f"max_concurrent must be > 0, got {max_concurrent}")
+    have_new = True
+    task_index = 0
+    previous_tasks: list[_Task] = []
+    try:
+        while have_new or previous_tasks:
+            running_tasks: list[_Task] = []
+            # Admit one new pipeline per iteration (staggered admission).
+            # Admitting one at a time ensures that while chunk N does NS
+            # compute on the default stream, chunk N+1's NCCL all-to-all
+            # runs concurrently on the NCCL stream — creating real
+            # communication/computation overlap on the GPU.
+            if have_new and len(previous_tasks) < max_concurrent:
+                try:
+                    gen = next(pipelines)
+                    task = _Task(gen, task_index)
+                    task_index += 1
+                    running_tasks.append(task)
+                except StopIteration:
+                    have_new = False
+            # Advance every previously-yielded task by one step.
+            for task in previous_tasks:
+                if task.step():
+                    running_tasks.append(task)
+            previous_tasks = running_tasks
+    except BaseException:
+        # Clean up all in-flight generators to release GPU resources.
+        for task in previous_tasks:
+            task.close()
+        raise

build/torch210-cxx11-cu128-x86_64-linux/core.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+from torch.distributed.tensor import DTensor
+@dataclass
+class _muon_state:
+    worker_rank: int
+    process_group: ProcessGroup
+    rank_indices: dict[int, tuple]  # local_rank -> per-dim indices
+    rank_numels: dict[int, int]  # local_rank -> numel
+    name: str
+    qk_clip_state: torch.Tensor | None = None
+def update_g(optimizer_state, p, g, group, momentum):
+    """Apply momentum update to gradient.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        p: Parameter tensor.
+        g: Gradient tensor.
+        group: Parameter group dict.
+        momentum: Momentum coefficient.
+    Returns:
+        Momentum-updated gradient tensor.
+    """
+    state = optimizer_state[p]
+    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+    torch.add(g, buf, alpha=momentum, out=buf)
+    if group["nesterov"]:
+        g.add_(buf, alpha=momentum)
+        return g
+    return buf
+def update_p(p, u, lr, adjusted_lr, weight_decay):
+    """Apply weight decay and orthogonalized update to parameter.
+    Args:
+        p: Parameter (torch.nn.Parameter or DTensor).
+        u: Orthogonalized update tensor.
+        lr: Base learning rate.
+        adjusted_lr: Size-adjusted learning rate.
+        weight_decay: Weight decay coefficient.
+    """
+    if isinstance(p, torch.nn.Parameter):
+        # apply weight decay
+        p.data.mul_(1 - lr * weight_decay)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    else:
+        p.mul_(1 - lr * weight_decay)
+        p.add_(u, alpha=-adjusted_lr)
+def adjust_lr_for_muon(lr, param_shape):
+    """Scale learning rate based on parameter matrix dimensions.
+    Args:
+        lr: Base learning rate.
+        param_shape: Shape of the parameter tensor.
+    Returns:
+        Adjusted learning rate.
+    """
+    A, B = param_shape[:2]
+    # We adjust the learning rate and weight decay based on the size of the parameter matrix
+    # as described in the paper
+    adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+    adjusted_lr = lr * adjusted_ratio
+    return adjusted_lr
+def default_is_muon(name, x, expert_keys=None):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    if any(key in name for key in skip_keys):
+        return False
+    effective_ndim = x.ndim
+    if expert_keys and any(key in name for key in expert_keys):
+        effective_ndim -= 1
+    return effective_ndim >= 2
+def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
+    if is_muon_func is None:
+        is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
+    return [
+        {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
+        },
+        {
+            "params": non_muon_params,
+            "use_muon": False,
+        },
+    ]

build/torch210-cxx11-cu128-x86_64-linux/distributed/utils.py CHANGED Viewed

@@ -7,22 +7,40 @@ from torch.distributed.tensor.placement_types import (Placement, Shard,
                                                       _StridedShard)
 def get_slices_of_dtensor(
     target: DTensor | torch.Tensor,
     local_rank: int,
     shard_mesh: DeviceMesh,
     shard_placements: tuple[Placement],
-) -> tuple[slice]:
     """
-    Get the slice of local tensor for a given rank from a tensor.
     Args:
-        target (DTensor | torch.Tensor): The target tensor.
-        rank (int): The local rank of the shard group.
-        shard_mesh (DeviceMesh): The shard mesh. It consists of global ranks.
         shard_placements (tuple[Placement]): The shard placements.
-    """
-    slices: list[slice] = [slice(0, dim_size) for dim_size in target.size()]
     # find the global rank of the local rank in the shard mesh
     rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
@@ -34,34 +52,75 @@ def get_slices_of_dtensor(
     assert len(rank_coords) == len(shard_placements)
     # Caution: Assuming replicate-to-shard of the shard mesh goes with
     # left-to-right sharding. This is ensured by the sorting logic of
     # construct_shard_mesh function.
-    for i, (rank_coord,
-            placement) in enumerate(zip(rank_coords, shard_placements)):
-        assert isinstance(placement, Shard)
-        num_ranks = shard_mesh.mesh.shape[i]
-        dim = placement.dim
-        dim_size = (slices[dim].stop - slices[dim].start)
-        if dim_size % num_ranks != 0:
             raise NotImplementedError(
-                f"Dimension size {dim_size} is not divisible "
-                f"by number of ranks {num_ranks} for shard "
-                f"placement on dim {dim}. (shape: {target.shape})")
-        shard_size = dim_size // num_ranks
-        start = slices[dim].start + rank_coord * shard_size
-        end = start + shard_size
-        assert start < end <= slices[dim].stop
-        slices[dim] = slice(start, end)
-    return tuple(slices)
 _ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
@@ -71,105 +130,105 @@ _ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
 def construct_shard_mesh(
     placements: tuple[Placement],
     mesh: DeviceMesh,
-) -> (DeviceMesh, ProcessGroup, tuple[Placement]):
-    """
-    Construct Shard Mesh and Placements for unsharding.
-    It removes Replicate placements and constructs a new Mesh and ProcessGroup.
-    """
-    my_rank = dist.get_rank()
-    assert mesh.mesh.device.type == 'cpu'
-    # Copy mesh to avoid modifying the original mesh
-    mesh = mesh.mesh.clone()
-    # 1. Sort placements. Replicate first, then Shard by dim ascending.
-    # For Shard, strided shard comes after regular shard on the same dim
-    # to preserve left-to-right order of replicate-to-shard.
-    # This is because that strided shard is using stride to represent
-    # more fine-grained sharding on the same dim.
-    # Please check the URL below for _StridedShard.
-    # https://github.com/pytorch/pytorch/blob/v2.8.0/torch/distributed/tensor/placement_types.py#L366
-    def placement_sort_key(
-        placement_with_index: tuple[float, Placement]
-    ) -> tuple[int, float, int]:  # (dim, split factor, original index)
-        index, placement = placement_with_index
-        is_replicate = placement.is_replicate()
-        is_shard = placement.is_shard()
-        is_partial = placement.is_partial()
-        assert is_replicate or is_shard, f"Unsupported placement type: {type(placement)}"
-        assert not is_partial, "Partial placement is not supported."
-        if is_replicate:
-            return (-1.0, 0, index)
-        elif is_shard:
-            if isinstance(placement, _StridedShard):
-                return (placement.dim, 1 / placement.split_factor, index)
-            return (placement.dim, 0, index)
-        else:
-            raise TypeError(f"Unknown placement type: {type(placement)}")
-    placements_with_index: list[tuple[int,
-                                      Placement]] = list(enumerate(placements))
-    placements_with_index = sorted(placements_with_index,
-                                   key=placement_sort_key)
-    sorted_indices, sorted_placements = zip(*placements_with_index)
-    # 2. Permute mesh according to sorted placements.
-    sorted_mesh = mesh.permute(sorted_indices)
-    # 3. Collect list of shard meshes by removing replicate dims
-    # For example, (2, 3, 4, 4) with placements [R, R, S(0), S(1)]
-    # shard_meshes should be list with 2 * 3 = 6 shard meshes of shape (4, 4)
-    num_replicates = sum(1 for p in sorted_placements if p.is_replicate())
-    # merge replicate dims
-    # shard_meshes became a list of shard meshes with a length of replicate degree
-    if num_replicates > 0:
-        sorted_mesh = sorted_mesh.flatten(
-            0, num_replicates - 1) if num_replicates > 1 else sorted_mesh
         shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
     else:
         shard_meshes = [sorted_mesh]
-    shard_placements = sorted_placements[num_replicates:]
-    # assume all shard placements are different
     assert len(shard_placements) == len(set(shard_placements))
-    # 4. Construct ProcessGroups
-    # Caution: all groups should be created in the same order in all processes,
-    # even though each process only needs its own group.
-    # To use tensor as dict key, convert it to tuple
-    def tensor_to_tuple(t):
-        if isinstance(t, torch.Tensor):
-            t = t.tolist()
-        if isinstance(t, list):
-            return tuple(tensor_to_tuple(x) for x in t)
-        return t
-    my_shard_mesh_as_tuple = None
-    for shard_mesh in shard_meshes:
-        assert isinstance(shard_mesh, torch.Tensor)
-        shard_mesh_as_tuple = tensor_to_tuple(shard_mesh)
-        if (my_rank == shard_mesh).any().item():
-            assert my_shard_mesh_as_tuple is None
-            my_shard_mesh_as_tuple = shard_mesh_as_tuple
-        # update global cache
-        if shard_mesh_as_tuple not in _ranks_to_dist_cache:
-            shard_process_group = dist.new_group(shard_mesh.flatten().tolist())
-            _ranks_to_dist_cache[shard_mesh_as_tuple] = (
-                DeviceMesh(device_type="cuda", mesh=shard_mesh),
-                shard_process_group,
             )
-    my_shard_mesh, my_shard_process_group = _ranks_to_dist_cache[
-        my_shard_mesh_as_tuple]
-    return my_shard_mesh, my_shard_process_group, shard_placements

                                                       _StridedShard)
+def _is_shard(placement: Placement) -> bool:
+    """Check if a placement is a shard type (Shard or _StridedShard).
+    In PyTorch 2.10+, _StridedShard no longer inherits from Shard, so
+    ``placement.is_shard()`` returns False for _StridedShard.  This helper
+    handles both old and new hierarchies.
+    """
+    return isinstance(placement, (Shard, _StridedShard))
 def get_slices_of_dtensor(
     target: DTensor | torch.Tensor,
     local_rank: int,
     shard_mesh: DeviceMesh,
     shard_placements: tuple[Placement],
+) -> tuple[slice | torch.Tensor, ...]:
     """
+    Get per-dimension indices for a given rank's shard of the target tensor.
+    Uses ``Shard.local_shard_size_and_offset`` and
+    ``_StridedShard.local_shard_size_and_offset`` for correct handling of
+    both contiguous and strided (non-contiguous) sharding.
     Args:
+        target (DTensor | torch.Tensor): The target tensor (for its shape).
+        local_rank (int): The local rank within the shard group.
+        shard_mesh (DeviceMesh): The shard mesh (only shard dimensions).
         shard_placements (tuple[Placement]): The shard placements.
+    Returns:
+        A tuple of indices (one per tensor dim).  Each element is either:
+        - A ``slice`` (for contiguous or unsharded dims)
+        - A 1-D ``torch.LongTensor`` of indices (for strided sharding)
+    """
     # find the global rank of the local rank in the shard mesh
     rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
     assert len(rank_coords) == len(shard_placements)
+    # Track per-shard-dim indices.
+    # None means "not yet sharded on this dim".
+    dim_indices: dict[int, torch.Tensor] = {}
     # Caution: Assuming replicate-to-shard of the shard mesh goes with
     # left-to-right sharding. This is ensured by the sorting logic of
     # construct_shard_mesh function.
+    for mesh_dim_idx, (rank_coord, placement) in enumerate(
+            zip(rank_coords, shard_placements)):
+        assert _is_shard(placement)
+        num_chunks = shard_mesh.mesh.shape[mesh_dim_idx]
+        shard_dim = placement.dim
+        # Current effective size on this dim (may already be sub-sharded)
+        if shard_dim in dim_indices:
+            curr_size = len(dim_indices[shard_dim])
+        else:
+            curr_size = target.size()[shard_dim]
+        if curr_size % num_chunks != 0:
             raise NotImplementedError(
+                f"Dimension size {curr_size} is not divisible "
+                f"by number of ranks {num_chunks} for shard "
+                f"placement on dim {shard_dim}. (shape: {target.shape})")
+        # Compute indices for this level of sharding
+        if isinstance(placement, _StridedShard):
+            _shard_size, offsets = _StridedShard.local_shard_size_and_offset(
+                placement,
+                curr_size,
+                num_chunks,
+                rank_coord,
+                return_first_offset=False)
+            new_indices = torch.tensor(offsets, dtype=torch.long)
+        else:
+            shard_size, offset = Shard.local_shard_size_and_offset(
+                curr_size, num_chunks, rank_coord)
+            new_indices = torch.arange(offset,
+                                       offset + shard_size,
+                                       dtype=torch.long)
+        # Compose with previous indices on this dim
+        if shard_dim in dim_indices:
+            dim_indices[shard_dim] = dim_indices[shard_dim][new_indices]
+        else:
+            dim_indices[shard_dim] = new_indices
+    # Build result tuple
+    result: list[slice | torch.Tensor] = []
+    for d in range(len(target.size())):
+        if d not in dim_indices:
+            result.append(slice(None))
+        else:
+            indices = dim_indices[d]
+            # Convert contiguous indices to slice for efficiency
+            if len(indices) > 0:
+                start = indices[0].item()
+                expected = torch.arange(start,
+                                        start + len(indices),
+                                        dtype=torch.long)
+                if torch.equal(indices, expected):
+                    result.append(slice(start, start + len(indices)))
+                else:
+                    result.append(indices)
+            else:
+                result.append(slice(0, 0))
+    return tuple(result)
 _ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
 def construct_shard_mesh(
     placements: tuple[Placement],
     mesh: DeviceMesh,
+) -> tuple[DeviceMesh, ProcessGroup, tuple[Placement, ...]]:
+    """Construct shard sub-mesh and ProcessGroup for all-to-all communication.
+    Given a DTensor's placements and device mesh, extracts the "shard group"
+    — the set of ranks that together hold all shards of the same replica —
+    and creates a ProcessGroup for all-to-all among them.
+    Steps:
+        1. Sort placements: Replicate first, then Shard by (dim, granularity).
+        2. Permute the mesh tensor to match the sorted order.
+        3. Collapse Replicate dims → list of shard sub-meshes (one per replica).
+        4. Create/retrieve a cached ProcessGroup for the current rank's sub-mesh.
+    Example — 8 GPUs, mesh shape (2, 2, 2),
+              placements ``[Shard(0), Replicate, _StridedShard(0)]``::
+        Step 1 — Sort: [Replicate, _StridedShard(0), Shard(0)]
+                 Permutation: [1, 2, 0]
+        Step 2 — Permute mesh dims by [1, 2, 0]:
+                 Original:                Permuted:
+                 [[[0,1],[2,3]],          [[[0,2],[1,3]],
+                  [[4,5],[6,7]]]           [[4,6],[5,7]]]
+        Step 3 — Unbind replicate dim (dim 0), giving 2 shard sub-meshes:
+                 sub-mesh 0 = [[0,2],[1,3]]  (replica group 0)
+                 sub-mesh 1 = [[4,6],[5,7]]  (replica group 1)
+                 shard_placements = (_StridedShard(0), Shard(0))
+        Step 4 — Rank 0 → ProcessGroup([0,1,4,5])
+                 Rank 2 → ProcessGroup([2,3,6,7])
+    Returns:
+        ``(shard_mesh, process_group, shard_placements)``
+    """
+    my_rank = dist.get_rank()
+    assert mesh.mesh.device.type == 'cpu'
+    # -- Fast path: 1D all-shard mesh → reuse existing PG. ----------------
+    # This avoids a non-collective dist.new_group() call, which would
+    # deadlock when only a subset of ranks call this function (e.g. expert
+    # DTensors on a TP submesh where ranks 0-3 and 4-7 call separately).
+    if mesh.ndim == 1 and len(placements) == 1 and _is_shard(placements[0]):
+        key = (*mesh.mesh.shape, *mesh.mesh.flatten().tolist())
+        if key not in _ranks_to_dist_cache:
+            _ranks_to_dist_cache[key] = (mesh, mesh.get_group())
+        return (*_ranks_to_dist_cache[key], tuple(placements))
+    mesh_tensor = mesh.mesh.clone()
+    # -- Step 1: Sort placements (Replicate first, then Shard by dim). ------
+    # _StridedShard comes BEFORE regular Shard on the same dim so that
+    # get_slices_of_dtensor applies the outer sharding first, matching
+    # DTensor's left-to-right (outer-to-inner) composition order.
+    def _sort_key(item):
+        index, placement = item
+        assert not placement.is_partial(), "Partial placement not supported"
+        if placement.is_replicate():
+            return (-1, 0, index)
+        assert _is_shard(placement), f"Unsupported: {type(placement)}"
+        split = (-1 / placement.split_factor if isinstance(
+            placement, _StridedShard) else 0)
+        return (placement.dim, split, index)
+    indexed = sorted(enumerate(placements), key=_sort_key)
+    perm, sorted_placements = zip(*indexed)
+    # -- Step 2: Permute mesh to match sorted placement order. --------------
+    sorted_mesh = mesh_tensor.permute(perm)
+    # -- Step 3: Collapse replicate dims → list of shard sub-meshes. --------
+    # E.g. mesh (2, 3, 4, 4) with [R, R, S(0), S(1)] → 6 sub-meshes of (4, 4)
+    num_rep = sum(1 for p in sorted_placements if p.is_replicate())
+    if num_rep > 0:
+        if num_rep > 1:
+            sorted_mesh = sorted_mesh.flatten(0, num_rep - 1)
         shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
     else:
         shard_meshes = [sorted_mesh]
+    shard_placements = sorted_placements[num_rep:]
     assert len(shard_placements) == len(set(shard_placements))
+    # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
+    # All ranks must call dist.new_group in the same order, even though each
+    # rank only joins one group.
+    def _cache_key(t: torch.Tensor) -> tuple:
+        return (*t.shape, *t.flatten().tolist())
+    my_key = None
+    for sm in shard_meshes:
+        key = _cache_key(sm)
+        if (my_rank == sm).any().item():
+            assert my_key is None, "Rank appears in multiple shard groups"
+            my_key = key
+        if key not in _ranks_to_dist_cache:
+            pg = dist.new_group(sm.flatten().tolist())
+            _ranks_to_dist_cache[key] = (
+                DeviceMesh(device_type="cuda", mesh=sm),
+                pg,
             )
+    return (*_ranks_to_dist_cache[my_key], shard_placements)

build/torch210-cxx11-cu128-x86_64-linux/matmul_transpose_triton.py CHANGED Viewed

@@ -119,10 +119,3 @@ def matmul_transpose_assign(d_in, d_out):
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))
-def matmul_transpose(d_in):
-    M, _ = d_in.shape
-    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
-    matmul_transpose_assign(d_in, d_out)
-    return d_out

     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))

build/torch210-cxx11-cu128-x86_64-linux/metadata.json CHANGED Viewed

	@@ -1 +1,3 @@
1	- {~~"python-depends":[]}~~

+{
+  "python-depends": []
+}

build/torch210-cxx11-cu128-x86_64-linux/muon.py CHANGED Viewed

@@ -1,536 +1,121 @@
 import logging
-import math
 import types
 from collections import defaultdict
-from dataclasses import dataclass
-from typing import Any, cast
 import torch
 import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import DTensor, Replicate
-from torch.distributed.tensor.placement_types import Placement
-from .distributed.utils import construct_shard_mesh, get_slices_of_dtensor
-from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
-COMM_DTYPE = torch.bfloat16
-DEFAULT_CHUNK_SIZE_RATIO = 4
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# Muon's Newton–Schulz iteration causes high variance in singular values
-# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
-@torch.no_grad()
-# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert len(G.shape) == 2
-    assert G.dtype == COMM_DTYPE
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm() + 1e-7)
-    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    # Perform the NS iterations
-    for a, b, c in [
-        (4.0848, -6.8946, 2.9270),
-        (3.9505, -6.3029, 2.6377),
-        (3.7418, -5.5913, 2.3037),
-        (2.8769, -3.1427, 1.2046),
-        (2.8366, -3.0525, 1.2012),
-    ]:
-        matmul_transpose_assign(X, buf1)
-        matmul_transpose_assign(buf1, buf2)
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X
-@dataclass
-class _muon_state:
-    # TODO: use Optional
-    worker_rank: int
-    process_group: ProcessGroup
-    shard_mesh: DeviceMesh
-    shard_placements: tuple[Placement, ...]
-    name: str
-    qk_clip_state: torch.Tensor | None = None
-    gathered_grad: torch.Tensor | None = None
-    scattered_u: DTensor | None = None
-    computed_u: torch.Tensor | None = None
-    gather_event: torch.cuda.Event | None = None
-    compute_event: torch.cuda.Event | None = None
-    scatter_event: torch.cuda.Event | None = None
-def numel_for_rank(
-    param: DTensor,
-    local_rank: int,
-    state: _muon_state,
-) -> int:
-    slices = get_slices_of_dtensor(
-        param,
-        local_rank,
-        state.shard_mesh,
-        state.shard_placements,
-    )
-    numel = 1
-    for s, dim in zip(slices, param.shape):
-        start, stop, step = s.indices(dim)
-        length = max(0, (stop - start + (step - 1)) // step)
-        numel *= length
-    return numel
-@torch.no_grad()
-def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
-    """
-    Pre-allocate gathered_grad buffer on compute_stream
-    before launching all2all gather
-    """
-    with torch.cuda.stream(compute_stream):
-        for p in params:
-            state = param_to_state[id(p)]
-            if rank == state.worker_rank:
-                state.gathered_grad = torch.empty(p.shape,
-                                                  dtype=COMM_DTYPE,
-                                                  device="cuda")
-            else:
-                state.gathered_grad = None
-        alloc_event = torch.cuda.Event()
-        alloc_event.record(compute_stream)
-        return alloc_event
-@torch.no_grad()
-def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
-                    alloc_event):
-    """
-    All2all gathers shards so each owner rank reconstructs its full gradient
-    """
-    with torch.cuda.stream(comm_stream):
-        process_group = param_to_state[id(params[0])].process_group
-        num_ranks = dist.get_world_size(group=process_group)
-        # Construct sending buffers
-        per_dst = [[] for _ in range(num_ranks)]
-        send_counts = [0] * num_ranks
-        for p in params:
-            state = param_to_state[id(p)]
-            dst = state.worker_rank
-            assert dst < num_ranks
-            shard_elems = numel_for_rank(p, rank, state)
-            g = p.grad
-            g = g.to_local().to(COMM_DTYPE).contiguous()
-            assert g.numel() == shard_elems
-            per_dst[dst].append(g.view(-1))
-            send_counts[dst] += shard_elems
-        assert any(
-            len(v) > 0 for v in per_dst
-        ), "At least one destination rank must receive a sharded tensor"
-        # list[list[Tensor]] -> list[Tensor]
-        per_dst = [t for dst in per_dst for t in dst]
-        send_buf = torch.cat(per_dst, dim=0)
-        owned_params = [
-            p for p in params if param_to_state[id(p)].worker_rank == rank
-        ]
-        # Compute receive sizes and allocate receiving buffers
-        recv_counts = [0] * num_ranks
-        for src in range(num_ranks):
-            total = 0
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                assert state.worker_rank == rank
-                total += numel_for_rank(p, src, state)
-            recv_counts[src] = total
-        recv_total = sum(recv_counts)
-        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-        #All2All
-        logger.debug(f"send_buf size: {send_buf.numel()}, "
-                     f"recv_buf size: {recv_buf.numel()}, "
-                     f"recv_counts: {recv_counts}, "
-                     f"send_counts: {send_counts}, "
-                     f"process_group: {str(process_group)}")
-        dist.all_to_all_single(
-            recv_buf,
-            send_buf,
-            output_split_sizes=recv_counts,
-            input_split_sizes=send_counts,
-            group=process_group,
-        )
-        # Reconstructs gathered grad from the received buffer
-        #
-        #                  recv_buf (num ranks = 3)
-        #
-        #      From rank 0        From rank 1        From rank 2
-        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
-        #
-        # Outer loop:
-        # rank 0 -> rank 1 -> rank2
-        #
-        # Inner loop:
-        # p1_n -> p2_n -> p3_n
-        comm_stream.wait_event(alloc_event)
-        off = 0
-        for src in range(num_ranks):
-            if recv_counts[src] == 0:
-                continue
-            block = recv_counts[src]
-            inner_off = 0
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                assert state.worker_rank == rank
-                # get the slice of the full dtensor corresponding to rank src.
-                slices = get_slices_of_dtensor(state.gathered_grad, src,
-                                               state.shard_mesh,
-                                               state.shard_placements)
-                dst = state.gathered_grad[slices]
-                assert dst._base is state.gathered_grad
-                n = dst.numel()
-                assert n > 0
-                sg = recv_buf.narrow(0, off + inner_off, n)
-                sg = sg.reshape_as(dst)
-                dst.copy_(sg)
-                inner_off += n
-            off += block
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank == rank:
-                state.gather_event = torch.cuda.Event()
-                state.gather_event.record(comm_stream)
-            else:
-                state.gathered_grad = None
-                state.gather_event = None
-            if none_grad:
-                p.grad = None
-@torch.no_grad()
-def _compute_u(p, state, steps, rank, compute_stream):
-    """
-    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
-    """
-    with torch.cuda.stream(compute_stream):
-        if rank == state.worker_rank:
-            if state.gather_event is None:
-                raise RuntimeError("Gather event must be set before compute.")
-            compute_stream.wait_event(state.gather_event)
-            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
-            state.gathered_grad = None
-            state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
-@torch.no_grad()
-def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
-    """
-    Pre-allocate scattered_u buffer on compute_stream
-    before launching all2all gather
-    """
-    with torch.cuda.stream(compute_stream):
-        for p in params:
-            state = param_to_state[id(p)]
-            state.scattered_u = torch.empty_like(p.to_local(),
-                                                 dtype=COMM_DTYPE)
-        alloc_event = torch.cuda.Event()
-        alloc_event.record(compute_stream)
-        return alloc_event
-def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
-    """
-    All2all scatters full gradients to all ranks
-    """
-    with torch.cuda.stream(comm_stream):
-        process_group = param_to_state[id(params[0])].process_group
-        num_ranks = dist.get_world_size(group=process_group)
-        owned_params = [
-            p for p in params if param_to_state[id(p)].worker_rank == rank
-        ]
-        # Construct sending buffer
-        per_dst = [[] for _ in range(num_ranks)]
-        send_counts = [0] * num_ranks
-        if owned_params:
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                if state.compute_event is None:
-                    raise RuntimeError(
-                        "Compute event must be set before scatter.")
-                comm_stream.wait_event(state.compute_event)
-                state.gathered_grad = None
-                assert state.computed_u is not None
-                u_full = state.computed_u.to(COMM_DTYPE).contiguous()
-                offset = 0
-                for dst in range(num_ranks):
-                    # get the slice of the full tensor corresponding to rank dst.
-                    slices = get_slices_of_dtensor(u_full, dst,
-                                                   state.shard_mesh,
-                                                   state.shard_placements)
-                    su = u_full[slices].flatten()
-                    n = su.numel()
-                    assert n > 0
-                    per_dst[dst].append(su)
-                    send_counts[dst] += n
-                    offset += n
-                assert offset == u_full.numel()
-        lengths = [len(v) for v in per_dst]
-        if all(l > 0 for l in lengths):
-            assert all(
-                l == lengths[0] for l in lengths
-            ), "All destination ranks must have the same number of sharded tensor"
-            # list[list[Tensor]] -> list[Tensor]
-            per_dst = [t for dst in per_dst for t in dst]
-            send_buf = torch.cat(per_dst, dim=0)
-        else:
-            # all_to_all requires participation from all ranks
-            # Even non-owner ranks must join the collective call
-            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-        # Compute receive sizes and allocate receiving buffers
-        recv_counts = [0] * num_ranks
-        for src in range(num_ranks):
-            total = 0
-            for p in params:
-                state = param_to_state[id(p)]
-                if state.worker_rank != src:
-                    continue
-                total += numel_for_rank(p, rank, state)
-            recv_counts[src] = total
-        recv_total = sum(recv_counts)
-        assert recv_total > 0
-        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-        #All2All
-        dist.all_to_all_single(
-            recv_buf,
-            send_buf,
-            output_split_sizes=recv_counts,
-            input_split_sizes=send_counts,
-            group=process_group,
-        )
-        # Copy to pre-allocated scattered_u buffer from the received buffer
-        #
-        #                  recv_buf (num ranks = 3, local_rank = 0)
-        #
-        #      From rank 0        From rank 1       From rank 2
-        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
-        #
-        # Outer loop:
-        # rank 0 -> rank 1 -> rank2
-        #
-        # Inner loop:
-        # src(0) :  p1_0 -> p2_0 -> p3_0
-        # src(1) :  p4_0
-        # src(2) :  p5_0 -> p6_0
-        comm_stream.wait_event(alloc_event)
-        off = 0
-        for src in range(num_ranks):
-            block = recv_counts[src]
-            if block == 0:
-                continue
-            inner_off = 0
-            for p in params:
-                state = param_to_state[id(p)]
-                if state.worker_rank != src:
-                    continue
-                n = numel_for_rank(p, rank, state)
-                assert n > 0
-                flat_local = recv_buf.narrow(0, off + inner_off,
-                                             n).view_as(p.to_local())
-                state.scattered_u.copy_(flat_local)
-                state.scatter_event = torch.cuda.Event()
-                state.scatter_event.record(comm_stream)
-                inner_off += n
-            assert inner_off == block
-            off += block
-def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
-                  compute_stream):
-    """
-    Update sharded parameter p with the scattered_u.
-    Only worker_rank frees computed_u.
     """
-    with torch.cuda.stream(compute_stream):
-        if state.scatter_event is None:
-            raise RuntimeError("Scatter event must be set before update")
-        compute_stream.wait_event(state.scatter_event)
-        u_dtensor = DTensor.from_local(
-            state.scattered_u,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
-        if rank == state.worker_rank:
-            # Free computed_u
-            state.computed_u = None
-        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-        state.scattered_u = None
-        u_dtensor = None
-        scales_full = Muon._compute_scales(
-            p,
-            state.qk_clip_state) if state.qk_clip_state is not None else None
-        if scales_full is not None:
-            # Have to slice scales_full among dim 0
-            weight_slices = get_slices_of_dtensor(p, rank, state.shard_mesh,
-                                                  state.shard_placements)
-            ratio = p.shape[0] // scales_full.shape[0]
-            scales_slice = slice(
-                None if weight_slices[0].start is None else
-                weight_slices[0].start // ratio,
-                None if weight_slices[0].stop is None else
-                weight_slices[0].stop // ratio,
-                None,
-            )
-            scales_local = scales_full[scales_slice]
-            scales_local = DTensor.from_local(
-                scales_local,
-                placements=p.placements,
-                device_mesh=p.device_mesh,
-            )
-            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
-def default_is_muon(name, x):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    return x.ndim >= 2 and not any(key in name for key in skip_keys)
-def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
-    muon_params, muon_names = [], []
-    non_muon_params = []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
             continue
-        if is_muon_func(n, p):
-            muon_params.append(p)
-            muon_names.append(n)
-        else:
-            non_muon_params.append(p)
-    return [
-        {
-            "params": muon_params,
-            "names": muon_names,
-            "use_muon": True,
-        },
-        {
-            "params": non_muon_params,
-            "use_muon": False,
-        },
-    ]
-def parse_qk_layer(name: str) -> tuple[str | None, int]:
-    """
-    Parse a parameter name to check if it is a query/key projection layer
-    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
-    Returns:
-        (kind, layer_idx) or (None, -1) if not matched.
-    Example:
-        'model.3.attn.wq.weight'      -> ('wq', 3)
-        'model.5.attn.wk.weight'      -> ('wk', 5)
-        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
-        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
-        'model.4.attn.v_proj.weight'  -> (None, -1)
-    """
-    parts = name.split('.')
-    if len(parts) < 3:
-        return None, -1
-    kind = parts[-2]
-    layer_idx = -1
-    for part in reversed(parts):
-        if part.isdigit():
-            layer_idx = int(part)
-            break
-    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
-        return kind, layer_idx
-    return None, -1
-@dataclass
-class QKClipInfo:
-    """Per-parameter dynamic info computed from config + runtime logits."""
-    kind: str | None  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
-    indices: list[int]  # which heads to consider for clipping
-    head_dim: int  # from config
-    threshold: float  # from config
-    logit: torch.Tensor | None
 class Muon(torch.optim.Optimizer):
@@ -554,7 +139,7 @@ class Muon(torch.optim.Optimizer):
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
         weight_decay: The weight decay for Muon and AdamW.
-        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
@@ -564,7 +149,7 @@ class Muon(torch.optim.Optimizer):
             - "q_indices" (list[int]): Indices of query heads to consider.
             - "k_indices" (list[int]): Indices of key heads to consider.
             - "head_dim" (int): Dimensionality of each attention head.
-            - "threshold" (float): Threshold value; heads whose QK logits exceed
             this value will be scaled down.
             Default is:
                 {
@@ -584,6 +169,13 @@ class Muon(torch.optim.Optimizer):
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
         small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
     """
     def __init__(self,
@@ -597,16 +189,12 @@ class Muon(torch.optim.Optimizer):
                  adamw_eps=1e-8,
                  none_grad=True,
                  debug=False,
-                 clip_config={
-                     "q_indices": [],
-                     "k_indices": [],
-                     "head_dim": 128,
-                     "threshold": 100
-                 },
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
-                 small_param_numel_threshold=65536):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -630,16 +218,18 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        self.rank = None
-        self.comm_stream = torch.cuda.Stream()
-        self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-        self.clip_config = clip_config
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
         self.small_param_numel_threshold = small_param_numel_threshold
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -649,20 +239,6 @@ class Muon(torch.optim.Optimizer):
         return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def adjust_lr_for_muon(self, lr, param_shape):
-        A, B = param_shape[:2]
-        # We adjust the learning rate and weight decay based on the size of the parameter matrix
-        # as describted in the paper
-        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-        adjusted_lr = lr * adjusted_ratio
-        return adjusted_lr
-    def set_rank_once(self, rank):
-        if self.rank is None:
-            self.rank = rank
-        else:
-            assert self.rank == rank
     def get_shard_mesh(self, p):
         """
         Get the shard mesh for a parameter p on the given rank.
@@ -673,9 +249,6 @@ class Muon(torch.optim.Optimizer):
         shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
             p.placements, p.device_mesh)
-        # set rank with the local rank in the shard process group
-        self.set_rank_once(dist.get_rank(group=shard_pg))
         return shard_mesh, shard_pg, shard_placements
     def init_state_and_assign_params(self, names, params, group, qk_logits):
@@ -694,8 +267,8 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
-                  flush=True)
         paired = list(zip(names, params))
@@ -724,44 +297,54 @@ class Muon(torch.optim.Optimizer):
             worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
             round_robin = (round_robin + 1) % len(shard_mesh_flattened)
-            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)] = _muon_state(
                 worker_rank=worker_rank,
                 process_group=shard_pg,
-                shard_mesh=shard_mesh,
-                shard_placements=shard_placements,
                 name=n,
                 qk_clip_state=qk_clip_state,
             )
         return param_to_state, ordered_params
-    def base(self, names, params, group, lr, weight_decay, momentum,
-             qk_logits):
-        # generate weight updates in distributed fashion
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            assert g is not None
-            g = self._update_g(p, g, group, momentum)
             u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
-            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
-            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
-            scales_full = self._compute_scales(
                 p, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
-                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def distributed_muon(
         self,
@@ -770,20 +353,15 @@ class Muon(torch.optim.Optimizer):
         group: dict[str, Any],
         lr: float,
         weight_decay: float,
-        momentum: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
         """ Implementation of Distributed Muon by Liu et al. """
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            assert g is not None
-            g = self._update_g(p, g, group, momentum)
             # Gather G
             if isinstance(p.data, DTensor):
@@ -796,16 +374,16 @@ class Muon(torch.optim.Optimizer):
             u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
                                                   steps=group["ns_steps"])
-            adjusted_lr = self.adjust_lr_for_muon(lr, p_full.shape)
-            Muon._update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
-            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
-            scales_full = self._compute_scales(
                 p_full, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
-                Muon._qk_clip(p_full, scales_full, qk_clip_state.head_dim)
             if isinstance(p.data, DTensor):
                 ndims = len(p.device_mesh.mesh.shape)
@@ -822,244 +400,53 @@ class Muon(torch.optim.Optimizer):
                 p.copy_(p_sharded)
-    def _update_g(self, p, g, group, momentum):
-        # calc update
-        state = self.state[p]
-        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-        torch.add(g, buf, alpha=momentum, out=buf)
-        if group["nesterov"]:
-            g.add_(buf, alpha=momentum)
-            return g
-        return buf
-    @staticmethod
-    def _update_p(p, u, lr, adjusted_lr, weight_decay):
-        if isinstance(p, torch.nn.Parameter):
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
-        else:
-            p.mul_(1 - lr * weight_decay)
-            p.add_(u, alpha=-adjusted_lr)
-    def get_qk_clip_info(self, n, qk_logits):
-        if self.clip_config is None:
-            return None
-        head_dim = self.clip_config.get('head_dim')
-        threshold = self.clip_config.get('threshold')
-        kind, layer_idx = parse_qk_layer(n)
-        logit, indices = None, []
-        if qk_logits is not None and kind is not None:
-            logit = qk_logits[layer_idx]
-            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
-            indices = self.clip_config.get(indices_key, []) or []
-            if isinstance(logit, DTensor):
-                # In TP settings, qk_logits may be DTensor
-                # We convert it to full tensor here for simplicity
-                logit = logit.full_tensor()
-        return QKClipInfo(
-            kind=kind,
-            indices=indices,
-            head_dim=head_dim,
-            threshold=threshold,
-            logit=logit,
-        )
-    @staticmethod
-    def _compute_scales(p, qk_clip_state):
-        kind = qk_clip_state.kind
-        indices = qk_clip_state.indices
-        head_dim = qk_clip_state.head_dim
-        threshold = qk_clip_state.threshold
-        logit = qk_clip_state.logit
-        H_global = p.shape[0] // head_dim
-        scales_full = torch.ones(H_global, device=p.data.device)
-        scaling = 0
-        for logit_idx, head_idx in enumerate(indices):
-            v_ele = float(logit[logit_idx])
-            if v_ele > threshold:
-                new_scale = math.sqrt(threshold / v_ele)
-                if new_scale < scales_full[head_idx]:
-                    scales_full[head_idx] = new_scale
-                    logger.info(
-                        f"[{kind}] Head {head_idx} exceeded threshold "
-                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
-                    )
-                    scaling += 1
-        return scales_full if scaling > 0 else None
-    @staticmethod
-    def _qk_clip(p, scales, head_dim):
-        if isinstance(p, torch.nn.Parameter):
-            W = p.data.view(-1, head_dim, p.data.shape[1])
-            W.mul_(scales.view(-1, 1, 1))
-        else:
-            W = p.view(-1, head_dim, p.shape[1])
-            W.mul_(scales.view(-1, 1, 1))
-    def parallel(self, names, params, group, lr, weight_decay, momentum,
-                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
-        """
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            # Update g in the local rank
-            g = self._update_g(
-                p,
-                g,
-                group,
-                momentum=momentum,
-            )
-            p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        assert self.rank is not None
-        def enqueue_all2all_gather(start_idx, chunk_size):
-            target_params = ordered_params[start_idx:start_idx + chunk_size]
-            if target_params:
-                alloc_event = _alloc_gathered_grad(target_params,
-                                                   param_to_state, self.rank,
-                                                   self.compute_stream)
-                _all2all_gather(target_params, param_to_state, self.rank,
-                                self.comm_stream, group["none_grad"],
-                                alloc_event)
-        def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _compute_u(p, state, group["ns_steps"], self.rank,
-                           self.compute_stream)
-        def enqueue_all2all_scatter(start_idx, chunk_size):
-            target_params = ordered_params[start_idx:start_idx + chunk_size]
-            if target_params:
-                alloc_event = _alloc_scattered_u(target_params, param_to_state,
-                                                 self.rank,
-                                                 self.compute_stream)
-                _all2all_scatter(target_params, param_to_state, self.rank,
-                                 self.comm_stream, alloc_event)
-        def enqueue_update_param(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _update_param(p, state, lr, adjusted_lr, weight_decay,
-                              self.rank, self.compute_stream)
         if self.chunk_size == -1:
             shard_ranks = dist.get_world_size(param_to_state[id(
-                params[0])].process_group)
             chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
         elif self.chunk_size > 0:
             chunk_size = self.chunk_size
         else:
             raise ValueError("chunk_size must be -1 or a positive integer.")
-        # Wait grad update
-        self.comm_stream.wait_stream(torch.cuda.current_stream())
-        warmup_step = self.warmup_step
-        for i in range(0, warmup_step):
-            enqueue_all2all_gather(i * chunk_size, chunk_size)
-            enqueue_computes(i * chunk_size, chunk_size)
-        for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_all2all_scatter(i, chunk_size)
-            enqueue_all2all_gather(i + warmup_step * chunk_size, chunk_size)
-            enqueue_update_param(i, chunk_size)
-            enqueue_computes(i + warmup_step * chunk_size, chunk_size)
-        # Wait the last update_param to finish
-        torch.cuda.current_stream().wait_stream(self.compute_stream)
-    @staticmethod
-    def _fused_adamw(
-        params: list[torch.Tensor],
-        grads: list[torch.Tensor],
-        exp_avgs: list[torch.Tensor],
-        exp_avg_sqs: list[torch.Tensor],
-        max_exp_avg_sqs: list[torch.Tensor],
-        state_steps: list[torch.Tensor],
-        amsgrad: bool,
-        beta1: float,
-        beta2: float,
-        lr: float | torch.Tensor,
-        weight_decay: float,
-        eps: float,
-        maximize: bool,
-    ) -> None:
-        if not params:
-            return
-        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
-        # treating it as a scalar.
-        lr_dict: DeviceDict | None = ({
-            lr.device: lr
-        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
-                                      None)
-        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
-            [
-                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
-                state_steps
-            ]  # type: ignore[list-item]
-        )
-        for (device, _), (
-            (
-                device_params_,
-                device_grads_,
-                device_exp_avgs_,
-                device_exp_avg_sqs_,
-                device_max_exp_avg_sqs,
-                device_state_steps_,
-            ),
-                _,
-        ) in grouped_tensors.items():
-            device_params = cast(list[torch.Tensor], device_params_)
-            device_grads = cast(list[torch.Tensor], device_grads_)
-            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
-            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
-            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
-            if lr_dict is not None and device not in lr_dict:
-                lr_dict[device] = lr.to(
-                    device=device,
-                    non_blocking=True)  # type: ignore[union-attr]
-                lr = lr_dict[device]
-            torch._foreach_add_(device_state_steps, 1)
-            func = torch._fused_adamw_
-            func(
-                device_params,
-                device_grads,
-                device_exp_avgs,
-                device_exp_avg_sqs,
-                device_max_exp_avg_sqs,  # type: ignore[arg-type]
-                device_state_steps,
-                amsgrad=amsgrad,
-                lr=lr,  # type: ignore[arg-type]
-                beta1=beta1,
-                beta2=beta2,
-                weight_decay=weight_decay,
-                eps=eps,
-                maximize=maximize,
-            )
     def _step_muon(self, group, qk_logits=None):
         params = group["params"]
@@ -1068,6 +455,18 @@ class Muon(torch.optim.Optimizer):
         momentum = group["momentum"]
         names = group["names"]
         param_dtensors = []
         name_dtensors = []
@@ -1083,7 +482,6 @@ class Muon(torch.optim.Optimizer):
                                   group=group,
                                   lr=lr,
                                   weight_decay=weight_decay,
-                                  momentum=momentum,
                                   qk_logits=qk_logits)
             return
@@ -1119,7 +517,6 @@ class Muon(torch.optim.Optimizer):
             # and run parallel Muon on each group.
             placement_to_params = defaultdict(lambda: ([], []))
-            # type: dict[tuple[Placement, DeviceMesh], tuple[list[str], list[DTensor]]]
             assert len(dtensors) == len(names)
             for p, n in zip(dtensors, names):
@@ -1141,7 +538,6 @@ class Muon(torch.optim.Optimizer):
                 group=group,
                 lr=lr,
                 weight_decay=weight_decay,
-                momentum=momentum,
                 qk_logits=qk_logits,
             )
@@ -1159,7 +555,6 @@ class Muon(torch.optim.Optimizer):
                     group,
                     lr=lr,
                     weight_decay=weight_decay,
-                    momentum=momentum,
                     qk_logits=qk_logits,
                 )
@@ -1170,78 +565,9 @@ class Muon(torch.optim.Optimizer):
                 group,
                 lr=lr,
                 weight_decay=weight_decay,
-                momentum=momentum,
                 qk_logits=qk_logits,
             )
-    def _step_adamw_params(self, params, group):
-        params_with_grads = []
-        grads = []
-        moment1 = []
-        moment2 = []
-        max_exp_avg_sqs = []
-        state_steps = []
-        lr = group["lr"]
-        beta1, beta2 = group["adamw_betas"]
-        eps = group["adamw_eps"]
-        weight_decay = group["weight_decay"]
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            state = self.state[p]
-            params_with_grads.append(p)
-            grads.append(g)
-            if "step" not in state:
-                state["step"] = (torch.zeros((),
-                                             dtype=torch.float32,
-                                             device=p.device))
-                state["moment1"] = torch.zeros_like(g)
-                state["moment2"] = torch.zeros_like(g)
-            moment1.append(state["moment1"])
-            moment2.append(state["moment2"])
-            if not isinstance(state["step"], torch.Tensor):
-                step_tensor = torch.tensor(state["step"],
-                                           dtype=torch.float32,
-                                           device=p.device)
-            else:
-                step_tensor = state["step"]
-            state_steps.append(step_tensor)
-        self._fused_adamw(
-            params_with_grads,
-            grads,
-            moment1,
-            moment2,
-            max_exp_avg_sqs,
-            state_steps,
-            amsgrad=False,
-            beta1=beta1,
-            beta2=beta2,
-            lr=lr,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=False,
-        )
-    def _step_adamw(self, group):
-        params = group["params"]
-        # group params with it's type and placement
-        placement_to_params: dict[tuple[Placement | type,
-                                        DeviceMesh | None]] = defaultdict(list)
-        for p in params:
-            match p:
-                case DTensor():
-                    placement_to_params[tuple([p.placements,
-                                               p.device_mesh])].append(p)
-                case torch.Tensor():
-                    placement_to_params[tuple([torch.Tensor, None])].append(p)
-        for params in placement_to_params.values():
-            self._step_adamw_params(params, group)
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
@@ -1249,9 +575,9 @@ class Muon(torch.optim.Optimizer):
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
-            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
-                to 1D tensors of shape (num_heads,), representing the maximum
-                QK logits across all tokens, computed as
                 (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
@@ -1263,6 +589,6 @@ class Muon(torch.optim.Optimizer):
             if group["use_muon"]:
                 self._step_muon(group, qk_logits=qk_logits)
             else:
-                self._step_adamw(group)
         return loss

 import logging
 import types
 from collections import defaultdict
+from typing import Any
 import torch
 import torch.distributed as dist
+from torch.distributed.tensor import DTensor, Replicate, Shard
+from torch.profiler import record_function
+from .adamw import step_adamw
+from .async_utils import run_pipeline
+from .core import (_muon_state, adjust_lr_for_muon,
+                   get_default_muon_param_groups, update_g, update_p)
+from .distributed.utils import (_is_shard, construct_shard_mesh,
+                                get_slices_of_dtensor)
+from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
+                            _zeropower_via_newtonschulz5)
+from .pipeline import muon_chunk_pipeline
+from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
 logger = logging.getLogger(__name__)
+def _expand_expert_params(names, params, expert_keys):
+    """Expand expert params by splitting on dim 0 (expert dimension).
+    Params whose name matches any key in ``expert_keys`` are treated as
+    expert-parallel tensors.  Their outermost dimension is the expert
+    dimension: an ``(E, out, in)`` tensor becomes ``E`` separate 2D
+    ``nn.Parameter`` views so that in-place updates propagate back to
+    the original storage.
+    Non-expert params with ``ndim > 2`` trigger an ``AssertionError`` —
+    if they are expert params, their key must be added to ``expert_keys``.
+    The grad must already be set on each expert param (e.g. after momentum).
+    For DTensor expert params, placements that shard on dim 0 (expert dim)
+    are consumed by the split.  Non-dim-0 shard placements (e.g. TP) are
+    preserved: each 2D slice is wrapped as a DTensor on the corresponding
+    submesh so the parallel pipeline handles the TP communication.
     """
+    expanded_names = []
+    expanded_params = []
+    for n, p in zip(names, params):
+        is_expert = expert_keys and any(key in n for key in expert_keys)
+        is_dtensor = isinstance(p.data, DTensor)
+        if not is_expert:
+            assert p.data.ndim <= 2, (
+                f"Param {n} has ndim={p.data.ndim} but does not match "
+                f"expert_keys={expert_keys}. If this is an expert param, "
+                f"add its key to expert_keys.")
+            expanded_names.append(n)
+            expanded_params.append(p)
             continue
+        g = p.grad
+        assert g is not None, (
+            f"Expert param {n} must have grad set before expansion")
+        tp_mesh = None
+        tp_placements_2d = None
+        if is_dtensor:
+            local_data = p.to_local()
+            local_grad = g.to_local() if isinstance(g, DTensor) else g
+            # Find non-dim-0 shard placements (e.g. TP sharding).
+            # After splitting on dim 0, Shard(k) becomes Shard(k-1).
+            tp_dim_indices = []
+            tp_placements_2d = []
+            for i, pl in enumerate(p.placements):
+                if _is_shard(pl) and pl.dim != 0:
+                    tp_dim_indices.append(i)
+                    tp_placements_2d.append(Shard(pl.dim - 1))
+            if tp_dim_indices:
+                tp_dim_names = tuple(p.device_mesh.mesh_dim_names[i]
+                                     for i in tp_dim_indices)
+                if len(tp_dim_names) == 1:
+                    tp_mesh = p.device_mesh[tp_dim_names[0]]
+                else:
+                    tp_mesh = p.device_mesh[tp_dim_names]
+        else:
+            local_data = p.data
+            local_grad = g
+        # Expand: split dim 0, reshape each slice to 2D.
+        num_local_experts = local_data.shape[0]
+        for i in range(num_local_experts):
+            slice_data = local_data[i]
+            slice_grad = local_grad[i]
+            if tp_mesh is not None:
+                # Wrap as DTensor on TP submesh so the pipeline handles
+                # TP communication (gather/scatter across TP ranks).
+                dt_data = DTensor.from_local(slice_data,
+                                             device_mesh=tp_mesh,
+                                             placements=tp_placements_2d)
+                dt_grad = DTensor.from_local(slice_grad,
+                                             device_mesh=tp_mesh,
+                                             placements=tp_placements_2d)
+                expert_param = torch.nn.Parameter(dt_data, requires_grad=False)
+                expert_param.grad = dt_grad
+            else:
+                expert_param = torch.nn.Parameter(slice_data,
+                                                  requires_grad=False)
+                expert_param.grad = slice_grad
+            expanded_names.append(f"{n}[{i}]")
+            expanded_params.append(expert_param)
+        p.grad = None  # allow expert grad storage to be freed after pipeline
+    return expanded_names, expanded_params
 class Muon(torch.optim.Optimizer):
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
         weight_decay: The weight decay for Muon and AdamW.
+            Parameters that are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW instead.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
             - "q_indices" (list[int]): Indices of query heads to consider.
             - "k_indices" (list[int]): Indices of key heads to consider.
             - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
             this value will be scaled down.
             Default is:
                 {
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
         small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
+        expert_keys: List of strings to identify expert-parallel parameters.
+                     If any key appears in a parameter's name, its outermost
+                     dimension is treated as the expert dimension and expanded
+                     into per-expert 2D params for Muon.  For example,
+                     ``expert_keys=["experts"]`` matches any param whose name
+                     contains "experts".  3D+ params not matched by any key
+                     will raise an error.
     """
     def __init__(self,
                  adamw_eps=1e-8,
                  none_grad=True,
                  debug=False,
+                 clip_config=None,
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
+                 small_param_numel_threshold=65536,
+                 expert_keys=None):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
         self.debug = debug
+        self.clip_config = clip_config if clip_config is not None else {
+            "q_indices": [],
+            "k_indices": [],
+            "head_dim": 128,
+            "threshold": 100,
+        }
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
         self.small_param_numel_threshold = small_param_numel_threshold
+        self.expert_keys = expert_keys
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
     def get_shard_mesh(self, p):
         """
         Get the shard mesh for a parameter p on the given rank.
         shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
             p.placements, p.device_mesh)
         return shard_mesh, shard_pg, shard_placements
     def init_state_and_assign_params(self, names, params, group, qk_logits):
             total_flops += flops
         if self.debug:
+            logger.debug("Total TFLOPs for Muon: %.2f TFLOPs",
+                         total_flops / 1e12)
         paired = list(zip(names, params))
             worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
             round_robin = (round_robin + 1) % len(shard_mesh_flattened)
+            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
+            # Precompute per-rank indices and numels for all-to-all.
+            rank_indices: dict[int, tuple] = {}
+            rank_numels: dict[int, int] = {}
+            for r in range(num_ranks):
+                indices = get_slices_of_dtensor(p, r, shard_mesh,
+                                                shard_placements)
+                rank_indices[r] = indices
+                numel = 1
+                for idx, dim_size in zip(indices, p.shape):
+                    if isinstance(idx, slice):
+                        start, stop, step = idx.indices(dim_size)
+                        numel *= max(0, (stop - start + (step - 1)) // step)
+                    else:
+                        numel *= len(idx)
+                rank_numels[r] = numel
             param_to_state[id(p)] = _muon_state(
                 worker_rank=worker_rank,
                 process_group=shard_pg,
+                rank_indices=rank_indices,
+                rank_numels=rank_numels,
                 name=n,
                 qk_clip_state=qk_clip_state,
             )
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, qk_logits):
+        # Momentum is already applied by _step_muon before this method.
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
+            adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+            update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
+            scales_full = compute_scales(
                 p, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
+                qk_clip(p, scales_full, qk_clip_state.head_dim)
     def distributed_muon(
         self,
         group: dict[str, Any],
         lr: float,
         weight_decay: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
         """ Implementation of Distributed Muon by Liu et al. """
+        # Momentum is already applied by _step_muon before this method.
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             # Gather G
             if isinstance(p.data, DTensor):
             u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
                                                   steps=group["ns_steps"])
+            adjusted_lr = adjust_lr_for_muon(lr, p_full.shape)
+            update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
+            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
+            scales_full = compute_scales(
                 p_full, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
+                qk_clip(p_full, scales_full, qk_clip_state.head_dim)
             if isinstance(p.data, DTensor):
                 ndims = len(p.device_mesh.mesh.shape)
                 p.copy_(p_sharded)
+    def parallel(self, names, params, group, lr, weight_decay, qk_logits):
         """
         Perform a parallel optimization step using Muon.
+        Parameters are chunked and each chunk is processed by a
+        :func:`muon_chunk_pipeline` generator.  :func:`run_pipeline`
+        interleaves multiple chunks so that communication and computation
+        overlap across chunks (the same overlap previously achieved by the
+        warmup + main-loop index scheduling).
+        """
+        # Momentum is already applied by _step_muon before this method.
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        # Compute local rank for this group's shard process group.
+        shard_pg = param_to_state[id(ordered_params[0])].process_group
+        rank = dist.get_rank(group=shard_pg)
         if self.chunk_size == -1:
             shard_ranks = dist.get_world_size(param_to_state[id(
+                ordered_params[0])].process_group)
             chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
         elif self.chunk_size > 0:
             chunk_size = self.chunk_size
         else:
             raise ValueError("chunk_size must be -1 or a positive integer.")
+        def pipelines():
+            for start in range(0, len(ordered_params), chunk_size):
+                chunk = ordered_params[start:start + chunk_size]
+                if chunk:
+                    yield muon_chunk_pipeline(
+                        params=chunk,
+                        param_to_state=param_to_state,
+                        rank=rank,
+                        ns_steps=group["ns_steps"],
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        none_grad=group["none_grad"],
+                    )
+        with record_function("muon::barrier"):
+            dist.barrier()
+        with record_function("muon::pipeline"):
+            run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
     def _step_muon(self, group, qk_logits=None):
         params = group["params"]
         momentum = group["momentum"]
         names = group["names"]
+        # Apply momentum to all params before routing/expansion.
+        with record_function("muon::momentum"):
+            for n, p in zip(names, params):
+                g = p.grad
+                if g is None:
+                    continue
+                g = update_g(self.state, p, g, group, momentum)
+                p.grad = g
+        # Expand expert params by splitting on dim 0.
+        names, params = _expand_expert_params(names, params, self.expert_keys)
         param_dtensors = []
         name_dtensors = []
                                   group=group,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   qk_logits=qk_logits)
             return
             # and run parallel Muon on each group.
             placement_to_params = defaultdict(lambda: ([], []))
             assert len(dtensors) == len(names)
             for p, n in zip(dtensors, names):
                 group=group,
                 lr=lr,
                 weight_decay=weight_decay,
                 qk_logits=qk_logits,
             )
                     group,
                     lr=lr,
                     weight_decay=weight_decay,
                     qk_logits=qk_logits,
                 )
                 group,
                 lr=lr,
                 weight_decay=weight_decay,
                 qk_logits=qk_logits,
             )
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
                 (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
             if group["use_muon"]:
                 self._step_muon(group, qk_logits=qk_logits)
             else:
+                step_adamw(self.state, group)
         return loss

build/torch210-cxx11-cu128-x86_64-linux/newton_schulz.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+from .matmul_transpose_triton import matmul_transpose_assign
+COMM_DTYPE = torch.bfloat16
+DEFAULT_CHUNK_SIZE_RATIO = 4
+# This code snippet is a modified version adapted from the following GitHub repositories:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
+@torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X

build/torch210-cxx11-cu128-x86_64-linux/pipeline.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import logging
+from typing import Generator
+import torch
+import torch.distributed as dist
+from torch.distributed.tensor import DTensor
+from torch.profiler import record_function
+from .core import _muon_state, adjust_lr_for_muon, update_p
+from .newton_schulz import COMM_DTYPE, _zeropower_via_newtonschulz5
+from .qk_clip import compute_scales
+logger = logging.getLogger(__name__)
+# ======================================================================
+# Stage helpers
+# ======================================================================
+def _launch_gather(
+    params: list[DTensor],
+    owned_params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    num_ranks: int,
+    process_group: dist.ProcessGroup,
+) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
+    """Allocate gather buffers, build send/recv, and launch async all-to-all.
+    Returns:
+        work: Async operation handle.
+        recv_buf: Flat receive buffer (needed by ``_complete_gather``).
+        gathered_grads: ``{id(p): empty_tensor}`` for owned params,
+            ``None`` for non-owned.
+        recv_counts: Per-source-rank element counts.
+    """
+    # Allocate gathered-grad buffers
+    gathered_grads: dict[int, torch.Tensor | None] = {}
+    for p in params:
+        state = param_to_state[id(p)]
+        if rank == state.worker_rank:
+            gathered_grads[id(p)] = torch.empty(p.shape,
+                                                dtype=COMM_DTYPE,
+                                                device="cuda")
+        else:
+            gathered_grads[id(p)] = None
+    # Build send buffer
+    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
+    send_counts = [0] * num_ranks
+    for p in params:
+        state = param_to_state[id(p)]
+        dst = state.worker_rank
+        assert dst < num_ranks
+        shard_elems = state.rank_numels[rank]
+        g = p.grad
+        g = g.to_local().to(COMM_DTYPE).contiguous()
+        assert g.numel() == shard_elems
+        per_dst[dst].append(g.view(-1))
+        send_counts[dst] += shard_elems
+    assert any(
+        len(v) > 0 for v in
+        per_dst), "At least one destination rank must receive a sharded tensor"
+    per_dst_flat = [t for dst in per_dst for t in dst]
+    send_buf = torch.cat(per_dst_flat, dim=0)
+    # Build recv buffer
+    recv_counts = [0] * num_ranks
+    for src in range(num_ranks):
+        total = 0
+        for p in owned_params:
+            state = param_to_state[id(p)]
+            assert state.worker_rank == rank
+            total += state.rank_numels[src]
+        recv_counts[src] = total
+    recv_buf = torch.empty(sum(recv_counts), dtype=COMM_DTYPE, device="cuda")
+    # Launch async all-to-all
+    logger.debug(f"send_buf size: {send_buf.numel()}, "
+                 f"recv_buf size: {recv_buf.numel()}, "
+                 f"recv_counts: {recv_counts}, "
+                 f"send_counts: {send_counts}, "
+                 f"process_group: {str(process_group)}")
+    work = dist.all_to_all_single(
+        recv_buf,
+        send_buf,
+        output_split_sizes=recv_counts,
+        input_split_sizes=send_counts,
+        group=process_group,
+        async_op=True,
+    )
+    return work, recv_buf, gathered_grads, recv_counts
+def _complete_gather(
+    recv_buf: torch.Tensor,
+    recv_counts: list[int],
+    owned_params: list[DTensor],
+    gathered_grads: dict[int, torch.Tensor | None],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+) -> None:
+    """Reconstruct gathered grads from the recv buffer (in-place)."""
+    off = 0
+    for src in range(len(recv_counts)):
+        if recv_counts[src] == 0:
+            continue
+        block = recv_counts[src]
+        inner_off = 0
+        for p in owned_params:
+            state = param_to_state[id(p)]
+            assert state.worker_rank == rank
+            indices = state.rank_indices[src]
+            shard_view = gathered_grads[id(p)][indices]
+            n = shard_view.numel()
+            assert n > 0
+            sg = recv_buf.narrow(0, off + inner_off, n)
+            sg = sg.reshape(shard_view.shape)
+            gathered_grads[id(p)][indices] = sg
+            inner_off += n
+        assert inner_off == block
+        off += block
+def _compute_ns(
+    owned_params: list[DTensor],
+    gathered_grads: dict[int, torch.Tensor | None],
+    ns_steps: int,
+) -> dict[int, torch.Tensor | None]:
+    """Run Newton-Schulz orthogonalization on owned parameters.
+    Returns:
+        computed_us: ``{id(p): orthogonalized_update}`` for owned params.
+    """
+    computed_us: dict[int, torch.Tensor | None] = {}
+    for p in owned_params:
+        u = _zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
+        gathered_grads[id(p)] = None  # free gathered grad
+        computed_us[id(p)] = u
+    return computed_us
+def _launch_scatter(
+    params: list[DTensor],
+    owned_params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    num_ranks: int,
+    process_group: dist.ProcessGroup,
+    computed_us: dict[int, torch.Tensor | None],
+) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor], list[int]]:
+    """Allocate scatter buffers, build send/recv, and launch async all-to-all.
+    Returns:
+        work: Async operation handle.
+        recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
+        scattered_us: ``{id(p): empty_local_tensor}`` for all params.
+        recv_counts: Per-source-rank element counts.
+    """
+    # Allocate scattered-u buffers
+    scattered_us: dict[int, torch.Tensor] = {}
+    for p in params:
+        scattered_us[id(p)] = torch.empty_like(p.to_local(), dtype=COMM_DTYPE)
+    # Build send buffer (from computed_us on owner ranks)
+    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
+    send_counts = [0] * num_ranks
+    if owned_params:
+        for p in owned_params:
+            state = param_to_state[id(p)]
+            assert computed_us[id(p)] is not None
+            u_full = computed_us[id(p)].to(COMM_DTYPE).contiguous()
+            total_sent = 0
+            for dst_rank in range(num_ranks):
+                indices = state.rank_indices[dst_rank]
+                su = u_full[indices].flatten()
+                n = su.numel()
+                assert n > 0
+                per_dst[dst_rank].append(su)
+                send_counts[dst_rank] += n
+                total_sent += n
+            assert total_sent == u_full.numel()
+    lengths = [len(v) for v in per_dst]
+    if all(l > 0 for l in lengths):
+        assert all(
+            l == lengths[0] for l in lengths
+        ), "All destination ranks must have the same number of sharded tensor"
+        per_dst_flat = [t for dst in per_dst for t in dst]
+        send_buf = torch.cat(per_dst_flat, dim=0)
+    else:
+        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+    # Build recv buffer
+    recv_counts = [0] * num_ranks
+    for src in range(num_ranks):
+        total = 0
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank != src:
+                continue
+            total += state.rank_numels[rank]
+        recv_counts[src] = total
+    recv_total = sum(recv_counts)
+    assert recv_total > 0
+    recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+    # Launch async all-to-all
+    work = dist.all_to_all_single(
+        recv_buf,
+        send_buf,
+        output_split_sizes=recv_counts,
+        input_split_sizes=send_counts,
+        group=process_group,
+        async_op=True,
+    )
+    return work, recv_buf, scattered_us, recv_counts
+def _complete_scatter(
+    recv_buf: torch.Tensor,
+    recv_counts: list[int],
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    scattered_us: dict[int, torch.Tensor],
+) -> None:
+    """Copy recv buffer into scattered_us (in-place)."""
+    off = 0
+    for src in range(len(recv_counts)):
+        block = recv_counts[src]
+        if block == 0:
+            continue
+        inner_off = 0
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank != src:
+                continue
+            n = state.rank_numels[rank]
+            assert n > 0
+            flat_local = recv_buf.narrow(0, off + inner_off,
+                                         n).view_as(p.to_local())
+            scattered_us[id(p)].copy_(flat_local)
+            inner_off += n
+        assert inner_off == block
+        off += block
+def _update_params(
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    scattered_us: dict[int, torch.Tensor],
+    lr: float,
+    weight_decay: float,
+) -> None:
+    """Apply weight decay, Muon update, and optional QK clipping."""
+    for p in params:
+        state = param_to_state[id(p)]
+        u_dtensor = DTensor.from_local(
+            scattered_us[id(p)],
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+        update_p(p, u_dtensor, lr, adjusted_lr, weight_decay)
+        # QK clipping – applied directly on the local tensor to
+        # avoid DTensor sharding-propagation issues with _StridedShard.
+        scales_full = compute_scales(
+            p,
+            state.qk_clip_state) if state.qk_clip_state is not None else None
+        if scales_full is not None:
+            ratio = p.shape[0] // scales_full.shape[0]
+            idx0 = state.rank_indices[rank][0]
+            if isinstance(idx0, slice):
+                start = idx0.start or 0
+                idx0 = torch.arange(start,
+                                    idx0.stop,
+                                    device=scales_full.device)
+            row_scales = scales_full[idx0 // ratio]
+            p._local_tensor.mul_(row_scales.view(-1, 1))
+# ======================================================================
+# Main generator – thin orchestrator that wires stages together.
+# ======================================================================
+@torch.no_grad()
+def muon_chunk_pipeline(
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    ns_steps: int,
+    lr: float,
+    weight_decay: float,
+    none_grad: bool,
+) -> Generator[None, None, None]:
+    """Process one chunk of parameters through the full Muon pipeline.
+    Stages: gather -> compute (Newton-Schulz) -> scatter -> update.
+    Each ``yield`` lets :func:`run_pipeline` interleave other chunks so
+    that communication and computation overlap across chunks.  Async
+    communication is launched via ``async_op=True`` and completed after
+    the yield with ``work.wait()``.
+    Overlap happens because :func:`run_pipeline` admits one new chunk
+    per iteration (staggered admission).  While chunk *N* does NS
+    compute on the default CUDA stream, chunk *N+1*'s async all-to-all
+    runs concurrently on the NCCL stream — no separate ``comm_stream``
+    is required.
+    Yields exactly **2** times:
+    1. After launching async all-to-all gather.
+    2. After launching async all-to-all scatter.
+    """
+    process_group = param_to_state[id(params[0])].process_group
+    num_ranks = dist.get_world_size(group=process_group)
+    owned_params = [
+        p for p in params if param_to_state[id(p)].worker_rank == rank
+    ]
+    # Stages 1-2: launch async gather.
+    with record_function("muon::launch_gather"):
+        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
+            params, owned_params, param_to_state, rank, num_ranks,
+            process_group)
+        if none_grad:
+            for p in params:
+                p.grad = None
+    yield  # --- YIELD 1: other chunks can launch their gather ---
+    with record_function("muon::wait_gather"):
+        work.wait()
+        _complete_gather(recv_buf, recv_counts, owned_params, gathered_grads,
+                         param_to_state, rank)
+        del recv_buf
+    # Stage 3: Newton-Schulz orthogonalization.
+    with record_function("muon::newton_schulz"):
+        computed_us = _compute_ns(owned_params, gathered_grads, ns_steps)
+        gathered_grads.clear()
+    # Stages 4-5: launch async scatter.
+    with record_function("muon::launch_scatter"):
+        work, recv_buf, scattered_us, recv_counts = _launch_scatter(
+            params, owned_params, param_to_state, rank, num_ranks,
+            process_group, computed_us)
+        computed_us.clear()
+    yield  # --- YIELD 2: other chunks can launch their scatter ---
+    with record_function("muon::wait_scatter"):
+        work.wait()
+        _complete_scatter(recv_buf, recv_counts, params, param_to_state, rank,
+                          scattered_us)
+        del recv_buf
+    # Stage 6: apply parameter updates.
+    with record_function("muon::update_params"):
+        _update_params(params, param_to_state, rank, scattered_us, lr,
+                       weight_decay)
+        scattered_us.clear()

build/torch210-cxx11-cu128-x86_64-linux/qk_clip.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import logging
+import math
+from dataclasses import dataclass
+import torch
+from torch.distributed.tensor import DTensor
+logger = logging.getLogger(__name__)
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: str | None  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: list[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: torch.Tensor | None
+def get_qk_clip_info(clip_config, n, qk_logits):
+    """Extract QK clipping info for a named parameter.
+    Args:
+        clip_config: QK clipping configuration dict (or None).
+        n: Parameter name string.
+        qk_logits: Dict mapping layer indices to logit tensors (or None).
+    Returns:
+        QKClipInfo instance with clipping configuration for this parameter.
+    """
+    if clip_config is None:
+        return None
+    head_dim = clip_config.get('head_dim')
+    threshold = clip_config.get('threshold')
+    kind, layer_idx = parse_qk_layer(n)
+    logit, indices = None, []
+    if qk_logits is not None and kind is not None:
+        logit = qk_logits[layer_idx]
+        indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+        indices = clip_config.get(indices_key, []) or []
+        if isinstance(logit, DTensor):
+            # In TP settings, qk_logits may be DTensor
+            # We convert it to full tensor here for simplicity
+            logit = logit.full_tensor()
+    return QKClipInfo(
+        kind=kind,
+        indices=indices,
+        head_dim=head_dim,
+        threshold=threshold,
+        logit=logit,
+    )
+def compute_scales(p, qk_clip_state):
+    """Compute per-head scaling factors for QK clipping.
+    Returns scales tensor if any head exceeds threshold, else None.
+    """
+    kind = qk_clip_state.kind
+    indices = qk_clip_state.indices
+    head_dim = qk_clip_state.head_dim
+    threshold = qk_clip_state.threshold
+    logit = qk_clip_state.logit
+    H_global = p.shape[0] // head_dim
+    scales_full = torch.ones(H_global, device=p.data.device)
+    scaling = 0
+    for logit_idx, head_idx in enumerate(indices):
+        v_ele = float(logit[logit_idx])
+        if v_ele > threshold:
+            new_scale = math.sqrt(threshold / v_ele)
+            if new_scale < scales_full[head_idx]:
+                scales_full[head_idx] = new_scale
+                logger.info(
+                    f"[{kind}] Head {head_idx} exceeded threshold "
+                    f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                )
+                scaling += 1
+    return scales_full if scaling > 0 else None
+def qk_clip(p, scales, head_dim):
+    """Apply per-head scaling to a Q/K projection weight matrix."""
+    if isinstance(p, torch.nn.Parameter):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    else:
+        W = p.view(-1, head_dim, p.shape[1])
+        W.mul_(scales.view(-1, 1, 1))

build/torch210-cxx11-cu130-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_06a260a_dirty
-ops = torch.ops._optimizer_06a260a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_06a260a_dirty::{op_name}"

 import torch
+from . import _optimizer_7aef62f_dirty
+ops = torch.ops._optimizer_7aef62f_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_7aef62f_dirty::{op_name}"

build/torch210-cxx11-cu130-x86_64-linux/{_optimizer_06a260a_dirty.abi3.so → _optimizer_7aef62f_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:330aaa6cb247ba3b5df7a13ced6ef7eff3e5d7a72a0b88f674f948aeaed66ee2
 size 2004728

 version https://git-lfs.github.com/spec/v1
+oid sha256:b9c7bb12bc030d4959e880a959b39ea07eb03e16175d7cf03829f9860f52525d
 size 2004728

build/torch210-cxx11-cu130-x86_64-linux/adamw.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from collections import defaultdict
+from typing import cast
+import torch
+from torch.distributed.tensor import DTensor
+def fused_adamw(
+    params: list[torch.Tensor],
+    grads: list[torch.Tensor],
+    exp_avgs: list[torch.Tensor],
+    exp_avg_sqs: list[torch.Tensor],
+    max_exp_avg_sqs: list[torch.Tensor],
+    state_steps: list[torch.Tensor],
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: float | torch.Tensor,
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+) -> None:
+    if not params:
+        return
+    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+    # treating it as a scalar.
+    lr_dict: dict | None = ({
+        lr.device: lr
+    } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else None)
+    grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+         state_steps]  # type: ignore[list-item]
+    )
+    for (device, _), (
+        (
+            device_params_,
+            device_grads_,
+            device_exp_avgs_,
+            device_exp_avg_sqs_,
+            device_max_exp_avg_sqs,
+            device_state_steps_,
+        ),
+            _,
+    ) in grouped_tensors.items():
+        device_params = cast(list[torch.Tensor], device_params_)
+        device_grads = cast(list[torch.Tensor], device_grads_)
+        device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+        device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+        device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+        if lr_dict is not None and device not in lr_dict:
+            lr_dict[device] = lr.to(
+                device=device, non_blocking=True)  # type: ignore[union-attr]
+            lr = lr_dict[device]
+        torch._foreach_add_(device_state_steps, 1)
+        func = torch._fused_adamw_
+        func(
+            device_params,
+            device_grads,
+            device_exp_avgs,
+            device_exp_avg_sqs,
+            device_max_exp_avg_sqs,  # type: ignore[arg-type]
+            device_state_steps,
+            amsgrad=amsgrad,
+            lr=lr,  # type: ignore[arg-type]
+            beta1=beta1,
+            beta2=beta2,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=maximize,
+        )
+def step_adamw_params(optimizer_state, params, group):
+    """Run fused AdamW on a list of parameters sharing the same placement.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        params: List of parameters to update.
+        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
+    """
+    params_with_grads = []
+    grads = []
+    moment1 = []
+    moment2 = []
+    max_exp_avg_sqs = []
+    state_steps = []
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
+    for p in params:
+        g = p.grad
+        if g is None:
+            continue
+        state = optimizer_state[p]
+        params_with_grads.append(p)
+        grads.append(g)
+        if "step" not in state:
+            state["step"] = (torch.zeros((),
+                                         dtype=torch.float32,
+                                         device=p.device))
+            state["moment1"] = torch.zeros_like(g)
+            state["moment2"] = torch.zeros_like(g)
+        moment1.append(state["moment1"])
+        moment2.append(state["moment2"])
+        if not isinstance(state["step"], torch.Tensor):
+            step_tensor = torch.tensor(state["step"],
+                                       dtype=torch.float32,
+                                       device=p.device)
+        else:
+            step_tensor = state["step"]
+        state_steps.append(step_tensor)
+    fused_adamw(
+        params_with_grads,
+        grads,
+        moment1,
+        moment2,
+        max_exp_avg_sqs,
+        state_steps,
+        amsgrad=False,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        eps=eps,
+        maximize=False,
+    )
+def step_adamw(optimizer_state, group):
+    """Dispatch AdamW step, grouping parameters by type and placement.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        group: Parameter group dict.
+    """
+    params = group["params"]
+    # group params with its type and placement
+    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
+    for p in params:
+        match p:
+            case DTensor():
+                placement_to_params[tuple([p.placements,
+                                           p.device_mesh])].append(p)
+            case torch.Tensor():
+                placement_to_params[tuple([torch.Tensor, None])].append(p)
+    for group_params in placement_to_params.values():
+        step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-cu130-x86_64-linux/async_utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import logging
+from typing import Generator
+logger = logging.getLogger(__name__)
+class _Task:
+    """Internal: wraps a generator, advances one yield at a time."""
+    def __init__(self, generator: Generator[None, None, None], index: int):
+        self._generator = generator
+        self._index = index
+        self._steps_completed = 0
+        self.step()  # run to first yield
+    def step(self) -> bool:
+        try:
+            next(self._generator)
+            self._steps_completed += 1
+            logger.debug("pipeline[%d] completed stage %d", self._index,
+                         self._steps_completed)
+            return True
+        except StopIteration:
+            logger.debug("pipeline[%d] finished after %d stages", self._index,
+                         self._steps_completed)
+            return False
+    def close(self):
+        self._generator.close()
+def run_pipeline(
+    pipelines: Generator[Generator[None, None, None], None, None],
+    max_concurrent: int,
+) -> None:
+    """Run generator-based pipelines with bounded concurrency.
+    Each pipeline is a generator that yields at stage boundaries.
+    The runtime interleaves pipelines so communication and computation
+    overlap across chunks.
+    """
+    if max_concurrent <= 0:
+        raise ValueError(f"max_concurrent must be > 0, got {max_concurrent}")
+    have_new = True
+    task_index = 0
+    previous_tasks: list[_Task] = []
+    try:
+        while have_new or previous_tasks:
+            running_tasks: list[_Task] = []
+            # Admit one new pipeline per iteration (staggered admission).
+            # Admitting one at a time ensures that while chunk N does NS
+            # compute on the default stream, chunk N+1's NCCL all-to-all
+            # runs concurrently on the NCCL stream — creating real
+            # communication/computation overlap on the GPU.
+            if have_new and len(previous_tasks) < max_concurrent:
+                try:
+                    gen = next(pipelines)
+                    task = _Task(gen, task_index)
+                    task_index += 1
+                    running_tasks.append(task)
+                except StopIteration:
+                    have_new = False
+            # Advance every previously-yielded task by one step.
+            for task in previous_tasks:
+                if task.step():
+                    running_tasks.append(task)
+            previous_tasks = running_tasks
+    except BaseException:
+        # Clean up all in-flight generators to release GPU resources.
+        for task in previous_tasks:
+            task.close()
+        raise

build/torch210-cxx11-cu130-x86_64-linux/core.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+from torch.distributed.tensor import DTensor
+@dataclass
+class _muon_state:
+    worker_rank: int
+    process_group: ProcessGroup
+    rank_indices: dict[int, tuple]  # local_rank -> per-dim indices
+    rank_numels: dict[int, int]  # local_rank -> numel
+    name: str
+    qk_clip_state: torch.Tensor | None = None
+def update_g(optimizer_state, p, g, group, momentum):
+    """Apply momentum update to gradient.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        p: Parameter tensor.
+        g: Gradient tensor.
+        group: Parameter group dict.
+        momentum: Momentum coefficient.
+    Returns:
+        Momentum-updated gradient tensor.
+    """
+    state = optimizer_state[p]
+    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+    torch.add(g, buf, alpha=momentum, out=buf)
+    if group["nesterov"]:
+        g.add_(buf, alpha=momentum)
+        return g
+    return buf
+def update_p(p, u, lr, adjusted_lr, weight_decay):
+    """Apply weight decay and orthogonalized update to parameter.
+    Args:
+        p: Parameter (torch.nn.Parameter or DTensor).
+        u: Orthogonalized update tensor.
+        lr: Base learning rate.
+        adjusted_lr: Size-adjusted learning rate.
+        weight_decay: Weight decay coefficient.
+    """
+    if isinstance(p, torch.nn.Parameter):
+        # apply weight decay
+        p.data.mul_(1 - lr * weight_decay)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    else:
+        p.mul_(1 - lr * weight_decay)
+        p.add_(u, alpha=-adjusted_lr)
+def adjust_lr_for_muon(lr, param_shape):
+    """Scale learning rate based on parameter matrix dimensions.
+    Args:
+        lr: Base learning rate.
+        param_shape: Shape of the parameter tensor.
+    Returns:
+        Adjusted learning rate.
+    """
+    A, B = param_shape[:2]
+    # We adjust the learning rate and weight decay based on the size of the parameter matrix
+    # as described in the paper
+    adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+    adjusted_lr = lr * adjusted_ratio
+    return adjusted_lr
+def default_is_muon(name, x, expert_keys=None):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    if any(key in name for key in skip_keys):
+        return False
+    effective_ndim = x.ndim
+    if expert_keys and any(key in name for key in expert_keys):
+        effective_ndim -= 1
+    return effective_ndim >= 2
+def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
+    if is_muon_func is None:
+        is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
+    return [
+        {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
+        },
+        {
+            "params": non_muon_params,
+            "use_muon": False,
+        },
+    ]

build/torch210-cxx11-cu130-x86_64-linux/distributed/utils.py CHANGED Viewed

@@ -7,22 +7,40 @@ from torch.distributed.tensor.placement_types import (Placement, Shard,
                                                       _StridedShard)
 def get_slices_of_dtensor(
     target: DTensor | torch.Tensor,
     local_rank: int,
     shard_mesh: DeviceMesh,
     shard_placements: tuple[Placement],
-) -> tuple[slice]:
     """
-    Get the slice of local tensor for a given rank from a tensor.
     Args:
-        target (DTensor | torch.Tensor): The target tensor.
-        rank (int): The local rank of the shard group.
-        shard_mesh (DeviceMesh): The shard mesh. It consists of global ranks.
         shard_placements (tuple[Placement]): The shard placements.
-    """
-    slices: list[slice] = [slice(0, dim_size) for dim_size in target.size()]
     # find the global rank of the local rank in the shard mesh
     rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
@@ -34,34 +52,75 @@ def get_slices_of_dtensor(
     assert len(rank_coords) == len(shard_placements)
     # Caution: Assuming replicate-to-shard of the shard mesh goes with
     # left-to-right sharding. This is ensured by the sorting logic of
     # construct_shard_mesh function.
-    for i, (rank_coord,
-            placement) in enumerate(zip(rank_coords, shard_placements)):
-        assert isinstance(placement, Shard)
-        num_ranks = shard_mesh.mesh.shape[i]
-        dim = placement.dim
-        dim_size = (slices[dim].stop - slices[dim].start)
-        if dim_size % num_ranks != 0:
             raise NotImplementedError(
-                f"Dimension size {dim_size} is not divisible "
-                f"by number of ranks {num_ranks} for shard "
-                f"placement on dim {dim}. (shape: {target.shape})")
-        shard_size = dim_size // num_ranks
-        start = slices[dim].start + rank_coord * shard_size
-        end = start + shard_size
-        assert start < end <= slices[dim].stop
-        slices[dim] = slice(start, end)
-    return tuple(slices)
 _ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
@@ -71,105 +130,105 @@ _ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
 def construct_shard_mesh(
     placements: tuple[Placement],
     mesh: DeviceMesh,
-) -> (DeviceMesh, ProcessGroup, tuple[Placement]):
-    """
-    Construct Shard Mesh and Placements for unsharding.
-    It removes Replicate placements and constructs a new Mesh and ProcessGroup.
-    """
-    my_rank = dist.get_rank()
-    assert mesh.mesh.device.type == 'cpu'
-    # Copy mesh to avoid modifying the original mesh
-    mesh = mesh.mesh.clone()
-    # 1. Sort placements. Replicate first, then Shard by dim ascending.
-    # For Shard, strided shard comes after regular shard on the same dim
-    # to preserve left-to-right order of replicate-to-shard.
-    # This is because that strided shard is using stride to represent
-    # more fine-grained sharding on the same dim.
-    # Please check the URL below for _StridedShard.
-    # https://github.com/pytorch/pytorch/blob/v2.8.0/torch/distributed/tensor/placement_types.py#L366
-    def placement_sort_key(
-        placement_with_index: tuple[float, Placement]
-    ) -> tuple[int, float, int]:  # (dim, split factor, original index)
-        index, placement = placement_with_index
-        is_replicate = placement.is_replicate()
-        is_shard = placement.is_shard()
-        is_partial = placement.is_partial()
-        assert is_replicate or is_shard, f"Unsupported placement type: {type(placement)}"
-        assert not is_partial, "Partial placement is not supported."
-        if is_replicate:
-            return (-1.0, 0, index)
-        elif is_shard:
-            if isinstance(placement, _StridedShard):
-                return (placement.dim, 1 / placement.split_factor, index)
-            return (placement.dim, 0, index)
-        else:
-            raise TypeError(f"Unknown placement type: {type(placement)}")
-    placements_with_index: list[tuple[int,
-                                      Placement]] = list(enumerate(placements))
-    placements_with_index = sorted(placements_with_index,
-                                   key=placement_sort_key)
-    sorted_indices, sorted_placements = zip(*placements_with_index)
-    # 2. Permute mesh according to sorted placements.
-    sorted_mesh = mesh.permute(sorted_indices)
-    # 3. Collect list of shard meshes by removing replicate dims
-    # For example, (2, 3, 4, 4) with placements [R, R, S(0), S(1)]
-    # shard_meshes should be list with 2 * 3 = 6 shard meshes of shape (4, 4)
-    num_replicates = sum(1 for p in sorted_placements if p.is_replicate())
-    # merge replicate dims
-    # shard_meshes became a list of shard meshes with a length of replicate degree
-    if num_replicates > 0:
-        sorted_mesh = sorted_mesh.flatten(
-            0, num_replicates - 1) if num_replicates > 1 else sorted_mesh
         shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
     else:
         shard_meshes = [sorted_mesh]
-    shard_placements = sorted_placements[num_replicates:]
-    # assume all shard placements are different
     assert len(shard_placements) == len(set(shard_placements))
-    # 4. Construct ProcessGroups
-    # Caution: all groups should be created in the same order in all processes,
-    # even though each process only needs its own group.
-    # To use tensor as dict key, convert it to tuple
-    def tensor_to_tuple(t):
-        if isinstance(t, torch.Tensor):
-            t = t.tolist()
-        if isinstance(t, list):
-            return tuple(tensor_to_tuple(x) for x in t)
-        return t
-    my_shard_mesh_as_tuple = None
-    for shard_mesh in shard_meshes:
-        assert isinstance(shard_mesh, torch.Tensor)
-        shard_mesh_as_tuple = tensor_to_tuple(shard_mesh)
-        if (my_rank == shard_mesh).any().item():
-            assert my_shard_mesh_as_tuple is None
-            my_shard_mesh_as_tuple = shard_mesh_as_tuple
-        # update global cache
-        if shard_mesh_as_tuple not in _ranks_to_dist_cache:
-            shard_process_group = dist.new_group(shard_mesh.flatten().tolist())
-            _ranks_to_dist_cache[shard_mesh_as_tuple] = (
-                DeviceMesh(device_type="cuda", mesh=shard_mesh),
-                shard_process_group,
             )
-    my_shard_mesh, my_shard_process_group = _ranks_to_dist_cache[
-        my_shard_mesh_as_tuple]
-    return my_shard_mesh, my_shard_process_group, shard_placements

                                                       _StridedShard)
+def _is_shard(placement: Placement) -> bool:
+    """Check if a placement is a shard type (Shard or _StridedShard).
+    In PyTorch 2.10+, _StridedShard no longer inherits from Shard, so
+    ``placement.is_shard()`` returns False for _StridedShard.  This helper
+    handles both old and new hierarchies.
+    """
+    return isinstance(placement, (Shard, _StridedShard))
 def get_slices_of_dtensor(
     target: DTensor | torch.Tensor,
     local_rank: int,
     shard_mesh: DeviceMesh,
     shard_placements: tuple[Placement],
+) -> tuple[slice | torch.Tensor, ...]:
     """
+    Get per-dimension indices for a given rank's shard of the target tensor.
+    Uses ``Shard.local_shard_size_and_offset`` and
+    ``_StridedShard.local_shard_size_and_offset`` for correct handling of
+    both contiguous and strided (non-contiguous) sharding.
     Args:
+        target (DTensor | torch.Tensor): The target tensor (for its shape).
+        local_rank (int): The local rank within the shard group.
+        shard_mesh (DeviceMesh): The shard mesh (only shard dimensions).
         shard_placements (tuple[Placement]): The shard placements.
+    Returns:
+        A tuple of indices (one per tensor dim).  Each element is either:
+        - A ``slice`` (for contiguous or unsharded dims)
+        - A 1-D ``torch.LongTensor`` of indices (for strided sharding)
+    """
     # find the global rank of the local rank in the shard mesh
     rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
     assert len(rank_coords) == len(shard_placements)
+    # Track per-shard-dim indices.
+    # None means "not yet sharded on this dim".
+    dim_indices: dict[int, torch.Tensor] = {}
     # Caution: Assuming replicate-to-shard of the shard mesh goes with
     # left-to-right sharding. This is ensured by the sorting logic of
     # construct_shard_mesh function.
+    for mesh_dim_idx, (rank_coord, placement) in enumerate(
+            zip(rank_coords, shard_placements)):
+        assert _is_shard(placement)
+        num_chunks = shard_mesh.mesh.shape[mesh_dim_idx]
+        shard_dim = placement.dim
+        # Current effective size on this dim (may already be sub-sharded)
+        if shard_dim in dim_indices:
+            curr_size = len(dim_indices[shard_dim])
+        else:
+            curr_size = target.size()[shard_dim]
+        if curr_size % num_chunks != 0:
             raise NotImplementedError(
+                f"Dimension size {curr_size} is not divisible "
+                f"by number of ranks {num_chunks} for shard "
+                f"placement on dim {shard_dim}. (shape: {target.shape})")
+        # Compute indices for this level of sharding
+        if isinstance(placement, _StridedShard):
+            _shard_size, offsets = _StridedShard.local_shard_size_and_offset(
+                placement,
+                curr_size,
+                num_chunks,
+                rank_coord,
+                return_first_offset=False)
+            new_indices = torch.tensor(offsets, dtype=torch.long)
+        else:
+            shard_size, offset = Shard.local_shard_size_and_offset(
+                curr_size, num_chunks, rank_coord)
+            new_indices = torch.arange(offset,
+                                       offset + shard_size,
+                                       dtype=torch.long)
+        # Compose with previous indices on this dim
+        if shard_dim in dim_indices:
+            dim_indices[shard_dim] = dim_indices[shard_dim][new_indices]
+        else:
+            dim_indices[shard_dim] = new_indices
+    # Build result tuple
+    result: list[slice | torch.Tensor] = []
+    for d in range(len(target.size())):
+        if d not in dim_indices:
+            result.append(slice(None))
+        else:
+            indices = dim_indices[d]
+            # Convert contiguous indices to slice for efficiency
+            if len(indices) > 0:
+                start = indices[0].item()
+                expected = torch.arange(start,
+                                        start + len(indices),
+                                        dtype=torch.long)
+                if torch.equal(indices, expected):
+                    result.append(slice(start, start + len(indices)))
+                else:
+                    result.append(indices)
+            else:
+                result.append(slice(0, 0))
+    return tuple(result)
 _ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
 def construct_shard_mesh(
     placements: tuple[Placement],
     mesh: DeviceMesh,
+) -> tuple[DeviceMesh, ProcessGroup, tuple[Placement, ...]]:
+    """Construct shard sub-mesh and ProcessGroup for all-to-all communication.
+    Given a DTensor's placements and device mesh, extracts the "shard group"
+    — the set of ranks that together hold all shards of the same replica —
+    and creates a ProcessGroup for all-to-all among them.
+    Steps:
+        1. Sort placements: Replicate first, then Shard by (dim, granularity).
+        2. Permute the mesh tensor to match the sorted order.
+        3. Collapse Replicate dims → list of shard sub-meshes (one per replica).
+        4. Create/retrieve a cached ProcessGroup for the current rank's sub-mesh.
+    Example — 8 GPUs, mesh shape (2, 2, 2),
+              placements ``[Shard(0), Replicate, _StridedShard(0)]``::
+        Step 1 — Sort: [Replicate, _StridedShard(0), Shard(0)]
+                 Permutation: [1, 2, 0]
+        Step 2 — Permute mesh dims by [1, 2, 0]:
+                 Original:                Permuted:
+                 [[[0,1],[2,3]],          [[[0,2],[1,3]],
+                  [[4,5],[6,7]]]           [[4,6],[5,7]]]
+        Step 3 — Unbind replicate dim (dim 0), giving 2 shard sub-meshes:
+                 sub-mesh 0 = [[0,2],[1,3]]  (replica group 0)
+                 sub-mesh 1 = [[4,6],[5,7]]  (replica group 1)
+                 shard_placements = (_StridedShard(0), Shard(0))
+        Step 4 — Rank 0 → ProcessGroup([0,1,4,5])
+                 Rank 2 → ProcessGroup([2,3,6,7])
+    Returns:
+        ``(shard_mesh, process_group, shard_placements)``
+    """
+    my_rank = dist.get_rank()
+    assert mesh.mesh.device.type == 'cpu'
+    # -- Fast path: 1D all-shard mesh → reuse existing PG. ----------------
+    # This avoids a non-collective dist.new_group() call, which would
+    # deadlock when only a subset of ranks call this function (e.g. expert
+    # DTensors on a TP submesh where ranks 0-3 and 4-7 call separately).
+    if mesh.ndim == 1 and len(placements) == 1 and _is_shard(placements[0]):
+        key = (*mesh.mesh.shape, *mesh.mesh.flatten().tolist())
+        if key not in _ranks_to_dist_cache:
+            _ranks_to_dist_cache[key] = (mesh, mesh.get_group())
+        return (*_ranks_to_dist_cache[key], tuple(placements))
+    mesh_tensor = mesh.mesh.clone()
+    # -- Step 1: Sort placements (Replicate first, then Shard by dim). ------
+    # _StridedShard comes BEFORE regular Shard on the same dim so that
+    # get_slices_of_dtensor applies the outer sharding first, matching
+    # DTensor's left-to-right (outer-to-inner) composition order.
+    def _sort_key(item):
+        index, placement = item
+        assert not placement.is_partial(), "Partial placement not supported"
+        if placement.is_replicate():
+            return (-1, 0, index)
+        assert _is_shard(placement), f"Unsupported: {type(placement)}"
+        split = (-1 / placement.split_factor if isinstance(
+            placement, _StridedShard) else 0)
+        return (placement.dim, split, index)
+    indexed = sorted(enumerate(placements), key=_sort_key)
+    perm, sorted_placements = zip(*indexed)
+    # -- Step 2: Permute mesh to match sorted placement order. --------------
+    sorted_mesh = mesh_tensor.permute(perm)
+    # -- Step 3: Collapse replicate dims → list of shard sub-meshes. --------
+    # E.g. mesh (2, 3, 4, 4) with [R, R, S(0), S(1)] → 6 sub-meshes of (4, 4)
+    num_rep = sum(1 for p in sorted_placements if p.is_replicate())
+    if num_rep > 0:
+        if num_rep > 1:
+            sorted_mesh = sorted_mesh.flatten(0, num_rep - 1)
         shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
     else:
         shard_meshes = [sorted_mesh]
+    shard_placements = sorted_placements[num_rep:]
     assert len(shard_placements) == len(set(shard_placements))
+    # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
+    # All ranks must call dist.new_group in the same order, even though each
+    # rank only joins one group.
+    def _cache_key(t: torch.Tensor) -> tuple:
+        return (*t.shape, *t.flatten().tolist())
+    my_key = None
+    for sm in shard_meshes:
+        key = _cache_key(sm)
+        if (my_rank == sm).any().item():
+            assert my_key is None, "Rank appears in multiple shard groups"
+            my_key = key
+        if key not in _ranks_to_dist_cache:
+            pg = dist.new_group(sm.flatten().tolist())
+            _ranks_to_dist_cache[key] = (
+                DeviceMesh(device_type="cuda", mesh=sm),
+                pg,
             )
+    return (*_ranks_to_dist_cache[my_key], shard_placements)

build/torch210-cxx11-cu130-x86_64-linux/matmul_transpose_triton.py CHANGED Viewed

@@ -119,10 +119,3 @@ def matmul_transpose_assign(d_in, d_out):
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))
-def matmul_transpose(d_in):
-    M, _ = d_in.shape
-    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
-    matmul_transpose_assign(d_in, d_out)
-    return d_out

     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))

build/torch210-cxx11-cu130-x86_64-linux/metadata.json CHANGED Viewed

	@@ -1 +1,3 @@
1	- {~~"python-depends":[]}~~

+{
+  "python-depends": []
+}

build/torch210-cxx11-cu130-x86_64-linux/muon.py CHANGED Viewed

@@ -1,536 +1,121 @@
 import logging
-import math
 import types
 from collections import defaultdict
-from dataclasses import dataclass
-from typing import Any, cast
 import torch
 import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import DTensor, Replicate
-from torch.distributed.tensor.placement_types import Placement
-from .distributed.utils import construct_shard_mesh, get_slices_of_dtensor
-from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
-COMM_DTYPE = torch.bfloat16
-DEFAULT_CHUNK_SIZE_RATIO = 4
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# Muon's Newton–Schulz iteration causes high variance in singular values
-# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
-@torch.no_grad()
-# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert len(G.shape) == 2
-    assert G.dtype == COMM_DTYPE
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm() + 1e-7)
-    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    # Perform the NS iterations
-    for a, b, c in [
-        (4.0848, -6.8946, 2.9270),
-        (3.9505, -6.3029, 2.6377),
-        (3.7418, -5.5913, 2.3037),
-        (2.8769, -3.1427, 1.2046),
-        (2.8366, -3.0525, 1.2012),
-    ]:
-        matmul_transpose_assign(X, buf1)
-        matmul_transpose_assign(buf1, buf2)
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X
-@dataclass
-class _muon_state:
-    # TODO: use Optional
-    worker_rank: int
-    process_group: ProcessGroup
-    shard_mesh: DeviceMesh
-    shard_placements: tuple[Placement, ...]
-    name: str
-    qk_clip_state: torch.Tensor | None = None
-    gathered_grad: torch.Tensor | None = None
-    scattered_u: DTensor | None = None
-    computed_u: torch.Tensor | None = None
-    gather_event: torch.cuda.Event | None = None
-    compute_event: torch.cuda.Event | None = None
-    scatter_event: torch.cuda.Event | None = None
-def numel_for_rank(
-    param: DTensor,
-    local_rank: int,
-    state: _muon_state,
-) -> int:
-    slices = get_slices_of_dtensor(
-        param,
-        local_rank,
-        state.shard_mesh,
-        state.shard_placements,
-    )
-    numel = 1
-    for s, dim in zip(slices, param.shape):
-        start, stop, step = s.indices(dim)
-        length = max(0, (stop - start + (step - 1)) // step)
-        numel *= length
-    return numel
-@torch.no_grad()
-def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
-    """
-    Pre-allocate gathered_grad buffer on compute_stream
-    before launching all2all gather
-    """
-    with torch.cuda.stream(compute_stream):
-        for p in params:
-            state = param_to_state[id(p)]
-            if rank == state.worker_rank:
-                state.gathered_grad = torch.empty(p.shape,
-                                                  dtype=COMM_DTYPE,
-                                                  device="cuda")
-            else:
-                state.gathered_grad = None
-        alloc_event = torch.cuda.Event()
-        alloc_event.record(compute_stream)
-        return alloc_event
-@torch.no_grad()
-def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
-                    alloc_event):
-    """
-    All2all gathers shards so each owner rank reconstructs its full gradient
-    """
-    with torch.cuda.stream(comm_stream):
-        process_group = param_to_state[id(params[0])].process_group
-        num_ranks = dist.get_world_size(group=process_group)
-        # Construct sending buffers
-        per_dst = [[] for _ in range(num_ranks)]
-        send_counts = [0] * num_ranks
-        for p in params:
-            state = param_to_state[id(p)]
-            dst = state.worker_rank
-            assert dst < num_ranks
-            shard_elems = numel_for_rank(p, rank, state)
-            g = p.grad
-            g = g.to_local().to(COMM_DTYPE).contiguous()
-            assert g.numel() == shard_elems
-            per_dst[dst].append(g.view(-1))
-            send_counts[dst] += shard_elems
-        assert any(
-            len(v) > 0 for v in per_dst
-        ), "At least one destination rank must receive a sharded tensor"
-        # list[list[Tensor]] -> list[Tensor]
-        per_dst = [t for dst in per_dst for t in dst]
-        send_buf = torch.cat(per_dst, dim=0)
-        owned_params = [
-            p for p in params if param_to_state[id(p)].worker_rank == rank
-        ]
-        # Compute receive sizes and allocate receiving buffers
-        recv_counts = [0] * num_ranks
-        for src in range(num_ranks):
-            total = 0
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                assert state.worker_rank == rank
-                total += numel_for_rank(p, src, state)
-            recv_counts[src] = total
-        recv_total = sum(recv_counts)
-        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-        #All2All
-        logger.debug(f"send_buf size: {send_buf.numel()}, "
-                     f"recv_buf size: {recv_buf.numel()}, "
-                     f"recv_counts: {recv_counts}, "
-                     f"send_counts: {send_counts}, "
-                     f"process_group: {str(process_group)}")
-        dist.all_to_all_single(
-            recv_buf,
-            send_buf,
-            output_split_sizes=recv_counts,
-            input_split_sizes=send_counts,
-            group=process_group,
-        )
-        # Reconstructs gathered grad from the received buffer
-        #
-        #                  recv_buf (num ranks = 3)
-        #
-        #      From rank 0        From rank 1        From rank 2
-        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
-        #
-        # Outer loop:
-        # rank 0 -> rank 1 -> rank2
-        #
-        # Inner loop:
-        # p1_n -> p2_n -> p3_n
-        comm_stream.wait_event(alloc_event)
-        off = 0
-        for src in range(num_ranks):
-            if recv_counts[src] == 0:
-                continue
-            block = recv_counts[src]
-            inner_off = 0
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                assert state.worker_rank == rank
-                # get the slice of the full dtensor corresponding to rank src.
-                slices = get_slices_of_dtensor(state.gathered_grad, src,
-                                               state.shard_mesh,
-                                               state.shard_placements)
-                dst = state.gathered_grad[slices]
-                assert dst._base is state.gathered_grad
-                n = dst.numel()
-                assert n > 0
-                sg = recv_buf.narrow(0, off + inner_off, n)
-                sg = sg.reshape_as(dst)
-                dst.copy_(sg)
-                inner_off += n
-            off += block
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank == rank:
-                state.gather_event = torch.cuda.Event()
-                state.gather_event.record(comm_stream)
-            else:
-                state.gathered_grad = None
-                state.gather_event = None
-            if none_grad:
-                p.grad = None
-@torch.no_grad()
-def _compute_u(p, state, steps, rank, compute_stream):
-    """
-    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
-    """
-    with torch.cuda.stream(compute_stream):
-        if rank == state.worker_rank:
-            if state.gather_event is None:
-                raise RuntimeError("Gather event must be set before compute.")
-            compute_stream.wait_event(state.gather_event)
-            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
-            state.gathered_grad = None
-            state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
-@torch.no_grad()
-def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
-    """
-    Pre-allocate scattered_u buffer on compute_stream
-    before launching all2all gather
-    """
-    with torch.cuda.stream(compute_stream):
-        for p in params:
-            state = param_to_state[id(p)]
-            state.scattered_u = torch.empty_like(p.to_local(),
-                                                 dtype=COMM_DTYPE)
-        alloc_event = torch.cuda.Event()
-        alloc_event.record(compute_stream)
-        return alloc_event
-def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
-    """
-    All2all scatters full gradients to all ranks
-    """
-    with torch.cuda.stream(comm_stream):
-        process_group = param_to_state[id(params[0])].process_group
-        num_ranks = dist.get_world_size(group=process_group)
-        owned_params = [
-            p for p in params if param_to_state[id(p)].worker_rank == rank
-        ]
-        # Construct sending buffer
-        per_dst = [[] for _ in range(num_ranks)]
-        send_counts = [0] * num_ranks
-        if owned_params:
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                if state.compute_event is None:
-                    raise RuntimeError(
-                        "Compute event must be set before scatter.")
-                comm_stream.wait_event(state.compute_event)
-                state.gathered_grad = None
-                assert state.computed_u is not None
-                u_full = state.computed_u.to(COMM_DTYPE).contiguous()
-                offset = 0
-                for dst in range(num_ranks):
-                    # get the slice of the full tensor corresponding to rank dst.
-                    slices = get_slices_of_dtensor(u_full, dst,
-                                                   state.shard_mesh,
-                                                   state.shard_placements)
-                    su = u_full[slices].flatten()
-                    n = su.numel()
-                    assert n > 0
-                    per_dst[dst].append(su)
-                    send_counts[dst] += n
-                    offset += n
-                assert offset == u_full.numel()
-        lengths = [len(v) for v in per_dst]
-        if all(l > 0 for l in lengths):
-            assert all(
-                l == lengths[0] for l in lengths
-            ), "All destination ranks must have the same number of sharded tensor"
-            # list[list[Tensor]] -> list[Tensor]
-            per_dst = [t for dst in per_dst for t in dst]
-            send_buf = torch.cat(per_dst, dim=0)
-        else:
-            # all_to_all requires participation from all ranks
-            # Even non-owner ranks must join the collective call
-            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-        # Compute receive sizes and allocate receiving buffers
-        recv_counts = [0] * num_ranks
-        for src in range(num_ranks):
-            total = 0
-            for p in params:
-                state = param_to_state[id(p)]
-                if state.worker_rank != src:
-                    continue
-                total += numel_for_rank(p, rank, state)
-            recv_counts[src] = total
-        recv_total = sum(recv_counts)
-        assert recv_total > 0
-        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-        #All2All
-        dist.all_to_all_single(
-            recv_buf,
-            send_buf,
-            output_split_sizes=recv_counts,
-            input_split_sizes=send_counts,
-            group=process_group,
-        )
-        # Copy to pre-allocated scattered_u buffer from the received buffer
-        #
-        #                  recv_buf (num ranks = 3, local_rank = 0)
-        #
-        #      From rank 0        From rank 1       From rank 2
-        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
-        #
-        # Outer loop:
-        # rank 0 -> rank 1 -> rank2
-        #
-        # Inner loop:
-        # src(0) :  p1_0 -> p2_0 -> p3_0
-        # src(1) :  p4_0
-        # src(2) :  p5_0 -> p6_0
-        comm_stream.wait_event(alloc_event)
-        off = 0
-        for src in range(num_ranks):
-            block = recv_counts[src]
-            if block == 0:
-                continue
-            inner_off = 0
-            for p in params:
-                state = param_to_state[id(p)]
-                if state.worker_rank != src:
-                    continue
-                n = numel_for_rank(p, rank, state)
-                assert n > 0
-                flat_local = recv_buf.narrow(0, off + inner_off,
-                                             n).view_as(p.to_local())
-                state.scattered_u.copy_(flat_local)
-                state.scatter_event = torch.cuda.Event()
-                state.scatter_event.record(comm_stream)
-                inner_off += n
-            assert inner_off == block
-            off += block
-def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
-                  compute_stream):
-    """
-    Update sharded parameter p with the scattered_u.
-    Only worker_rank frees computed_u.
     """
-    with torch.cuda.stream(compute_stream):
-        if state.scatter_event is None:
-            raise RuntimeError("Scatter event must be set before update")
-        compute_stream.wait_event(state.scatter_event)
-        u_dtensor = DTensor.from_local(
-            state.scattered_u,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
-        if rank == state.worker_rank:
-            # Free computed_u
-            state.computed_u = None
-        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-        state.scattered_u = None
-        u_dtensor = None
-        scales_full = Muon._compute_scales(
-            p,
-            state.qk_clip_state) if state.qk_clip_state is not None else None
-        if scales_full is not None:
-            # Have to slice scales_full among dim 0
-            weight_slices = get_slices_of_dtensor(p, rank, state.shard_mesh,
-                                                  state.shard_placements)
-            ratio = p.shape[0] // scales_full.shape[0]
-            scales_slice = slice(
-                None if weight_slices[0].start is None else
-                weight_slices[0].start // ratio,
-                None if weight_slices[0].stop is None else
-                weight_slices[0].stop // ratio,
-                None,
-            )
-            scales_local = scales_full[scales_slice]
-            scales_local = DTensor.from_local(
-                scales_local,
-                placements=p.placements,
-                device_mesh=p.device_mesh,
-            )
-            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
-def default_is_muon(name, x):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    return x.ndim >= 2 and not any(key in name for key in skip_keys)
-def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
-    muon_params, muon_names = [], []
-    non_muon_params = []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
             continue
-        if is_muon_func(n, p):
-            muon_params.append(p)
-            muon_names.append(n)
-        else:
-            non_muon_params.append(p)
-    return [
-        {
-            "params": muon_params,
-            "names": muon_names,
-            "use_muon": True,
-        },
-        {
-            "params": non_muon_params,
-            "use_muon": False,
-        },
-    ]
-def parse_qk_layer(name: str) -> tuple[str | None, int]:
-    """
-    Parse a parameter name to check if it is a query/key projection layer
-    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
-    Returns:
-        (kind, layer_idx) or (None, -1) if not matched.
-    Example:
-        'model.3.attn.wq.weight'      -> ('wq', 3)
-        'model.5.attn.wk.weight'      -> ('wk', 5)
-        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
-        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
-        'model.4.attn.v_proj.weight'  -> (None, -1)
-    """
-    parts = name.split('.')
-    if len(parts) < 3:
-        return None, -1
-    kind = parts[-2]
-    layer_idx = -1
-    for part in reversed(parts):
-        if part.isdigit():
-            layer_idx = int(part)
-            break
-    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
-        return kind, layer_idx
-    return None, -1
-@dataclass
-class QKClipInfo:
-    """Per-parameter dynamic info computed from config + runtime logits."""
-    kind: str | None  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
-    indices: list[int]  # which heads to consider for clipping
-    head_dim: int  # from config
-    threshold: float  # from config
-    logit: torch.Tensor | None
 class Muon(torch.optim.Optimizer):
@@ -554,7 +139,7 @@ class Muon(torch.optim.Optimizer):
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
         weight_decay: The weight decay for Muon and AdamW.
-        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
@@ -564,7 +149,7 @@ class Muon(torch.optim.Optimizer):
             - "q_indices" (list[int]): Indices of query heads to consider.
             - "k_indices" (list[int]): Indices of key heads to consider.
             - "head_dim" (int): Dimensionality of each attention head.
-            - "threshold" (float): Threshold value; heads whose QK logits exceed
             this value will be scaled down.
             Default is:
                 {
@@ -584,6 +169,13 @@ class Muon(torch.optim.Optimizer):
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
         small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
     """
     def __init__(self,
@@ -597,16 +189,12 @@ class Muon(torch.optim.Optimizer):
                  adamw_eps=1e-8,
                  none_grad=True,
                  debug=False,
-                 clip_config={
-                     "q_indices": [],
-                     "k_indices": [],
-                     "head_dim": 128,
-                     "threshold": 100
-                 },
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
-                 small_param_numel_threshold=65536):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -630,16 +218,18 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        self.rank = None
-        self.comm_stream = torch.cuda.Stream()
-        self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-        self.clip_config = clip_config
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
         self.small_param_numel_threshold = small_param_numel_threshold
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -649,20 +239,6 @@ class Muon(torch.optim.Optimizer):
         return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def adjust_lr_for_muon(self, lr, param_shape):
-        A, B = param_shape[:2]
-        # We adjust the learning rate and weight decay based on the size of the parameter matrix
-        # as describted in the paper
-        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-        adjusted_lr = lr * adjusted_ratio
-        return adjusted_lr
-    def set_rank_once(self, rank):
-        if self.rank is None:
-            self.rank = rank
-        else:
-            assert self.rank == rank
     def get_shard_mesh(self, p):
         """
         Get the shard mesh for a parameter p on the given rank.
@@ -673,9 +249,6 @@ class Muon(torch.optim.Optimizer):
         shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
             p.placements, p.device_mesh)
-        # set rank with the local rank in the shard process group
-        self.set_rank_once(dist.get_rank(group=shard_pg))
         return shard_mesh, shard_pg, shard_placements
     def init_state_and_assign_params(self, names, params, group, qk_logits):
@@ -694,8 +267,8 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
-                  flush=True)
         paired = list(zip(names, params))
@@ -724,44 +297,54 @@ class Muon(torch.optim.Optimizer):
             worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
             round_robin = (round_robin + 1) % len(shard_mesh_flattened)
-            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)] = _muon_state(
                 worker_rank=worker_rank,
                 process_group=shard_pg,
-                shard_mesh=shard_mesh,
-                shard_placements=shard_placements,
                 name=n,
                 qk_clip_state=qk_clip_state,
             )
         return param_to_state, ordered_params
-    def base(self, names, params, group, lr, weight_decay, momentum,
-             qk_logits):
-        # generate weight updates in distributed fashion
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            assert g is not None
-            g = self._update_g(p, g, group, momentum)
             u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
-            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
-            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
-            scales_full = self._compute_scales(
                 p, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
-                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def distributed_muon(
         self,
@@ -770,20 +353,15 @@ class Muon(torch.optim.Optimizer):
         group: dict[str, Any],
         lr: float,
         weight_decay: float,
-        momentum: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
         """ Implementation of Distributed Muon by Liu et al. """
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            assert g is not None
-            g = self._update_g(p, g, group, momentum)
             # Gather G
             if isinstance(p.data, DTensor):
@@ -796,16 +374,16 @@ class Muon(torch.optim.Optimizer):
             u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
                                                   steps=group["ns_steps"])
-            adjusted_lr = self.adjust_lr_for_muon(lr, p_full.shape)
-            Muon._update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
-            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
-            scales_full = self._compute_scales(
                 p_full, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
-                Muon._qk_clip(p_full, scales_full, qk_clip_state.head_dim)
             if isinstance(p.data, DTensor):
                 ndims = len(p.device_mesh.mesh.shape)
@@ -822,244 +400,53 @@ class Muon(torch.optim.Optimizer):
                 p.copy_(p_sharded)
-    def _update_g(self, p, g, group, momentum):
-        # calc update
-        state = self.state[p]
-        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-        torch.add(g, buf, alpha=momentum, out=buf)
-        if group["nesterov"]:
-            g.add_(buf, alpha=momentum)
-            return g
-        return buf
-    @staticmethod
-    def _update_p(p, u, lr, adjusted_lr, weight_decay):
-        if isinstance(p, torch.nn.Parameter):
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
-        else:
-            p.mul_(1 - lr * weight_decay)
-            p.add_(u, alpha=-adjusted_lr)
-    def get_qk_clip_info(self, n, qk_logits):
-        if self.clip_config is None:
-            return None
-        head_dim = self.clip_config.get('head_dim')
-        threshold = self.clip_config.get('threshold')
-        kind, layer_idx = parse_qk_layer(n)
-        logit, indices = None, []
-        if qk_logits is not None and kind is not None:
-            logit = qk_logits[layer_idx]
-            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
-            indices = self.clip_config.get(indices_key, []) or []
-            if isinstance(logit, DTensor):
-                # In TP settings, qk_logits may be DTensor
-                # We convert it to full tensor here for simplicity
-                logit = logit.full_tensor()
-        return QKClipInfo(
-            kind=kind,
-            indices=indices,
-            head_dim=head_dim,
-            threshold=threshold,
-            logit=logit,
-        )
-    @staticmethod
-    def _compute_scales(p, qk_clip_state):
-        kind = qk_clip_state.kind
-        indices = qk_clip_state.indices
-        head_dim = qk_clip_state.head_dim
-        threshold = qk_clip_state.threshold
-        logit = qk_clip_state.logit
-        H_global = p.shape[0] // head_dim
-        scales_full = torch.ones(H_global, device=p.data.device)
-        scaling = 0
-        for logit_idx, head_idx in enumerate(indices):
-            v_ele = float(logit[logit_idx])
-            if v_ele > threshold:
-                new_scale = math.sqrt(threshold / v_ele)
-                if new_scale < scales_full[head_idx]:
-                    scales_full[head_idx] = new_scale
-                    logger.info(
-                        f"[{kind}] Head {head_idx} exceeded threshold "
-                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
-                    )
-                    scaling += 1
-        return scales_full if scaling > 0 else None
-    @staticmethod
-    def _qk_clip(p, scales, head_dim):
-        if isinstance(p, torch.nn.Parameter):
-            W = p.data.view(-1, head_dim, p.data.shape[1])
-            W.mul_(scales.view(-1, 1, 1))
-        else:
-            W = p.view(-1, head_dim, p.shape[1])
-            W.mul_(scales.view(-1, 1, 1))
-    def parallel(self, names, params, group, lr, weight_decay, momentum,
-                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
-        """
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            # Update g in the local rank
-            g = self._update_g(
-                p,
-                g,
-                group,
-                momentum=momentum,
-            )
-            p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        assert self.rank is not None
-        def enqueue_all2all_gather(start_idx, chunk_size):
-            target_params = ordered_params[start_idx:start_idx + chunk_size]
-            if target_params:
-                alloc_event = _alloc_gathered_grad(target_params,
-                                                   param_to_state, self.rank,
-                                                   self.compute_stream)
-                _all2all_gather(target_params, param_to_state, self.rank,
-                                self.comm_stream, group["none_grad"],
-                                alloc_event)
-        def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _compute_u(p, state, group["ns_steps"], self.rank,
-                           self.compute_stream)
-        def enqueue_all2all_scatter(start_idx, chunk_size):
-            target_params = ordered_params[start_idx:start_idx + chunk_size]
-            if target_params:
-                alloc_event = _alloc_scattered_u(target_params, param_to_state,
-                                                 self.rank,
-                                                 self.compute_stream)
-                _all2all_scatter(target_params, param_to_state, self.rank,
-                                 self.comm_stream, alloc_event)
-        def enqueue_update_param(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _update_param(p, state, lr, adjusted_lr, weight_decay,
-                              self.rank, self.compute_stream)
         if self.chunk_size == -1:
             shard_ranks = dist.get_world_size(param_to_state[id(
-                params[0])].process_group)
             chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
         elif self.chunk_size > 0:
             chunk_size = self.chunk_size
         else:
             raise ValueError("chunk_size must be -1 or a positive integer.")
-        # Wait grad update
-        self.comm_stream.wait_stream(torch.cuda.current_stream())
-        warmup_step = self.warmup_step
-        for i in range(0, warmup_step):
-            enqueue_all2all_gather(i * chunk_size, chunk_size)
-            enqueue_computes(i * chunk_size, chunk_size)
-        for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_all2all_scatter(i, chunk_size)
-            enqueue_all2all_gather(i + warmup_step * chunk_size, chunk_size)
-            enqueue_update_param(i, chunk_size)
-            enqueue_computes(i + warmup_step * chunk_size, chunk_size)
-        # Wait the last update_param to finish
-        torch.cuda.current_stream().wait_stream(self.compute_stream)
-    @staticmethod
-    def _fused_adamw(
-        params: list[torch.Tensor],
-        grads: list[torch.Tensor],
-        exp_avgs: list[torch.Tensor],
-        exp_avg_sqs: list[torch.Tensor],
-        max_exp_avg_sqs: list[torch.Tensor],
-        state_steps: list[torch.Tensor],
-        amsgrad: bool,
-        beta1: float,
-        beta2: float,
-        lr: float | torch.Tensor,
-        weight_decay: float,
-        eps: float,
-        maximize: bool,
-    ) -> None:
-        if not params:
-            return
-        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
-        # treating it as a scalar.
-        lr_dict: DeviceDict | None = ({
-            lr.device: lr
-        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
-                                      None)
-        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
-            [
-                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
-                state_steps
-            ]  # type: ignore[list-item]
-        )
-        for (device, _), (
-            (
-                device_params_,
-                device_grads_,
-                device_exp_avgs_,
-                device_exp_avg_sqs_,
-                device_max_exp_avg_sqs,
-                device_state_steps_,
-            ),
-                _,
-        ) in grouped_tensors.items():
-            device_params = cast(list[torch.Tensor], device_params_)
-            device_grads = cast(list[torch.Tensor], device_grads_)
-            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
-            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
-            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
-            if lr_dict is not None and device not in lr_dict:
-                lr_dict[device] = lr.to(
-                    device=device,
-                    non_blocking=True)  # type: ignore[union-attr]
-                lr = lr_dict[device]
-            torch._foreach_add_(device_state_steps, 1)
-            func = torch._fused_adamw_
-            func(
-                device_params,
-                device_grads,
-                device_exp_avgs,
-                device_exp_avg_sqs,
-                device_max_exp_avg_sqs,  # type: ignore[arg-type]
-                device_state_steps,
-                amsgrad=amsgrad,
-                lr=lr,  # type: ignore[arg-type]
-                beta1=beta1,
-                beta2=beta2,
-                weight_decay=weight_decay,
-                eps=eps,
-                maximize=maximize,
-            )
     def _step_muon(self, group, qk_logits=None):
         params = group["params"]
@@ -1068,6 +455,18 @@ class Muon(torch.optim.Optimizer):
         momentum = group["momentum"]
         names = group["names"]
         param_dtensors = []
         name_dtensors = []
@@ -1083,7 +482,6 @@ class Muon(torch.optim.Optimizer):
                                   group=group,
                                   lr=lr,
                                   weight_decay=weight_decay,
-                                  momentum=momentum,
                                   qk_logits=qk_logits)
             return
@@ -1119,7 +517,6 @@ class Muon(torch.optim.Optimizer):
             # and run parallel Muon on each group.
             placement_to_params = defaultdict(lambda: ([], []))
-            # type: dict[tuple[Placement, DeviceMesh], tuple[list[str], list[DTensor]]]
             assert len(dtensors) == len(names)
             for p, n in zip(dtensors, names):
@@ -1141,7 +538,6 @@ class Muon(torch.optim.Optimizer):
                 group=group,
                 lr=lr,
                 weight_decay=weight_decay,
-                momentum=momentum,
                 qk_logits=qk_logits,
             )
@@ -1159,7 +555,6 @@ class Muon(torch.optim.Optimizer):
                     group,
                     lr=lr,
                     weight_decay=weight_decay,
-                    momentum=momentum,
                     qk_logits=qk_logits,
                 )
@@ -1170,78 +565,9 @@ class Muon(torch.optim.Optimizer):
                 group,
                 lr=lr,
                 weight_decay=weight_decay,
-                momentum=momentum,
                 qk_logits=qk_logits,
             )
-    def _step_adamw_params(self, params, group):
-        params_with_grads = []
-        grads = []
-        moment1 = []
-        moment2 = []
-        max_exp_avg_sqs = []
-        state_steps = []
-        lr = group["lr"]
-        beta1, beta2 = group["adamw_betas"]
-        eps = group["adamw_eps"]
-        weight_decay = group["weight_decay"]
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            state = self.state[p]
-            params_with_grads.append(p)
-            grads.append(g)
-            if "step" not in state:
-                state["step"] = (torch.zeros((),
-                                             dtype=torch.float32,
-                                             device=p.device))
-                state["moment1"] = torch.zeros_like(g)
-                state["moment2"] = torch.zeros_like(g)
-            moment1.append(state["moment1"])
-            moment2.append(state["moment2"])
-            if not isinstance(state["step"], torch.Tensor):
-                step_tensor = torch.tensor(state["step"],
-                                           dtype=torch.float32,
-                                           device=p.device)
-            else:
-                step_tensor = state["step"]
-            state_steps.append(step_tensor)
-        self._fused_adamw(
-            params_with_grads,
-            grads,
-            moment1,
-            moment2,
-            max_exp_avg_sqs,
-            state_steps,
-            amsgrad=False,
-            beta1=beta1,
-            beta2=beta2,
-            lr=lr,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=False,
-        )
-    def _step_adamw(self, group):
-        params = group["params"]
-        # group params with it's type and placement
-        placement_to_params: dict[tuple[Placement | type,
-                                        DeviceMesh | None]] = defaultdict(list)
-        for p in params:
-            match p:
-                case DTensor():
-                    placement_to_params[tuple([p.placements,
-                                               p.device_mesh])].append(p)
-                case torch.Tensor():
-                    placement_to_params[tuple([torch.Tensor, None])].append(p)
-        for params in placement_to_params.values():
-            self._step_adamw_params(params, group)
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
@@ -1249,9 +575,9 @@ class Muon(torch.optim.Optimizer):
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
-            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
-                to 1D tensors of shape (num_heads,), representing the maximum
-                QK logits across all tokens, computed as
                 (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
@@ -1263,6 +589,6 @@ class Muon(torch.optim.Optimizer):
             if group["use_muon"]:
                 self._step_muon(group, qk_logits=qk_logits)
             else:
-                self._step_adamw(group)
         return loss

 import logging
 import types
 from collections import defaultdict
+from typing import Any
 import torch
 import torch.distributed as dist
+from torch.distributed.tensor import DTensor, Replicate, Shard
+from torch.profiler import record_function
+from .adamw import step_adamw
+from .async_utils import run_pipeline
+from .core import (_muon_state, adjust_lr_for_muon,
+                   get_default_muon_param_groups, update_g, update_p)
+from .distributed.utils import (_is_shard, construct_shard_mesh,
+                                get_slices_of_dtensor)
+from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
+                            _zeropower_via_newtonschulz5)
+from .pipeline import muon_chunk_pipeline
+from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
 logger = logging.getLogger(__name__)
+def _expand_expert_params(names, params, expert_keys):
+    """Expand expert params by splitting on dim 0 (expert dimension).
+    Params whose name matches any key in ``expert_keys`` are treated as
+    expert-parallel tensors.  Their outermost dimension is the expert
+    dimension: an ``(E, out, in)`` tensor becomes ``E`` separate 2D
+    ``nn.Parameter`` views so that in-place updates propagate back to
+    the original storage.
+    Non-expert params with ``ndim > 2`` trigger an ``AssertionError`` —
+    if they are expert params, their key must be added to ``expert_keys``.
+    The grad must already be set on each expert param (e.g. after momentum).
+    For DTensor expert params, placements that shard on dim 0 (expert dim)
+    are consumed by the split.  Non-dim-0 shard placements (e.g. TP) are
+    preserved: each 2D slice is wrapped as a DTensor on the corresponding
+    submesh so the parallel pipeline handles the TP communication.
     """
+    expanded_names = []
+    expanded_params = []
+    for n, p in zip(names, params):
+        is_expert = expert_keys and any(key in n for key in expert_keys)
+        is_dtensor = isinstance(p.data, DTensor)
+        if not is_expert:
+            assert p.data.ndim <= 2, (
+                f"Param {n} has ndim={p.data.ndim} but does not match "
+                f"expert_keys={expert_keys}. If this is an expert param, "
+                f"add its key to expert_keys.")
+            expanded_names.append(n)
+            expanded_params.append(p)
             continue
+        g = p.grad
+        assert g is not None, (
+            f"Expert param {n} must have grad set before expansion")
+        tp_mesh = None
+        tp_placements_2d = None
+        if is_dtensor:
+            local_data = p.to_local()
+            local_grad = g.to_local() if isinstance(g, DTensor) else g
+            # Find non-dim-0 shard placements (e.g. TP sharding).
+            # After splitting on dim 0, Shard(k) becomes Shard(k-1).
+            tp_dim_indices = []
+            tp_placements_2d = []
+            for i, pl in enumerate(p.placements):
+                if _is_shard(pl) and pl.dim != 0:
+                    tp_dim_indices.append(i)
+                    tp_placements_2d.append(Shard(pl.dim - 1))
+            if tp_dim_indices:
+                tp_dim_names = tuple(p.device_mesh.mesh_dim_names[i]
+                                     for i in tp_dim_indices)
+                if len(tp_dim_names) == 1:
+                    tp_mesh = p.device_mesh[tp_dim_names[0]]
+                else:
+                    tp_mesh = p.device_mesh[tp_dim_names]
+        else:
+            local_data = p.data
+            local_grad = g
+        # Expand: split dim 0, reshape each slice to 2D.
+        num_local_experts = local_data.shape[0]
+        for i in range(num_local_experts):
+            slice_data = local_data[i]
+            slice_grad = local_grad[i]
+            if tp_mesh is not None:
+                # Wrap as DTensor on TP submesh so the pipeline handles
+                # TP communication (gather/scatter across TP ranks).
+                dt_data = DTensor.from_local(slice_data,
+                                             device_mesh=tp_mesh,
+                                             placements=tp_placements_2d)
+                dt_grad = DTensor.from_local(slice_grad,
+                                             device_mesh=tp_mesh,
+                                             placements=tp_placements_2d)
+                expert_param = torch.nn.Parameter(dt_data, requires_grad=False)
+                expert_param.grad = dt_grad
+            else:
+                expert_param = torch.nn.Parameter(slice_data,
+                                                  requires_grad=False)
+                expert_param.grad = slice_grad
+            expanded_names.append(f"{n}[{i}]")
+            expanded_params.append(expert_param)
+        p.grad = None  # allow expert grad storage to be freed after pipeline
+    return expanded_names, expanded_params
 class Muon(torch.optim.Optimizer):
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
         weight_decay: The weight decay for Muon and AdamW.
+            Parameters that are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW instead.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
             - "q_indices" (list[int]): Indices of query heads to consider.
             - "k_indices" (list[int]): Indices of key heads to consider.
             - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
             this value will be scaled down.
             Default is:
                 {
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
         small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
+        expert_keys: List of strings to identify expert-parallel parameters.
+                     If any key appears in a parameter's name, its outermost
+                     dimension is treated as the expert dimension and expanded
+                     into per-expert 2D params for Muon.  For example,
+                     ``expert_keys=["experts"]`` matches any param whose name
+                     contains "experts".  3D+ params not matched by any key
+                     will raise an error.
     """
     def __init__(self,
                  adamw_eps=1e-8,
                  none_grad=True,
                  debug=False,
+                 clip_config=None,
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
+                 small_param_numel_threshold=65536,
+                 expert_keys=None):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
         self.debug = debug
+        self.clip_config = clip_config if clip_config is not None else {
+            "q_indices": [],
+            "k_indices": [],
+            "head_dim": 128,
+            "threshold": 100,
+        }
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
         self.small_param_numel_threshold = small_param_numel_threshold
+        self.expert_keys = expert_keys
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
     def get_shard_mesh(self, p):
         """
         Get the shard mesh for a parameter p on the given rank.
         shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
             p.placements, p.device_mesh)
         return shard_mesh, shard_pg, shard_placements
     def init_state_and_assign_params(self, names, params, group, qk_logits):
             total_flops += flops
         if self.debug:
+            logger.debug("Total TFLOPs for Muon: %.2f TFLOPs",
+                         total_flops / 1e12)
         paired = list(zip(names, params))
             worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
             round_robin = (round_robin + 1) % len(shard_mesh_flattened)
+            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
+            # Precompute per-rank indices and numels for all-to-all.
+            rank_indices: dict[int, tuple] = {}
+            rank_numels: dict[int, int] = {}
+            for r in range(num_ranks):
+                indices = get_slices_of_dtensor(p, r, shard_mesh,
+                                                shard_placements)
+                rank_indices[r] = indices
+                numel = 1
+                for idx, dim_size in zip(indices, p.shape):
+                    if isinstance(idx, slice):
+                        start, stop, step = idx.indices(dim_size)
+                        numel *= max(0, (stop - start + (step - 1)) // step)
+                    else:
+                        numel *= len(idx)
+                rank_numels[r] = numel
             param_to_state[id(p)] = _muon_state(
                 worker_rank=worker_rank,
                 process_group=shard_pg,
+                rank_indices=rank_indices,
+                rank_numels=rank_numels,
                 name=n,
                 qk_clip_state=qk_clip_state,
             )
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, qk_logits):
+        # Momentum is already applied by _step_muon before this method.
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
+            adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+            update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
+            scales_full = compute_scales(
                 p, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
+                qk_clip(p, scales_full, qk_clip_state.head_dim)
     def distributed_muon(
         self,
         group: dict[str, Any],
         lr: float,
         weight_decay: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
         """ Implementation of Distributed Muon by Liu et al. """
+        # Momentum is already applied by _step_muon before this method.
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             # Gather G
             if isinstance(p.data, DTensor):
             u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
                                                   steps=group["ns_steps"])
+            adjusted_lr = adjust_lr_for_muon(lr, p_full.shape)
+            update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
+            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
+            scales_full = compute_scales(
                 p_full, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
+                qk_clip(p_full, scales_full, qk_clip_state.head_dim)
             if isinstance(p.data, DTensor):
                 ndims = len(p.device_mesh.mesh.shape)
                 p.copy_(p_sharded)
+    def parallel(self, names, params, group, lr, weight_decay, qk_logits):
         """
         Perform a parallel optimization step using Muon.
+        Parameters are chunked and each chunk is processed by a
+        :func:`muon_chunk_pipeline` generator.  :func:`run_pipeline`
+        interleaves multiple chunks so that communication and computation
+        overlap across chunks (the same overlap previously achieved by the
+        warmup + main-loop index scheduling).
+        """
+        # Momentum is already applied by _step_muon before this method.
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        # Compute local rank for this group's shard process group.
+        shard_pg = param_to_state[id(ordered_params[0])].process_group
+        rank = dist.get_rank(group=shard_pg)
         if self.chunk_size == -1:
             shard_ranks = dist.get_world_size(param_to_state[id(
+                ordered_params[0])].process_group)
             chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
         elif self.chunk_size > 0:
             chunk_size = self.chunk_size
         else:
             raise ValueError("chunk_size must be -1 or a positive integer.")
+        def pipelines():
+            for start in range(0, len(ordered_params), chunk_size):
+                chunk = ordered_params[start:start + chunk_size]
+                if chunk:
+                    yield muon_chunk_pipeline(
+                        params=chunk,
+                        param_to_state=param_to_state,
+                        rank=rank,
+                        ns_steps=group["ns_steps"],
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        none_grad=group["none_grad"],
+                    )
+        with record_function("muon::barrier"):
+            dist.barrier()
+        with record_function("muon::pipeline"):
+            run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
     def _step_muon(self, group, qk_logits=None):
         params = group["params"]
         momentum = group["momentum"]
         names = group["names"]
+        # Apply momentum to all params before routing/expansion.
+        with record_function("muon::momentum"):
+            for n, p in zip(names, params):
+                g = p.grad
+                if g is None:
+                    continue
+                g = update_g(self.state, p, g, group, momentum)
+                p.grad = g
+        # Expand expert params by splitting on dim 0.
+        names, params = _expand_expert_params(names, params, self.expert_keys)
         param_dtensors = []
         name_dtensors = []
                                   group=group,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   qk_logits=qk_logits)
             return
             # and run parallel Muon on each group.
             placement_to_params = defaultdict(lambda: ([], []))
             assert len(dtensors) == len(names)
             for p, n in zip(dtensors, names):
                 group=group,
                 lr=lr,
                 weight_decay=weight_decay,
                 qk_logits=qk_logits,
             )
                     group,
                     lr=lr,
                     weight_decay=weight_decay,
                     qk_logits=qk_logits,
                 )
                 group,
                 lr=lr,
                 weight_decay=weight_decay,
                 qk_logits=qk_logits,
             )
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
                 (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
             if group["use_muon"]:
                 self._step_muon(group, qk_logits=qk_logits)
             else:
+                step_adamw(self.state, group)
         return loss

build/torch210-cxx11-cu130-x86_64-linux/newton_schulz.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+from .matmul_transpose_triton import matmul_transpose_assign
+COMM_DTYPE = torch.bfloat16
+DEFAULT_CHUNK_SIZE_RATIO = 4
+# This code snippet is a modified version adapted from the following GitHub repositories:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
+@torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X

build/torch210-cxx11-cu130-x86_64-linux/pipeline.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import logging
+from typing import Generator
+import torch
+import torch.distributed as dist
+from torch.distributed.tensor import DTensor
+from torch.profiler import record_function
+from .core import _muon_state, adjust_lr_for_muon, update_p
+from .newton_schulz import COMM_DTYPE, _zeropower_via_newtonschulz5
+from .qk_clip import compute_scales
+logger = logging.getLogger(__name__)
+# ======================================================================
+# Stage helpers
+# ======================================================================
+def _launch_gather(
+    params: list[DTensor],
+    owned_params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    num_ranks: int,
+    process_group: dist.ProcessGroup,
+) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
+    """Allocate gather buffers, build send/recv, and launch async all-to-all.
+    Returns:
+        work: Async operation handle.
+        recv_buf: Flat receive buffer (needed by ``_complete_gather``).
+        gathered_grads: ``{id(p): empty_tensor}`` for owned params,
+            ``None`` for non-owned.
+        recv_counts: Per-source-rank element counts.
+    """
+    # Allocate gathered-grad buffers
+    gathered_grads: dict[int, torch.Tensor | None] = {}
+    for p in params:
+        state = param_to_state[id(p)]
+        if rank == state.worker_rank:
+            gathered_grads[id(p)] = torch.empty(p.shape,
+                                                dtype=COMM_DTYPE,
+                                                device="cuda")
+        else:
+            gathered_grads[id(p)] = None
+    # Build send buffer
+    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
+    send_counts = [0] * num_ranks
+    for p in params:
+        state = param_to_state[id(p)]
+        dst = state.worker_rank
+        assert dst < num_ranks
+        shard_elems = state.rank_numels[rank]
+        g = p.grad
+        g = g.to_local().to(COMM_DTYPE).contiguous()
+        assert g.numel() == shard_elems
+        per_dst[dst].append(g.view(-1))
+        send_counts[dst] += shard_elems
+    assert any(
+        len(v) > 0 for v in
+        per_dst), "At least one destination rank must receive a sharded tensor"
+    per_dst_flat = [t for dst in per_dst for t in dst]
+    send_buf = torch.cat(per_dst_flat, dim=0)
+    # Build recv buffer
+    recv_counts = [0] * num_ranks
+    for src in range(num_ranks):
+        total = 0
+        for p in owned_params:
+            state = param_to_state[id(p)]
+            assert state.worker_rank == rank
+            total += state.rank_numels[src]
+        recv_counts[src] = total
+    recv_buf = torch.empty(sum(recv_counts), dtype=COMM_DTYPE, device="cuda")
+    # Launch async all-to-all
+    logger.debug(f"send_buf size: {send_buf.numel()}, "
+                 f"recv_buf size: {recv_buf.numel()}, "
+                 f"recv_counts: {recv_counts}, "
+                 f"send_counts: {send_counts}, "
+                 f"process_group: {str(process_group)}")
+    work = dist.all_to_all_single(
+        recv_buf,
+        send_buf,
+        output_split_sizes=recv_counts,
+        input_split_sizes=send_counts,
+        group=process_group,
+        async_op=True,
+    )
+    return work, recv_buf, gathered_grads, recv_counts
+def _complete_gather(
+    recv_buf: torch.Tensor,
+    recv_counts: list[int],
+    owned_params: list[DTensor],
+    gathered_grads: dict[int, torch.Tensor | None],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+) -> None:
+    """Reconstruct gathered grads from the recv buffer (in-place)."""
+    off = 0
+    for src in range(len(recv_counts)):
+        if recv_counts[src] == 0:
+            continue
+        block = recv_counts[src]
+        inner_off = 0
+        for p in owned_params:
+            state = param_to_state[id(p)]
+            assert state.worker_rank == rank
+            indices = state.rank_indices[src]
+            shard_view = gathered_grads[id(p)][indices]
+            n = shard_view.numel()
+            assert n > 0
+            sg = recv_buf.narrow(0, off + inner_off, n)
+            sg = sg.reshape(shard_view.shape)
+            gathered_grads[id(p)][indices] = sg
+            inner_off += n
+        assert inner_off == block
+        off += block
+def _compute_ns(
+    owned_params: list[DTensor],
+    gathered_grads: dict[int, torch.Tensor | None],
+    ns_steps: int,
+) -> dict[int, torch.Tensor | None]:
+    """Run Newton-Schulz orthogonalization on owned parameters.
+    Returns:
+        computed_us: ``{id(p): orthogonalized_update}`` for owned params.
+    """
+    computed_us: dict[int, torch.Tensor | None] = {}
+    for p in owned_params:
+        u = _zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
+        gathered_grads[id(p)] = None  # free gathered grad
+        computed_us[id(p)] = u
+    return computed_us
+def _launch_scatter(
+    params: list[DTensor],
+    owned_params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    num_ranks: int,
+    process_group: dist.ProcessGroup,
+    computed_us: dict[int, torch.Tensor | None],
+) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor], list[int]]:
+    """Allocate scatter buffers, build send/recv, and launch async all-to-all.
+    Returns:
+        work: Async operation handle.
+        recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
+        scattered_us: ``{id(p): empty_local_tensor}`` for all params.
+        recv_counts: Per-source-rank element counts.
+    """
+    # Allocate scattered-u buffers
+    scattered_us: dict[int, torch.Tensor] = {}
+    for p in params:
+        scattered_us[id(p)] = torch.empty_like(p.to_local(), dtype=COMM_DTYPE)
+    # Build send buffer (from computed_us on owner ranks)
+    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
+    send_counts = [0] * num_ranks
+    if owned_params:
+        for p in owned_params:
+            state = param_to_state[id(p)]
+            assert computed_us[id(p)] is not None
+            u_full = computed_us[id(p)].to(COMM_DTYPE).contiguous()
+            total_sent = 0
+            for dst_rank in range(num_ranks):
+                indices = state.rank_indices[dst_rank]
+                su = u_full[indices].flatten()
+                n = su.numel()
+                assert n > 0
+                per_dst[dst_rank].append(su)
+                send_counts[dst_rank] += n
+                total_sent += n
+            assert total_sent == u_full.numel()
+    lengths = [len(v) for v in per_dst]
+    if all(l > 0 for l in lengths):
+        assert all(
+            l == lengths[0] for l in lengths
+        ), "All destination ranks must have the same number of sharded tensor"
+        per_dst_flat = [t for dst in per_dst for t in dst]
+        send_buf = torch.cat(per_dst_flat, dim=0)
+    else:
+        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+    # Build recv buffer
+    recv_counts = [0] * num_ranks
+    for src in range(num_ranks):
+        total = 0
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank != src:
+                continue
+            total += state.rank_numels[rank]
+        recv_counts[src] = total
+    recv_total = sum(recv_counts)
+    assert recv_total > 0
+    recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+    # Launch async all-to-all
+    work = dist.all_to_all_single(
+        recv_buf,
+        send_buf,
+        output_split_sizes=recv_counts,
+        input_split_sizes=send_counts,
+        group=process_group,
+        async_op=True,
+    )
+    return work, recv_buf, scattered_us, recv_counts
+def _complete_scatter(
+    recv_buf: torch.Tensor,
+    recv_counts: list[int],
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    scattered_us: dict[int, torch.Tensor],
+) -> None:
+    """Copy recv buffer into scattered_us (in-place)."""
+    off = 0
+    for src in range(len(recv_counts)):
+        block = recv_counts[src]
+        if block == 0:
+            continue
+        inner_off = 0
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank != src:
+                continue
+            n = state.rank_numels[rank]
+            assert n > 0
+            flat_local = recv_buf.narrow(0, off + inner_off,
+                                         n).view_as(p.to_local())
+            scattered_us[id(p)].copy_(flat_local)
+            inner_off += n
+        assert inner_off == block
+        off += block
+def _update_params(
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    scattered_us: dict[int, torch.Tensor],
+    lr: float,
+    weight_decay: float,
+) -> None:
+    """Apply weight decay, Muon update, and optional QK clipping."""
+    for p in params:
+        state = param_to_state[id(p)]
+        u_dtensor = DTensor.from_local(
+            scattered_us[id(p)],
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+        update_p(p, u_dtensor, lr, adjusted_lr, weight_decay)
+        # QK clipping – applied directly on the local tensor to
+        # avoid DTensor sharding-propagation issues with _StridedShard.
+        scales_full = compute_scales(
+            p,
+            state.qk_clip_state) if state.qk_clip_state is not None else None
+        if scales_full is not None:
+            ratio = p.shape[0] // scales_full.shape[0]
+            idx0 = state.rank_indices[rank][0]
+            if isinstance(idx0, slice):
+                start = idx0.start or 0
+                idx0 = torch.arange(start,
+                                    idx0.stop,
+                                    device=scales_full.device)
+            row_scales = scales_full[idx0 // ratio]
+            p._local_tensor.mul_(row_scales.view(-1, 1))
+# ======================================================================
+# Main generator – thin orchestrator that wires stages together.
+# ======================================================================
+@torch.no_grad()
+def muon_chunk_pipeline(
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    ns_steps: int,
+    lr: float,
+    weight_decay: float,
+    none_grad: bool,
+) -> Generator[None, None, None]:
+    """Process one chunk of parameters through the full Muon pipeline.
+    Stages: gather -> compute (Newton-Schulz) -> scatter -> update.
+    Each ``yield`` lets :func:`run_pipeline` interleave other chunks so
+    that communication and computation overlap across chunks.  Async
+    communication is launched via ``async_op=True`` and completed after
+    the yield with ``work.wait()``.
+    Overlap happens because :func:`run_pipeline` admits one new chunk
+    per iteration (staggered admission).  While chunk *N* does NS
+    compute on the default CUDA stream, chunk *N+1*'s async all-to-all
+    runs concurrently on the NCCL stream — no separate ``comm_stream``
+    is required.
+    Yields exactly **2** times:
+    1. After launching async all-to-all gather.
+    2. After launching async all-to-all scatter.
+    """
+    process_group = param_to_state[id(params[0])].process_group
+    num_ranks = dist.get_world_size(group=process_group)
+    owned_params = [
+        p for p in params if param_to_state[id(p)].worker_rank == rank
+    ]
+    # Stages 1-2: launch async gather.
+    with record_function("muon::launch_gather"):
+        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
+            params, owned_params, param_to_state, rank, num_ranks,
+            process_group)
+        if none_grad:
+            for p in params:
+                p.grad = None
+    yield  # --- YIELD 1: other chunks can launch their gather ---
+    with record_function("muon::wait_gather"):
+        work.wait()
+        _complete_gather(recv_buf, recv_counts, owned_params, gathered_grads,
+                         param_to_state, rank)
+        del recv_buf
+    # Stage 3: Newton-Schulz orthogonalization.
+    with record_function("muon::newton_schulz"):
+        computed_us = _compute_ns(owned_params, gathered_grads, ns_steps)
+        gathered_grads.clear()
+    # Stages 4-5: launch async scatter.
+    with record_function("muon::launch_scatter"):
+        work, recv_buf, scattered_us, recv_counts = _launch_scatter(
+            params, owned_params, param_to_state, rank, num_ranks,
+            process_group, computed_us)
+        computed_us.clear()
+    yield  # --- YIELD 2: other chunks can launch their scatter ---
+    with record_function("muon::wait_scatter"):
+        work.wait()
+        _complete_scatter(recv_buf, recv_counts, params, param_to_state, rank,
+                          scattered_us)
+        del recv_buf
+    # Stage 6: apply parameter updates.
+    with record_function("muon::update_params"):
+        _update_params(params, param_to_state, rank, scattered_us, lr,
+                       weight_decay)
+        scattered_us.clear()

build/torch210-cxx11-cu130-x86_64-linux/qk_clip.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import logging
+import math
+from dataclasses import dataclass
+import torch
+from torch.distributed.tensor import DTensor
+logger = logging.getLogger(__name__)
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: str | None  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: list[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: torch.Tensor | None
+def get_qk_clip_info(clip_config, n, qk_logits):
+    """Extract QK clipping info for a named parameter.
+    Args:
+        clip_config: QK clipping configuration dict (or None).
+        n: Parameter name string.
+        qk_logits: Dict mapping layer indices to logit tensors (or None).
+    Returns:
+        QKClipInfo instance with clipping configuration for this parameter.
+    """
+    if clip_config is None:
+        return None
+    head_dim = clip_config.get('head_dim')
+    threshold = clip_config.get('threshold')
+    kind, layer_idx = parse_qk_layer(n)
+    logit, indices = None, []
+    if qk_logits is not None and kind is not None:
+        logit = qk_logits[layer_idx]
+        indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+        indices = clip_config.get(indices_key, []) or []
+        if isinstance(logit, DTensor):
+            # In TP settings, qk_logits may be DTensor
+            # We convert it to full tensor here for simplicity
+            logit = logit.full_tensor()
+    return QKClipInfo(
+        kind=kind,
+        indices=indices,
+        head_dim=head_dim,
+        threshold=threshold,
+        logit=logit,
+    )
+def compute_scales(p, qk_clip_state):
+    """Compute per-head scaling factors for QK clipping.
+    Returns scales tensor if any head exceeds threshold, else None.
+    """
+    kind = qk_clip_state.kind
+    indices = qk_clip_state.indices
+    head_dim = qk_clip_state.head_dim
+    threshold = qk_clip_state.threshold
+    logit = qk_clip_state.logit
+    H_global = p.shape[0] // head_dim
+    scales_full = torch.ones(H_global, device=p.data.device)
+    scaling = 0
+    for logit_idx, head_idx in enumerate(indices):
+        v_ele = float(logit[logit_idx])
+        if v_ele > threshold:
+            new_scale = math.sqrt(threshold / v_ele)
+            if new_scale < scales_full[head_idx]:
+                scales_full[head_idx] = new_scale
+                logger.info(
+                    f"[{kind}] Head {head_idx} exceeded threshold "
+                    f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                )
+                scaling += 1
+    return scales_full if scaling > 0 else None
+def qk_clip(p, scales, head_dim):
+    """Apply per-head scaling to a Q/K projection weight matrix."""
+    if isinstance(p, torch.nn.Parameter):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    else:
+        W = p.view(-1, head_dim, p.shape[1])
+        W.mul_(scales.view(-1, 1, 1))

build/torch210-cxx11-rocm70-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_06a260a_dirty
-ops = torch.ops._optimizer_06a260a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_06a260a_dirty::{op_name}"

 import torch
+from . import _optimizer_7aef62f_dirty
+ops = torch.ops._optimizer_7aef62f_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_7aef62f_dirty::{op_name}"

build/torch210-cxx11-rocm70-x86_64-linux/{_optimizer_06a260a_dirty.abi3.so → _optimizer_7aef62f_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3562c68e8ee85fc5b268e079150ffff69d52860092d59e44fb9b3c4526c5d497
 size 1866400

 version https://git-lfs.github.com/spec/v1
+oid sha256:00e9d9e1c2306badb97c3b8f2454a47d6335a302101a38c804ad3c7b075168cc
 size 1866400

build/torch210-cxx11-rocm70-x86_64-linux/adamw.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from collections import defaultdict
+from typing import cast
+import torch
+from torch.distributed.tensor import DTensor
+def fused_adamw(
+    params: list[torch.Tensor],
+    grads: list[torch.Tensor],
+    exp_avgs: list[torch.Tensor],
+    exp_avg_sqs: list[torch.Tensor],
+    max_exp_avg_sqs: list[torch.Tensor],
+    state_steps: list[torch.Tensor],
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: float | torch.Tensor,
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+) -> None:
+    if not params:
+        return
+    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+    # treating it as a scalar.
+    lr_dict: dict | None = ({
+        lr.device: lr
+    } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else None)
+    grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+         state_steps]  # type: ignore[list-item]
+    )
+    for (device, _), (
+        (
+            device_params_,
+            device_grads_,
+            device_exp_avgs_,
+            device_exp_avg_sqs_,
+            device_max_exp_avg_sqs,
+            device_state_steps_,
+        ),
+            _,
+    ) in grouped_tensors.items():
+        device_params = cast(list[torch.Tensor], device_params_)
+        device_grads = cast(list[torch.Tensor], device_grads_)
+        device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+        device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+        device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+        if lr_dict is not None and device not in lr_dict:
+            lr_dict[device] = lr.to(
+                device=device, non_blocking=True)  # type: ignore[union-attr]
+            lr = lr_dict[device]
+        torch._foreach_add_(device_state_steps, 1)
+        func = torch._fused_adamw_
+        func(
+            device_params,
+            device_grads,
+            device_exp_avgs,
+            device_exp_avg_sqs,
+            device_max_exp_avg_sqs,  # type: ignore[arg-type]
+            device_state_steps,
+            amsgrad=amsgrad,
+            lr=lr,  # type: ignore[arg-type]
+            beta1=beta1,
+            beta2=beta2,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=maximize,
+        )
+def step_adamw_params(optimizer_state, params, group):
+    """Run fused AdamW on a list of parameters sharing the same placement.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        params: List of parameters to update.
+        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
+    """
+    params_with_grads = []
+    grads = []
+    moment1 = []
+    moment2 = []
+    max_exp_avg_sqs = []
+    state_steps = []
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
+    for p in params:
+        g = p.grad
+        if g is None:
+            continue
+        state = optimizer_state[p]
+        params_with_grads.append(p)
+        grads.append(g)
+        if "step" not in state:
+            state["step"] = (torch.zeros((),
+                                         dtype=torch.float32,
+                                         device=p.device))
+            state["moment1"] = torch.zeros_like(g)
+            state["moment2"] = torch.zeros_like(g)
+        moment1.append(state["moment1"])
+        moment2.append(state["moment2"])
+        if not isinstance(state["step"], torch.Tensor):
+            step_tensor = torch.tensor(state["step"],
+                                       dtype=torch.float32,
+                                       device=p.device)
+        else:
+            step_tensor = state["step"]
+        state_steps.append(step_tensor)
+    fused_adamw(
+        params_with_grads,
+        grads,
+        moment1,
+        moment2,
+        max_exp_avg_sqs,
+        state_steps,
+        amsgrad=False,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        eps=eps,
+        maximize=False,
+    )
+def step_adamw(optimizer_state, group):
+    """Dispatch AdamW step, grouping parameters by type and placement.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        group: Parameter group dict.
+    """
+    params = group["params"]
+    # group params with its type and placement
+    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
+    for p in params:
+        match p:
+            case DTensor():
+                placement_to_params[tuple([p.placements,
+                                           p.device_mesh])].append(p)
+            case torch.Tensor():
+                placement_to_params[tuple([torch.Tensor, None])].append(p)
+    for group_params in placement_to_params.values():
+        step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-rocm70-x86_64-linux/async_utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import logging
+from typing import Generator
+logger = logging.getLogger(__name__)
+class _Task:
+    """Internal: wraps a generator, advances one yield at a time."""
+    def __init__(self, generator: Generator[None, None, None], index: int):
+        self._generator = generator
+        self._index = index
+        self._steps_completed = 0
+        self.step()  # run to first yield
+    def step(self) -> bool:
+        try:
+            next(self._generator)
+            self._steps_completed += 1
+            logger.debug("pipeline[%d] completed stage %d", self._index,
+                         self._steps_completed)
+            return True
+        except StopIteration:
+            logger.debug("pipeline[%d] finished after %d stages", self._index,
+                         self._steps_completed)
+            return False
+    def close(self):
+        self._generator.close()
+def run_pipeline(
+    pipelines: Generator[Generator[None, None, None], None, None],
+    max_concurrent: int,
+) -> None:
+    """Run generator-based pipelines with bounded concurrency.
+    Each pipeline is a generator that yields at stage boundaries.
+    The runtime interleaves pipelines so communication and computation
+    overlap across chunks.
+    """
+    if max_concurrent <= 0:
+        raise ValueError(f"max_concurrent must be > 0, got {max_concurrent}")
+    have_new = True
+    task_index = 0
+    previous_tasks: list[_Task] = []
+    try:
+        while have_new or previous_tasks:
+            running_tasks: list[_Task] = []
+            # Admit one new pipeline per iteration (staggered admission).
+            # Admitting one at a time ensures that while chunk N does NS
+            # compute on the default stream, chunk N+1's NCCL all-to-all
+            # runs concurrently on the NCCL stream — creating real
+            # communication/computation overlap on the GPU.
+            if have_new and len(previous_tasks) < max_concurrent:
+                try:
+                    gen = next(pipelines)
+                    task = _Task(gen, task_index)
+                    task_index += 1
+                    running_tasks.append(task)
+                except StopIteration:
+                    have_new = False
+            # Advance every previously-yielded task by one step.
+            for task in previous_tasks:
+                if task.step():
+                    running_tasks.append(task)
+            previous_tasks = running_tasks
+    except BaseException:
+        # Clean up all in-flight generators to release GPU resources.
+        for task in previous_tasks:
+            task.close()
+        raise

build/torch210-cxx11-rocm70-x86_64-linux/core.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+from torch.distributed.tensor import DTensor
+@dataclass
+class _muon_state:
+    worker_rank: int
+    process_group: ProcessGroup
+    rank_indices: dict[int, tuple]  # local_rank -> per-dim indices
+    rank_numels: dict[int, int]  # local_rank -> numel
+    name: str
+    qk_clip_state: torch.Tensor | None = None
+def update_g(optimizer_state, p, g, group, momentum):
+    """Apply momentum update to gradient.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        p: Parameter tensor.
+        g: Gradient tensor.
+        group: Parameter group dict.
+        momentum: Momentum coefficient.
+    Returns:
+        Momentum-updated gradient tensor.
+    """
+    state = optimizer_state[p]
+    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+    torch.add(g, buf, alpha=momentum, out=buf)
+    if group["nesterov"]:
+        g.add_(buf, alpha=momentum)
+        return g
+    return buf
+def update_p(p, u, lr, adjusted_lr, weight_decay):
+    """Apply weight decay and orthogonalized update to parameter.
+    Args:
+        p: Parameter (torch.nn.Parameter or DTensor).
+        u: Orthogonalized update tensor.
+        lr: Base learning rate.
+        adjusted_lr: Size-adjusted learning rate.
+        weight_decay: Weight decay coefficient.
+    """
+    if isinstance(p, torch.nn.Parameter):
+        # apply weight decay
+        p.data.mul_(1 - lr * weight_decay)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    else:
+        p.mul_(1 - lr * weight_decay)
+        p.add_(u, alpha=-adjusted_lr)
+def adjust_lr_for_muon(lr, param_shape):
+    """Scale learning rate based on parameter matrix dimensions.
+    Args:
+        lr: Base learning rate.
+        param_shape: Shape of the parameter tensor.
+    Returns:
+        Adjusted learning rate.
+    """
+    A, B = param_shape[:2]
+    # We adjust the learning rate and weight decay based on the size of the parameter matrix
+    # as described in the paper
+    adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+    adjusted_lr = lr * adjusted_ratio
+    return adjusted_lr
+def default_is_muon(name, x, expert_keys=None):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    if any(key in name for key in skip_keys):
+        return False
+    effective_ndim = x.ndim
+    if expert_keys and any(key in name for key in expert_keys):
+        effective_ndim -= 1
+    return effective_ndim >= 2
+def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
+    if is_muon_func is None:
+        is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
+    return [
+        {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
+        },
+        {
+            "params": non_muon_params,
+            "use_muon": False,
+        },
+    ]

build/torch210-cxx11-rocm70-x86_64-linux/distributed/utils.py CHANGED Viewed

@@ -7,22 +7,40 @@ from torch.distributed.tensor.placement_types import (Placement, Shard,
                                                       _StridedShard)
 def get_slices_of_dtensor(
     target: DTensor | torch.Tensor,
     local_rank: int,
     shard_mesh: DeviceMesh,
     shard_placements: tuple[Placement],
-) -> tuple[slice]:
     """
-    Get the slice of local tensor for a given rank from a tensor.
     Args:
-        target (DTensor | torch.Tensor): The target tensor.
-        rank (int): The local rank of the shard group.
-        shard_mesh (DeviceMesh): The shard mesh. It consists of global ranks.
         shard_placements (tuple[Placement]): The shard placements.
-    """
-    slices: list[slice] = [slice(0, dim_size) for dim_size in target.size()]
     # find the global rank of the local rank in the shard mesh
     rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
@@ -34,34 +52,75 @@ def get_slices_of_dtensor(
     assert len(rank_coords) == len(shard_placements)
     # Caution: Assuming replicate-to-shard of the shard mesh goes with
     # left-to-right sharding. This is ensured by the sorting logic of
     # construct_shard_mesh function.
-    for i, (rank_coord,
-            placement) in enumerate(zip(rank_coords, shard_placements)):
-        assert isinstance(placement, Shard)
-        num_ranks = shard_mesh.mesh.shape[i]
-        dim = placement.dim
-        dim_size = (slices[dim].stop - slices[dim].start)
-        if dim_size % num_ranks != 0:
             raise NotImplementedError(
-                f"Dimension size {dim_size} is not divisible "
-                f"by number of ranks {num_ranks} for shard "
-                f"placement on dim {dim}. (shape: {target.shape})")
-        shard_size = dim_size // num_ranks
-        start = slices[dim].start + rank_coord * shard_size
-        end = start + shard_size
-        assert start < end <= slices[dim].stop
-        slices[dim] = slice(start, end)
-    return tuple(slices)
 _ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
@@ -71,105 +130,105 @@ _ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
 def construct_shard_mesh(
     placements: tuple[Placement],
     mesh: DeviceMesh,
-) -> (DeviceMesh, ProcessGroup, tuple[Placement]):
-    """
-    Construct Shard Mesh and Placements for unsharding.
-    It removes Replicate placements and constructs a new Mesh and ProcessGroup.
-    """
-    my_rank = dist.get_rank()
-    assert mesh.mesh.device.type == 'cpu'
-    # Copy mesh to avoid modifying the original mesh
-    mesh = mesh.mesh.clone()
-    # 1. Sort placements. Replicate first, then Shard by dim ascending.
-    # For Shard, strided shard comes after regular shard on the same dim
-    # to preserve left-to-right order of replicate-to-shard.
-    # This is because that strided shard is using stride to represent
-    # more fine-grained sharding on the same dim.
-    # Please check the URL below for _StridedShard.
-    # https://github.com/pytorch/pytorch/blob/v2.8.0/torch/distributed/tensor/placement_types.py#L366
-    def placement_sort_key(
-        placement_with_index: tuple[float, Placement]
-    ) -> tuple[int, float, int]:  # (dim, split factor, original index)
-        index, placement = placement_with_index
-        is_replicate = placement.is_replicate()
-        is_shard = placement.is_shard()
-        is_partial = placement.is_partial()
-        assert is_replicate or is_shard, f"Unsupported placement type: {type(placement)}"
-        assert not is_partial, "Partial placement is not supported."
-        if is_replicate:
-            return (-1.0, 0, index)
-        elif is_shard:
-            if isinstance(placement, _StridedShard):
-                return (placement.dim, 1 / placement.split_factor, index)
-            return (placement.dim, 0, index)
-        else:
-            raise TypeError(f"Unknown placement type: {type(placement)}")
-    placements_with_index: list[tuple[int,
-                                      Placement]] = list(enumerate(placements))
-    placements_with_index = sorted(placements_with_index,
-                                   key=placement_sort_key)
-    sorted_indices, sorted_placements = zip(*placements_with_index)
-    # 2. Permute mesh according to sorted placements.
-    sorted_mesh = mesh.permute(sorted_indices)
-    # 3. Collect list of shard meshes by removing replicate dims
-    # For example, (2, 3, 4, 4) with placements [R, R, S(0), S(1)]
-    # shard_meshes should be list with 2 * 3 = 6 shard meshes of shape (4, 4)
-    num_replicates = sum(1 for p in sorted_placements if p.is_replicate())
-    # merge replicate dims
-    # shard_meshes became a list of shard meshes with a length of replicate degree
-    if num_replicates > 0:
-        sorted_mesh = sorted_mesh.flatten(
-            0, num_replicates - 1) if num_replicates > 1 else sorted_mesh
         shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
     else:
         shard_meshes = [sorted_mesh]
-    shard_placements = sorted_placements[num_replicates:]
-    # assume all shard placements are different
     assert len(shard_placements) == len(set(shard_placements))
-    # 4. Construct ProcessGroups
-    # Caution: all groups should be created in the same order in all processes,
-    # even though each process only needs its own group.
-    # To use tensor as dict key, convert it to tuple
-    def tensor_to_tuple(t):
-        if isinstance(t, torch.Tensor):
-            t = t.tolist()
-        if isinstance(t, list):
-            return tuple(tensor_to_tuple(x) for x in t)
-        return t
-    my_shard_mesh_as_tuple = None
-    for shard_mesh in shard_meshes:
-        assert isinstance(shard_mesh, torch.Tensor)
-        shard_mesh_as_tuple = tensor_to_tuple(shard_mesh)
-        if (my_rank == shard_mesh).any().item():
-            assert my_shard_mesh_as_tuple is None
-            my_shard_mesh_as_tuple = shard_mesh_as_tuple
-        # update global cache
-        if shard_mesh_as_tuple not in _ranks_to_dist_cache:
-            shard_process_group = dist.new_group(shard_mesh.flatten().tolist())
-            _ranks_to_dist_cache[shard_mesh_as_tuple] = (
-                DeviceMesh(device_type="cuda", mesh=shard_mesh),
-                shard_process_group,
             )
-    my_shard_mesh, my_shard_process_group = _ranks_to_dist_cache[
-        my_shard_mesh_as_tuple]
-    return my_shard_mesh, my_shard_process_group, shard_placements

                                                       _StridedShard)
+def _is_shard(placement: Placement) -> bool:
+    """Check if a placement is a shard type (Shard or _StridedShard).
+    In PyTorch 2.10+, _StridedShard no longer inherits from Shard, so
+    ``placement.is_shard()`` returns False for _StridedShard.  This helper
+    handles both old and new hierarchies.
+    """
+    return isinstance(placement, (Shard, _StridedShard))
 def get_slices_of_dtensor(
     target: DTensor | torch.Tensor,
     local_rank: int,
     shard_mesh: DeviceMesh,
     shard_placements: tuple[Placement],
+) -> tuple[slice | torch.Tensor, ...]:
     """
+    Get per-dimension indices for a given rank's shard of the target tensor.
+    Uses ``Shard.local_shard_size_and_offset`` and
+    ``_StridedShard.local_shard_size_and_offset`` for correct handling of
+    both contiguous and strided (non-contiguous) sharding.
     Args:
+        target (DTensor | torch.Tensor): The target tensor (for its shape).
+        local_rank (int): The local rank within the shard group.
+        shard_mesh (DeviceMesh): The shard mesh (only shard dimensions).
         shard_placements (tuple[Placement]): The shard placements.
+    Returns:
+        A tuple of indices (one per tensor dim).  Each element is either:
+        - A ``slice`` (for contiguous or unsharded dims)
+        - A 1-D ``torch.LongTensor`` of indices (for strided sharding)
+    """
     # find the global rank of the local rank in the shard mesh
     rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
     assert len(rank_coords) == len(shard_placements)
+    # Track per-shard-dim indices.
+    # None means "not yet sharded on this dim".
+    dim_indices: dict[int, torch.Tensor] = {}
     # Caution: Assuming replicate-to-shard of the shard mesh goes with
     # left-to-right sharding. This is ensured by the sorting logic of
     # construct_shard_mesh function.
+    for mesh_dim_idx, (rank_coord, placement) in enumerate(
+            zip(rank_coords, shard_placements)):
+        assert _is_shard(placement)
+        num_chunks = shard_mesh.mesh.shape[mesh_dim_idx]
+        shard_dim = placement.dim
+        # Current effective size on this dim (may already be sub-sharded)
+        if shard_dim in dim_indices:
+            curr_size = len(dim_indices[shard_dim])
+        else:
+            curr_size = target.size()[shard_dim]
+        if curr_size % num_chunks != 0:
             raise NotImplementedError(
+                f"Dimension size {curr_size} is not divisible "
+                f"by number of ranks {num_chunks} for shard "
+                f"placement on dim {shard_dim}. (shape: {target.shape})")
+        # Compute indices for this level of sharding
+        if isinstance(placement, _StridedShard):
+            _shard_size, offsets = _StridedShard.local_shard_size_and_offset(
+                placement,
+                curr_size,
+                num_chunks,
+                rank_coord,
+                return_first_offset=False)
+            new_indices = torch.tensor(offsets, dtype=torch.long)
+        else:
+            shard_size, offset = Shard.local_shard_size_and_offset(
+                curr_size, num_chunks, rank_coord)
+            new_indices = torch.arange(offset,
+                                       offset + shard_size,
+                                       dtype=torch.long)
+        # Compose with previous indices on this dim
+        if shard_dim in dim_indices:
+            dim_indices[shard_dim] = dim_indices[shard_dim][new_indices]
+        else:
+            dim_indices[shard_dim] = new_indices
+    # Build result tuple
+    result: list[slice | torch.Tensor] = []
+    for d in range(len(target.size())):
+        if d not in dim_indices:
+            result.append(slice(None))
+        else:
+            indices = dim_indices[d]
+            # Convert contiguous indices to slice for efficiency
+            if len(indices) > 0:
+                start = indices[0].item()
+                expected = torch.arange(start,
+                                        start + len(indices),
+                                        dtype=torch.long)
+                if torch.equal(indices, expected):
+                    result.append(slice(start, start + len(indices)))
+                else:
+                    result.append(indices)
+            else:
+                result.append(slice(0, 0))
+    return tuple(result)
 _ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
 def construct_shard_mesh(
     placements: tuple[Placement],
     mesh: DeviceMesh,
+) -> tuple[DeviceMesh, ProcessGroup, tuple[Placement, ...]]:
+    """Construct shard sub-mesh and ProcessGroup for all-to-all communication.
+    Given a DTensor's placements and device mesh, extracts the "shard group"
+    — the set of ranks that together hold all shards of the same replica —
+    and creates a ProcessGroup for all-to-all among them.
+    Steps:
+        1. Sort placements: Replicate first, then Shard by (dim, granularity).
+        2. Permute the mesh tensor to match the sorted order.
+        3. Collapse Replicate dims → list of shard sub-meshes (one per replica).
+        4. Create/retrieve a cached ProcessGroup for the current rank's sub-mesh.
+    Example — 8 GPUs, mesh shape (2, 2, 2),
+              placements ``[Shard(0), Replicate, _StridedShard(0)]``::
+        Step 1 — Sort: [Replicate, _StridedShard(0), Shard(0)]
+                 Permutation: [1, 2, 0]
+        Step 2 — Permute mesh dims by [1, 2, 0]:
+                 Original:                Permuted:
+                 [[[0,1],[2,3]],          [[[0,2],[1,3]],
+                  [[4,5],[6,7]]]           [[4,6],[5,7]]]
+        Step 3 — Unbind replicate dim (dim 0), giving 2 shard sub-meshes:
+                 sub-mesh 0 = [[0,2],[1,3]]  (replica group 0)
+                 sub-mesh 1 = [[4,6],[5,7]]  (replica group 1)
+                 shard_placements = (_StridedShard(0), Shard(0))
+        Step 4 — Rank 0 → ProcessGroup([0,1,4,5])
+                 Rank 2 → ProcessGroup([2,3,6,7])
+    Returns:
+        ``(shard_mesh, process_group, shard_placements)``
+    """
+    my_rank = dist.get_rank()
+    assert mesh.mesh.device.type == 'cpu'
+    # -- Fast path: 1D all-shard mesh → reuse existing PG. ----------------
+    # This avoids a non-collective dist.new_group() call, which would
+    # deadlock when only a subset of ranks call this function (e.g. expert
+    # DTensors on a TP submesh where ranks 0-3 and 4-7 call separately).
+    if mesh.ndim == 1 and len(placements) == 1 and _is_shard(placements[0]):
+        key = (*mesh.mesh.shape, *mesh.mesh.flatten().tolist())
+        if key not in _ranks_to_dist_cache:
+            _ranks_to_dist_cache[key] = (mesh, mesh.get_group())
+        return (*_ranks_to_dist_cache[key], tuple(placements))
+    mesh_tensor = mesh.mesh.clone()
+    # -- Step 1: Sort placements (Replicate first, then Shard by dim). ------
+    # _StridedShard comes BEFORE regular Shard on the same dim so that
+    # get_slices_of_dtensor applies the outer sharding first, matching
+    # DTensor's left-to-right (outer-to-inner) composition order.
+    def _sort_key(item):
+        index, placement = item
+        assert not placement.is_partial(), "Partial placement not supported"
+        if placement.is_replicate():
+            return (-1, 0, index)
+        assert _is_shard(placement), f"Unsupported: {type(placement)}"
+        split = (-1 / placement.split_factor if isinstance(
+            placement, _StridedShard) else 0)
+        return (placement.dim, split, index)
+    indexed = sorted(enumerate(placements), key=_sort_key)
+    perm, sorted_placements = zip(*indexed)
+    # -- Step 2: Permute mesh to match sorted placement order. --------------
+    sorted_mesh = mesh_tensor.permute(perm)
+    # -- Step 3: Collapse replicate dims → list of shard sub-meshes. --------
+    # E.g. mesh (2, 3, 4, 4) with [R, R, S(0), S(1)] → 6 sub-meshes of (4, 4)
+    num_rep = sum(1 for p in sorted_placements if p.is_replicate())
+    if num_rep > 0:
+        if num_rep > 1:
+            sorted_mesh = sorted_mesh.flatten(0, num_rep - 1)
         shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
     else:
         shard_meshes = [sorted_mesh]
+    shard_placements = sorted_placements[num_rep:]
     assert len(shard_placements) == len(set(shard_placements))
+    # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
+    # All ranks must call dist.new_group in the same order, even though each
+    # rank only joins one group.
+    def _cache_key(t: torch.Tensor) -> tuple:
+        return (*t.shape, *t.flatten().tolist())
+    my_key = None
+    for sm in shard_meshes:
+        key = _cache_key(sm)
+        if (my_rank == sm).any().item():
+            assert my_key is None, "Rank appears in multiple shard groups"
+            my_key = key
+        if key not in _ranks_to_dist_cache:
+            pg = dist.new_group(sm.flatten().tolist())
+            _ranks_to_dist_cache[key] = (
+                DeviceMesh(device_type="cuda", mesh=sm),
+                pg,
             )
+    return (*_ranks_to_dist_cache[my_key], shard_placements)

build/torch210-cxx11-rocm70-x86_64-linux/matmul_transpose_triton.py CHANGED Viewed

@@ -119,10 +119,3 @@ def matmul_transpose_assign(d_in, d_out):
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))
-def matmul_transpose(d_in):
-    M, _ = d_in.shape
-    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
-    matmul_transpose_assign(d_in, d_out)
-    return d_out

     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))

build/torch210-cxx11-rocm70-x86_64-linux/metadata.json CHANGED Viewed

	@@ -1 +1,3 @@
1	- {~~"python-depends":[]}~~

+{
+  "python-depends": []
+}

build/torch210-cxx11-rocm70-x86_64-linux/muon.py CHANGED Viewed

@@ -1,536 +1,121 @@
 import logging
-import math
 import types
 from collections import defaultdict
-from dataclasses import dataclass
-from typing import Any, cast
 import torch
 import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import DTensor, Replicate
-from torch.distributed.tensor.placement_types import Placement
-from .distributed.utils import construct_shard_mesh, get_slices_of_dtensor
-from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
-COMM_DTYPE = torch.bfloat16
-DEFAULT_CHUNK_SIZE_RATIO = 4
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# Muon's Newton–Schulz iteration causes high variance in singular values
-# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
-@torch.no_grad()
-# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert len(G.shape) == 2
-    assert G.dtype == COMM_DTYPE
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm() + 1e-7)
-    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    # Perform the NS iterations
-    for a, b, c in [
-        (4.0848, -6.8946, 2.9270),
-        (3.9505, -6.3029, 2.6377),
-        (3.7418, -5.5913, 2.3037),
-        (2.8769, -3.1427, 1.2046),
-        (2.8366, -3.0525, 1.2012),
-    ]:
-        matmul_transpose_assign(X, buf1)
-        matmul_transpose_assign(buf1, buf2)
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X
-@dataclass
-class _muon_state:
-    # TODO: use Optional
-    worker_rank: int
-    process_group: ProcessGroup
-    shard_mesh: DeviceMesh
-    shard_placements: tuple[Placement, ...]
-    name: str
-    qk_clip_state: torch.Tensor | None = None
-    gathered_grad: torch.Tensor | None = None
-    scattered_u: DTensor | None = None
-    computed_u: torch.Tensor | None = None
-    gather_event: torch.cuda.Event | None = None
-    compute_event: torch.cuda.Event | None = None
-    scatter_event: torch.cuda.Event | None = None
-def numel_for_rank(
-    param: DTensor,
-    local_rank: int,
-    state: _muon_state,
-) -> int:
-    slices = get_slices_of_dtensor(
-        param,
-        local_rank,
-        state.shard_mesh,
-        state.shard_placements,
-    )
-    numel = 1
-    for s, dim in zip(slices, param.shape):
-        start, stop, step = s.indices(dim)
-        length = max(0, (stop - start + (step - 1)) // step)
-        numel *= length
-    return numel
-@torch.no_grad()
-def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
-    """
-    Pre-allocate gathered_grad buffer on compute_stream
-    before launching all2all gather
-    """
-    with torch.cuda.stream(compute_stream):
-        for p in params:
-            state = param_to_state[id(p)]
-            if rank == state.worker_rank:
-                state.gathered_grad = torch.empty(p.shape,
-                                                  dtype=COMM_DTYPE,
-                                                  device="cuda")
-            else:
-                state.gathered_grad = None
-        alloc_event = torch.cuda.Event()
-        alloc_event.record(compute_stream)
-        return alloc_event
-@torch.no_grad()
-def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
-                    alloc_event):
-    """
-    All2all gathers shards so each owner rank reconstructs its full gradient
-    """
-    with torch.cuda.stream(comm_stream):
-        process_group = param_to_state[id(params[0])].process_group
-        num_ranks = dist.get_world_size(group=process_group)
-        # Construct sending buffers
-        per_dst = [[] for _ in range(num_ranks)]
-        send_counts = [0] * num_ranks
-        for p in params:
-            state = param_to_state[id(p)]
-            dst = state.worker_rank
-            assert dst < num_ranks
-            shard_elems = numel_for_rank(p, rank, state)
-            g = p.grad
-            g = g.to_local().to(COMM_DTYPE).contiguous()
-            assert g.numel() == shard_elems
-            per_dst[dst].append(g.view(-1))
-            send_counts[dst] += shard_elems
-        assert any(
-            len(v) > 0 for v in per_dst
-        ), "At least one destination rank must receive a sharded tensor"
-        # list[list[Tensor]] -> list[Tensor]
-        per_dst = [t for dst in per_dst for t in dst]
-        send_buf = torch.cat(per_dst, dim=0)
-        owned_params = [
-            p for p in params if param_to_state[id(p)].worker_rank == rank
-        ]
-        # Compute receive sizes and allocate receiving buffers
-        recv_counts = [0] * num_ranks
-        for src in range(num_ranks):
-            total = 0
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                assert state.worker_rank == rank
-                total += numel_for_rank(p, src, state)
-            recv_counts[src] = total
-        recv_total = sum(recv_counts)
-        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-        #All2All
-        logger.debug(f"send_buf size: {send_buf.numel()}, "
-                     f"recv_buf size: {recv_buf.numel()}, "
-                     f"recv_counts: {recv_counts}, "
-                     f"send_counts: {send_counts}, "
-                     f"process_group: {str(process_group)}")
-        dist.all_to_all_single(
-            recv_buf,
-            send_buf,
-            output_split_sizes=recv_counts,
-            input_split_sizes=send_counts,
-            group=process_group,
-        )
-        # Reconstructs gathered grad from the received buffer
-        #
-        #                  recv_buf (num ranks = 3)
-        #
-        #      From rank 0        From rank 1        From rank 2
-        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
-        #
-        # Outer loop:
-        # rank 0 -> rank 1 -> rank2
-        #
-        # Inner loop:
-        # p1_n -> p2_n -> p3_n
-        comm_stream.wait_event(alloc_event)
-        off = 0
-        for src in range(num_ranks):
-            if recv_counts[src] == 0:
-                continue
-            block = recv_counts[src]
-            inner_off = 0
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                assert state.worker_rank == rank
-                # get the slice of the full dtensor corresponding to rank src.
-                slices = get_slices_of_dtensor(state.gathered_grad, src,
-                                               state.shard_mesh,
-                                               state.shard_placements)
-                dst = state.gathered_grad[slices]
-                assert dst._base is state.gathered_grad
-                n = dst.numel()
-                assert n > 0
-                sg = recv_buf.narrow(0, off + inner_off, n)
-                sg = sg.reshape_as(dst)
-                dst.copy_(sg)
-                inner_off += n
-            off += block
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank == rank:
-                state.gather_event = torch.cuda.Event()
-                state.gather_event.record(comm_stream)
-            else:
-                state.gathered_grad = None
-                state.gather_event = None
-            if none_grad:
-                p.grad = None
-@torch.no_grad()
-def _compute_u(p, state, steps, rank, compute_stream):
-    """
-    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
-    """
-    with torch.cuda.stream(compute_stream):
-        if rank == state.worker_rank:
-            if state.gather_event is None:
-                raise RuntimeError("Gather event must be set before compute.")
-            compute_stream.wait_event(state.gather_event)
-            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
-            state.gathered_grad = None
-            state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
-@torch.no_grad()
-def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
-    """
-    Pre-allocate scattered_u buffer on compute_stream
-    before launching all2all gather
-    """
-    with torch.cuda.stream(compute_stream):
-        for p in params:
-            state = param_to_state[id(p)]
-            state.scattered_u = torch.empty_like(p.to_local(),
-                                                 dtype=COMM_DTYPE)
-        alloc_event = torch.cuda.Event()
-        alloc_event.record(compute_stream)
-        return alloc_event
-def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
-    """
-    All2all scatters full gradients to all ranks
-    """
-    with torch.cuda.stream(comm_stream):
-        process_group = param_to_state[id(params[0])].process_group
-        num_ranks = dist.get_world_size(group=process_group)
-        owned_params = [
-            p for p in params if param_to_state[id(p)].worker_rank == rank
-        ]
-        # Construct sending buffer
-        per_dst = [[] for _ in range(num_ranks)]
-        send_counts = [0] * num_ranks
-        if owned_params:
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                if state.compute_event is None:
-                    raise RuntimeError(
-                        "Compute event must be set before scatter.")
-                comm_stream.wait_event(state.compute_event)
-                state.gathered_grad = None
-                assert state.computed_u is not None
-                u_full = state.computed_u.to(COMM_DTYPE).contiguous()
-                offset = 0
-                for dst in range(num_ranks):
-                    # get the slice of the full tensor corresponding to rank dst.
-                    slices = get_slices_of_dtensor(u_full, dst,
-                                                   state.shard_mesh,
-                                                   state.shard_placements)
-                    su = u_full[slices].flatten()
-                    n = su.numel()
-                    assert n > 0
-                    per_dst[dst].append(su)
-                    send_counts[dst] += n
-                    offset += n
-                assert offset == u_full.numel()
-        lengths = [len(v) for v in per_dst]
-        if all(l > 0 for l in lengths):
-            assert all(
-                l == lengths[0] for l in lengths
-            ), "All destination ranks must have the same number of sharded tensor"
-            # list[list[Tensor]] -> list[Tensor]
-            per_dst = [t for dst in per_dst for t in dst]
-            send_buf = torch.cat(per_dst, dim=0)
-        else:
-            # all_to_all requires participation from all ranks
-            # Even non-owner ranks must join the collective call
-            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-        # Compute receive sizes and allocate receiving buffers
-        recv_counts = [0] * num_ranks
-        for src in range(num_ranks):
-            total = 0
-            for p in params:
-                state = param_to_state[id(p)]
-                if state.worker_rank != src:
-                    continue
-                total += numel_for_rank(p, rank, state)
-            recv_counts[src] = total
-        recv_total = sum(recv_counts)
-        assert recv_total > 0
-        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-        #All2All
-        dist.all_to_all_single(
-            recv_buf,
-            send_buf,
-            output_split_sizes=recv_counts,
-            input_split_sizes=send_counts,
-            group=process_group,
-        )
-        # Copy to pre-allocated scattered_u buffer from the received buffer
-        #
-        #                  recv_buf (num ranks = 3, local_rank = 0)
-        #
-        #      From rank 0        From rank 1       From rank 2
-        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
-        #
-        # Outer loop:
-        # rank 0 -> rank 1 -> rank2
-        #
-        # Inner loop:
-        # src(0) :  p1_0 -> p2_0 -> p3_0
-        # src(1) :  p4_0
-        # src(2) :  p5_0 -> p6_0
-        comm_stream.wait_event(alloc_event)
-        off = 0
-        for src in range(num_ranks):
-            block = recv_counts[src]
-            if block == 0:
-                continue
-            inner_off = 0
-            for p in params:
-                state = param_to_state[id(p)]
-                if state.worker_rank != src:
-                    continue
-                n = numel_for_rank(p, rank, state)
-                assert n > 0
-                flat_local = recv_buf.narrow(0, off + inner_off,
-                                             n).view_as(p.to_local())
-                state.scattered_u.copy_(flat_local)
-                state.scatter_event = torch.cuda.Event()
-                state.scatter_event.record(comm_stream)
-                inner_off += n
-            assert inner_off == block
-            off += block
-def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
-                  compute_stream):
-    """
-    Update sharded parameter p with the scattered_u.
-    Only worker_rank frees computed_u.
     """
-    with torch.cuda.stream(compute_stream):
-        if state.scatter_event is None:
-            raise RuntimeError("Scatter event must be set before update")
-        compute_stream.wait_event(state.scatter_event)
-        u_dtensor = DTensor.from_local(
-            state.scattered_u,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
-        if rank == state.worker_rank:
-            # Free computed_u
-            state.computed_u = None
-        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-        state.scattered_u = None
-        u_dtensor = None
-        scales_full = Muon._compute_scales(
-            p,
-            state.qk_clip_state) if state.qk_clip_state is not None else None
-        if scales_full is not None:
-            # Have to slice scales_full among dim 0
-            weight_slices = get_slices_of_dtensor(p, rank, state.shard_mesh,
-                                                  state.shard_placements)
-            ratio = p.shape[0] // scales_full.shape[0]
-            scales_slice = slice(
-                None if weight_slices[0].start is None else
-                weight_slices[0].start // ratio,
-                None if weight_slices[0].stop is None else
-                weight_slices[0].stop // ratio,
-                None,
-            )
-            scales_local = scales_full[scales_slice]
-            scales_local = DTensor.from_local(
-                scales_local,
-                placements=p.placements,
-                device_mesh=p.device_mesh,
-            )
-            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
-def default_is_muon(name, x):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    return x.ndim >= 2 and not any(key in name for key in skip_keys)
-def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
-    muon_params, muon_names = [], []
-    non_muon_params = []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
             continue
-        if is_muon_func(n, p):
-            muon_params.append(p)
-            muon_names.append(n)
-        else:
-            non_muon_params.append(p)
-    return [
-        {
-            "params": muon_params,
-            "names": muon_names,
-            "use_muon": True,
-        },
-        {
-            "params": non_muon_params,
-            "use_muon": False,
-        },
-    ]
-def parse_qk_layer(name: str) -> tuple[str | None, int]:
-    """
-    Parse a parameter name to check if it is a query/key projection layer
-    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
-    Returns:
-        (kind, layer_idx) or (None, -1) if not matched.
-    Example:
-        'model.3.attn.wq.weight'      -> ('wq', 3)
-        'model.5.attn.wk.weight'      -> ('wk', 5)
-        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
-        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
-        'model.4.attn.v_proj.weight'  -> (None, -1)
-    """
-    parts = name.split('.')
-    if len(parts) < 3:
-        return None, -1
-    kind = parts[-2]
-    layer_idx = -1
-    for part in reversed(parts):
-        if part.isdigit():
-            layer_idx = int(part)
-            break
-    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
-        return kind, layer_idx
-    return None, -1
-@dataclass
-class QKClipInfo:
-    """Per-parameter dynamic info computed from config + runtime logits."""
-    kind: str | None  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
-    indices: list[int]  # which heads to consider for clipping
-    head_dim: int  # from config
-    threshold: float  # from config
-    logit: torch.Tensor | None
 class Muon(torch.optim.Optimizer):
@@ -554,7 +139,7 @@ class Muon(torch.optim.Optimizer):
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
         weight_decay: The weight decay for Muon and AdamW.
-        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
@@ -564,7 +149,7 @@ class Muon(torch.optim.Optimizer):
             - "q_indices" (list[int]): Indices of query heads to consider.
             - "k_indices" (list[int]): Indices of key heads to consider.
             - "head_dim" (int): Dimensionality of each attention head.
-            - "threshold" (float): Threshold value; heads whose QK logits exceed
             this value will be scaled down.
             Default is:
                 {
@@ -584,6 +169,13 @@ class Muon(torch.optim.Optimizer):
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
         small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
     """
     def __init__(self,
@@ -597,16 +189,12 @@ class Muon(torch.optim.Optimizer):
                  adamw_eps=1e-8,
                  none_grad=True,
                  debug=False,
-                 clip_config={
-                     "q_indices": [],
-                     "k_indices": [],
-                     "head_dim": 128,
-                     "threshold": 100
-                 },
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
-                 small_param_numel_threshold=65536):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -630,16 +218,18 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        self.rank = None
-        self.comm_stream = torch.cuda.Stream()
-        self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-        self.clip_config = clip_config
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
         self.small_param_numel_threshold = small_param_numel_threshold
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -649,20 +239,6 @@ class Muon(torch.optim.Optimizer):
         return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def adjust_lr_for_muon(self, lr, param_shape):
-        A, B = param_shape[:2]
-        # We adjust the learning rate and weight decay based on the size of the parameter matrix
-        # as describted in the paper
-        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-        adjusted_lr = lr * adjusted_ratio
-        return adjusted_lr
-    def set_rank_once(self, rank):
-        if self.rank is None:
-            self.rank = rank
-        else:
-            assert self.rank == rank
     def get_shard_mesh(self, p):
         """
         Get the shard mesh for a parameter p on the given rank.
@@ -673,9 +249,6 @@ class Muon(torch.optim.Optimizer):
         shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
             p.placements, p.device_mesh)
-        # set rank with the local rank in the shard process group
-        self.set_rank_once(dist.get_rank(group=shard_pg))
         return shard_mesh, shard_pg, shard_placements
     def init_state_and_assign_params(self, names, params, group, qk_logits):
@@ -694,8 +267,8 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
-                  flush=True)
         paired = list(zip(names, params))
@@ -724,44 +297,54 @@ class Muon(torch.optim.Optimizer):
             worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
             round_robin = (round_robin + 1) % len(shard_mesh_flattened)
-            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)] = _muon_state(
                 worker_rank=worker_rank,
                 process_group=shard_pg,
-                shard_mesh=shard_mesh,
-                shard_placements=shard_placements,
                 name=n,
                 qk_clip_state=qk_clip_state,
             )
         return param_to_state, ordered_params
-    def base(self, names, params, group, lr, weight_decay, momentum,
-             qk_logits):
-        # generate weight updates in distributed fashion
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            assert g is not None
-            g = self._update_g(p, g, group, momentum)
             u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
-            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
-            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
-            scales_full = self._compute_scales(
                 p, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
-                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def distributed_muon(
         self,
@@ -770,20 +353,15 @@ class Muon(torch.optim.Optimizer):
         group: dict[str, Any],
         lr: float,
         weight_decay: float,
-        momentum: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
         """ Implementation of Distributed Muon by Liu et al. """
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            assert g is not None
-            g = self._update_g(p, g, group, momentum)
             # Gather G
             if isinstance(p.data, DTensor):
@@ -796,16 +374,16 @@ class Muon(torch.optim.Optimizer):
             u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
                                                   steps=group["ns_steps"])
-            adjusted_lr = self.adjust_lr_for_muon(lr, p_full.shape)
-            Muon._update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
-            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
-            scales_full = self._compute_scales(
                 p_full, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
-                Muon._qk_clip(p_full, scales_full, qk_clip_state.head_dim)
             if isinstance(p.data, DTensor):
                 ndims = len(p.device_mesh.mesh.shape)
@@ -822,244 +400,53 @@ class Muon(torch.optim.Optimizer):
                 p.copy_(p_sharded)
-    def _update_g(self, p, g, group, momentum):
-        # calc update
-        state = self.state[p]
-        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-        torch.add(g, buf, alpha=momentum, out=buf)
-        if group["nesterov"]:
-            g.add_(buf, alpha=momentum)
-            return g
-        return buf
-    @staticmethod
-    def _update_p(p, u, lr, adjusted_lr, weight_decay):
-        if isinstance(p, torch.nn.Parameter):
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
-        else:
-            p.mul_(1 - lr * weight_decay)
-            p.add_(u, alpha=-adjusted_lr)
-    def get_qk_clip_info(self, n, qk_logits):
-        if self.clip_config is None:
-            return None
-        head_dim = self.clip_config.get('head_dim')
-        threshold = self.clip_config.get('threshold')
-        kind, layer_idx = parse_qk_layer(n)
-        logit, indices = None, []
-        if qk_logits is not None and kind is not None:
-            logit = qk_logits[layer_idx]
-            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
-            indices = self.clip_config.get(indices_key, []) or []
-            if isinstance(logit, DTensor):
-                # In TP settings, qk_logits may be DTensor
-                # We convert it to full tensor here for simplicity
-                logit = logit.full_tensor()
-        return QKClipInfo(
-            kind=kind,
-            indices=indices,
-            head_dim=head_dim,
-            threshold=threshold,
-            logit=logit,
-        )
-    @staticmethod
-    def _compute_scales(p, qk_clip_state):
-        kind = qk_clip_state.kind
-        indices = qk_clip_state.indices
-        head_dim = qk_clip_state.head_dim
-        threshold = qk_clip_state.threshold
-        logit = qk_clip_state.logit
-        H_global = p.shape[0] // head_dim
-        scales_full = torch.ones(H_global, device=p.data.device)
-        scaling = 0
-        for logit_idx, head_idx in enumerate(indices):
-            v_ele = float(logit[logit_idx])
-            if v_ele > threshold:
-                new_scale = math.sqrt(threshold / v_ele)
-                if new_scale < scales_full[head_idx]:
-                    scales_full[head_idx] = new_scale
-                    logger.info(
-                        f"[{kind}] Head {head_idx} exceeded threshold "
-                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
-                    )
-                    scaling += 1
-        return scales_full if scaling > 0 else None
-    @staticmethod
-    def _qk_clip(p, scales, head_dim):
-        if isinstance(p, torch.nn.Parameter):
-            W = p.data.view(-1, head_dim, p.data.shape[1])
-            W.mul_(scales.view(-1, 1, 1))
-        else:
-            W = p.view(-1, head_dim, p.shape[1])
-            W.mul_(scales.view(-1, 1, 1))
-    def parallel(self, names, params, group, lr, weight_decay, momentum,
-                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
-        """
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            # Update g in the local rank
-            g = self._update_g(
-                p,
-                g,
-                group,
-                momentum=momentum,
-            )
-            p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        assert self.rank is not None
-        def enqueue_all2all_gather(start_idx, chunk_size):
-            target_params = ordered_params[start_idx:start_idx + chunk_size]
-            if target_params:
-                alloc_event = _alloc_gathered_grad(target_params,
-                                                   param_to_state, self.rank,
-                                                   self.compute_stream)
-                _all2all_gather(target_params, param_to_state, self.rank,
-                                self.comm_stream, group["none_grad"],
-                                alloc_event)
-        def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _compute_u(p, state, group["ns_steps"], self.rank,
-                           self.compute_stream)
-        def enqueue_all2all_scatter(start_idx, chunk_size):
-            target_params = ordered_params[start_idx:start_idx + chunk_size]
-            if target_params:
-                alloc_event = _alloc_scattered_u(target_params, param_to_state,
-                                                 self.rank,
-                                                 self.compute_stream)
-                _all2all_scatter(target_params, param_to_state, self.rank,
-                                 self.comm_stream, alloc_event)
-        def enqueue_update_param(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _update_param(p, state, lr, adjusted_lr, weight_decay,
-                              self.rank, self.compute_stream)
         if self.chunk_size == -1:
             shard_ranks = dist.get_world_size(param_to_state[id(
-                params[0])].process_group)
             chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
         elif self.chunk_size > 0:
             chunk_size = self.chunk_size
         else:
             raise ValueError("chunk_size must be -1 or a positive integer.")
-        # Wait grad update
-        self.comm_stream.wait_stream(torch.cuda.current_stream())
-        warmup_step = self.warmup_step
-        for i in range(0, warmup_step):
-            enqueue_all2all_gather(i * chunk_size, chunk_size)
-            enqueue_computes(i * chunk_size, chunk_size)
-        for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_all2all_scatter(i, chunk_size)
-            enqueue_all2all_gather(i + warmup_step * chunk_size, chunk_size)
-            enqueue_update_param(i, chunk_size)
-            enqueue_computes(i + warmup_step * chunk_size, chunk_size)
-        # Wait the last update_param to finish
-        torch.cuda.current_stream().wait_stream(self.compute_stream)
-    @staticmethod
-    def _fused_adamw(
-        params: list[torch.Tensor],
-        grads: list[torch.Tensor],
-        exp_avgs: list[torch.Tensor],
-        exp_avg_sqs: list[torch.Tensor],
-        max_exp_avg_sqs: list[torch.Tensor],
-        state_steps: list[torch.Tensor],
-        amsgrad: bool,
-        beta1: float,
-        beta2: float,
-        lr: float | torch.Tensor,
-        weight_decay: float,
-        eps: float,
-        maximize: bool,
-    ) -> None:
-        if not params:
-            return
-        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
-        # treating it as a scalar.
-        lr_dict: DeviceDict | None = ({
-            lr.device: lr
-        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
-                                      None)
-        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
-            [
-                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
-                state_steps
-            ]  # type: ignore[list-item]
-        )
-        for (device, _), (
-            (
-                device_params_,
-                device_grads_,
-                device_exp_avgs_,
-                device_exp_avg_sqs_,
-                device_max_exp_avg_sqs,
-                device_state_steps_,
-            ),
-                _,
-        ) in grouped_tensors.items():
-            device_params = cast(list[torch.Tensor], device_params_)
-            device_grads = cast(list[torch.Tensor], device_grads_)
-            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
-            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
-            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
-            if lr_dict is not None and device not in lr_dict:
-                lr_dict[device] = lr.to(
-                    device=device,
-                    non_blocking=True)  # type: ignore[union-attr]
-                lr = lr_dict[device]
-            torch._foreach_add_(device_state_steps, 1)
-            func = torch._fused_adamw_
-            func(
-                device_params,
-                device_grads,
-                device_exp_avgs,
-                device_exp_avg_sqs,
-                device_max_exp_avg_sqs,  # type: ignore[arg-type]
-                device_state_steps,
-                amsgrad=amsgrad,
-                lr=lr,  # type: ignore[arg-type]
-                beta1=beta1,
-                beta2=beta2,
-                weight_decay=weight_decay,
-                eps=eps,
-                maximize=maximize,
-            )
     def _step_muon(self, group, qk_logits=None):
         params = group["params"]
@@ -1068,6 +455,18 @@ class Muon(torch.optim.Optimizer):
         momentum = group["momentum"]
         names = group["names"]
         param_dtensors = []
         name_dtensors = []
@@ -1083,7 +482,6 @@ class Muon(torch.optim.Optimizer):
                                   group=group,
                                   lr=lr,
                                   weight_decay=weight_decay,
-                                  momentum=momentum,
                                   qk_logits=qk_logits)
             return
@@ -1119,7 +517,6 @@ class Muon(torch.optim.Optimizer):
             # and run parallel Muon on each group.
             placement_to_params = defaultdict(lambda: ([], []))
-            # type: dict[tuple[Placement, DeviceMesh], tuple[list[str], list[DTensor]]]
             assert len(dtensors) == len(names)
             for p, n in zip(dtensors, names):
@@ -1141,7 +538,6 @@ class Muon(torch.optim.Optimizer):
                 group=group,
                 lr=lr,
                 weight_decay=weight_decay,
-                momentum=momentum,
                 qk_logits=qk_logits,
             )
@@ -1159,7 +555,6 @@ class Muon(torch.optim.Optimizer):
                     group,
                     lr=lr,
                     weight_decay=weight_decay,
-                    momentum=momentum,
                     qk_logits=qk_logits,
                 )
@@ -1170,78 +565,9 @@ class Muon(torch.optim.Optimizer):
                 group,
                 lr=lr,
                 weight_decay=weight_decay,
-                momentum=momentum,
                 qk_logits=qk_logits,
             )
-    def _step_adamw_params(self, params, group):
-        params_with_grads = []
-        grads = []
-        moment1 = []
-        moment2 = []
-        max_exp_avg_sqs = []
-        state_steps = []
-        lr = group["lr"]
-        beta1, beta2 = group["adamw_betas"]
-        eps = group["adamw_eps"]
-        weight_decay = group["weight_decay"]
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            state = self.state[p]
-            params_with_grads.append(p)
-            grads.append(g)
-            if "step" not in state:
-                state["step"] = (torch.zeros((),
-                                             dtype=torch.float32,
-                                             device=p.device))
-                state["moment1"] = torch.zeros_like(g)
-                state["moment2"] = torch.zeros_like(g)
-            moment1.append(state["moment1"])
-            moment2.append(state["moment2"])
-            if not isinstance(state["step"], torch.Tensor):
-                step_tensor = torch.tensor(state["step"],
-                                           dtype=torch.float32,
-                                           device=p.device)
-            else:
-                step_tensor = state["step"]
-            state_steps.append(step_tensor)
-        self._fused_adamw(
-            params_with_grads,
-            grads,
-            moment1,
-            moment2,
-            max_exp_avg_sqs,
-            state_steps,
-            amsgrad=False,
-            beta1=beta1,
-            beta2=beta2,
-            lr=lr,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=False,
-        )
-    def _step_adamw(self, group):
-        params = group["params"]
-        # group params with it's type and placement
-        placement_to_params: dict[tuple[Placement | type,
-                                        DeviceMesh | None]] = defaultdict(list)
-        for p in params:
-            match p:
-                case DTensor():
-                    placement_to_params[tuple([p.placements,
-                                               p.device_mesh])].append(p)
-                case torch.Tensor():
-                    placement_to_params[tuple([torch.Tensor, None])].append(p)
-        for params in placement_to_params.values():
-            self._step_adamw_params(params, group)
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
@@ -1249,9 +575,9 @@ class Muon(torch.optim.Optimizer):
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
-            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
-                to 1D tensors of shape (num_heads,), representing the maximum
-                QK logits across all tokens, computed as
                 (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
@@ -1263,6 +589,6 @@ class Muon(torch.optim.Optimizer):
             if group["use_muon"]:
                 self._step_muon(group, qk_logits=qk_logits)
             else:
-                self._step_adamw(group)
         return loss

 import logging
 import types
 from collections import defaultdict
+from typing import Any
 import torch
 import torch.distributed as dist
+from torch.distributed.tensor import DTensor, Replicate, Shard
+from torch.profiler import record_function
+from .adamw import step_adamw
+from .async_utils import run_pipeline
+from .core import (_muon_state, adjust_lr_for_muon,
+                   get_default_muon_param_groups, update_g, update_p)
+from .distributed.utils import (_is_shard, construct_shard_mesh,
+                                get_slices_of_dtensor)
+from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
+                            _zeropower_via_newtonschulz5)
+from .pipeline import muon_chunk_pipeline
+from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
 logger = logging.getLogger(__name__)
+def _expand_expert_params(names, params, expert_keys):
+    """Expand expert params by splitting on dim 0 (expert dimension).
+    Params whose name matches any key in ``expert_keys`` are treated as
+    expert-parallel tensors.  Their outermost dimension is the expert
+    dimension: an ``(E, out, in)`` tensor becomes ``E`` separate 2D
+    ``nn.Parameter`` views so that in-place updates propagate back to
+    the original storage.
+    Non-expert params with ``ndim > 2`` trigger an ``AssertionError`` —
+    if they are expert params, their key must be added to ``expert_keys``.
+    The grad must already be set on each expert param (e.g. after momentum).
+    For DTensor expert params, placements that shard on dim 0 (expert dim)
+    are consumed by the split.  Non-dim-0 shard placements (e.g. TP) are
+    preserved: each 2D slice is wrapped as a DTensor on the corresponding
+    submesh so the parallel pipeline handles the TP communication.
     """
+    expanded_names = []
+    expanded_params = []
+    for n, p in zip(names, params):
+        is_expert = expert_keys and any(key in n for key in expert_keys)
+        is_dtensor = isinstance(p.data, DTensor)
+        if not is_expert:
+            assert p.data.ndim <= 2, (
+                f"Param {n} has ndim={p.data.ndim} but does not match "
+                f"expert_keys={expert_keys}. If this is an expert param, "
+                f"add its key to expert_keys.")
+            expanded_names.append(n)
+            expanded_params.append(p)
             continue
+        g = p.grad
+        assert g is not None, (
+            f"Expert param {n} must have grad set before expansion")
+        tp_mesh = None
+        tp_placements_2d = None
+        if is_dtensor:
+            local_data = p.to_local()
+            local_grad = g.to_local() if isinstance(g, DTensor) else g
+            # Find non-dim-0 shard placements (e.g. TP sharding).
+            # After splitting on dim 0, Shard(k) becomes Shard(k-1).
+            tp_dim_indices = []
+            tp_placements_2d = []
+            for i, pl in enumerate(p.placements):
+                if _is_shard(pl) and pl.dim != 0:
+                    tp_dim_indices.append(i)
+                    tp_placements_2d.append(Shard(pl.dim - 1))
+            if tp_dim_indices:
+                tp_dim_names = tuple(p.device_mesh.mesh_dim_names[i]
+                                     for i in tp_dim_indices)
+                if len(tp_dim_names) == 1:
+                    tp_mesh = p.device_mesh[tp_dim_names[0]]
+                else:
+                    tp_mesh = p.device_mesh[tp_dim_names]
+        else:
+            local_data = p.data
+            local_grad = g
+        # Expand: split dim 0, reshape each slice to 2D.
+        num_local_experts = local_data.shape[0]
+        for i in range(num_local_experts):
+            slice_data = local_data[i]
+            slice_grad = local_grad[i]
+            if tp_mesh is not None:
+                # Wrap as DTensor on TP submesh so the pipeline handles
+                # TP communication (gather/scatter across TP ranks).
+                dt_data = DTensor.from_local(slice_data,
+                                             device_mesh=tp_mesh,
+                                             placements=tp_placements_2d)
+                dt_grad = DTensor.from_local(slice_grad,
+                                             device_mesh=tp_mesh,
+                                             placements=tp_placements_2d)
+                expert_param = torch.nn.Parameter(dt_data, requires_grad=False)
+                expert_param.grad = dt_grad
+            else:
+                expert_param = torch.nn.Parameter(slice_data,
+                                                  requires_grad=False)
+                expert_param.grad = slice_grad
+            expanded_names.append(f"{n}[{i}]")
+            expanded_params.append(expert_param)
+        p.grad = None  # allow expert grad storage to be freed after pipeline
+    return expanded_names, expanded_params
 class Muon(torch.optim.Optimizer):
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
         weight_decay: The weight decay for Muon and AdamW.
+            Parameters that are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW instead.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
             - "q_indices" (list[int]): Indices of query heads to consider.
             - "k_indices" (list[int]): Indices of key heads to consider.
             - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
             this value will be scaled down.
             Default is:
                 {
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
         small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
+        expert_keys: List of strings to identify expert-parallel parameters.
+                     If any key appears in a parameter's name, its outermost
+                     dimension is treated as the expert dimension and expanded
+                     into per-expert 2D params for Muon.  For example,
+                     ``expert_keys=["experts"]`` matches any param whose name
+                     contains "experts".  3D+ params not matched by any key
+                     will raise an error.
     """
     def __init__(self,
                  adamw_eps=1e-8,
                  none_grad=True,
                  debug=False,
+                 clip_config=None,
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
+                 small_param_numel_threshold=65536,
+                 expert_keys=None):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
         self.debug = debug
+        self.clip_config = clip_config if clip_config is not None else {
+            "q_indices": [],
+            "k_indices": [],
+            "head_dim": 128,
+            "threshold": 100,
+        }
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
         self.small_param_numel_threshold = small_param_numel_threshold
+        self.expert_keys = expert_keys
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
     def get_shard_mesh(self, p):
         """
         Get the shard mesh for a parameter p on the given rank.
         shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
             p.placements, p.device_mesh)
         return shard_mesh, shard_pg, shard_placements
     def init_state_and_assign_params(self, names, params, group, qk_logits):
             total_flops += flops
         if self.debug:
+            logger.debug("Total TFLOPs for Muon: %.2f TFLOPs",
+                         total_flops / 1e12)
         paired = list(zip(names, params))
             worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
             round_robin = (round_robin + 1) % len(shard_mesh_flattened)
+            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
+            # Precompute per-rank indices and numels for all-to-all.
+            rank_indices: dict[int, tuple] = {}
+            rank_numels: dict[int, int] = {}
+            for r in range(num_ranks):
+                indices = get_slices_of_dtensor(p, r, shard_mesh,
+                                                shard_placements)
+                rank_indices[r] = indices
+                numel = 1
+                for idx, dim_size in zip(indices, p.shape):
+                    if isinstance(idx, slice):
+                        start, stop, step = idx.indices(dim_size)
+                        numel *= max(0, (stop - start + (step - 1)) // step)
+                    else:
+                        numel *= len(idx)
+                rank_numels[r] = numel
             param_to_state[id(p)] = _muon_state(
                 worker_rank=worker_rank,
                 process_group=shard_pg,
+                rank_indices=rank_indices,
+                rank_numels=rank_numels,
                 name=n,
                 qk_clip_state=qk_clip_state,
             )
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, qk_logits):
+        # Momentum is already applied by _step_muon before this method.
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
+            adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+            update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
+            scales_full = compute_scales(
                 p, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
+                qk_clip(p, scales_full, qk_clip_state.head_dim)
     def distributed_muon(
         self,
         group: dict[str, Any],
         lr: float,
         weight_decay: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
         """ Implementation of Distributed Muon by Liu et al. """
+        # Momentum is already applied by _step_muon before this method.
         for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             # Gather G
             if isinstance(p.data, DTensor):
             u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
                                                   steps=group["ns_steps"])
+            adjusted_lr = adjust_lr_for_muon(lr, p_full.shape)
+            update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
+            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
+            scales_full = compute_scales(
                 p_full, qk_clip_state) if qk_clip_state is not None else None
             if scales_full is not None:
+                qk_clip(p_full, scales_full, qk_clip_state.head_dim)
             if isinstance(p.data, DTensor):
                 ndims = len(p.device_mesh.mesh.shape)
                 p.copy_(p_sharded)
+    def parallel(self, names, params, group, lr, weight_decay, qk_logits):
         """
         Perform a parallel optimization step using Muon.
+        Parameters are chunked and each chunk is processed by a
+        :func:`muon_chunk_pipeline` generator.  :func:`run_pipeline`
+        interleaves multiple chunks so that communication and computation
+        overlap across chunks (the same overlap previously achieved by the
+        warmup + main-loop index scheduling).
+        """
+        # Momentum is already applied by _step_muon before this method.
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        # Compute local rank for this group's shard process group.
+        shard_pg = param_to_state[id(ordered_params[0])].process_group
+        rank = dist.get_rank(group=shard_pg)
         if self.chunk_size == -1:
             shard_ranks = dist.get_world_size(param_to_state[id(
+                ordered_params[0])].process_group)
             chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
         elif self.chunk_size > 0:
             chunk_size = self.chunk_size
         else:
             raise ValueError("chunk_size must be -1 or a positive integer.")
+        def pipelines():
+            for start in range(0, len(ordered_params), chunk_size):
+                chunk = ordered_params[start:start + chunk_size]
+                if chunk:
+                    yield muon_chunk_pipeline(
+                        params=chunk,
+                        param_to_state=param_to_state,
+                        rank=rank,
+                        ns_steps=group["ns_steps"],
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        none_grad=group["none_grad"],
+                    )
+        with record_function("muon::barrier"):
+            dist.barrier()
+        with record_function("muon::pipeline"):
+            run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
     def _step_muon(self, group, qk_logits=None):
         params = group["params"]
         momentum = group["momentum"]
         names = group["names"]
+        # Apply momentum to all params before routing/expansion.
+        with record_function("muon::momentum"):
+            for n, p in zip(names, params):
+                g = p.grad
+                if g is None:
+                    continue
+                g = update_g(self.state, p, g, group, momentum)
+                p.grad = g
+        # Expand expert params by splitting on dim 0.
+        names, params = _expand_expert_params(names, params, self.expert_keys)
         param_dtensors = []
         name_dtensors = []
                                   group=group,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   qk_logits=qk_logits)
             return
             # and run parallel Muon on each group.
             placement_to_params = defaultdict(lambda: ([], []))
             assert len(dtensors) == len(names)
             for p, n in zip(dtensors, names):
                 group=group,
                 lr=lr,
                 weight_decay=weight_decay,
                 qk_logits=qk_logits,
             )
                     group,
                     lr=lr,
                     weight_decay=weight_decay,
                     qk_logits=qk_logits,
                 )
                 group,
                 lr=lr,
                 weight_decay=weight_decay,
                 qk_logits=qk_logits,
             )
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
                 (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
             if group["use_muon"]:
                 self._step_muon(group, qk_logits=qk_logits)
             else:
+                step_adamw(self.state, group)
         return loss

build/torch210-cxx11-rocm70-x86_64-linux/newton_schulz.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+from .matmul_transpose_triton import matmul_transpose_assign
+COMM_DTYPE = torch.bfloat16
+DEFAULT_CHUNK_SIZE_RATIO = 4
+# This code snippet is a modified version adapted from the following GitHub repositories:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
+@torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X

build/torch210-cxx11-rocm70-x86_64-linux/pipeline.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import logging
+from typing import Generator
+import torch
+import torch.distributed as dist
+from torch.distributed.tensor import DTensor
+from torch.profiler import record_function
+from .core import _muon_state, adjust_lr_for_muon, update_p
+from .newton_schulz import COMM_DTYPE, _zeropower_via_newtonschulz5
+from .qk_clip import compute_scales
+logger = logging.getLogger(__name__)
+# ======================================================================
+# Stage helpers
+# ======================================================================
+def _launch_gather(
+    params: list[DTensor],
+    owned_params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    num_ranks: int,
+    process_group: dist.ProcessGroup,
+) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
+    """Allocate gather buffers, build send/recv, and launch async all-to-all.
+    Returns:
+        work: Async operation handle.
+        recv_buf: Flat receive buffer (needed by ``_complete_gather``).
+        gathered_grads: ``{id(p): empty_tensor}`` for owned params,
+            ``None`` for non-owned.
+        recv_counts: Per-source-rank element counts.
+    """
+    # Allocate gathered-grad buffers
+    gathered_grads: dict[int, torch.Tensor | None] = {}
+    for p in params:
+        state = param_to_state[id(p)]
+        if rank == state.worker_rank:
+            gathered_grads[id(p)] = torch.empty(p.shape,
+                                                dtype=COMM_DTYPE,
+                                                device="cuda")
+        else:
+            gathered_grads[id(p)] = None
+    # Build send buffer
+    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
+    send_counts = [0] * num_ranks
+    for p in params:
+        state = param_to_state[id(p)]
+        dst = state.worker_rank
+        assert dst < num_ranks
+        shard_elems = state.rank_numels[rank]
+        g = p.grad
+        g = g.to_local().to(COMM_DTYPE).contiguous()
+        assert g.numel() == shard_elems
+        per_dst[dst].append(g.view(-1))
+        send_counts[dst] += shard_elems
+    assert any(
+        len(v) > 0 for v in
+        per_dst), "At least one destination rank must receive a sharded tensor"
+    per_dst_flat = [t for dst in per_dst for t in dst]
+    send_buf = torch.cat(per_dst_flat, dim=0)
+    # Build recv buffer
+    recv_counts = [0] * num_ranks
+    for src in range(num_ranks):
+        total = 0
+        for p in owned_params:
+            state = param_to_state[id(p)]
+            assert state.worker_rank == rank
+            total += state.rank_numels[src]
+        recv_counts[src] = total
+    recv_buf = torch.empty(sum(recv_counts), dtype=COMM_DTYPE, device="cuda")
+    # Launch async all-to-all
+    logger.debug(f"send_buf size: {send_buf.numel()}, "
+                 f"recv_buf size: {recv_buf.numel()}, "
+                 f"recv_counts: {recv_counts}, "
+                 f"send_counts: {send_counts}, "
+                 f"process_group: {str(process_group)}")
+    work = dist.all_to_all_single(
+        recv_buf,
+        send_buf,
+        output_split_sizes=recv_counts,
+        input_split_sizes=send_counts,
+        group=process_group,
+        async_op=True,
+    )
+    return work, recv_buf, gathered_grads, recv_counts
+def _complete_gather(
+    recv_buf: torch.Tensor,
+    recv_counts: list[int],
+    owned_params: list[DTensor],
+    gathered_grads: dict[int, torch.Tensor | None],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+) -> None:
+    """Reconstruct gathered grads from the recv buffer (in-place)."""
+    off = 0
+    for src in range(len(recv_counts)):
+        if recv_counts[src] == 0:
+            continue
+        block = recv_counts[src]
+        inner_off = 0
+        for p in owned_params:
+            state = param_to_state[id(p)]
+            assert state.worker_rank == rank
+            indices = state.rank_indices[src]
+            shard_view = gathered_grads[id(p)][indices]
+            n = shard_view.numel()
+            assert n > 0
+            sg = recv_buf.narrow(0, off + inner_off, n)
+            sg = sg.reshape(shard_view.shape)
+            gathered_grads[id(p)][indices] = sg
+            inner_off += n
+        assert inner_off == block
+        off += block
+def _compute_ns(
+    owned_params: list[DTensor],
+    gathered_grads: dict[int, torch.Tensor | None],
+    ns_steps: int,
+) -> dict[int, torch.Tensor | None]:
+    """Run Newton-Schulz orthogonalization on owned parameters.
+    Returns:
+        computed_us: ``{id(p): orthogonalized_update}`` for owned params.
+    """
+    computed_us: dict[int, torch.Tensor | None] = {}
+    for p in owned_params:
+        u = _zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
+        gathered_grads[id(p)] = None  # free gathered grad
+        computed_us[id(p)] = u
+    return computed_us
+def _launch_scatter(
+    params: list[DTensor],
+    owned_params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    num_ranks: int,
+    process_group: dist.ProcessGroup,
+    computed_us: dict[int, torch.Tensor | None],
+) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor], list[int]]:
+    """Allocate scatter buffers, build send/recv, and launch async all-to-all.
+    Returns:
+        work: Async operation handle.
+        recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
+        scattered_us: ``{id(p): empty_local_tensor}`` for all params.
+        recv_counts: Per-source-rank element counts.
+    """
+    # Allocate scattered-u buffers
+    scattered_us: dict[int, torch.Tensor] = {}
+    for p in params:
+        scattered_us[id(p)] = torch.empty_like(p.to_local(), dtype=COMM_DTYPE)
+    # Build send buffer (from computed_us on owner ranks)
+    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
+    send_counts = [0] * num_ranks
+    if owned_params:
+        for p in owned_params:
+            state = param_to_state[id(p)]
+            assert computed_us[id(p)] is not None
+            u_full = computed_us[id(p)].to(COMM_DTYPE).contiguous()
+            total_sent = 0
+            for dst_rank in range(num_ranks):
+                indices = state.rank_indices[dst_rank]
+                su = u_full[indices].flatten()
+                n = su.numel()
+                assert n > 0
+                per_dst[dst_rank].append(su)
+                send_counts[dst_rank] += n
+                total_sent += n
+            assert total_sent == u_full.numel()
+    lengths = [len(v) for v in per_dst]
+    if all(l > 0 for l in lengths):
+        assert all(
+            l == lengths[0] for l in lengths
+        ), "All destination ranks must have the same number of sharded tensor"
+        per_dst_flat = [t for dst in per_dst for t in dst]
+        send_buf = torch.cat(per_dst_flat, dim=0)
+    else:
+        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+    # Build recv buffer
+    recv_counts = [0] * num_ranks
+    for src in range(num_ranks):
+        total = 0
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank != src:
+                continue
+            total += state.rank_numels[rank]
+        recv_counts[src] = total
+    recv_total = sum(recv_counts)
+    assert recv_total > 0
+    recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+    # Launch async all-to-all
+    work = dist.all_to_all_single(
+        recv_buf,
+        send_buf,
+        output_split_sizes=recv_counts,
+        input_split_sizes=send_counts,
+        group=process_group,
+        async_op=True,
+    )
+    return work, recv_buf, scattered_us, recv_counts
+def _complete_scatter(
+    recv_buf: torch.Tensor,
+    recv_counts: list[int],
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    scattered_us: dict[int, torch.Tensor],
+) -> None:
+    """Copy recv buffer into scattered_us (in-place)."""
+    off = 0
+    for src in range(len(recv_counts)):
+        block = recv_counts[src]
+        if block == 0:
+            continue
+        inner_off = 0
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank != src:
+                continue
+            n = state.rank_numels[rank]
+            assert n > 0
+            flat_local = recv_buf.narrow(0, off + inner_off,
+                                         n).view_as(p.to_local())
+            scattered_us[id(p)].copy_(flat_local)
+            inner_off += n
+        assert inner_off == block
+        off += block
+def _update_params(
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    scattered_us: dict[int, torch.Tensor],
+    lr: float,
+    weight_decay: float,
+) -> None:
+    """Apply weight decay, Muon update, and optional QK clipping."""
+    for p in params:
+        state = param_to_state[id(p)]
+        u_dtensor = DTensor.from_local(
+            scattered_us[id(p)],
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+        update_p(p, u_dtensor, lr, adjusted_lr, weight_decay)
+        # QK clipping – applied directly on the local tensor to
+        # avoid DTensor sharding-propagation issues with _StridedShard.
+        scales_full = compute_scales(
+            p,
+            state.qk_clip_state) if state.qk_clip_state is not None else None
+        if scales_full is not None:
+            ratio = p.shape[0] // scales_full.shape[0]
+            idx0 = state.rank_indices[rank][0]
+            if isinstance(idx0, slice):
+                start = idx0.start or 0
+                idx0 = torch.arange(start,
+                                    idx0.stop,
+                                    device=scales_full.device)
+            row_scales = scales_full[idx0 // ratio]
+            p._local_tensor.mul_(row_scales.view(-1, 1))
+# ======================================================================
+# Main generator – thin orchestrator that wires stages together.
+# ======================================================================
+@torch.no_grad()
+def muon_chunk_pipeline(
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    ns_steps: int,
+    lr: float,
+    weight_decay: float,
+    none_grad: bool,
+) -> Generator[None, None, None]:
+    """Process one chunk of parameters through the full Muon pipeline.
+    Stages: gather -> compute (Newton-Schulz) -> scatter -> update.
+    Each ``yield`` lets :func:`run_pipeline` interleave other chunks so
+    that communication and computation overlap across chunks.  Async
+    communication is launched via ``async_op=True`` and completed after
+    the yield with ``work.wait()``.
+    Overlap happens because :func:`run_pipeline` admits one new chunk
+    per iteration (staggered admission).  While chunk *N* does NS
+    compute on the default CUDA stream, chunk *N+1*'s async all-to-all
+    runs concurrently on the NCCL stream — no separate ``comm_stream``
+    is required.
+    Yields exactly **2** times:
+    1. After launching async all-to-all gather.
+    2. After launching async all-to-all scatter.
+    """
+    process_group = param_to_state[id(params[0])].process_group
+    num_ranks = dist.get_world_size(group=process_group)
+    owned_params = [
+        p for p in params if param_to_state[id(p)].worker_rank == rank
+    ]
+    # Stages 1-2: launch async gather.
+    with record_function("muon::launch_gather"):
+        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
+            params, owned_params, param_to_state, rank, num_ranks,
+            process_group)
+        if none_grad:
+            for p in params:
+                p.grad = None
+    yield  # --- YIELD 1: other chunks can launch their gather ---
+    with record_function("muon::wait_gather"):
+        work.wait()
+        _complete_gather(recv_buf, recv_counts, owned_params, gathered_grads,
+                         param_to_state, rank)
+        del recv_buf
+    # Stage 3: Newton-Schulz orthogonalization.
+    with record_function("muon::newton_schulz"):
+        computed_us = _compute_ns(owned_params, gathered_grads, ns_steps)
+        gathered_grads.clear()
+    # Stages 4-5: launch async scatter.
+    with record_function("muon::launch_scatter"):
+        work, recv_buf, scattered_us, recv_counts = _launch_scatter(
+            params, owned_params, param_to_state, rank, num_ranks,
+            process_group, computed_us)
+        computed_us.clear()
+    yield  # --- YIELD 2: other chunks can launch their scatter ---
+    with record_function("muon::wait_scatter"):
+        work.wait()
+        _complete_scatter(recv_buf, recv_counts, params, param_to_state, rank,
+                          scattered_us)
+        del recv_buf
+    # Stage 6: apply parameter updates.
+    with record_function("muon::update_params"):
+        _update_params(params, param_to_state, rank, scattered_us, lr,
+                       weight_decay)
+        scattered_us.clear()

build/torch210-cxx11-rocm70-x86_64-linux/qk_clip.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import logging
+import math
+from dataclasses import dataclass
+import torch
+from torch.distributed.tensor import DTensor
+logger = logging.getLogger(__name__)
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: str | None  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: list[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: torch.Tensor | None
+def get_qk_clip_info(clip_config, n, qk_logits):
+    """Extract QK clipping info for a named parameter.
+    Args:
+        clip_config: QK clipping configuration dict (or None).
+        n: Parameter name string.
+        qk_logits: Dict mapping layer indices to logit tensors (or None).
+    Returns:
+        QKClipInfo instance with clipping configuration for this parameter.
+    """
+    if clip_config is None:
+        return None
+    head_dim = clip_config.get('head_dim')
+    threshold = clip_config.get('threshold')
+    kind, layer_idx = parse_qk_layer(n)
+    logit, indices = None, []
+    if qk_logits is not None and kind is not None:
+        logit = qk_logits[layer_idx]
+        indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+        indices = clip_config.get(indices_key, []) or []
+        if isinstance(logit, DTensor):
+            # In TP settings, qk_logits may be DTensor
+            # We convert it to full tensor here for simplicity
+            logit = logit.full_tensor()
+    return QKClipInfo(
+        kind=kind,
+        indices=indices,
+        head_dim=head_dim,
+        threshold=threshold,
+        logit=logit,
+    )
+def compute_scales(p, qk_clip_state):
+    """Compute per-head scaling factors for QK clipping.
+    Returns scales tensor if any head exceeds threshold, else None.
+    """
+    kind = qk_clip_state.kind
+    indices = qk_clip_state.indices
+    head_dim = qk_clip_state.head_dim
+    threshold = qk_clip_state.threshold
+    logit = qk_clip_state.logit
+    H_global = p.shape[0] // head_dim
+    scales_full = torch.ones(H_global, device=p.data.device)
+    scaling = 0
+    for logit_idx, head_idx in enumerate(indices):
+        v_ele = float(logit[logit_idx])
+        if v_ele > threshold:
+            new_scale = math.sqrt(threshold / v_ele)
+            if new_scale < scales_full[head_idx]:
+                scales_full[head_idx] = new_scale
+                logger.info(
+                    f"[{kind}] Head {head_idx} exceeded threshold "
+                    f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                )
+                scaling += 1
+    return scales_full if scaling > 0 else None
+def qk_clip(p, scales, head_dim):
+    """Apply per-head scaling to a Q/K projection weight matrix."""
+    if isinstance(p, torch.nn.Parameter):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    else:
+        W = p.view(-1, head_dim, p.shape[1])
+        W.mul_(scales.view(-1, 1, 1))