test

by iamwyldecat - opened Jun 17, 2025

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+4834

-58971

This view is limited to 50 files because it contains too many changes. See the raw diff here.

Files changed (50) hide show

.github/actionlint.yaml +0 -3
.github/workflows/build-and-commit.yml +0 -120
.github/workflows/pre-commit.yml +0 -30
.github/workflows/push-to-hf.yml +0 -40
.gitignore +0 -21
.pre-commit-config.yaml +0 -33
CLAUDE.md +0 -108
README.md +4 -75
_typos.toml +0 -3
build.toml +14 -24
build/torch210-cxx11-cu126-x86_64-linux/adamw.py +0 -271
build/torch210-cxx11-cu126-x86_64-linux/async_utils.py +0 -77
build/torch210-cxx11-cu126-x86_64-linux/core.py +0 -219
build/torch210-cxx11-cu126-x86_64-linux/cpu_offload.py +0 -206
build/torch210-cxx11-cu126-x86_64-linux/distributed/utils.py +0 -232
build/torch210-cxx11-cu126-x86_64-linux/matmul_transpose_triton.py +0 -122
build/torch210-cxx11-cu126-x86_64-linux/metadata.json +0 -3
build/torch210-cxx11-cu126-x86_64-linux/muon.py +0 -1068
build/torch210-cxx11-cu126-x86_64-linux/newton_schulz.py +0 -240
build/torch210-cxx11-cu126-x86_64-linux/optimizer/__init__.py +0 -26
build/torch210-cxx11-cu126-x86_64-linux/pipeline.py +0 -468
build/torch210-cxx11-cu126-x86_64-linux/qk_clip.py +0 -198
build/torch210-cxx11-cu128-x86_64-linux/adamw.py +0 -271
build/torch210-cxx11-cu128-x86_64-linux/async_utils.py +0 -77
build/torch210-cxx11-cu128-x86_64-linux/core.py +0 -219
build/torch210-cxx11-cu128-x86_64-linux/cpu_offload.py +0 -206
build/torch210-cxx11-cu128-x86_64-linux/distributed/utils.py +0 -232
build/torch210-cxx11-cu128-x86_64-linux/matmul_transpose_triton.py +0 -122
build/torch210-cxx11-cu128-x86_64-linux/metadata.json +0 -3
build/torch210-cxx11-cu128-x86_64-linux/muon.py +0 -1068
build/torch210-cxx11-cu128-x86_64-linux/newton_schulz.py +0 -240
build/torch210-cxx11-cu128-x86_64-linux/optimizer/__init__.py +0 -26
build/torch210-cxx11-cu128-x86_64-linux/pipeline.py +0 -468
build/torch210-cxx11-cu128-x86_64-linux/qk_clip.py +0 -198
build/torch210-cxx11-cu130-x86_64-linux/adamw.py +0 -271
build/torch210-cxx11-cu130-x86_64-linux/async_utils.py +0 -77
build/torch210-cxx11-cu130-x86_64-linux/core.py +0 -219
build/torch210-cxx11-cu130-x86_64-linux/cpu_offload.py +0 -206
build/torch210-cxx11-cu130-x86_64-linux/distributed/utils.py +0 -232
build/torch210-cxx11-cu130-x86_64-linux/matmul_transpose_triton.py +0 -122
build/torch210-cxx11-cu130-x86_64-linux/metadata.json +0 -3
build/torch210-cxx11-cu130-x86_64-linux/muon.py +0 -1068
build/torch210-cxx11-cu130-x86_64-linux/newton_schulz.py +0 -240
build/torch210-cxx11-cu130-x86_64-linux/optimizer/__init__.py +0 -26
build/torch210-cxx11-cu130-x86_64-linux/pipeline.py +0 -468
build/torch210-cxx11-cu130-x86_64-linux/qk_clip.py +0 -198
build/torch210-cxx11-rocm70-x86_64-linux/adamw.py +0 -271
build/torch210-cxx11-rocm70-x86_64-linux/async_utils.py +0 -77
build/torch210-cxx11-rocm70-x86_64-linux/core.py +0 -219
build/torch210-cxx11-rocm70-x86_64-linux/cpu_offload.py +0 -206

.github/actionlint.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-self-hosted-runner:
-  labels:
-    - docker-builder-01

.github/workflows/build-and-commit.yml DELETED Viewed

@@ -1,120 +0,0 @@
-name: Nix build and commit
-on:
-  pull_request:
-    types: [opened, synchronize, reopened]
-  workflow_dispatch:
-permissions:
-  contents: write
-jobs:
-  check-commit:
-    runs-on: ubuntu-latest
-    outputs:
-      skip: ${{ steps.check.outputs.skip }}
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - id: check
-        run: |
-          if [ "${{ github.event_name }}" = "pull_request" ]; then
-            msg=$(git log -1 --pretty=%B "${{ github.event.pull_request.head.sha }}")
-          else
-            msg="manual dispatch"
-          fi
-          echo "Commit message: $msg"
-          if echo "$msg" | grep -q '\[skip-build\]'; then
-            echo "skip=true" >> "$GITHUB_OUTPUT"
-          else
-            echo "skip=false" >> "$GITHUB_OUTPUT"
-          fi
-  build_and_commit:
-    needs: check-commit
-    if: needs.check-commit.outputs.skip == 'false'
-    runs-on: docker-builder-01
-    steps:
-      - name: Show disk usage
-        run: df -h
-      - name: Notify build start on Slack
-        id: slack_start
-        run: |
-          msg="*Build started* for \`${{ github.repository }}\`\nBranch: \`${{ github.ref_name }}\`\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Workflow>"
-          response=$(curl -s -X POST \
-            -H "Authorization: Bearer ${{ secrets.SLACK_TOKEN }}" \
-            -H "Content-type: application/json; charset=utf-8" \
-            --data "{\"channel\":\"${{ secrets.SLACK_CHANNEL_ID }}\",\"text\":\"$msg\"}" \
-            https://slack.com/api/chat.postMessage)
-          ts=$(echo "$response" | jq -r '.ts')
-          echo "thread_ts=$ts" >> "$GITHUB_OUTPUT"
-          echo "$response"
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          lfs: true
-          ref: ${{ github.head_ref || github.ref }}
-      - name: Install Nix
-        uses: cachix/install-nix-action@v31
-      - name: Setup huggingface cachix
-        uses: cachix/cachix-action@v15
-        with:
-          name: huggingface
-      - name: Clean build directory
-        run: |
-          rm -rf build
-      - name: Build with Nix
-        run: |
-            nix run .#build-and-copy \
-                --override-input kernel-builder github:huggingface/kernel-builder \
-                --max-jobs 8 \
-                -j 8 \
-                -L
-      - name: List built binaries
-        run: |
-          ls build
-      - name: Commit build artifact
-        run: |
-          git config user.name "github-actions[bot]"
-          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
-          git add build/*
-          git commit -m "Add built binary [skip-build]"
-      - name: Push changes
-        run: |
-          git push origin HEAD:"$HEAD_REF"
-        env:
-          HEAD_REF: ${{ github.head_ref || github.ref }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Notify success on Slack (thread)
-        if: success()
-        run: |
-          ts="${{ steps.slack_start.outputs.thread_ts }}"
-          msg="*Build succeeded* for \`${{ github.repository }}\`\nBranch: \`${{ github.ref_name }}\`\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Workflow>"
-          curl -s -X POST \
-            -H "Authorization: Bearer ${{ secrets.SLACK_TOKEN }}" \
-            -H "Content-type: application/json; charset=utf-8" \
-            --data "{\"channel\":\"${{ secrets.SLACK_CHANNEL_ID }}\",\"text\":\"$msg\",\"thread_ts\":\"$ts\"}" \
-            https://slack.com/api/chat.postMessage
-      - name: Notify failure on Slack (thread)
-        if: failure()
-        run: |
-          ts="${{ steps.slack_start.outputs.thread_ts }}"
-          msg="*Build failed* for \`${{ github.repository }}\`\nBranch: \`${{ github.ref_name }}\`\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Workflow>"
-          curl -s -X POST \
-            -H "Authorization: Bearer ${{ secrets.SLACK_TOKEN }}" \
-            -H "Content-type: application/json; charset=utf-8" \
-            --data "{\"channel\":\"${{ secrets.SLACK_CHANNEL_ID }}\",\"text\":\"$msg\",\"thread_ts\":\"$ts\"}" \
-            https://slack.com/api/chat.postMessage

.github/workflows/pre-commit.yml DELETED Viewed

@@ -1,30 +0,0 @@
-name: pre-commit
-on:
-  pull_request:
-  push:
-    branches: [ main, master ]
-jobs:
-  run-pre-commit:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      pull-requests: read
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Cache pre-commit
-        uses: actions/cache@v4
-        with:
-          path: ~/.cache/pre-commit
-          key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}
-          restore-keys: |
-            pre-commit-${{ runner.os }}-
-      - name: Run pre-commit
-        uses: pre-commit/action@v3.0.1

.github/workflows/push-to-hf.yml DELETED Viewed

@@ -1,40 +0,0 @@
-name: Push to HF Repo
-on:
-  push:
-    branches:
-      - main
-  workflow_dispatch:
-jobs:
-  push_to_hf:
-    runs-on: ubuntu-latest
-    steps:
-      # 1. Checkout the repo
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - name: Install Git LFS
-        run: |
-          git lfs install
-          git lfs fetch --all
-          git lfs pull
-      # 2. Set up Git
-      - name: Configure Git
-        run: |
-          git config user.name "MotifTech"
-          git config user.email "huggingface@motiftech.io"
-      # 3. Add HF remote
-      - name: Add Hugging Face remote
-        run: |
-          git remote add hf https://huggingface.co/Motif-Technologies/optimizer
-          git fetch hf || true
-      # 4. Push to HF repo
-      - name: Push to Hugging Face
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          git push "https://hf_token:${HF_TOKEN}@huggingface.co/Motif-Technologies/optimizer" HEAD:main

.gitignore DELETED Viewed

@@ -1,21 +0,0 @@
-__pycache__
-.idea
-.DS_Store
-*.egg-info
-outputs
-dist/*
-.vscode
-# data
-data
-out
-wandb
-torchtitan/datasets/**/*.model
-torchtitan/experiments/flux/assets/*
-# temp files
-*.log
-error.json
-_remote_module_non_scriptable.py
-.git_disabled/

.pre-commit-config.yaml DELETED Viewed

@@ -1,33 +0,0 @@
-default_install_hook_types:
-  - pre-commit
-  - commit-msg
-default_stages:
-  - pre-commit # Run locally
-  - manual # Run in CI
-exclude: '(build|result)/.*|__pycache__/.*|.*\.(png|html)$'
-repos:
-- repo: https://github.com/google/yapf
-  rev: v0.43.0
-  hooks:
-  - id: yapf
-    args: [--in-place, --verbose]
-- repo: https://github.com/crate-ci/typos
-  rev: v1.34.0
-  hooks:
-  - id: typos
-    exclude: '.gitattributes'
-- repo: https://github.com/PyCQA/isort
-  rev: 6.0.1
-  hooks:
-  - id: isort
-- repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v20.1.3
-  hooks:
-  - id: clang-format
-    types_or: [c++, cuda]
-    args: [--style=file, --verbose]
-- repo: https://github.com/jackdewinter/pymarkdown
-  rev: v0.9.29
-  hooks:
-  - id: pymarkdown
-    args: [fix]

CLAUDE.md DELETED Viewed

@@ -1,108 +0,0 @@
-# CLAUDE.md
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-## Project Overview
-Optimizer is a PyTorch package implementing the **Muon optimizer** with support for N-D sharding parallelism for large-scale distributed training. Based on the paper at https://arxiv.org/abs/2511.07464. It supports general N-D sharding configurations (FSDP2 through hybrid setups like 2 TP + 2 DP-Replicate + 2 DP-Shard).
-## Commands
-### Lint & Format
-```bash
-pre-commit run --all-files          # Run all pre-commit hooks
-pre-commit run isort --all-files    # Run a specific hook (e.g., isort)
-```
-Hooks: yapf (Python formatter), isort (import sorter), typos (spell checker), clang-format (C++/CUDA), pymarkdown (Markdown linter), actionlint (GitHub Actions).
-### Tests
-Tests require **8 GPUs**, access to `Motif-Technologies/Motif-2.6B-4layer-random` on HuggingFace (`HF_TOKEN` env var), and PyTorch >= 2.8.0.
-```bash
-cd test && ./run_test.sh
-# Equivalent to:
-cd test && torchrun --nproc-per-node=8 --local-ranks-filter=0 -m pytest test_muon.py
-```
-Useful pytest flags: `--measure-perf` (timing/memory), `--do-profile` (profiling, requires `--measure-perf`), `--skip-verify` (skip correctness check against sequential implementation).
-### Build
-Uses kernel-builder infrastructure (`build.toml`, `flake.nix`). Pre-built binaries for various PyTorch/CUDA/ROCm combinations are stored in `build/`.
-### Commit Convention
-**Always append `[skip-build]` to every commit message.** This prevents CI from triggering unnecessary build jobs on development branches.
-## Architecture
-### Source Layout
-```
-torch-ext/optimizer/
-├── __init__.py                    # Public API: exports Muon
-├── muon.py                        # Muon optimizer class (~430 lines)
-├── newton_schulz.py               # Newton-Schulz iteration (~50 lines)
-├── qk_clip.py                     # QK clipping for attention heads (~130 lines)
-├── core.py                        # Shared state, helpers, param grouping (~110 lines)
-├── pipeline.py                    # Async generator pipeline for parallel mode (~290 lines)
-├── async_utils.py                 # AsyncTask / AsyncRuntime scheduling (~75 lines)
-├── adamw.py                       # Fused AdamW for non-Muon parameters (~160 lines)
-├── matmul_transpose_triton.py     # Triton kernel for X @ X.T (~130 lines)
-└── distributed/
-    └── utils.py                   # Shard mesh construction, DTensor slicing (~175 lines)
-```
-### Optimizer Modes
-The `Muon` optimizer has three execution paths selected per-parameter based on its tensor type and mesh structure:
-1. **Base mode** (`base()`) — Single-device / non-sharded tensors. Standard Muon with Newton-Schulz orthogonalization.
-2. **Distributed mode** (`distributed_muon()`) — Gathers full tensors via all-gather, computes updates, redistributes. Used for small parameters or fallback.
-3. **Parallel mode** (`parallel()`) — Pipelined all2all communication overlapped with compute. Uses an async generator pipeline scheduled by `run_pipeline()`. This is the main advanced feature.
-### Parallel Mode Pipeline
-The parallel pipeline is implemented as a single generator function `muon_chunk_pipeline()` in `pipeline.py`. Parameters are split into chunks, and each chunk flows through:
-```
-build bufs + async all2all_gather → yield → wait + Newton-Schulz compute + async all2all_scatter → yield → wait + update_param
-```
-The generator yields 2 times (after launching async gather and async scatter via `async_op=True`), allowing `run_pipeline()` to interleave multiple chunks for communication overlap. `work.wait()` completes each async operation after the yield.
-`warmup_step` maps to `max_concurrent_tasks = warmup_step + 1` in `run_pipeline()`.
-For detailed implementation documentation (pipeline internals, distributed utilities, QK clipping with strided sharding, etc.), see [`docs/implementation.md`](docs/implementation.md).
-### Key Abstractions
-- **`get_default_muon_param_groups(model, is_muon_func)`** (`core.py`) — Separates parameters into Muon-optimizable (2D+) and AdamW groups. Skips embeddings and output layers by default.
-- **`_muon_state` dataclass** (`core.py`) — Per-parameter config: rank ownership (`worker_rank`), process group, precomputed shard indices (`rank_indices`, `rank_numels`), and optional QK clip state. Config-only; no transient pipeline state.
-- **`muon_chunk_pipeline()` generator** (`pipeline.py`) — Processes one chunk through the full gather→compute→scatter→update pipeline. Uses `async_op=True` for non-blocking all-to-all and yields to allow chunk interleaving. All intermediate buffers are generator-local variables.
-- **`run_pipeline()`** (`async_utils.py`) — Generator-based pipeline scheduling with bounded concurrency. Interleaves multiple chunk pipelines at yield points.
-- **`construct_shard_mesh()` / `get_slices_of_dtensor()`** (`distributed/utils.py`) — Utilities for building shard meshes from DTensor placements and computing per-rank local slices. Handles both `Shard` and `_StridedShard` (PyTorch 2.10+).
-- **Newton-Schulz iteration** (`newton_schulz.py`) — `_zeropower_via_newtonschulz5()`: 5 quintic iterations in bfloat16 with pre-optimized coefficients for gradient orthogonalization. Uses Triton kernel `matmul_transpose_assign` for efficient X @ X.T.
-- **QK Clipping** (`qk_clip.py`) — Optional dynamic clipping of attention head projections when QK logits exceed a threshold. Configured via `q_indices`, `k_indices`, `head_dim`, `threshold`.
-- **Fused AdamW** (`adamw.py`) — Uses PyTorch's `torch._fused_adamw_` for non-Muon parameters, grouping tensors by device/dtype and DTensor placement.
-### Dependency Graph
-```
-matmul_transpose_triton.py       (leaf)
-         │
-    newton_schulz.py              (leaf + triton)
-         │
-      core.py ──── qk_clip.py    (leaf, distributed/utils)
-       │    │         │
-       │  pipeline.py ─── async_utils.py
-       │       │
-       │   adamw.py
-       │       │
-      muon.py                     (all above)
-         │
-    __init__.py
-```

README.md CHANGED Viewed

@@ -1,7 +1,6 @@
 ---
 tags:
-- kernels
-license: apache-2.0
 ---
 # Optimizer
@@ -10,14 +9,8 @@ Optimizer is a python package that provides:
 - PyTorch implementation of recent optimizer algorithms
 - with support for parallelism techniques for efficient large-scale training.
-## Currently implemented
-- Parallel Muon with N-D sharding
-  - [arxiv URL](https://arxiv.org/abs/2511.07464)
-  - Supports **general N-D sharding configurations**
-    - The implementation is not tied to any specific parallel strategy.
-    - Verified from basic FSDP2 setups up to hybrid configurations such as
-      **(2 TP + 2 DP-Replicate + 2 DP-Shard)**.
-    - Verified configurations can be found in [test_muon.py](./test/test_muon.py)
 ## Usage
@@ -27,78 +20,14 @@ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from kernels import get_kernel
 optimizer = get_kernel("motif-technologies/optimizer")
-get_default_muon_param_groups = optimizer.muon.get_default_muon_param_groups
 model = None # your model here
 fsdp_model = FSDP(model)
-# muon, in nature, cannot use 1-d tensor
-# we provide helper function to group such tensors
-# you can use your own function, if necessary
-params = get_default_muon_param_groups(model) # user can write own is_muon_func, if necessary
 optim = optimizer.Muon(
-    params,
     lr=0.01,
     momentum=0.9,
     weight_decay=1e-4,
 )
 ```
-## Documentation
-- [Implementation Guide](./docs/implementation.md) — Detailed walkthrough of the internal architecture, parallel pipeline, distributed utilities, and QK clipping. Recommended for code reviewers and new contributors.
-- [PyTorch 2.10 TP Fix](./docs/pytorch-2.10-tp-fix.md) — Root cause analysis and fixes for `_StridedShard` compatibility with PyTorch 2.10+.
-## Test
-- Check [test/README.md](./test/README.md) for how to run the tests.
-## Pre-commit Hooks
-This project uses [pre-commit](https://pre-commit.com/) to automatically check and format code before commits.
-### Setup
-1. Install pre-commit:
-   ```bash
-   pip install pre-commit
-   ```
-2. Install the git hooks:
-```bash
-   pre-commit install
-   ```
-Once installed, the configured hooks will run automatically on each commit.
-### Included Hooks
-The following tools are run via pre-commit:
-- **[yapf](https://github.com/google/yapf)** – Python code formatter
-- **[typos](https://github.com/crate-ci/typos)** – Spell checker for common typos
-- **[isort](https://github.com/PyCQA/isort)** – Organizes and sorts Python imports
-- **[clang-format](https://clang.llvm.org/docs/ClangFormat.html)** – Formats C++/CUDA code (`--style=file`)
-- **[pymarkdown](https://github.com/jackdewinter/pymarkdown)** – Lints and auto-fixes Markdown files
-- **[actionlint](https://github.com/rhysd/actionlint)** – Validates GitHub Actions workflows
-### Usage
-- Run all checks on the entire codebase:
-   ```bash
-   pre-commit run --all-files
-   ```
-- Run a specific hook (example: isort):
- ```bash
-   pre-commit run isort --all-files
-   ```
-### Test
-- There is a [simple unittest for Parallel Muon](./test/test_muon/README.md)

 ---
 tags:
+- kernel
 ---
 # Optimizer
 - PyTorch implementation of recent optimizer algorithms
 - with support for parallelism techniques for efficient large-scale training.
+### Currently implemented
+- [Parallel Muon with FSDP2](./docs/muon/parallel_muon.pdf)
 ## Usage
 from kernels import get_kernel
 optimizer = get_kernel("motif-technologies/optimizer")
 model = None # your model here
 fsdp_model = FSDP(model)
 optim = optimizer.Muon(
+    fsdp_model.parameters(),
     lr=0.01,
     momentum=0.9,
     weight_decay=1e-4,
 )
 ```

_typos.toml DELETED Viewed

@@ -1,3 +0,0 @@
-[default.extend-words]
-# Math notation used in docs/muon-clip.md (O subscript t, update step output)
-Ot = "Ot"

build.toml CHANGED Viewed

@@ -1,33 +1,23 @@
 [general]
 name = "optimizer"
-backends = [
-    "cuda",
-    "rocm",
-]
 [torch]
 src = [
-    "torch-ext/torch_binding.cpp",
-    "torch-ext/torch_binding.h",
 ]
-[kernel.optimizer]
-backend = "cuda"
-depends = ["torch"]
-src = ["optimizer/dummy.cu"]
-[kernel.optimizer_rocm]
 backend = "rocm"
-rocm-archs = [
-    "gfx906",
-    "gfx908",
-    "gfx90a",
-    "gfx940",
-    "gfx941",
-    "gfx942",
-    "gfx1030",
-    "gfx1100",
-    "gfx1101",
 ]
-depends = ["torch"]
-src = ["optimizer/dummy.cu"]

 [general]
 name = "optimizer"
+universal = false
 [torch]
 src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h",
 ]
+[kernel.activation]
 backend = "rocm"
+src = [
+  "optimizer/dummy.cu",
+]
+depends = [ "torch" ]
+[kernel.activation_cuda]
+backend = "cuda"
+src = [
+  "optimizer/dummy.cu",
 ]
+depends = [ "torch" ]

build/torch210-cxx11-cu126-x86_64-linux/adamw.py DELETED Viewed

@@ -1,271 +0,0 @@
-import logging
-from collections import defaultdict
-from typing import cast
-import torch
-from torch.distributed.tensor import DTensor
-from torch.profiler import record_function
-logger = logging.getLogger(__name__)
-def fused_adamw(
-    params: list[torch.Tensor],
-    grads: list[torch.Tensor],
-    exp_avgs: list[torch.Tensor],
-    exp_avg_sqs: list[torch.Tensor],
-    max_exp_avg_sqs: list[torch.Tensor],
-    state_steps: list[torch.Tensor],
-    amsgrad: bool,
-    beta1: float,
-    beta2: float,
-    lr: float | torch.Tensor,
-    weight_decay: float,
-    eps: float,
-    maximize: bool,
-) -> None:
-    if not params:
-        return
-    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
-    # treating it as a scalar.
-    lr_dict: dict | None = ({
-        lr.device: lr
-    } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else None)
-    grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
-         state_steps]  # type: ignore[list-item]
-    )
-    for (device, _), (
-        (
-            device_params_,
-            device_grads_,
-            device_exp_avgs_,
-            device_exp_avg_sqs_,
-            device_max_exp_avg_sqs,
-            device_state_steps_,
-        ),
-            _,
-    ) in grouped_tensors.items():
-        device_params = cast(list[torch.Tensor], device_params_)
-        device_grads = cast(list[torch.Tensor], device_grads_)
-        device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
-        device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
-        device_state_steps = cast(list[torch.Tensor], device_state_steps_)
-        if lr_dict is not None and device not in lr_dict:
-            lr_dict[device] = lr.to(
-                device=device, non_blocking=True)  # type: ignore[union-attr]
-            lr = lr_dict[device]
-        torch._foreach_add_(device_state_steps, 1)
-        func = torch._fused_adamw_
-        func(
-            device_params,
-            device_grads,
-            device_exp_avgs,
-            device_exp_avg_sqs,
-            device_max_exp_avg_sqs,  # type: ignore[arg-type]
-            device_state_steps,
-            amsgrad=amsgrad,
-            lr=lr,  # type: ignore[arg-type]
-            beta1=beta1,
-            beta2=beta2,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=maximize,
-        )
-def _to_local(t):
-    """Unwrap DTensor to local tensor for fused ops."""
-    return t._local_tensor if isinstance(t, DTensor) else t
-# ---------------------------------------------------------------------------
-# Caches for eliminating per-step Python overhead.
-#
-# Placement grouping and tensor list assembly are identical every step
-# (params don't change placement, moment/step tensors are the same objects
-# after initialisation).  We cache them keyed by id() of the param list
-# stored in param_groups (stable across steps).
-#
-# Only gradients change each step and must be collected fresh.
-# ---------------------------------------------------------------------------
-# id(group["params"]) → dict[placement_key, list[param]]
-_placement_cache: dict[int, dict[tuple, list]] = {}
-# id(placement_group_list) → (params_local, moment1, moment2, state_steps)
-_tensor_cache: dict[int, tuple[list, list, list, list]] = {}
-def _step_adamw_params_slow(optimizer_state, params, group):
-    """Uncached fallback for the rare case where some params lack grads."""
-    params_with_grads = []
-    grads = []
-    moment1 = []
-    moment2 = []
-    state_steps = []
-    for p in params:
-        g = p.grad
-        if g is None:
-            continue
-        state = optimizer_state[p]
-        params_with_grads.append(_to_local(p))
-        grads.append(_to_local(g))
-        if "step" not in state:
-            state["step"] = torch.zeros((),
-                                        dtype=torch.float32,
-                                        device=p.device)
-            state["moment1"] = torch.zeros_like(g)
-            state["moment2"] = torch.zeros_like(g)
-        moment1.append(_to_local(state["moment1"]))
-        moment2.append(_to_local(state["moment2"]))
-        if not isinstance(state["step"], torch.Tensor):
-            state["step"] = torch.tensor(state["step"],
-                                         dtype=torch.float32,
-                                         device=p.device)
-        state_steps.append(state["step"])
-    if not params_with_grads:
-        return
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
-    fused_adamw(
-        params_with_grads,
-        grads,
-        moment1,
-        moment2,
-        [],
-        state_steps,
-        amsgrad=False,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        weight_decay=weight_decay,
-        eps=eps,
-        maximize=False,
-    )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    After the first call, cached tensor lists (params_local, moment1,
-    moment2, state_steps) are reused — only gradients are collected fresh.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
-    # Collect grads — the only thing that changes each step.
-    with record_function("adamw::collect_grads"):
-        grads = []
-        for p in params:
-            g = p.grad
-            if g is None:
-                # Rare: fall back to slow path that filters per-param.
-                _step_adamw_params_slow(optimizer_state, params, group)
-                return
-            grads.append(_to_local(g))
-    tensor_key = id(params)
-    if tensor_key not in _tensor_cache:
-        with record_function("adamw::init_tensor_cache"):
-            params_local = []
-            moment1 = []
-            moment2 = []
-            state_steps = []
-            for p in params:
-                state = optimizer_state[p]
-                params_local.append(_to_local(p))
-                if "step" not in state:
-                    state["step"] = torch.zeros((),
-                                                dtype=torch.float32,
-                                                device=p.device)
-                    state["moment1"] = torch.zeros_like(p.grad)
-                    state["moment2"] = torch.zeros_like(p.grad)
-                moment1.append(_to_local(state["moment1"]))
-                moment2.append(_to_local(state["moment2"]))
-                if not isinstance(state["step"], torch.Tensor):
-                    state["step"] = torch.tensor(state["step"],
-                                                 dtype=torch.float32,
-                                                 device=p.device)
-                state_steps.append(state["step"])
-            _tensor_cache[tensor_key] = (params_local, moment1, moment2,
-                                         state_steps)
-    params_local, moment1, moment2, state_steps = _tensor_cache[tensor_key]
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
-    with record_function("adamw::fused_adamw"):
-        fused_adamw(
-            params_local,
-            grads,
-            moment1,
-            moment2,
-            [],
-            state_steps,
-            amsgrad=False,
-            beta1=beta1,
-            beta2=beta2,
-            lr=lr,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=False,
-        )
-def step_adamw(optimizer_state, group):
-    """Dispatch AdamW step, grouping parameters by type and placement.
-    Placement grouping is cached after the first call since params never
-    change their placement between steps.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        group: Parameter group dict.
-    """
-    params = group["params"]
-    placement_key = id(params)
-    if placement_key not in _placement_cache:
-        with record_function("adamw::group_by_placement"):
-            placement_to_params: dict[tuple,
-                                      list[torch.Tensor]] = defaultdict(list)
-            for p in params:
-                match p:
-                    case DTensor():
-                        logger.debug(
-                            "[AdamW] DTensor param: shape=%s, placements=%s, "
-                            "mesh=%s, grad=%s", p.shape, p.placements,
-                            p.device_mesh.mesh_dim_names,
-                            p.grad.shape if p.grad is not None else None)
-                        placement_to_params[tuple(
-                            [p.placements, p.device_mesh])].append(p)
-                    case torch.Tensor():
-                        logger.debug(
-                            "[AdamW] plain param: shape=%s, grad=%s", p.shape,
-                            p.grad.shape if p.grad is not None else None)
-                        placement_to_params[tuple([torch.Tensor,
-                                                   None])].append(p)
-            logger.debug("[AdamW] %d placement groups, %d total params",
-                         len(placement_to_params), len(params))
-            _placement_cache[placement_key] = dict(placement_to_params)
-    for group_params in _placement_cache[placement_key].values():
-        step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-cu126-x86_64-linux/async_utils.py DELETED Viewed

@@ -1,77 +0,0 @@
-import logging
-from typing import Generator
-logger = logging.getLogger(__name__)
-class _Task:
-    """Internal: wraps a generator, advances one yield at a time."""
-    def __init__(self, generator: Generator[None, None, None], index: int):
-        self._generator = generator
-        self._index = index
-        self._steps_completed = 0
-        self.step()  # run to first yield
-    def step(self) -> bool:
-        try:
-            next(self._generator)
-            self._steps_completed += 1
-            logger.debug("pipeline[%d] completed stage %d", self._index,
-                         self._steps_completed)
-            return True
-        except StopIteration:
-            logger.debug("pipeline[%d] finished after %d stages", self._index,
-                         self._steps_completed)
-            return False
-    def close(self):
-        self._generator.close()
-def run_pipeline(
-    pipelines: Generator[Generator[None, None, None], None, None],
-    max_concurrent: int,
-) -> None:
-    """Run generator-based pipelines with bounded concurrency.
-    Each pipeline is a generator that yields at stage boundaries.
-    The runtime interleaves pipelines so communication and computation
-    overlap across chunks.
-    """
-    if max_concurrent <= 0:
-        raise ValueError(f"max_concurrent must be > 0, got {max_concurrent}")
-    have_new = True
-    task_index = 0
-    previous_tasks: list[_Task] = []
-    try:
-        while have_new or previous_tasks:
-            running_tasks: list[_Task] = []
-            # Admit one new pipeline per iteration (staggered admission).
-            # Admitting one at a time ensures that while chunk N does NS
-            # compute on the default stream, chunk N+1's NCCL all-to-all
-            # runs concurrently on the NCCL stream — creating real
-            # communication/computation overlap on the GPU.
-            if have_new and len(previous_tasks) < max_concurrent:
-                try:
-                    gen = next(pipelines)
-                    task = _Task(gen, task_index)
-                    task_index += 1
-                    running_tasks.append(task)
-                except StopIteration:
-                    have_new = False
-            # Advance every previously-yielded task by one step.
-            for task in previous_tasks:
-                if task.step():
-                    running_tasks.append(task)
-            previous_tasks = running_tasks
-    except BaseException:
-        # Clean up all in-flight generators to release GPU resources.
-        for task in previous_tasks:
-            task.close()
-        raise

build/torch210-cxx11-cu126-x86_64-linux/core.py DELETED Viewed

@@ -1,219 +0,0 @@
-import logging
-import math
-from dataclasses import dataclass
-from typing import List
-import torch
-from torch.distributed import ProcessGroup
-from torch.distributed.tensor import DTensor
-# torch.compile wraps modules as OptimizedModule, inserting "_orig_mod" into
-# parameter FQNs.  Activation checkpointing similarly inserts
-# "_checkpoint_wrapped_module".  Strip these so name-based matching (skip_keys,
-# expert_keys, QK layer parsing) works regardless of wrapper nesting.
-_WRAPPER_PARTS = frozenset({"_orig_mod", "_checkpoint_wrapped_module"})
-logger = logging.getLogger(__name__)
-def normalize_fqn(name: str) -> str:
-    """Strip torch.compile / checkpoint wrapper components from a parameter FQN."""
-    return ".".join(p for p in name.split(".") if p not in _WRAPPER_PARTS)
-@dataclass
-class _muon_state:
-    worker_rank: int
-    process_group: ProcessGroup
-    rank_indices: dict[int, tuple]  # local_rank -> per-dim indices
-    rank_numels: dict[int, int]  # local_rank -> numel
-    name: str
-    qk_clip_state: torch.Tensor | None = None
-def _batch_momentum(
-    grads: List[torch.Tensor],
-    momentum_bufs: List[torch.Tensor],
-    momentum: torch.Tensor,
-) -> None:
-    """Batched momentum update (no nesterov)."""
-    torch._foreach_mul_(momentum_bufs, momentum)
-    torch._foreach_add_(momentum_bufs, grads)
-def _batch_momentum_nesterov(
-    grads: List[torch.Tensor],
-    momentum_bufs: List[torch.Tensor],
-    momentum: torch.Tensor,
-) -> None:
-    """Batched momentum update with nesterov correction."""
-    torch._foreach_mul_(momentum_bufs, momentum)
-    torch._foreach_add_(momentum_bufs, grads)
-    nesterov_terms = torch._foreach_mul(momentum_bufs, momentum)
-    torch._foreach_add_(grads, nesterov_terms)
-_compiled_momentum: dict[bool, callable] = {}
-_use_momentum_compile = True
-def set_momentum_compile(enabled: bool):
-    """Toggle torch.compile for batched momentum."""
-    global _use_momentum_compile
-    _use_momentum_compile = enabled
-def batch_pre_ortho(
-    grads: List[torch.Tensor],
-    momentum_bufs: List[torch.Tensor],
-    momentum: torch.Tensor,
-    nesterov: bool,
-) -> None:
-    """Batched momentum update on lists of plain tensors.
-    Mirrors dion's ``muon_update_pre_orthogonalize``.
-    Inputs must be plain CUDA tensors (not DTensor).
-    Modifies ``momentum_bufs`` and (for nesterov) ``grads`` in-place.
-    When compile is enabled, uses separately compiled functions for
-    nesterov=True/False to avoid graph breaks from the branch.
-    """
-    fn = _batch_momentum_nesterov if nesterov else _batch_momentum
-    if _use_momentum_compile:
-        if nesterov not in _compiled_momentum:
-            _compiled_momentum[nesterov] = torch.compile(fn)
-        fn = _compiled_momentum[nesterov]
-    fn(grads, momentum_bufs, momentum)
-def _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay):
-    """Weight-decay + update on plain tensors.
-    Not compiled: per-param @torch.compile caused ~0.25ms TorchDynamo cache
-    lookup per call × 256+ params = massive overhead.  The pipeline path uses
-    batched _foreach_* ops instead; this function remains for base() and
-    distributed_muon().
-    """
-    p_data.mul_(1 - lr * weight_decay)
-    p_data.add_(u_data, alpha=-adjusted_lr)
-def update_p(p, u, lr, adjusted_lr, weight_decay):
-    """Apply weight decay and orthogonalized update to parameter.
-    Args:
-        p: Parameter (torch.nn.Parameter or DTensor).
-        u: Orthogonalized update tensor.
-        lr: Base learning rate.
-        adjusted_lr: Size-adjusted learning rate.
-        weight_decay: Weight decay coefficient.
-    """
-    # Unwrap Parameter -> underlying data tensor.
-    p_data = p.data if isinstance(p, torch.nn.Parameter) else p
-    # Unwrap DTensor -> local CUDA tensor for compiled kernel.
-    if isinstance(p_data, DTensor):
-        p_data = p_data._local_tensor
-    u_data = u._local_tensor if isinstance(u, DTensor) else u
-    _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay)
-def adjust_lr_for_muon(lr, param_shape):
-    """Scale learning rate based on parameter matrix dimensions.
-    Args:
-        lr: Base learning rate.
-        param_shape: Shape of the parameter tensor.
-    Returns:
-        Adjusted learning rate.
-    """
-    A, B = param_shape[:2]
-    # We adjust the learning rate and weight decay based on the size of the parameter matrix
-    # as described in the paper
-    adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-    adjusted_lr = lr * adjusted_ratio
-    return adjusted_lr
-def _match_key(parts, key):
-    """Check if key matches as contiguous components in parts.
-    Single-component keys (e.g. "experts") match any single component.
-    Multi-component keys (e.g. "experts.w1") match as a contiguous subsequence.
-    """
-    key_parts = key.split(".")
-    key_len = len(key_parts)
-    if key_len == 1:
-        return key in parts
-    return any(parts[i:i + key_len] == key_parts
-               for i in range(len(parts) - key_len + 1))
-def is_expert_param(name, expert_keys):
-    """Check if a parameter name matches any expert key (component-level)."""
-    if not expert_keys:
-        return False
-    parts = normalize_fqn(name).split(".")
-    return any(_match_key(parts, key) for key in expert_keys)
-def default_is_muon(name, x, expert_keys=None):
-    normalized = normalize_fqn(name)
-    parts = normalized.split(".")
-    skip_keys = [
-        "embed_tokens",
-        "lm_head",
-        "tok_embeddings",
-        "output",
-        "mhc_attn",
-        "mhc_ffn",
-        "lambda_proj",
-    ]
-    if any(key in parts for key in skip_keys):
-        logger.info(
-            "[is_muon] %s (orig: %s): skip (matched skip_key), ndim=%d",
-            normalized, name, x.ndim)
-        return False
-    effective_ndim = x.ndim
-    is_expert = is_expert_param(name, expert_keys)
-    if is_expert:
-        effective_ndim -= 1
-    result = effective_ndim >= 2
-    logger.info(
-        "[is_muon] %s (orig: %s): ndim=%d, expert=%s, effective_ndim=%d → %s",
-        normalized, name, x.ndim, is_expert, effective_ndim,
-        "Muon" if result else "AdamW")
-    return result
-def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
-    if is_muon_func is None:
-        is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
-    muon_params, muon_names = [], []
-    non_muon_params, non_muon_names = [], []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
-            continue
-        if is_muon_func(n, p):
-            muon_params.append(p)
-            muon_names.append(n)
-        else:
-            non_muon_params.append(p)
-            non_muon_names.append(n)
-    logger.info("[param_groups] expert_keys=%s, Muon=%d, AdamW=%d",
-                expert_keys, len(muon_names), len(non_muon_names))
-    return [
-        {
-            "params": muon_params,
-            "names": muon_names,
-            "use_muon": True,
-        },
-        {
-            "params": non_muon_params,
-            "use_muon": False,
-        },
-    ]

build/torch210-cxx11-cu126-x86_64-linux/cpu_offload.py DELETED Viewed

@@ -1,206 +0,0 @@
-"""CPU offloading for optimizer states.
-Manages a pinned CPU memory pool and async CUDA streams to offload
-optimizer state tensors (momentum buffers, Adam moments) to CPU between
-optimizer steps, freeing GPU memory.
-All tracked tensors are packed into a single flat pinned CPU buffer
-(per dtype).  D2H and H2D copies are performed per-tensor directly
-between individual GPU tensors and their slice of the CPU flat buffer
-— no GPU staging buffer is allocated, so there is **no temporary GPU
-memory spike** during offload or reload.
-Individual tensor storages are freed after offload via
-``untyped_storage().resize_(0)``, preserving tensor identity so
-downstream caches remain valid.
-"""
-import logging
-from collections import defaultdict
-import torch
-from torch.distributed.tensor import DTensor
-logger = logging.getLogger(__name__)
-class CPUOffloadPool:
-    """Pinned CPU memory pool for async optimizer state offloading.
-    Tracked tensors are grouped by dtype.  Each group gets a single flat
-    pinned CPU buffer.  D2H / H2D copies are per-tensor (into slices of
-    the flat buffer) to avoid allocating a GPU staging buffer.
-    """
-    def __init__(self):
-        self._managed: list[torch.Tensor] = []
-        self._storage_nbytes: dict[int, int] = {}  # id(t) → bytes
-        # Per-dtype group: populated on first offload.
-        # dtype → dict with keys:
-        #   "indices"   : list[int]           managed-list indices
-        #   "offsets"   : list[tuple[int,int]] (start, numel) in flat buf
-        #   "total"     : int                  total numel
-        #   "cpu_flat"  : Tensor               pinned CPU buffer
-        self._groups: dict[torch.dtype, dict] = {}
-        self._offload_stream: torch.cuda.Stream | None = None
-        self._device: torch.device | None = None
-        self._initialized: bool = False
-        self._logged: bool = False
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _local(t: torch.Tensor) -> torch.Tensor:
-        """Unwrap DTensor to its local CUDA tensor."""
-        return t._local_tensor if isinstance(t, DTensor) else t
-    def _ensure_stream(self):
-        if self._offload_stream is None:
-            self._offload_stream = torch.cuda.Stream(device=self._device)
-    # ------------------------------------------------------------------
-    def track(self, tensor: torch.Tensor):
-        """Register a GPU tensor for CPU offloading.  Idempotent."""
-        tid = id(tensor)
-        if tid in self._storage_nbytes:
-            return
-        local = self._local(tensor)
-        if self._device is None:
-            self._device = local.device
-        storage = local.untyped_storage()
-        # Skip tensors with empty storage (e.g. empty FSDP shards)
-        if storage.size() == 0:
-            return
-        self._storage_nbytes[tid] = storage.size()
-        self._managed.append(tensor)
-    # ------------------------------------------------------------------
-    def _init_buffers(self):
-        """Build per-dtype flat buffers on first offload."""
-        # Group managed tensors by dtype.
-        dtype_map: dict[torch.dtype, list[tuple[int, int]]] = defaultdict(list)
-        for idx, t in enumerate(self._managed):
-            local = self._local(t)
-            dtype_map[local.dtype].append((idx, local.numel()))
-        total_cpu_bytes = 0
-        for dtype, entries in dtype_map.items():
-            offsets: list[tuple[int, int]] = []
-            indices: list[int] = []
-            off = 0
-            for idx, n in entries:
-                indices.append(idx)
-                offsets.append((off, n))
-                off += n
-            cpu_flat = torch.empty(off, dtype=dtype, device="cpu", pin_memory=True)
-            self._groups[dtype] = {
-                "indices": indices,
-                "offsets": offsets,
-                "total": off,
-                "cpu_flat": cpu_flat,
-            }
-            total_cpu_bytes += off * cpu_flat.element_size()
-        self._initialized = True
-        logger.info(
-            "[CPUOffload] Pool initialized: %d tensors, %d dtype group(s), "
-            "%.2f MB pinned CPU memory",
-            len(self._managed),
-            len(self._groups),
-            total_cpu_bytes / (1024**2),
-        )
-    # ------------------------------------------------------------------
-    def offload(self):
-        """Per-tensor async D2H into CPU flat buffer, then free GPU storage."""
-        if not self._managed:
-            return
-        if not self._initialized:
-            self._init_buffers()
-        self._ensure_stream()
-        # Offload stream waits for compute to finish.
-        compute_event = torch.cuda.current_stream(self._device).record_event()
-        self._offload_stream.wait_event(compute_event)
-        offloaded_bytes = 0
-        # Per-tensor D2H copies directly into CPU flat buffer slices.
-        # No GPU staging buffer → no temporary GPU memory spike.
-        with torch.cuda.stream(self._offload_stream):
-            for dtype, grp in self._groups.items():
-                indices = grp["indices"]
-                offsets = grp["offsets"]
-                cpu_flat = grp["cpu_flat"]
-                for i, mgd_idx in enumerate(indices):
-                    local = self._local(self._managed[mgd_idx])
-                    off, n = offsets[i]
-                    cpu_flat[off : off + n].copy_(local.reshape(-1), non_blocking=True)
-                offloaded_bytes += grp["total"] * cpu_flat.element_size()
-        # Wait for all D2H copies to land, then free GPU storage.
-        self._offload_stream.synchronize()
-        for t in self._managed:
-            storage = self._local(t).untyped_storage()
-            if storage.size() != 0:
-                storage.resize_(0)
-            else:
-                raise RuntimeError(
-                    f"Tensor storage is already freed (size=0) before offload. "
-                    f"This indicates a double-free or external interference. "
-                    f"Tensor shape: {t.shape}, dtype: {t.dtype}"
-                )
-        if not self._logged:
-            logger.info(
-                "[CPUOffload] Offloaded %.2f MB (GPU → CPU)",
-                offloaded_bytes / (1024**2),
-            )
-    # ------------------------------------------------------------------
-    def reload(self):
-        """Per-tensor H2D from CPU flat buffer on the default stream.
-        Runs on the current (default) CUDA stream to avoid stream
-        interaction issues with the parallel Muon pipeline.  Since
-        pinned CPU memory is the source, the copies overlap with
-        GPU idle time between steps.
-        """
-        if not self._managed or not self._initialized:
-            return
-        reloaded_bytes = 0
-        # Re-allocate all GPU storages first.
-        for t in self._managed:
-            local = self._local(t)
-            storage = local.untyped_storage()
-            if storage.size() != 0:
-                raise RuntimeError(
-                    f"Storage should have been freed (size=0) before reload, "
-                    f"but got size={storage.size()}. "
-                    f"Tensor shape: {t.shape}, dtype: {t.dtype}"
-                )
-            storage.resize_(self._storage_nbytes[id(t)])
-        # Per-tensor H2D copies from CPU flat buffer slices.
-        # non_blocking=True with pinned source allows DMA overlap.
-        for dtype, grp in self._groups.items():
-            indices = grp["indices"]
-            offsets = grp["offsets"]
-            cpu_flat = grp["cpu_flat"]
-            for i, mgd_idx in enumerate(indices):
-                local = self._local(self._managed[mgd_idx])
-                off, n = offsets[i]
-                local.reshape(-1).copy_(cpu_flat[off : off + n], non_blocking=True)
-            reloaded_bytes += grp["total"] * cpu_flat.element_size()
-        if not self._logged:
-            logger.info(
-                "[CPUOffload] Reloaded %.2f MB (CPU → GPU)", reloaded_bytes / (1024**2)
-            )

build/torch210-cxx11-cu126-x86_64-linux/distributed/utils.py DELETED Viewed

@@ -1,232 +0,0 @@
-import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import DTensor
-from torch.distributed.tensor.placement_types import (Placement, Shard,
-                                                      _StridedShard)
-def _is_shard(placement: Placement) -> bool:
-    """Check if a placement is a shard type (Shard or _StridedShard).
-    In PyTorch 2.10+, _StridedShard no longer inherits from Shard, so
-    ``placement.is_shard()`` returns False for _StridedShard.  This helper
-    handles both old and new hierarchies.
-    """
-    return isinstance(placement, (Shard, _StridedShard))
-def get_slices_of_dtensor(
-    target: DTensor | torch.Tensor,
-    local_rank: int,
-    shard_mesh: DeviceMesh,
-    shard_placements: tuple[Placement],
-) -> tuple[slice | torch.Tensor, ...]:
-    """
-    Get per-dimension indices for a given rank's shard of the target tensor.
-    Uses ``Shard.local_shard_size_and_offset`` and
-    ``_StridedShard.local_shard_size_and_offset`` for correct handling of
-    both contiguous and strided (non-contiguous) sharding.
-    Args:
-        target (DTensor | torch.Tensor): The target tensor (for its shape).
-        local_rank (int): The local rank within the shard group.
-        shard_mesh (DeviceMesh): The shard mesh (only shard dimensions).
-        shard_placements (tuple[Placement]): The shard placements.
-    Returns:
-        A tuple of indices (one per tensor dim).  Each element is either:
-        - A ``slice`` (for contiguous or unsharded dims)
-        - A 1-D ``torch.LongTensor`` of indices (for strided sharding)
-    """
-    # find the global rank of the local rank in the shard mesh
-    rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
-    rank_coords = (shard_mesh.mesh == rank).nonzero()
-    assert len(rank_coords) == 1
-    rank_coords = tuple(rank_coords[0].tolist())
-    assert len(rank_coords) == len(shard_placements)
-    # Track per-shard-dim indices.
-    # None means "not yet sharded on this dim".
-    dim_indices: dict[int, torch.Tensor] = {}
-    # Caution: Assuming replicate-to-shard of the shard mesh goes with
-    # left-to-right sharding. This is ensured by the sorting logic of
-    # construct_shard_mesh function.
-    for mesh_dim_idx, (rank_coord, placement) in enumerate(
-            zip(rank_coords, shard_placements)):
-        assert _is_shard(placement)
-        num_chunks = shard_mesh.mesh.shape[mesh_dim_idx]
-        shard_dim = placement.dim
-        # Current effective size on this dim (may already be sub-sharded)
-        if shard_dim in dim_indices:
-            curr_size = len(dim_indices[shard_dim])
-        else:
-            curr_size = target.size()[shard_dim]
-        # Compute indices for this level of sharding
-        if isinstance(placement, _StridedShard):
-            _shard_size, offsets = _StridedShard.local_shard_size_and_offset(
-                placement,
-                curr_size,
-                num_chunks,
-                rank_coord,
-                return_first_offset=False)
-            new_indices = torch.tensor(offsets, dtype=torch.long)
-        else:
-            shard_size, offset = Shard.local_shard_size_and_offset(
-                curr_size, num_chunks, rank_coord)
-            new_indices = torch.arange(offset,
-                                       offset + shard_size,
-                                       dtype=torch.long)
-        # Compose with previous indices on this dim
-        if shard_dim in dim_indices:
-            dim_indices[shard_dim] = dim_indices[shard_dim][new_indices]
-        else:
-            dim_indices[shard_dim] = new_indices
-    # Build result tuple
-    result: list[slice | torch.Tensor] = []
-    for d in range(len(target.size())):
-        if d not in dim_indices:
-            result.append(slice(None))
-        else:
-            indices = dim_indices[d]
-            # Convert contiguous indices to slice for efficiency
-            if len(indices) > 0:
-                start = indices[0].item()
-                expected = torch.arange(start,
-                                        start + len(indices),
-                                        dtype=torch.long)
-                if torch.equal(indices, expected):
-                    result.append(slice(start, start + len(indices)))
-                else:
-                    result.append(indices)
-            else:
-                result.append(slice(0, 0))
-    return tuple(result)
-_ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
-                                                  ProcessGroup]] = dict()
-def construct_shard_mesh(
-    placements: tuple[Placement],
-    mesh: DeviceMesh,
-) -> tuple[DeviceMesh, ProcessGroup, tuple[Placement, ...]]:
-    """Construct shard sub-mesh and ProcessGroup for all-to-all communication.
-    Given a DTensor's placements and device mesh, extracts the "shard group"
-    — the set of ranks that together hold all shards of the same replica —
-    and creates a ProcessGroup for all-to-all among them.
-    Steps:
-        1. Sort placements: Replicate first, then Shard by (dim, granularity).
-        2. Permute the mesh tensor to match the sorted order.
-        3. Collapse Replicate dims → list of shard sub-meshes (one per replica).
-        4. Create/retrieve a cached ProcessGroup for the current rank's sub-mesh.
-    Example — 8 GPUs, mesh shape (2, 2, 2),
-              placements ``[Shard(0), Replicate, _StridedShard(0)]``::
-        Step 1 — Sort: [Replicate, _StridedShard(0), Shard(0)]
-                 Permutation: [1, 2, 0]
-        Step 2 — Permute mesh dims by [1, 2, 0]:
-                 Original:                Permuted:
-                 [[[0,1],[2,3]],          [[[0,2],[1,3]],
-                  [[4,5],[6,7]]]           [[4,6],[5,7]]]
-        Step 3 — Unbind replicate dim (dim 0), giving 2 shard sub-meshes:
-                 sub-mesh 0 = [[0,2],[1,3]]  (replica group 0)
-                 sub-mesh 1 = [[4,6],[5,7]]  (replica group 1)
-                 shard_placements = (_StridedShard(0), Shard(0))
-        Step 4 — Rank 0 → ProcessGroup([0,1,4,5])
-                 Rank 2 → ProcessGroup([2,3,6,7])
-    Returns:
-        ``(shard_mesh, process_group, shard_placements)``
-    """
-    my_rank = dist.get_rank()
-    assert mesh.mesh.device.type == 'cpu'
-    # -- Fast path: 1D all-shard mesh → reuse existing PG. ----------------
-    # Reuses the mesh's existing ProcessGroup directly, avoiding the
-    # overhead of dist.new_group(). The standard path below also handles
-    # subset calls safely via use_local_synchronization=True, but this
-    # fast path is still beneficial for the common 1D shard case.
-    if mesh.ndim == 1 and len(placements) == 1 and _is_shard(placements[0]):
-        key = (*mesh.mesh.shape, *mesh.mesh.flatten().tolist())
-        if key not in _ranks_to_dist_cache:
-            _ranks_to_dist_cache[key] = (mesh, mesh.get_group())
-        return (*_ranks_to_dist_cache[key], tuple(placements))
-    mesh_tensor = mesh.mesh.clone()
-    # -- Step 1: Sort placements (Replicate first, then Shard by dim). ------
-    # _StridedShard comes BEFORE regular Shard on the same dim so that
-    # get_slices_of_dtensor applies the outer sharding first, matching
-    # DTensor's left-to-right (outer-to-inner) composition order.
-    def _sort_key(item):
-        index, placement = item
-        assert not placement.is_partial(), "Partial placement not supported"
-        if placement.is_replicate():
-            return (-1, 0, index)
-        assert _is_shard(placement), f"Unsupported: {type(placement)}"
-        split = (-1 / placement.split_factor if isinstance(
-            placement, _StridedShard) else 0)
-        return (placement.dim, split, index)
-    indexed = sorted(enumerate(placements), key=_sort_key)
-    perm, sorted_placements = zip(*indexed)
-    # -- Step 2: Permute mesh to match sorted placement order. --------------
-    sorted_mesh = mesh_tensor.permute(perm)
-    # -- Step 3: Collapse replicate dims → list of shard sub-meshes. --------
-    # E.g. mesh (2, 3, 4, 4) with [R, R, S(0), S(1)] → 6 sub-meshes of (4, 4)
-    num_rep = sum(1 for p in sorted_placements if p.is_replicate())
-    if num_rep > 0:
-        if num_rep > 1:
-            sorted_mesh = sorted_mesh.flatten(0, num_rep - 1)
-        shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
-    else:
-        shard_meshes = [sorted_mesh]
-    shard_placements = sorted_placements[num_rep:]
-    assert len(shard_placements) == len(set(shard_placements))
-    # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
-    # Each rank only creates the group it belongs to, using
-    # use_local_synchronization=True so that only group members need to
-    # coordinate. This avoids deadlocks when different PP stages call
-    # construct_shard_mesh for different parameters.
-    def _cache_key(t: torch.Tensor) -> tuple:
-        return (*t.shape, *t.flatten().tolist())
-    my_key = None
-    for sm in shard_meshes:
-        if (my_rank == sm).any().item():
-            key = _cache_key(sm)
-            assert my_key is None, "Rank appears in multiple shard groups"
-            my_key = key
-            if key not in _ranks_to_dist_cache:
-                pg = dist.new_group(sm.flatten().tolist(),
-                                    use_local_synchronization=True)
-                _ranks_to_dist_cache[key] = (
-                    DeviceMesh(device_type="cuda", mesh=sm),
-                    pg,
-                )
-    return (*_ranks_to_dist_cache[my_key], shard_placements)

build/torch210-cxx11-cu126-x86_64-linux/matmul_transpose_triton.py DELETED Viewed

@@ -1,122 +0,0 @@
-# MIT License
-#
-# Copyright (c) 2025 Tianyang Lin
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import torch
-import triton
-import triton.language as tl
-def get_autotune_config():
-    return [
-        triton.Config(
-            {
-                'BLOCK_SIZE_M': blk_m,
-                'BLOCK_SIZE_K': blk_k,
-                'GROUP_SIZE_M': grp_sz
-            },
-            num_stages=n_stages,
-            num_warps=n_warps) for blk_m in [32, 64, 128]
-        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
-        for n_warps in [4, 8]
-    ]
-@triton.autotune(
-    configs=get_autotune_config(),
-    key=['M', 'K'],
-    restore_value=['y'],
-)
-@triton.jit
-def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
-               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-               GROUP_SIZE_M: tl.constexpr):
-    """
-    Core kernel jit function of matmul_transpose that computes y = x @ x.T
-    The code is a simple adaptation from the triton `matmul` tutorial:
-    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
-    """
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    if pid_m > pid_n:
-        return
-    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    # we use a & b ptrs to denote different rows of x.
-    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        a = tl.load(a_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
-        a_ptrs += BLOCK_SIZE_K * stride_xk
-        b_ptrs += BLOCK_SIZE_K * stride_xk
-    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
-    # https://github.com/triton-lang/triton/issues/2252
-    c = accumulator.to(x.dtype.element_ty)
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
-    tl.store(c_ptrs, c, mask=c_mask)
-    # transpose and copy
-    if pid_m < pid_n:
-        ct_ptrs = y + stride_ym * offs_cn[:,
-                                          None] + stride_yn * offs_cm[None, :]
-        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
-        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
-@torch.library.custom_op("muon::matmul_transpose_assign",
-                         mutates_args=("d_out", ))
-def matmul_transpose_assign(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
-    """Compute d_out = d_in @ d_in.T using an optimized Triton kernel."""
-    d_in = d_in.contiguous()
-    M, K = d_in.shape
-    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
-        M, META['BLOCK_SIZE_M']), )
-    with torch.cuda.device(d_in.device.index):
-        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
-                         d_out.stride(0), d_out.stride(1))
-@matmul_transpose_assign.register_fake
-def _(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
-    """FakeTensor impl: d_out is already allocated, mutation is declared."""
-    pass

build/torch210-cxx11-cu126-x86_64-linux/metadata.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "python-depends": []
-}

build/torch210-cxx11-cu126-x86_64-linux/muon.py DELETED Viewed

@@ -1,1068 +0,0 @@
-import logging
-import types
-from collections import defaultdict
-from typing import Any
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor, Replicate, Shard
-from torch.profiler import record_function
-from .adamw import _placement_cache, _tensor_cache, step_adamw
-from .async_utils import run_pipeline
-from .core import (_muon_state, adjust_lr_for_muon, batch_pre_ortho,
-                   get_default_muon_param_groups, is_expert_param, update_p)
-from .cpu_offload import CPUOffloadPool
-from .distributed.utils import (_is_shard, construct_shard_mesh,
-                                get_slices_of_dtensor)
-from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
-                            _zeropower_via_newtonschulz5,
-                            zeropower_via_newtonschulz5,
-                            zeropower_via_newtonschulz5_batched)
-from .pipeline import muon_chunk_pipeline, prelaunch_first_gather
-from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
-logger = logging.getLogger(__name__)
-def _expand_expert_params(names, params, expert_keys):
-    """Expand expert params by splitting on dim 0 (expert dimension).
-    Params whose name matches any key in ``expert_keys`` are treated as
-    expert-parallel tensors.  Their outermost dimension is the expert
-    dimension: an ``(E, out, in)`` tensor becomes ``E`` separate 2D
-    ``nn.Parameter`` views so that in-place updates propagate back to
-    the original storage.
-    Non-expert params with ``ndim > 2`` trigger an ``AssertionError`` —
-    if they are expert params, their key must be added to ``expert_keys``.
-    The grad must already be set on each expert param (e.g. after momentum).
-    For DTensor expert params, placements that shard on dim 0 (expert dim)
-    are consumed by the split.  Non-dim-0 shard placements (e.g. TP) are
-    preserved: each 2D slice is wrapped as a DTensor on the corresponding
-    submesh so the parallel pipeline handles the TP communication.
-    """
-    expanded_names = []
-    expanded_params = []
-    for n, p in zip(names, params):
-        is_expert = is_expert_param(n, expert_keys)
-        is_dtensor = isinstance(p.data, DTensor)
-        if is_expert:
-            if is_dtensor:
-                logger.debug(
-                    "[expand_expert] %s: expert DTensor, shape=%s, "
-                    "placements=%s, mesh=%s, local_shape=%s", n, p.shape,
-                    p.placements, p.device_mesh.mesh_dim_names,
-                    p.to_local().shape)
-            else:
-                logger.debug(
-                    "[expand_expert] %s: expert plain tensor, shape=%s", n,
-                    p.data.shape)
-        if not is_expert:
-            assert p.data.ndim <= 2, (
-                f"Param {n} has ndim={p.data.ndim} but does not match "
-                f"expert_keys={expert_keys}. If this is an expert param, "
-                f"add its key to expert_keys.")
-            expanded_names.append(n)
-            expanded_params.append(p)
-            continue
-        g = p.grad
-        assert g is not None, (
-            f"Expert param {n} must have grad set before expansion")
-        tp_mesh = None
-        tp_placements_2d = None
-        if is_dtensor:
-            local_data = p.to_local()
-            local_grad = g.to_local() if isinstance(g, DTensor) else g
-            # Find non-dim-0 shard placements (e.g. TP sharding).
-            # After splitting on dim 0, Shard(k) becomes Shard(k-1).
-            tp_dim_indices = []
-            tp_placements_2d = []
-            for i, pl in enumerate(p.placements):
-                if _is_shard(pl) and pl.dim != 0:
-                    tp_dim_indices.append(i)
-                    tp_placements_2d.append(Shard(pl.dim - 1))
-            if tp_dim_indices:
-                tp_dim_names = tuple(p.device_mesh.mesh_dim_names[i]
-                                     for i in tp_dim_indices)
-                if len(tp_dim_names) == 1:
-                    tp_mesh = p.device_mesh[tp_dim_names[0]]
-                else:
-                    tp_mesh = p.device_mesh[tp_dim_names]
-        else:
-            local_data = p.data
-            local_grad = g
-        # Expand: split dim 0, reshape each slice to 2D.
-        num_local_experts = local_data.shape[0]
-        for i in range(num_local_experts):
-            slice_data = local_data[i]
-            slice_grad = local_grad[i]
-            if tp_mesh is not None:
-                # Wrap as DTensor on TP submesh so the pipeline handles
-                # TP communication (gather/scatter across TP ranks).
-                dt_data = DTensor.from_local(slice_data,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                dt_grad = DTensor.from_local(slice_grad,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                expert_param = torch.nn.Parameter(dt_data, requires_grad=False)
-                expert_param.grad = dt_grad
-            else:
-                expert_param = torch.nn.Parameter(slice_data,
-                                                  requires_grad=False)
-                expert_param.grad = slice_grad
-            expanded_names.append(f"{n}[{i}]")
-            expanded_params.append(expert_param)
-        p.grad = None  # allow expert grad storage to be freed after pipeline
-    return expanded_names, expanded_params
-class Muon(torch.optim.Optimizer):
-    """
-    Muon - MomentUm Orthogonalized by Newton-schulz
-    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
-    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
-    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
-    the advantage that it can be stably run in bfloat16 on the GPU.
-    Some warnings:
-    - We believe this optimizer is unlikely to work well for training with small batch size.
-    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
-    Arguments:
-        model: The model to be optimized by Muon.
-        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
-        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
-        momentum: The momentum used by the internal SGD. (0.95 is a good default)
-        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
-        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        weight_decay: The weight decay for Muon and AdamW.
-            Parameters that are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW instead.
-        adamw_lr: The learning rate for the internal AdamW.
-        adamw_betas: The betas for the internal AdamW.
-        adamw_eps: The epsilon for the internal AdamW.
-        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
-        debug: Whether to print debug information.
-        clip_info : Configuration for QK clipping. Expected keys:
-            - "q_indices" (list[int]): Indices of query heads to consider.
-            - "k_indices" (list[int]): Indices of key heads to consider.
-            - "head_dim" (int): Dimensionality of each attention head.
-            - "threshold" (float): Threshold value; heads whose QK logits exceed
-            this value will be scaled down.
-            Default is:
-                {
-                    "q_indices": [],
-                    "k_indices": [],
-                    "head_dim": 128,
-                    "threshold": 100
-                }
-        warmup_step : How many all2all gather, compute operations are launched in advance
-                      before the corresponding all2all scatter steps begin.
-                      A higher warmup_step increases memory usage but can improve
-                      performance by overlapping communication.
-                      Parallel muon only.
-        chunk_size : Batch size of parameters to process in each
-                     all2all gather/compute/scatter step.
-                     Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
-        use_distributed_muon: Use distributed muon by Liu et al. (2024).
-                              For testing purpose only.
-        expert_keys: List of strings to identify expert-parallel parameters.
-                     If any key appears in a parameter's name, its outermost
-                     dimension is treated as the expert dimension and expanded
-                     into per-expert 2D params for Muon.  For example,
-                     ``expert_keys=["experts"]`` matches any param whose name
-                     contains "experts".  3D+ params not matched by any key
-                     will raise an error.
-    """
-    def __init__(self,
-                 params,
-                 lr=1e-3,
-                 momentum=0.95,
-                 nesterov=True,
-                 ns_steps=5,
-                 weight_decay=0.1,
-                 adamw_betas=(0.9, 0.95),
-                 adamw_eps=1e-8,
-                 none_grad=True,
-                 debug=False,
-                 clip_config=None,
-                 warmup_step=5,
-                 chunk_size=-1,
-                 use_distributed_muon=False,
-                 expert_keys=None):
-        defaults = dict(
-            lr=lr,
-            weight_decay=weight_decay,
-            momentum=momentum,
-            nesterov=nesterov,
-            ns_steps=ns_steps,
-            adamw_betas=adamw_betas,
-            adamw_eps=adamw_eps,
-            none_grad=none_grad,
-            use_muon=True,
-        )
-        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
-        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
-        if isinstance(params, types.GeneratorType):
-            raise ValueError(error_message.format(idx=0) + instruction_code)
-        for _idx, param_group in enumerate(params):
-            if param_group.get("use_muon", None) is None:
-                raise ValueError(
-                    error_message.format(idx=_idx) + instruction_code)
-        super().__init__(params, defaults)
-        self.debug = debug
-        self.clip_config = clip_config if clip_config is not None else {
-            "q_indices": [],
-            "k_indices": [],
-            "head_dim": 128,
-            "threshold": 100,
-        }
-        self.warmup_step = warmup_step
-        self.chunk_size = chunk_size
-        self.use_distributed_muon = use_distributed_muon
-        self.expert_keys = expert_keys
-        self.cpu_offload = False
-        self._cpu_offload_pool: CPUOffloadPool | None = None
-        self._offload_initialized = False
-        self._parallel_cache: dict[tuple[str, ...], dict] = {}
-        self._expert_expand_cache: dict[tuple[int, ...], dict] = {}
-    def _calc_flops(self, G, steps):
-        assert len(G.shape) == 2
-        M, N = G.shape
-        if M > N:
-            M, N = N, M
-        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def get_shard_mesh(self, p):
-        """
-        Get the shard mesh for a parameter p on the given rank.
-        """
-        assert isinstance(
-            p, DTensor), "Parallel Muon only supports DTensor parameters."
-        shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
-            p.placements, p.device_mesh)
-        return shard_mesh, shard_pg, shard_placements
-    def init_state_and_assign_params(self, names, params, group, qk_logits):
-        param_to_state = {}
-        param_to_flops = {}
-        total_flops = 0
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            assert g.ndim == 2, "Muon only supports 2D parameters."
-            flops = self._calc_flops(g, group["ns_steps"])
-            param_to_flops[id(p)] = flops
-            total_flops += flops
-        if self.debug:
-            logger.debug("Total TFLOPs for Muon: %.2f TFLOPs",
-                         total_flops / 1e12)
-        paired = list(zip(names, params))
-        paired_sorted = sorted(paired,
-                               key=lambda x: param_to_flops[id(x[1])],
-                               reverse=True)
-        names_sorted, params_sorted = zip(*paired_sorted)
-        ordered_names = list(names_sorted)
-        ordered_params = list(params_sorted)
-        round_robin = 0
-        mesh = ordered_params[0].device_mesh
-        placements = ordered_params[0].placements
-        shard_mesh, shard_pg, shard_placements = self.get_shard_mesh(
-            ordered_params[0])
-        shard_mesh_flattened = shard_mesh.mesh.flatten()
-        num_ranks = dist.get_world_size(group=shard_pg)
-        for n, p in zip(ordered_names, ordered_params):
-            if mesh != p.device_mesh:
-                raise ValueError("All parameters must be on the same mesh.")
-            if placements != p.placements:
-                raise ValueError("All parameters must have same placements.")
-            worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
-            round_robin = (round_robin + 1) % len(shard_mesh_flattened)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            # Precompute per-rank indices and numels for all-to-all.
-            rank_indices: dict[int, tuple] = {}
-            rank_numels: dict[int, int] = {}
-            for r in range(num_ranks):
-                indices = get_slices_of_dtensor(p, r, shard_mesh,
-                                                shard_placements)
-                rank_indices[r] = indices
-                numel = 1
-                for idx, dim_size in zip(indices, p.shape):
-                    if isinstance(idx, slice):
-                        start, stop, step = idx.indices(dim_size)
-                        numel *= max(0, (stop - start + (step - 1)) // step)
-                    else:
-                        numel *= len(idx)
-                rank_numels[r] = numel
-            param_to_state[id(p)] = _muon_state(
-                worker_rank=worker_rank,
-                process_group=shard_pg,
-                rank_indices=rank_indices,
-                rank_numels=rank_numels,
-                name=n,
-                qk_clip_state=qk_clip_state,
-            )
-        return param_to_state, ordered_params
-    def base(self, names, params, group, lr, weight_decay, qk_logits):
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            u = zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
-                                            steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-            update_p(p, u, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p, scales_full, qk_clip_state)
-    def distributed_muon(
-        self,
-        names: list[str],
-        params: list[torch.nn.Parameter],
-        group: dict[str, Any],
-        lr: float,
-        weight_decay: float,
-        qk_logits: list[torch.Tensor | DTensor] | None,
-    ):
-        """Batched Distributed Muon — for testing/correctness verification only.
-        Uses all-gather to reconstruct full tensors, computes Newton-Schulz on
-        the full grad, then slices back to local shards.  This is simpler but
-        slower than the parallel pipeline (all2all) path, so it serves as a
-        reference implementation for verifying correctness.
-        """
-        with record_function("distributed_muon"):
-            # Momentum is already applied by _step_muon before this method.
-            ns_steps = group["ns_steps"]
-            # Separate plain tensors (no communication) from DTensors.
-            plain_names, plain_params = [], []
-            dtensor_names, dtensor_params = [], []
-            for n, p in zip(names, params):
-                if p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    dtensor_names.append(n)
-                    dtensor_params.append(p)
-                else:
-                    plain_names.append(n)
-                    plain_params.append(p)
-            # Process plain tensors per-param (no communication).
-            for n, p in zip(plain_names, plain_params):
-                u = _zeropower_via_newtonschulz5(p.grad.to(COMM_DTYPE),
-                                                 steps=ns_steps)
-                adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-                update_p(p, u, lr, adjusted_lr, weight_decay)
-                qk_clip_state = get_qk_clip_info(self.clip_config, n,
-                                                 qk_logits)
-                scales_full = compute_scales(
-                    p, qk_clip_state) if qk_clip_state is not None else None
-                if scales_full is not None:
-                    qk_clip(p, scales_full, qk_clip_state)
-            if not dtensor_params:
-                return
-            # Group DTensors by (placements, mesh) for batched all-gather.
-            placement_groups: dict[tuple,
-                                   tuple[list,
-                                         list]] = defaultdict(lambda: ([], []))
-            for n, p in zip(dtensor_names, dtensor_params):
-                key = (p.placements, p.device_mesh)
-                placement_groups[key][0].append(n)
-                placement_groups[key][1].append(p)
-            logger.info(
-                "distributed_muon: %d placement groups, %d total dtensors",
-                len(placement_groups), len(dtensor_params))
-            for (placements, mesh), (grp_names,
-                                     grp_params) in placement_groups.items():
-                shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
-                    placements, mesh)
-                rank = dist.get_rank(shard_pg)
-                world_size = dist.get_world_size(shard_pg)
-                logger.info("  group: %d params, placements=%s, world_size=%d",
-                            len(grp_params), placements, world_size)
-                # Separate params that can be batched (all shard dims evenly
-                # divisible) from those needing per-param full_tensor
-                # (e.g. MoE gate weights with fewer rows than shard ranks).
-                # all_gather_into_tensor requires equal buffer sizes across
-                # ranks, so uneven splits must use DTensor full_tensor().
-                batch_names, batch_params = [], []
-                single_names, single_params = [], []
-                for n, p in zip(grp_names, grp_params):
-                    even = all(p.shape[pl.dim] %
-                               shard_mesh.mesh.shape[dim_idx] == 0
-                               for dim_idx, pl in enumerate(shard_placements))
-                    if even:
-                        batch_names.append(n)
-                        batch_params.append(p)
-                    else:
-                        single_names.append(n)
-                        single_params.append(p)
-                # Process uneven-split params per-param via full_tensor().
-                for n, p in zip(single_names, single_params):
-                    with record_function("distributed_muon::newton_schulz"):
-                        g_full = p.grad.full_tensor().to(COMM_DTYPE)
-                        u_full = _zeropower_via_newtonschulz5(g_full,
-                                                              steps=ns_steps)
-                        del g_full
-                    with record_function("distributed_muon::update"):
-                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-                        p._local_tensor.mul_(1 - lr * weight_decay)
-                        local_indices = get_slices_of_dtensor(
-                            p, rank, shard_mesh, shard_placements)
-                        u_local = u_full[local_indices]
-                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
-                        del u_full
-                        qk_clip_state = get_qk_clip_info(
-                            self.clip_config, n, qk_logits)
-                        scales_full = compute_scales(
-                            p, qk_clip_state
-                        ) if qk_clip_state is not None else None
-                        if scales_full is not None:
-                            ratio = p.shape[0] // scales_full.shape[0]
-                            idx0 = local_indices[0]
-                            if isinstance(idx0, slice):
-                                start = idx0.start or 0
-                                idx0 = torch.arange(start,
-                                                    idx0.stop,
-                                                    device=scales_full.device)
-                            row_scales = scales_full[idx0 // ratio]
-                            p._local_tensor.mul_(row_scales.view(-1, 1))
-                if not batch_params:
-                    continue
-                logger.info("  batched=%d, single=%d", len(batch_params),
-                            len(single_params))
-                # Concat all local grad shards into a single flat buffer.
-                with record_function("distributed_muon::gather"):
-                    grad_locals = [
-                        p.grad.to_local().to(COMM_DTYPE).flatten()
-                        for p in batch_params
-                    ]
-                    numels = [g.numel() for g in grad_locals]
-                    grad_concat = torch.cat(grad_locals)
-                    del grad_locals
-                    # Single all-gather (replaces N separate full_tensor).
-                    grad_gathered = torch.empty(
-                        grad_concat.numel() * world_size,
-                        dtype=COMM_DTYPE,
-                        device="cuda",
-                    )
-                    dist.all_gather_into_tensor(grad_gathered,
-                                                grad_concat,
-                                                group=shard_pg)
-                total_numel = grad_concat.numel()
-                del grad_concat
-                # Precompute per-param offsets within the concat buffer.
-                offsets = []
-                off = 0
-                for ne in numels:
-                    offsets.append(off)
-                    off += ne
-                # Per-param: reconstruct full grad → NS → local update.
-                for i, (n, p) in enumerate(zip(batch_names, batch_params)):
-                    with record_function("distributed_muon::newton_schulz"):
-                        g_full = torch.empty(p.shape,
-                                             dtype=COMM_DTYPE,
-                                             device="cuda")
-                        for r in range(world_size):
-                            r_start = r * total_numel + offsets[i]
-                            shard = grad_gathered[r_start:r_start + numels[i]]
-                            indices = get_slices_of_dtensor(
-                                p, r, shard_mesh, shard_placements)
-                            g_full[indices] = shard.reshape(
-                                g_full[indices].shape)
-                        u_full = _zeropower_via_newtonschulz5(g_full,
-                                                              steps=ns_steps)
-                        del g_full
-                    with record_function("distributed_muon::update"):
-                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-                        p._local_tensor.mul_(1 - lr * weight_decay)
-                        local_indices = get_slices_of_dtensor(
-                            p, rank, shard_mesh, shard_placements)
-                        u_local = u_full[local_indices]
-                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
-                        del u_full
-                        qk_clip_state = get_qk_clip_info(
-                            self.clip_config, n, qk_logits)
-                        scales_full = compute_scales(
-                            p, qk_clip_state
-                        ) if qk_clip_state is not None else None
-                        if scales_full is not None:
-                            ratio = p.shape[0] // scales_full.shape[0]
-                            idx0 = local_indices[0]
-                            if isinstance(idx0, slice):
-                                start = idx0.start or 0
-                                idx0 = torch.arange(start,
-                                                    idx0.stop,
-                                                    device=scales_full.device)
-                            row_scales = scales_full[idx0 // ratio]
-                            p._local_tensor.mul_(row_scales.view(-1, 1))
-    def _setup_parallel(self, names, params, group, qk_logits):
-        """Compute (or retrieve cached) parallel pipeline metadata.
-        Returns:
-            (ordered_params, param_to_state, rank, chunk_size)
-        """
-        cache_key = tuple(names)
-        if cache_key not in self._parallel_cache:
-            # First call: compute metadata and populate cache.
-            param_to_state, ordered_params = self.init_state_and_assign_params(
-                names, params, group, qk_logits)
-            shard_pg = param_to_state[id(ordered_params[0])].process_group
-            rank = dist.get_rank(group=shard_pg)
-            if self.chunk_size == -1:
-                shard_ranks = dist.get_world_size(shard_pg)
-                chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
-            elif self.chunk_size > 0:
-                chunk_size = self.chunk_size
-            else:
-                raise ValueError(
-                    "chunk_size must be -1 or a positive integer.")
-            ordered_names = [
-                param_to_state[id(p)].name for p in ordered_params
-            ]
-            name_to_state = {
-                param_to_state[id(p)].name: param_to_state[id(p)]
-                for p in ordered_params
-            }
-            self._parallel_cache[cache_key] = {
-                'ordered_names': ordered_names,
-                'name_to_state': name_to_state,
-                'rank': rank,
-                'chunk_size': chunk_size,
-            }
-        else:
-            # Cached path: rebuild param_to_state with current id(p) keys.
-            cache = self._parallel_cache[cache_key]
-            rank = cache['rank']
-            chunk_size = cache['chunk_size']
-            name_to_param = dict(zip(names, params))
-            ordered_params = [name_to_param[n] for n in cache['ordered_names']]
-            param_to_state = {}
-            for p, n in zip(ordered_params, cache['ordered_names']):
-                cached_state = cache['name_to_state'][n]
-                param_to_state[id(p)] = _muon_state(
-                    worker_rank=cached_state.worker_rank,
-                    process_group=cached_state.process_group,
-                    rank_indices=cached_state.rank_indices,
-                    rank_numels=cached_state.rank_numels,
-                    name=n,
-                    qk_clip_state=get_qk_clip_info(self.clip_config, n,
-                                                   qk_logits),
-                )
-        return ordered_params, param_to_state, rank, chunk_size
-    def parallel(self,
-                 names,
-                 params,
-                 group,
-                 lr,
-                 weight_decay,
-                 qk_logits,
-                 prelaunch_gather=None):
-        """
-        Perform a parallel optimization step using Muon.
-        Parameters are chunked and each chunk is processed by a
-        :func:`muon_chunk_pipeline` generator.  :func:`run_pipeline`
-        interleaves multiple chunks so that communication and computation
-        overlap across chunks (the same overlap previously achieved by the
-        warmup + main-loop index scheduling).
-        If ``prelaunch_gather`` is provided, it is passed to the first
-        chunk's generator to skip re-launching the already in-flight
-        A2A gather.
-        """
-        # Momentum is already applied by _step_muon before this method.
-        ordered_params, param_to_state, rank, chunk_size = (
-            self._setup_parallel(names, params, group, qk_logits))
-        def pipelines():
-            first = True
-            for start in range(0, len(ordered_params), chunk_size):
-                chunk = ordered_params[start:start + chunk_size]
-                if chunk:
-                    kwargs = dict(
-                        params=chunk,
-                        param_to_state=param_to_state,
-                        rank=rank,
-                        ns_steps=group["ns_steps"],
-                        lr=lr,
-                        weight_decay=weight_decay,
-                        none_grad=group["none_grad"],
-                    )
-                    if first and prelaunch_gather is not None:
-                        kwargs['prelaunch_gather'] = prelaunch_gather
-                    first = False
-                    yield muon_chunk_pipeline(**kwargs)
-        with record_function("muon::pipeline"):
-            run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
-    def _step_muon(self, group, qk_logits=None):
-        params = group["params"]
-        lr = group["lr"]
-        weight_decay = group["weight_decay"]
-        momentum = group["momentum"]
-        names = group["names"]
-        # Apply momentum to all params before routing/expansion.
-        # Batched using _foreach_* ops (compiled, fullgraph=True).
-        with record_function("muon::momentum"):
-            active_params = [p for p in params if p.grad is not None]
-            if active_params:
-                # Ensure momentum buffers exist (avoid zeros_like when already present).
-                for p in active_params:
-                    if "momentum_buffer" not in self.state[p]:
-                        self.state[p]["momentum_buffer"] = torch.zeros_like(
-                            p.grad)
-                # Extract local tensors for compiled batch function.
-                local_grads = [
-                    p.grad._local_tensor
-                    if isinstance(p.grad, DTensor) else p.grad
-                    for p in active_params
-                ]
-                local_bufs = [
-                    self.state[p]["momentum_buffer"]._local_tensor
-                    if isinstance(self.state[p]["momentum_buffer"], DTensor)
-                    else self.state[p]["momentum_buffer"]
-                    for p in active_params
-                ]
-                # Wrap momentum as tensor for torch.compile.
-                batch_pre_ortho(local_grads, local_bufs,
-                                torch.tensor(momentum), group["nesterov"])
-                # For non-nesterov, the result is the momentum buffer.
-                if not group["nesterov"]:
-                    for p in active_params:
-                        p.grad = self.state[p]["momentum_buffer"]
-        # Identify batched experts for deferred NS.
-        # Detection is cheap (condition checks only); actual NS compute is
-        # deferred so it can overlap with the first chunk's A2A gather.
-        deferred_expert_work = []
-        if self.expert_keys:
-            batched_expert_indices = []
-            for i, (n, p) in enumerate(zip(names, params)):
-                if not (is_expert_param(n, self.expert_keys)
-                        and p.grad is not None):
-                    continue
-                # Eligible: plain tensor, or DTensor with no non-dim-0 shards.
-                if isinstance(p.data, DTensor):
-                    has_tp = any(
-                        _is_shard(pl) and pl.dim != 0 for pl in p.placements)
-                    if has_tp:
-                        continue
-                batched_expert_indices.append(i)
-            if batched_expert_indices:
-                # Save refs for deferred NS; free grads from param list.
-                for i in batched_expert_indices:
-                    p = params[i]
-                    g = p.grad
-                    local_g = (g._local_tensor
-                               if isinstance(g, DTensor) else g)
-                    local_data = (p.data._local_tensor if isinstance(
-                        p.data, DTensor) else p.data)
-                    deferred_expert_work.append((local_data, local_g))
-                    p.grad = None
-                # Remove batched experts from lists before expansion.
-                keep = sorted(
-                    set(range(len(params))) - set(batched_expert_indices))
-                names = [names[i] for i in keep]
-                params = [params[i] for i in keep]
-        def _run_deferred_expert_ns():
-            """Execute deferred batched expert NS."""
-            if not deferred_expert_work:
-                return
-            with record_function("muon::batched_expert_ns"):
-                ns_steps = group["ns_steps"]
-                for local_data, local_g in deferred_expert_work:
-                    u = zeropower_via_newtonschulz5_batched(
-                        local_g.to(COMM_DTYPE), steps=ns_steps)
-                    adjusted_lr = adjust_lr_for_muon(lr, local_g.shape[1:])
-                    local_data.mul_(1 - lr * weight_decay)
-                    local_data.add_(u, alpha=-adjusted_lr)
-        # Expand expert params by splitting on dim 0.
-        logger.debug("[_step_muon] before expand: %d params, expert_keys=%s",
-                     len(params), self.expert_keys)
-        if self.expert_keys:
-            cache_key = tuple(id(p) for p in params)
-            cache = self._expert_expand_cache.get(cache_key)
-            if cache is None:
-                # Cold path: full expansion + build cache metadata.
-                exp_names, exp_params = _expand_expert_params(
-                    names, params, self.expert_keys)
-                # Build per-expert-group info for hot-path grad updates.
-                grad_info = []
-                exp_idx = 0
-                for orig_idx, (n, p) in enumerate(zip(names, params)):
-                    if not is_expert_param(n, self.expert_keys):
-                        exp_idx += 1
-                        continue
-                    is_dt = isinstance(p.data, DTensor)
-                    num_experts = (p.to_local() if is_dt else p.data).shape[0]
-                    # Detect TP mesh from the first expanded expert param.
-                    tp_mesh = None
-                    tp_pls = None
-                    sample = exp_params[exp_idx]
-                    if isinstance(sample.data, DTensor):
-                        tp_mesh = sample.data.device_mesh
-                        tp_pls = list(sample.data.placements)
-                    grad_info.append((orig_idx, num_experts, exp_idx, is_dt,
-                                      tp_mesh, tp_pls))
-                    exp_idx += num_experts
-                self._expert_expand_cache[cache_key] = {
-                    'names': exp_names,
-                    'params': exp_params,
-                    'grad_info': grad_info,
-                }
-                names, params = exp_names, exp_params
-            else:
-                # Hot path: reuse cached params, only update expert grads.
-                for (orig_idx, num_experts, exp_start, is_dt, tp_mesh,
-                     tp_pls) in cache['grad_info']:
-                    p = params[orig_idx]
-                    g = p.grad
-                    local_grad = (g.to_local()
-                                  if is_dt and isinstance(g, DTensor) else g)
-                    for i in range(num_experts):
-                        expert_p = cache['params'][exp_start + i]
-                        sg = local_grad[i]
-                        if tp_mesh is not None:
-                            expert_p.grad = DTensor.from_local(
-                                sg, device_mesh=tp_mesh, placements=tp_pls)
-                        else:
-                            expert_p.grad = sg
-                    p.grad = None
-                names = cache['names']
-                params = cache['params']
-        else:
-            names, params = _expand_expert_params(names, params,
-                                                  self.expert_keys)
-        logger.debug("[_step_muon] after expand: %d params", len(params))
-        param_dtensors = []
-        name_dtensors = []
-        param_tensors = []
-        name_tensors = []
-        # distributed_muon is a reference implementation for testing only.
-        # The parallel pipeline (all2all) path below is the production path.
-        if self.use_distributed_muon:
-            _run_deferred_expert_ns()
-            self.distributed_muon(names=names,
-                                  params=params,
-                                  group=group,
-                                  lr=lr,
-                                  weight_decay=weight_decay,
-                                  qk_logits=qk_logits)
-            return
-        for n, p in zip(names, params):
-            if p is None or p.grad is None:
-                continue
-            if isinstance(p.data, DTensor):
-                if all(
-                        isinstance(placement, Replicate)
-                        for placement in p.placements):
-                    logger.debug(
-                        "[route] %s → base (DTensor all-Replicate), "
-                        "shape=%s, placements=%s", n, p.shape, p.placements)
-                    param_tensors.append(p)
-                    name_tensors.append(n)
-                else:
-                    logger.debug(
-                        "[route] %s → parallel (DTensor), shape=%s, "
-                        "placements=%s, mesh=%s", n, p.shape, p.placements,
-                        p.device_mesh.mesh_dim_names)
-                    param_dtensors.append(p)
-                    name_dtensors.append(n)
-            elif isinstance(p.data, torch.Tensor):
-                logger.debug("[route] %s → base (plain tensor), shape=%s", n,
-                             p.data.shape)
-                param_tensors.append(p)
-                name_tensors.append(n)
-            else:
-                raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-        logger.debug(f"[Muon] {len(param_dtensors)} DTensors → parallel, "
-                     f"{len(param_tensors)} Tensors → base")
-        def group_dtensors(dtensors, names):
-            # To support different placements, we group parameters by placements
-            # and run parallel Muon on each group.
-            placement_to_params = defaultdict(lambda: ([], []))
-            assert len(dtensors) == len(names)
-            for p, n in zip(dtensors, names):
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][0].append(n)
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][1].append(p)
-            return placement_to_params
-        if len(param_dtensors) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            dtensor_group = group_dtensors(param_dtensors, name_dtensors)
-            # Pre-launch the first chunk's A2A gather so that the NCCL
-            # communication overlaps with the (deferred) batched expert NS
-            # compute on the default CUDA stream.
-            prelaunch = None
-            if deferred_expert_work:
-                first_names, first_params = next(iter(dtensor_group.values()))
-                ordered, pts, rnk, csz = self._setup_parallel(
-                    first_names, first_params, group, qk_logits)
-                first_chunk = ordered[:csz]
-                if first_chunk:
-                    prelaunch = prelaunch_first_gather(first_chunk, pts, rnk,
-                                                       group["none_grad"])
-            _run_deferred_expert_ns()
-            first_group = True
-            for _, (names, params) in dtensor_group.items():
-                pg = prelaunch if first_group else None
-                first_group = False
-                self.parallel(
-                    names,
-                    params,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    qk_logits=qk_logits,
-                    prelaunch_gather=pg,
-                )
-        else:
-            _run_deferred_expert_ns()
-        if len(param_tensors) > 0:
-            self.base(
-                name_tensors,
-                param_tensors,
-                group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
-    def _register_states_for_offload(self):
-        """Register all optimizer state tensors with the CPU offload pool.
-        Called once after the first step when states have been lazily created.
-        Offloads all param states (momentum buffers for Muon, moment1/moment2
-        for AdamW) to free GPU memory between steps.
-        """
-        pool = self._cpu_offload_pool
-        tracked = 0
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p not in self.state:
-                    continue
-                state = self.state[p]
-                if group.get("use_muon", False):
-                    if "momentum_buffer" in state:
-                        pool.track(state["momentum_buffer"])
-                        tracked += 1
-                else:
-                    if "moment1" in state:
-                        pool.track(state["moment1"])
-                    if "moment2" in state:
-                        pool.track(state["moment2"])
-                        tracked += 1
-        logger.info("[CPUOffload] Registered %d param states for offload",
-                    tracked)
-    @torch.no_grad
-    def step(self, closure=None, qk_logits=None):
-        """Perform a single optimization step.
-        Args:
-            closure (Callable, optional): A closure that reevaluates the model
-                and returns the loss.
-            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
-                to 1D tensors of shape (num_heads,), representing the maximum
-                QK logits across all tokens, computed as
-                (1 / sqrt(head_dim)) * (Q @ K^T).
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-        # H2D: reload optimizer states from CPU before computation.
-        if self.cpu_offload and self._offload_initialized:
-            self._cpu_offload_pool.reload()
-        logger.debug("[Muon.step] expert_keys=%s, %d param groups",
-                     self.expert_keys, len(self.param_groups))
-        for i, group in enumerate(self.param_groups):
-            if group["use_muon"]:
-                logger.debug("[Muon.step] group %d: use_muon=True, %d params",
-                             i, len(group["params"]))
-                self._step_muon(group, qk_logits=qk_logits)
-            else:
-                logger.debug(
-                    "[Muon.step] group %d: use_muon=False (AdamW), %d params",
-                    i, len(group["params"]))
-                step_adamw(self.state, group)
-        # D2H: offload optimizer states to CPU after computation.
-        if self.cpu_offload:
-            if not self._offload_initialized:
-                if self._cpu_offload_pool is None:
-                    self._cpu_offload_pool = CPUOffloadPool()
-                self._register_states_for_offload()
-                self._offload_initialized = True
-            self._cpu_offload_pool.offload()
-        return loss
-    # ------------------------------------------------------------------
-    # CPU offload public helpers
-    # ------------------------------------------------------------------
-    def turn_on_cpu_offload(self):
-        """Enable CPU offload for optimizer states."""
-        if self.cpu_offload:
-            return
-        logger.info("[Muon] turn_on_cpu_offload")
-        self.cpu_offload = True
-        if not self.state:
-            return
-        self._cpu_offload_pool = CPUOffloadPool()
-        self._offload_initialized = False
-        self._register_states_for_offload()
-        self._offload_initialized = True
-        self._cpu_offload_pool.offload()
-    def turn_off_cpu_offload(self):
-        """Disable CPU offload and keep optimizer states resident on GPU."""
-        if not self.cpu_offload:
-            return
-        logger.info("[Muon] turn_off_cpu_offload")
-        if self._offload_initialized:
-            self._cpu_offload_pool.reload()
-            torch.cuda.current_stream().synchronize()
-        self._cpu_offload_pool = None
-        self._offload_initialized = False
-        self.cpu_offload = False
-    # ------------------------------------------------------------------
-    # Checkpoint support for cpu_offload
-    # ------------------------------------------------------------------
-    def state_dict(self) -> dict:
-        if self.cpu_offload:
-            raise RuntimeError(
-                "Muon.state_dict() requires turn_off_cpu_offload() before checkpoint save."
-            )
-        return super().state_dict()
-    def load_state_dict(self, state_dict: dict) -> None:
-        if self.cpu_offload:
-            raise RuntimeError(
-                "Muon.load_state_dict() requires turn_off_cpu_offload() before checkpoint load."
-            )
-        super().load_state_dict(state_dict)
-        # Invalidate adamw.py's module-level tensor caches so that
-        # the next step rebuilds them with the newly loaded state tensors.
-        _placement_cache.clear()
-        _tensor_cache.clear()

build/torch210-cxx11-cu126-x86_64-linux/newton_schulz.py DELETED Viewed

@@ -1,240 +0,0 @@
-from itertools import repeat
-from math import inf, sqrt
-import numpy as np
-import torch
-from .matmul_transpose_triton import matmul_transpose_assign
-COMM_DTYPE = torch.bfloat16
-DEFAULT_CHUNK_SIZE_RATIO = 4
-def _optimal_quintic(l, u, max_iter=1000):
-    """
-    Use the simplified Remez algorithm to find the optimal odd quintic approximant
-    to the constant function x -> 1 over the interval [l, u].
-    Returns (a, b, c) for p(x) = ax + bx^3 + cx^5 that minimizes the maximum
-    approximation error max_{x in [l,u]} |p(x) - 1|. Iterates by updating the
-    two interior equioscillation nodes q, r until convergence. Returns the
-    closed-form equioscillating solution when l ≈ u.
-    Raises ValueError if any intermediate value (a, b, c, E, q, r) is non-finite
-    (NaN or inf). Raises RuntimeError if convergence is not reached within
-    max_iter iterations.
-    """
-    assert 0 <= l <= u
-    if 1 - 5e-6 <= l / u:
-        return (15 / 8) / u, (-10 / 8) / (u**3), (3 / 8) / (u**5)
-    q = (3 * l + u) / 4
-    r = (l + 3 * u) / 4
-    E = inf
-    for _ in range(max_iter):
-        old_E = E
-        LHS = np.array(
-            [
-                [l, l**3, l**5, 1],
-                [q, q**3, q**5, -1],
-                [r, r**3, r**5, 1],
-                [u, u**3, u**5, -1],
-            ]
-        )
-        a, b, c, E = np.linalg.solve(LHS, np.ones(4))
-        if not np.all(np.isfinite([a, b, c, E])):
-            raise ValueError(
-                f"_optimal_quintic: non-finite solve result a={a}, b={b}, c={c}, E={E}"
-            )
-        q, r = np.sqrt(
-            (-3 * b + np.array([-1, 1]) * sqrt(9 * b**2 - 20 * a * c)) / (10 * c)
-        )
-        if not np.all(np.isfinite([q, r])):
-            raise ValueError(f"_optimal_quintic: non-finite node update q={q}, r={r}")
-        if abs(old_E - E) <= 1e-15:
-            break
-    else:
-        raise RuntimeError(
-            f"_optimal_quintic: did not converge after {max_iter} iterations"
-        )
-    return float(a), float(b), float(c)
-def _optimal_composition(l, num_iters, safety_factor_eps=0, cushion=0):
-    """
-    Compute the Polar Express coefficient series for `num_iters` quintic iterations.
-    Builds a sequence of per-step optimal odd quintic coefficients (a, b, c) that
-    compose to map singular values from [l, 1] toward 1. At each step:
-      1. Solves `_optimal_quintic` on [max(l, cushion*u), u]. The `cushion`
-         prevents near-zero singular values from stalling by raising the effective
-         lower bound; if it is active (cushion*u > l), the coefficients are
-         rescaled so that p(l) and p(u) are centered around 1 w.r.t. the true [l, u].
-      2. Deflates the coefficients by (1 + safety_factor_eps)^degree for all but the
-         last iteration, providing numerical headroom at the cost of a slightly slower
-         final convergence step.
-      3. Advances the interval: l <- p(l), u <- 2 - p(l) (by symmetry of p around 1).
-    Returns a list of (a, b, c) tuples, one per iteration.
-    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
-    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
-    """
-    u = 1
-    assert 0 <= l <= u
-    safety_factor = 1 + safety_factor_eps
-    coefficients = []
-    for iter in range(num_iters):
-        a, b, c = _optimal_quintic(max(l, cushion * u), u)
-        if cushion * u > l:
-            pl = a * l + b * l**3 + c * l**5
-            pu = a * u + b * u**3 + c * u**5
-            rescaler = 2 / (pl + pu)
-            a *= rescaler
-            b *= rescaler
-            c *= rescaler
-        if iter < num_iters - 1:
-            a /= safety_factor
-            b /= safety_factor**3
-            c /= safety_factor**5
-        coefficients.append((a, b, c))
-        l = a * l + b * l**3 + c * l**5
-        u = 2 - l
-    return coefficients
-# Precomputed Polar Express coefficients (a, b, c) for 10 quintic Newton-Schulz
-# iterations. Each tuple is the minimax-optimal (Remez/equioscillation) odd quintic
-# approximant to x->1 over the current singular-value interval, computed once at
-# import time and reused across all optimizer steps.
-#
-# Contrast with the former hardcoded NS coefficients (5 fixed tuples):
-#   - Former: empirically tuned to maximize slope at zero; did not converge
-#     singular values to 1, yielding US'V^T with S' ~ Uniform(0.5, 1.5) instead
-#     of the true polar factor UV^T.
-#   - Polar Express: analytically optimal per step, adapting to the shrinking
-#     singular-value interval [l, u] as iterations progress; converges all
-#     singular values to 1, producing the exact polar factor UV^T.
-_coeffs_list = _optimal_composition(
-    l=1e-3, num_iters=10, safety_factor_eps=1e-2, cushion=0.02
-)
-# This code is adapted from:
-#   KellerJordan/Muon (https://github.com/KellerJordan/Muon/blob/master/muon.py)
-#   NoahAmsel/PolarExpress (https://github.com/NoahAmsel/PolarExpress)
-#   matmul_transpose_assign kernel from nil0x9/flash-muon (https://github.com/nil0x9/flash-muon)
-@torch.no_grad()
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Compute the polar factor of G via the Polar Express method.
-    Applies `steps` quintic iterations X <- aX + bX^3 + cX^5, where (a, b, c)
-    are the Polar Express coefficients from `_coeffs_list`. Each step is the
-    optimal odd quintic approximant to x -> 1 over the current singular-value
-    interval, minimizing the maximum approximation error (Remez / minimax criterion).
-    The composition maps singular values from [l, 1] to near 1, producing the
-    polar factor (orthogonal factor in the polar decomposition G = UP).
-    `_coeffs_list` is precomputed for 10 iterations (l=1e-3, safety_factor_eps=1e-2,
-    cushion=0.02). If `steps` exceeds 10, the final coefficient set is repeated.
-    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
-    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
-    """
-    assert len(G.shape) == 2
-    assert G.dtype == COMM_DTYPE
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    X = X / (X.norm() + 1e-7)
-    hs = _coeffs_list[:steps] + list(
-        repeat(_coeffs_list[-1], steps - len(_coeffs_list))
-    )
-    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    # Perform the NS iterations
-    for a, b, c in hs:
-        matmul_transpose_assign(X, buf1)
-        matmul_transpose_assign(buf1, buf2)
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X
-@torch.no_grad()
-def _zeropower_via_newtonschulz5_batched(G, steps):
-    """Batched polar factor computation for 3D (E, out, in) tensors.
-    Same algorithm as ``_zeropower_via_newtonschulz5`` but uses
-    ``torch.bmm`` / ``torch.baddbmm`` instead of the 2D Triton kernel,
-    processing all E expert matrices in a single batched call.
-    """
-    assert len(G.shape) == 3
-    assert G.dtype == COMM_DTYPE
-    X = G
-    if G.size(1) > G.size(2):
-        X = X.transpose(-2, -1)
-    # Per-expert Frobenius norm.
-    X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
-    hs = _coeffs_list[:steps] + list(
-        repeat(_coeffs_list[-1], steps - len(_coeffs_list))
-    )
-    for a, b, c in hs:
-        buf1 = torch.bmm(X, X.transpose(-2, -1))
-        buf2 = torch.bmm(buf1, buf1.transpose(-2, -1))
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.baddbmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(1) > G.size(2):
-        X = X.transpose(-2, -1)
-    return X
-_ns_per_shape: dict[tuple[int, ...], callable] = {}
-_use_compile = True
-def set_ns_compile(enabled: bool):
-    """Toggle torch.compile for Newton-Schulz iteration."""
-    global _use_compile
-    _use_compile = enabled
-def zeropower_via_newtonschulz5(G, steps=5):
-    if not _use_compile:
-        return _zeropower_via_newtonschulz5(G, steps)
-    key = G.shape
-    if key not in _ns_per_shape:
-        _ns_per_shape[key] = torch.compile(_zeropower_via_newtonschulz5,
-                                           options={
-                                               "triton.cudagraphs": True,
-                                               "shape_padding": False
-                                           })
-    torch.compiler.cudagraph_mark_step_begin()
-    return _ns_per_shape[key](G, steps).clone()
-def zeropower_via_newtonschulz5_batched(G, steps=5):
-    """Compile-cached batched Newton-Schulz for 3D expert tensors."""
-    if not _use_compile:
-        return _zeropower_via_newtonschulz5_batched(G, steps)
-    key = G.shape
-    if key not in _ns_per_shape:
-        _ns_per_shape[key] = torch.compile(
-            _zeropower_via_newtonschulz5_batched,
-            options={
-                "triton.cudagraphs": True,
-                "shape_padding": False
-            })
-    torch.compiler.cudagraph_mark_step_begin()
-    return _ns_per_shape[key](G, steps).clone()

build/torch210-cxx11-cu126-x86_64-linux/optimizer/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu126-x86_64-linux/pipeline.py DELETED Viewed

@@ -1,468 +0,0 @@
-import logging
-from typing import Generator
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor
-from torch.profiler import record_function
-from .core import _muon_state, adjust_lr_for_muon
-from .newton_schulz import COMM_DTYPE, zeropower_via_newtonschulz5
-from .qk_clip import compute_scales
-logger = logging.getLogger(__name__)
-# ======================================================================
-# Stage helpers
-# ======================================================================
-def _launch_gather(
-    params: list[DTensor],
-    owned_params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    num_ranks: int,
-    process_group: dist.ProcessGroup,
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
-    """Allocate gather buffers, build send/recv, and launch async all-to-all.
-    Returns:
-        work: Async operation handle.
-        recv_buf: Flat receive buffer (needed by ``_complete_gather``).
-        gathered_grads: ``{id(p): empty_tensor}`` for owned params,
-            ``None`` for non-owned.
-        recv_counts: Per-source-rank element counts.
-    """
-    # Allocate gathered-grad buffers
-    gathered_grads: dict[int, torch.Tensor | None] = {}
-    for p in params:
-        state = param_to_state[id(p)]
-        if rank == state.worker_rank:
-            gathered_grads[id(p)] = torch.empty(p.shape,
-                                                dtype=COMM_DTYPE,
-                                                device="cuda")
-        else:
-            gathered_grads[id(p)] = None
-    # Build send buffer – batch grad copies via torch.cat
-    # (1-2 fused kernels vs N individual narrow().copy_() calls).
-    send_counts = [0] * num_ranks
-    for p in params:
-        state = param_to_state[id(p)]
-        send_counts[state.worker_rank] += state.rank_numels[rank]
-    total_send = sum(send_counts)
-    if total_send > 0:
-        # Group grad slices by destination rank in a single pass.
-        dst_to_grads = [[] for _ in range(num_ranks)]
-        for p in params:
-            state = param_to_state[id(p)]
-            n = state.rank_numels[rank]
-            if n > 0:
-                g = p.grad.to_local()
-                dst_to_grads[state.worker_rank].append(g.reshape(-1))
-        # Flatten in dst order and cat once.
-        all_slices = []
-        for dst in range(num_ranks):
-            all_slices.extend(dst_to_grads[dst])
-        send_buf = torch.cat(all_slices)
-        if send_buf.dtype != COMM_DTYPE:
-            send_buf = send_buf.to(COMM_DTYPE)
-    else:
-        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-    # Build recv buffer
-    recv_counts = [0] * num_ranks
-    for src in range(num_ranks):
-        total = 0
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert state.worker_rank == rank
-            total += state.rank_numels[src]
-        recv_counts[src] = total
-    recv_buf = torch.empty(sum(recv_counts), dtype=COMM_DTYPE, device="cuda")
-    # Launch async all-to-all
-    logger.debug(f"send_buf size: {send_buf.numel()}, "
-                 f"recv_buf size: {recv_buf.numel()}, "
-                 f"recv_counts: {recv_counts}, "
-                 f"send_counts: {send_counts}, "
-                 f"process_group: {str(process_group)}")
-    work = dist.all_to_all_single(
-        recv_buf,
-        send_buf,
-        output_split_sizes=recv_counts,
-        input_split_sizes=send_counts,
-        group=process_group,
-        async_op=True,
-    )
-    return work, recv_buf, gathered_grads, recv_counts
-def _complete_gather(
-    recv_buf: torch.Tensor,
-    recv_counts: list[int],
-    owned_params: list[DTensor],
-    gathered_grads: dict[int, torch.Tensor | None],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-) -> None:
-    """Reconstruct gathered grads from the recv buffer (in-place)."""
-    off = 0
-    for src in range(len(recv_counts)):
-        if recv_counts[src] == 0:
-            continue
-        block = recv_counts[src]
-        inner_off = 0
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert state.worker_rank == rank
-            indices = state.rank_indices[src]
-            shard_view = gathered_grads[id(p)][indices]
-            n = shard_view.numel()
-            if n == 0:
-                continue
-            sg = recv_buf.narrow(0, off + inner_off, n)
-            sg = sg.reshape(shard_view.shape)
-            gathered_grads[id(p)][indices] = sg
-            inner_off += n
-        assert inner_off == block
-        off += block
-def _compute_ns(
-    owned_params: list[DTensor],
-    gathered_grads: dict[int, torch.Tensor | None],
-    ns_steps: int,
-) -> dict[int, torch.Tensor | None]:
-    """Run Newton-Schulz orthogonalization on owned parameters.
-    Returns:
-        computed_us: ``{id(p): orthogonalized_update}`` for owned params.
-    """
-    computed_us: dict[int, torch.Tensor | None] = {}
-    for p in owned_params:
-        u = zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
-        gathered_grads[id(p)] = None  # free gathered grad
-        computed_us[id(p)] = u
-    return computed_us
-def _launch_scatter(
-    params: list[DTensor],
-    owned_params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    num_ranks: int,
-    process_group: dist.ProcessGroup,
-    computed_us: dict[int, torch.Tensor | None],
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor], list[int]]:
-    """Allocate scatter buffers, build send/recv, and launch async all-to-all.
-    Returns:
-        work: Async operation handle.
-        recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
-        scattered_us: Empty dict, populated by ``_complete_scatter`` with
-            zero-copy views into ``recv_buf``.
-        recv_counts: Per-source-rank element counts.
-    """
-    # scattered_us is populated by _complete_scatter with zero-copy views
-    # into recv_buf, avoiding N empty_like allocations + N copy_ calls.
-    # Pre-seed entries for params whose local shard is empty (rank_numels == 0)
-    # so _update_params can iterate all params without KeyError.
-    scattered_us: dict[int, torch.Tensor] = {}
-    for p in params:
-        if param_to_state[id(p)].rank_numels[rank] == 0:
-            scattered_us[id(p)] = torch.empty_like(p.to_local(),
-                                                   dtype=COMM_DTYPE)
-    # Build send buffer – batch via torch.cat
-    # (1 fused kernel vs N*num_ranks individual narrow().copy_() calls).
-    send_counts = [0] * num_ranks
-    if owned_params:
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            for dst_rank in range(num_ranks):
-                send_counts[dst_rank] += state.rank_numels[dst_rank]
-    total_send = sum(send_counts)
-    if total_send > 0:
-        # Cache u_full conversions to avoid redundant .to() per dst_rank.
-        u_fulls = {}
-        for p in owned_params:
-            u_fulls[id(p)] = computed_us[id(p)].to(COMM_DTYPE).contiguous()
-        # Collect slices in dst order (matches all-to-all send layout).
-        all_slices = []
-        for dst_rank in range(num_ranks):
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                su = u_fulls[id(p)][state.rank_indices[dst_rank]].flatten()
-                if su.numel() > 0:
-                    all_slices.append(su)
-        send_buf = torch.cat(all_slices) if all_slices else torch.empty(
-            0, dtype=COMM_DTYPE, device="cuda")
-    else:
-        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-    # Build recv buffer
-    recv_counts = [0] * num_ranks
-    for src in range(num_ranks):
-        total = 0
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank != src:
-                continue
-            total += state.rank_numels[rank]
-        recv_counts[src] = total
-    recv_total = sum(recv_counts)
-    recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-    # Launch async all-to-all
-    work = dist.all_to_all_single(
-        recv_buf,
-        send_buf,
-        output_split_sizes=recv_counts,
-        input_split_sizes=send_counts,
-        group=process_group,
-        async_op=True,
-    )
-    return work, recv_buf, scattered_us, recv_counts
-def _complete_scatter(
-    recv_buf: torch.Tensor,
-    recv_counts: list[int],
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    scattered_us: dict[int, torch.Tensor],
-) -> None:
-    """Populate scattered_us with zero-copy views into recv_buf.
-    Instead of pre-allocating tensors and copying, we assign views directly
-    from ``recv_buf``.  This eliminates N ``empty_like`` + N ``copy_`` calls.
-    The underlying storage of ``recv_buf`` is kept alive through the views
-    until ``scattered_us`` is cleared after ``_update_params``.
-    """
-    off = 0
-    for src in range(len(recv_counts)):
-        block = recv_counts[src]
-        if block == 0:
-            continue
-        inner_off = 0
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank != src:
-                continue
-            n = state.rank_numels[rank]
-            if n == 0:
-                continue
-            scattered_us[id(p)] = recv_buf.narrow(0, off + inner_off,
-                                                  n).view_as(p.to_local())
-            inner_off += n
-        assert inner_off == block
-        off += block
-def _update_params(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    scattered_us: dict[int, torch.Tensor],
-    lr: float,
-    weight_decay: float,
-) -> None:
-    """Apply weight decay, Muon update, and optional QK clipping.
-    Uses batched ``_foreach_mul_`` for weight decay and batched
-    ``_foreach_add_`` for the Muon update, grouping parameters by
-    adjusted_lr to minimize kernel launches while preserving float32
-    precision for the alpha scaling.
-    """
-    if not params:
-        return
-    # Batched weight decay: p *= (1 - lr * wd) — single fused kernel.
-    p_locals = [p._local_tensor for p in params]
-    torch._foreach_mul_(p_locals, 1.0 - lr * weight_decay)
-    # Group params by adjusted_lr so _foreach_add_ can use a single
-    # alpha per group (preserves float32 precision for alpha scaling).
-    lr_groups: dict[float, tuple[list, list]] = {}
-    for p in params:
-        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-        if adjusted_lr not in lr_groups:
-            lr_groups[adjusted_lr] = ([], [])
-        lr_groups[adjusted_lr][0].append(p._local_tensor)
-        lr_groups[adjusted_lr][1].append(scattered_us[id(p)])
-    for adjusted_lr, (p_group, u_group) in lr_groups.items():
-        torch._foreach_add_(p_group, u_group, alpha=-adjusted_lr)
-    # QK clipping – applied directly on the local tensor to
-    # avoid DTensor sharding-propagation issues with _StridedShard.
-    for p in params:
-        state = param_to_state[id(p)]
-        if state.qk_clip_state is None:
-            continue
-        scales_full = compute_scales(p, state.qk_clip_state)
-        if scales_full is not None:
-            ratio = p.shape[0] // scales_full.shape[0]
-            idx0 = state.rank_indices[rank][0]
-            if isinstance(idx0, slice):
-                start = idx0.start or 0
-                idx0 = torch.arange(start,
-                                    idx0.stop,
-                                    device=scales_full.device)
-            row_scales = scales_full[idx0 // ratio]
-            p._local_tensor.mul_(row_scales.view(-1, 1))
-# ======================================================================
-# Pre-launch helper for overlapping first chunk's gather with other work.
-# ======================================================================
-@torch.no_grad()
-def prelaunch_first_gather(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    none_grad: bool,
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
-    """Launch the first chunk's A2A gather early for overlap with other compute.
-    Call this *before* expensive GPU work (e.g. batched expert NS) so that
-    the NCCL all-to-all runs concurrently on the NCCL stream while the
-    default stream executes compute.
-    Returns the same 4-tuple that ``_launch_gather`` produces, which should
-    be passed as ``prelaunch_gather`` to :func:`muon_chunk_pipeline`.
-    """
-    process_group = param_to_state[id(params[0])].process_group
-    num_ranks = dist.get_world_size(group=process_group)
-    owned_params = [
-        p for p in params if param_to_state[id(p)].worker_rank == rank
-    ]
-    with record_function("muon::prelaunch_gather"):
-        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group)
-    if none_grad:
-        for p in params:
-            p.grad = None
-    return work, recv_buf, gathered_grads, recv_counts
-# ======================================================================
-# Main generator – thin orchestrator that wires stages together.
-# ======================================================================
-@torch.no_grad()
-def muon_chunk_pipeline(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    ns_steps: int,
-    lr: float,
-    weight_decay: float,
-    none_grad: bool,
-    prelaunch_gather: tuple | None = None,
-) -> Generator[None, None, None]:
-    """Process one chunk of parameters through the full Muon pipeline.
-    Stages: gather -> compute (Newton-Schulz) -> scatter -> update.
-    Each ``yield`` lets :func:`run_pipeline` interleave other chunks so
-    that communication and computation overlap across chunks.  Async
-    communication is launched via ``async_op=True`` and completed after
-    the yield with ``work.wait()``.
-    Overlap happens because :func:`run_pipeline` admits one new chunk
-    per iteration (staggered admission).  While chunk *N* does NS
-    compute on the default CUDA stream, chunk *N+1*'s async all-to-all
-    runs concurrently on the NCCL stream — no separate ``comm_stream``
-    is required.
-    If ``prelaunch_gather`` is provided, the gather was already launched
-    by :func:`prelaunch_first_gather` and we skip launching it again.
-    Yields exactly **2** times:
-    1. After launching async all-to-all gather (or immediately if pre-launched).
-    2. After launching async all-to-all scatter.
-    """
-    process_group = param_to_state[id(params[0])].process_group
-    num_ranks = dist.get_world_size(group=process_group)
-    owned_params = [
-        p for p in params if param_to_state[id(p)].worker_rank == rank
-    ]
-    if prelaunch_gather is not None:
-        # Gather was pre-launched; none_grad already handled by caller.
-        work, recv_buf, gathered_grads, recv_counts = prelaunch_gather
-    else:
-        # Normal path: launch async gather.
-        with record_function("muon::launch_gather"):
-            work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-                params, owned_params, param_to_state, rank, num_ranks,
-                process_group)
-            if none_grad:
-                for p in params:
-                    p.grad = None
-    yield  # --- YIELD 1: other chunks can launch their gather ---
-    with record_function("muon::wait_gather"):
-        work.wait()
-        _complete_gather(recv_buf, recv_counts, owned_params, gathered_grads,
-                         param_to_state, rank)
-        del recv_buf
-    # Stage 3: Newton-Schulz orthogonalization.
-    with record_function("muon::newton_schulz"):
-        computed_us = _compute_ns(owned_params, gathered_grads, ns_steps)
-        gathered_grads.clear()
-    # Stages 4-5: launch async scatter.
-    with record_function("muon::launch_scatter"):
-        work, recv_buf, scattered_us, recv_counts = _launch_scatter(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group, computed_us)
-        computed_us.clear()
-    yield  # --- YIELD 2: other chunks can launch their scatter ---
-    with record_function("muon::wait_scatter"):
-        work.wait()
-        _complete_scatter(recv_buf, recv_counts, params, param_to_state, rank,
-                          scattered_us)
-        del recv_buf
-    # Stage 6: apply parameter updates.
-    with record_function("muon::update_params"):
-        _update_params(params, param_to_state, rank, scattered_us, lr,
-                       weight_decay)
-        scattered_us.clear()

build/torch210-cxx11-cu126-x86_64-linux/qk_clip.py DELETED Viewed

@@ -1,198 +0,0 @@
-import logging
-import math
-from dataclasses import dataclass
-import torch
-from torch.distributed.tensor import DTensor
-from .core import normalize_fqn
-logger = logging.getLogger(__name__)
-def parse_qk_layer(name: str) -> tuple[str | None, int]:
-    """
-    Parse a parameter name to check if it is a query/key projection layer
-    and return (kind, layer_index).
-    Supported kinds:
-        MHA/GQA: 'wq', 'wk', 'q_proj', 'k_proj'
-        MLA:     'wq_b' (Q up-proj), 'wkv_b' (KV up-proj)
-    Returns:
-        (kind, layer_idx) or (None, -1) if not matched.
-    Example:
-        'model.3.attn.wq.weight'      -> ('wq', 3)
-        'model.5.attn.wk.weight'      -> ('wk', 5)
-        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
-        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
-        'model.1.attn.wq_b.weight'    -> ('wq_b', 1)
-        'model.0.attn.wkv_b.weight'   -> ('wkv_b', 0)
-        'model.4.attn.v_proj.weight'  -> (None, -1)
-    """
-    parts = normalize_fqn(name).split('.')
-    if len(parts) < 3:
-        return None, -1
-    kind = parts[-2]
-    layer_idx = -1
-    for part in reversed(parts):
-        if part.isdigit():
-            layer_idx = int(part)
-            break
-    if kind in ('wq', 'wk', 'q_proj', 'k_proj', 'wq_b', 'wkv_b'):
-        return kind, layer_idx
-    return None, -1
-@dataclass
-class QKClipInfo:
-    """Per-parameter dynamic info computed from config + runtime logits."""
-    kind: str | None  # 'wq'/'q_proj'/'wq_b' or 'wk'/'k_proj'/'wkv_b' or None
-    indices: list[int]  # which heads to consider for clipping
-    head_dim: int  # from config (qk_head_dim for MLA wq_b)
-    threshold: float  # from config
-    logit: torch.Tensor | None
-    # MLA-specific fields
-    is_mla: bool = False
-    qk_nope_head_dim: int = 0
-    qk_rope_head_dim: int = 0
-    v_head_dim: int = 0
-def get_qk_clip_info(clip_config, n, qk_logits):
-    """Extract QK clipping info for a named parameter.
-    Args:
-        clip_config: QK clipping configuration dict (or None).
-            MHA/GQA keys: head_dim, threshold, q_indices, k_indices
-            MLA extra keys: is_mla=True, qk_nope_head_dim, qk_rope_head_dim, v_head_dim
-        n: Parameter name string.
-        qk_logits: Dict mapping layer indices to logit tensors (or None).
-    Returns:
-        QKClipInfo instance with clipping configuration for this parameter.
-    """
-    if clip_config is None:
-        return None
-    head_dim = clip_config.get('head_dim')
-    threshold = clip_config.get('threshold')
-    kind, layer_idx = parse_qk_layer(n)
-    is_mla = clip_config.get('is_mla', False)
-    logit, indices = None, []
-    if qk_logits is not None and kind is not None:
-        logit = qk_logits[layer_idx]
-        if isinstance(logit, DTensor):
-            # In TP settings, qk_logits may be DTensor
-            # We convert it to full tensor here for simplicity
-            logit = logit.full_tensor()
-        if kind in ('wq_b', 'wq', 'q_proj'):
-            indices = clip_config.get('q_indices', []) or []
-        elif kind in ('wkv_b', 'wk', 'k_proj'):
-            indices = clip_config.get('k_indices', []) or []
-    if is_mla:
-        return QKClipInfo(
-            kind=kind,
-            indices=indices,
-            head_dim=head_dim,
-            threshold=threshold,
-            logit=logit,
-            is_mla=True,
-            qk_nope_head_dim=clip_config['qk_nope_head_dim'],
-            qk_rope_head_dim=clip_config['qk_rope_head_dim'],
-            v_head_dim=clip_config['v_head_dim'],
-        )
-    else:
-        return QKClipInfo(
-            kind=kind,
-            indices=indices,
-            head_dim=head_dim,
-            threshold=threshold,
-            logit=logit,
-        )
-def compute_scales(p, qk_clip_state):
-    """Compute per-head scaling factors for QK clipping.
-    Returns scales tensor (√γ per head) if any head exceeds threshold, else None.
-    For MLA wkv_b, effective row stride is qk_nope_head_dim + v_head_dim.
-    """
-    kind = qk_clip_state.kind
-    indices = qk_clip_state.indices
-    head_dim = qk_clip_state.head_dim
-    threshold = qk_clip_state.threshold
-    logit = qk_clip_state.logit
-    # Check if any head exceeds threshold before allocating.
-    head_scales = {}
-    for logit_idx, head_idx in enumerate(indices):
-        v_ele = float(logit[logit_idx])
-        if v_ele > threshold:
-            new_scale = math.sqrt(threshold / v_ele)
-            if head_idx not in head_scales or new_scale < head_scales[head_idx]:
-                head_scales[head_idx] = new_scale
-                logger.info(
-                    f"[{kind}] Head {head_idx} exceeded threshold "
-                    f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
-                )
-    if not head_scales:
-        return None
-    # For MLA wkv_b, each KV head spans qk_nope_head_dim + v_head_dim rows
-    if qk_clip_state.is_mla and kind == 'wkv_b':
-        effective_head_dim = qk_clip_state.qk_nope_head_dim + qk_clip_state.v_head_dim
-    else:
-        effective_head_dim = head_dim
-    H_global = p.shape[0] // effective_head_dim
-    scales_full = torch.ones(H_global, device=p.data.device)
-    for head_idx, scale in head_scales.items():
-        scales_full[head_idx] = scale
-    return scales_full
-def qk_clip(p, scales, info):
-    """Apply per-head scaling to a Q/K projection weight matrix.
-    Args:
-        p: Parameter (nn.Parameter or raw tensor).
-        scales: [n_heads] tensor, each element = √γ_h.
-        info: QKClipInfo with kind, head_dim, and MLA sub-head dimensions.
-    MLA sub-region scaling per Algorithm 1 (MuonClip):
-        wq_b: q_nope rows → √γ,  q_pe rows → γ
-        wkv_b: k_nope rows → √γ, v rows → unchanged
-    """
-    W = p.data if isinstance(p, torch.nn.Parameter) else p
-    if not info.is_mla:
-        # MHA/GQA: uniform √γ applied to all rows in each head
-        W.view(-1, info.head_dim, W.shape[1]).mul_(scales.view(-1, 1, 1))
-        return
-    # MLA: vectorized sub-region scaling within each head
-    if info.kind == 'wq_b':
-        qk_nope = info.qk_nope_head_dim
-        qk_head_dim = qk_nope + info.qk_rope_head_dim
-        W_3d = W.view(-1, qk_head_dim, W.shape[1])  # [H, qk_head_dim, in_dim]
-        W_3d[:, :qk_nope, :].mul_(scales.view(-1, 1, 1))  # q_nope → √γ
-        W_3d[:, qk_nope:, :].mul_((scales * scales).view(-1, 1,
-                                                         1))  # q_pe   → γ
-    elif info.kind == 'wkv_b':
-        qk_nope = info.qk_nope_head_dim
-        kv_stride = qk_nope + info.v_head_dim
-        W_3d = W.view(-1, kv_stride, W.shape[1])  # [H, kv_stride, in_dim]
-        W_3d[:, :qk_nope, :].mul_(scales.view(-1, 1, 1))  # k_nope → √γ
-        # v rows: not touched (k_R shared rotary unchanged)

build/torch210-cxx11-cu128-x86_64-linux/adamw.py DELETED Viewed

@@ -1,271 +0,0 @@
-import logging
-from collections import defaultdict
-from typing import cast
-import torch
-from torch.distributed.tensor import DTensor
-from torch.profiler import record_function
-logger = logging.getLogger(__name__)
-def fused_adamw(
-    params: list[torch.Tensor],
-    grads: list[torch.Tensor],
-    exp_avgs: list[torch.Tensor],
-    exp_avg_sqs: list[torch.Tensor],
-    max_exp_avg_sqs: list[torch.Tensor],
-    state_steps: list[torch.Tensor],
-    amsgrad: bool,
-    beta1: float,
-    beta2: float,
-    lr: float | torch.Tensor,
-    weight_decay: float,
-    eps: float,
-    maximize: bool,
-) -> None:
-    if not params:
-        return
-    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
-    # treating it as a scalar.
-    lr_dict: dict | None = ({
-        lr.device: lr
-    } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else None)
-    grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
-         state_steps]  # type: ignore[list-item]
-    )
-    for (device, _), (
-        (
-            device_params_,
-            device_grads_,
-            device_exp_avgs_,
-            device_exp_avg_sqs_,
-            device_max_exp_avg_sqs,
-            device_state_steps_,
-        ),
-            _,
-    ) in grouped_tensors.items():
-        device_params = cast(list[torch.Tensor], device_params_)
-        device_grads = cast(list[torch.Tensor], device_grads_)
-        device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
-        device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
-        device_state_steps = cast(list[torch.Tensor], device_state_steps_)
-        if lr_dict is not None and device not in lr_dict:
-            lr_dict[device] = lr.to(
-                device=device, non_blocking=True)  # type: ignore[union-attr]
-            lr = lr_dict[device]
-        torch._foreach_add_(device_state_steps, 1)
-        func = torch._fused_adamw_
-        func(
-            device_params,
-            device_grads,
-            device_exp_avgs,
-            device_exp_avg_sqs,
-            device_max_exp_avg_sqs,  # type: ignore[arg-type]
-            device_state_steps,
-            amsgrad=amsgrad,
-            lr=lr,  # type: ignore[arg-type]
-            beta1=beta1,
-            beta2=beta2,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=maximize,
-        )
-def _to_local(t):
-    """Unwrap DTensor to local tensor for fused ops."""
-    return t._local_tensor if isinstance(t, DTensor) else t
-# ---------------------------------------------------------------------------
-# Caches for eliminating per-step Python overhead.
-#
-# Placement grouping and tensor list assembly are identical every step
-# (params don't change placement, moment/step tensors are the same objects
-# after initialisation).  We cache them keyed by id() of the param list
-# stored in param_groups (stable across steps).
-#
-# Only gradients change each step and must be collected fresh.
-# ---------------------------------------------------------------------------
-# id(group["params"]) → dict[placement_key, list[param]]
-_placement_cache: dict[int, dict[tuple, list]] = {}
-# id(placement_group_list) → (params_local, moment1, moment2, state_steps)
-_tensor_cache: dict[int, tuple[list, list, list, list]] = {}
-def _step_adamw_params_slow(optimizer_state, params, group):
-    """Uncached fallback for the rare case where some params lack grads."""
-    params_with_grads = []
-    grads = []
-    moment1 = []
-    moment2 = []
-    state_steps = []
-    for p in params:
-        g = p.grad
-        if g is None:
-            continue
-        state = optimizer_state[p]
-        params_with_grads.append(_to_local(p))
-        grads.append(_to_local(g))
-        if "step" not in state:
-            state["step"] = torch.zeros((),
-                                        dtype=torch.float32,
-                                        device=p.device)
-            state["moment1"] = torch.zeros_like(g)
-            state["moment2"] = torch.zeros_like(g)
-        moment1.append(_to_local(state["moment1"]))
-        moment2.append(_to_local(state["moment2"]))
-        if not isinstance(state["step"], torch.Tensor):
-            state["step"] = torch.tensor(state["step"],
-                                         dtype=torch.float32,
-                                         device=p.device)
-        state_steps.append(state["step"])
-    if not params_with_grads:
-        return
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
-    fused_adamw(
-        params_with_grads,
-        grads,
-        moment1,
-        moment2,
-        [],
-        state_steps,
-        amsgrad=False,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        weight_decay=weight_decay,
-        eps=eps,
-        maximize=False,
-    )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    After the first call, cached tensor lists (params_local, moment1,
-    moment2, state_steps) are reused — only gradients are collected fresh.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
-    # Collect grads — the only thing that changes each step.
-    with record_function("adamw::collect_grads"):
-        grads = []
-        for p in params:
-            g = p.grad
-            if g is None:
-                # Rare: fall back to slow path that filters per-param.
-                _step_adamw_params_slow(optimizer_state, params, group)
-                return
-            grads.append(_to_local(g))
-    tensor_key = id(params)
-    if tensor_key not in _tensor_cache:
-        with record_function("adamw::init_tensor_cache"):
-            params_local = []
-            moment1 = []
-            moment2 = []
-            state_steps = []
-            for p in params:
-                state = optimizer_state[p]
-                params_local.append(_to_local(p))
-                if "step" not in state:
-                    state["step"] = torch.zeros((),
-                                                dtype=torch.float32,
-                                                device=p.device)
-                    state["moment1"] = torch.zeros_like(p.grad)
-                    state["moment2"] = torch.zeros_like(p.grad)
-                moment1.append(_to_local(state["moment1"]))
-                moment2.append(_to_local(state["moment2"]))
-                if not isinstance(state["step"], torch.Tensor):
-                    state["step"] = torch.tensor(state["step"],
-                                                 dtype=torch.float32,
-                                                 device=p.device)
-                state_steps.append(state["step"])
-            _tensor_cache[tensor_key] = (params_local, moment1, moment2,
-                                         state_steps)
-    params_local, moment1, moment2, state_steps = _tensor_cache[tensor_key]
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
-    with record_function("adamw::fused_adamw"):
-        fused_adamw(
-            params_local,
-            grads,
-            moment1,
-            moment2,
-            [],
-            state_steps,
-            amsgrad=False,
-            beta1=beta1,
-            beta2=beta2,
-            lr=lr,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=False,
-        )
-def step_adamw(optimizer_state, group):
-    """Dispatch AdamW step, grouping parameters by type and placement.
-    Placement grouping is cached after the first call since params never
-    change their placement between steps.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        group: Parameter group dict.
-    """
-    params = group["params"]
-    placement_key = id(params)
-    if placement_key not in _placement_cache:
-        with record_function("adamw::group_by_placement"):
-            placement_to_params: dict[tuple,
-                                      list[torch.Tensor]] = defaultdict(list)
-            for p in params:
-                match p:
-                    case DTensor():
-                        logger.debug(
-                            "[AdamW] DTensor param: shape=%s, placements=%s, "
-                            "mesh=%s, grad=%s", p.shape, p.placements,
-                            p.device_mesh.mesh_dim_names,
-                            p.grad.shape if p.grad is not None else None)
-                        placement_to_params[tuple(
-                            [p.placements, p.device_mesh])].append(p)
-                    case torch.Tensor():
-                        logger.debug(
-                            "[AdamW] plain param: shape=%s, grad=%s", p.shape,
-                            p.grad.shape if p.grad is not None else None)
-                        placement_to_params[tuple([torch.Tensor,
-                                                   None])].append(p)
-            logger.debug("[AdamW] %d placement groups, %d total params",
-                         len(placement_to_params), len(params))
-            _placement_cache[placement_key] = dict(placement_to_params)
-    for group_params in _placement_cache[placement_key].values():
-        step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-cu128-x86_64-linux/async_utils.py DELETED Viewed

@@ -1,77 +0,0 @@
-import logging
-from typing import Generator
-logger = logging.getLogger(__name__)
-class _Task:
-    """Internal: wraps a generator, advances one yield at a time."""
-    def __init__(self, generator: Generator[None, None, None], index: int):
-        self._generator = generator
-        self._index = index
-        self._steps_completed = 0
-        self.step()  # run to first yield
-    def step(self) -> bool:
-        try:
-            next(self._generator)
-            self._steps_completed += 1
-            logger.debug("pipeline[%d] completed stage %d", self._index,
-                         self._steps_completed)
-            return True
-        except StopIteration:
-            logger.debug("pipeline[%d] finished after %d stages", self._index,
-                         self._steps_completed)
-            return False
-    def close(self):
-        self._generator.close()
-def run_pipeline(
-    pipelines: Generator[Generator[None, None, None], None, None],
-    max_concurrent: int,
-) -> None:
-    """Run generator-based pipelines with bounded concurrency.
-    Each pipeline is a generator that yields at stage boundaries.
-    The runtime interleaves pipelines so communication and computation
-    overlap across chunks.
-    """
-    if max_concurrent <= 0:
-        raise ValueError(f"max_concurrent must be > 0, got {max_concurrent}")
-    have_new = True
-    task_index = 0
-    previous_tasks: list[_Task] = []
-    try:
-        while have_new or previous_tasks:
-            running_tasks: list[_Task] = []
-            # Admit one new pipeline per iteration (staggered admission).
-            # Admitting one at a time ensures that while chunk N does NS
-            # compute on the default stream, chunk N+1's NCCL all-to-all
-            # runs concurrently on the NCCL stream — creating real
-            # communication/computation overlap on the GPU.
-            if have_new and len(previous_tasks) < max_concurrent:
-                try:
-                    gen = next(pipelines)
-                    task = _Task(gen, task_index)
-                    task_index += 1
-                    running_tasks.append(task)
-                except StopIteration:
-                    have_new = False
-            # Advance every previously-yielded task by one step.
-            for task in previous_tasks:
-                if task.step():
-                    running_tasks.append(task)
-            previous_tasks = running_tasks
-    except BaseException:
-        # Clean up all in-flight generators to release GPU resources.
-        for task in previous_tasks:
-            task.close()
-        raise

build/torch210-cxx11-cu128-x86_64-linux/core.py DELETED Viewed

@@ -1,219 +0,0 @@
-import logging
-import math
-from dataclasses import dataclass
-from typing import List
-import torch
-from torch.distributed import ProcessGroup
-from torch.distributed.tensor import DTensor
-# torch.compile wraps modules as OptimizedModule, inserting "_orig_mod" into
-# parameter FQNs.  Activation checkpointing similarly inserts
-# "_checkpoint_wrapped_module".  Strip these so name-based matching (skip_keys,
-# expert_keys, QK layer parsing) works regardless of wrapper nesting.
-_WRAPPER_PARTS = frozenset({"_orig_mod", "_checkpoint_wrapped_module"})
-logger = logging.getLogger(__name__)
-def normalize_fqn(name: str) -> str:
-    """Strip torch.compile / checkpoint wrapper components from a parameter FQN."""
-    return ".".join(p for p in name.split(".") if p not in _WRAPPER_PARTS)
-@dataclass
-class _muon_state:
-    worker_rank: int
-    process_group: ProcessGroup
-    rank_indices: dict[int, tuple]  # local_rank -> per-dim indices
-    rank_numels: dict[int, int]  # local_rank -> numel
-    name: str
-    qk_clip_state: torch.Tensor | None = None
-def _batch_momentum(
-    grads: List[torch.Tensor],
-    momentum_bufs: List[torch.Tensor],
-    momentum: torch.Tensor,
-) -> None:
-    """Batched momentum update (no nesterov)."""
-    torch._foreach_mul_(momentum_bufs, momentum)
-    torch._foreach_add_(momentum_bufs, grads)
-def _batch_momentum_nesterov(
-    grads: List[torch.Tensor],
-    momentum_bufs: List[torch.Tensor],
-    momentum: torch.Tensor,
-) -> None:
-    """Batched momentum update with nesterov correction."""
-    torch._foreach_mul_(momentum_bufs, momentum)
-    torch._foreach_add_(momentum_bufs, grads)
-    nesterov_terms = torch._foreach_mul(momentum_bufs, momentum)
-    torch._foreach_add_(grads, nesterov_terms)
-_compiled_momentum: dict[bool, callable] = {}
-_use_momentum_compile = True
-def set_momentum_compile(enabled: bool):
-    """Toggle torch.compile for batched momentum."""
-    global _use_momentum_compile
-    _use_momentum_compile = enabled
-def batch_pre_ortho(
-    grads: List[torch.Tensor],
-    momentum_bufs: List[torch.Tensor],
-    momentum: torch.Tensor,
-    nesterov: bool,
-) -> None:
-    """Batched momentum update on lists of plain tensors.
-    Mirrors dion's ``muon_update_pre_orthogonalize``.
-    Inputs must be plain CUDA tensors (not DTensor).
-    Modifies ``momentum_bufs`` and (for nesterov) ``grads`` in-place.
-    When compile is enabled, uses separately compiled functions for
-    nesterov=True/False to avoid graph breaks from the branch.
-    """
-    fn = _batch_momentum_nesterov if nesterov else _batch_momentum
-    if _use_momentum_compile:
-        if nesterov not in _compiled_momentum:
-            _compiled_momentum[nesterov] = torch.compile(fn)
-        fn = _compiled_momentum[nesterov]
-    fn(grads, momentum_bufs, momentum)
-def _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay):
-    """Weight-decay + update on plain tensors.
-    Not compiled: per-param @torch.compile caused ~0.25ms TorchDynamo cache
-    lookup per call × 256+ params = massive overhead.  The pipeline path uses
-    batched _foreach_* ops instead; this function remains for base() and
-    distributed_muon().
-    """
-    p_data.mul_(1 - lr * weight_decay)
-    p_data.add_(u_data, alpha=-adjusted_lr)
-def update_p(p, u, lr, adjusted_lr, weight_decay):
-    """Apply weight decay and orthogonalized update to parameter.
-    Args:
-        p: Parameter (torch.nn.Parameter or DTensor).
-        u: Orthogonalized update tensor.
-        lr: Base learning rate.
-        adjusted_lr: Size-adjusted learning rate.
-        weight_decay: Weight decay coefficient.
-    """
-    # Unwrap Parameter -> underlying data tensor.
-    p_data = p.data if isinstance(p, torch.nn.Parameter) else p
-    # Unwrap DTensor -> local CUDA tensor for compiled kernel.
-    if isinstance(p_data, DTensor):
-        p_data = p_data._local_tensor
-    u_data = u._local_tensor if isinstance(u, DTensor) else u
-    _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay)
-def adjust_lr_for_muon(lr, param_shape):
-    """Scale learning rate based on parameter matrix dimensions.
-    Args:
-        lr: Base learning rate.
-        param_shape: Shape of the parameter tensor.
-    Returns:
-        Adjusted learning rate.
-    """
-    A, B = param_shape[:2]
-    # We adjust the learning rate and weight decay based on the size of the parameter matrix
-    # as described in the paper
-    adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-    adjusted_lr = lr * adjusted_ratio
-    return adjusted_lr
-def _match_key(parts, key):
-    """Check if key matches as contiguous components in parts.
-    Single-component keys (e.g. "experts") match any single component.
-    Multi-component keys (e.g. "experts.w1") match as a contiguous subsequence.
-    """
-    key_parts = key.split(".")
-    key_len = len(key_parts)
-    if key_len == 1:
-        return key in parts
-    return any(parts[i:i + key_len] == key_parts
-               for i in range(len(parts) - key_len + 1))
-def is_expert_param(name, expert_keys):
-    """Check if a parameter name matches any expert key (component-level)."""
-    if not expert_keys:
-        return False
-    parts = normalize_fqn(name).split(".")
-    return any(_match_key(parts, key) for key in expert_keys)
-def default_is_muon(name, x, expert_keys=None):
-    normalized = normalize_fqn(name)
-    parts = normalized.split(".")
-    skip_keys = [
-        "embed_tokens",
-        "lm_head",
-        "tok_embeddings",
-        "output",
-        "mhc_attn",
-        "mhc_ffn",
-        "lambda_proj",
-    ]
-    if any(key in parts for key in skip_keys):
-        logger.info(
-            "[is_muon] %s (orig: %s): skip (matched skip_key), ndim=%d",
-            normalized, name, x.ndim)
-        return False
-    effective_ndim = x.ndim
-    is_expert = is_expert_param(name, expert_keys)
-    if is_expert:
-        effective_ndim -= 1
-    result = effective_ndim >= 2
-    logger.info(
-        "[is_muon] %s (orig: %s): ndim=%d, expert=%s, effective_ndim=%d → %s",
-        normalized, name, x.ndim, is_expert, effective_ndim,
-        "Muon" if result else "AdamW")
-    return result
-def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
-    if is_muon_func is None:
-        is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
-    muon_params, muon_names = [], []
-    non_muon_params, non_muon_names = [], []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
-            continue
-        if is_muon_func(n, p):
-            muon_params.append(p)
-            muon_names.append(n)
-        else:
-            non_muon_params.append(p)
-            non_muon_names.append(n)
-    logger.info("[param_groups] expert_keys=%s, Muon=%d, AdamW=%d",
-                expert_keys, len(muon_names), len(non_muon_names))
-    return [
-        {
-            "params": muon_params,
-            "names": muon_names,
-            "use_muon": True,
-        },
-        {
-            "params": non_muon_params,
-            "use_muon": False,
-        },
-    ]

build/torch210-cxx11-cu128-x86_64-linux/cpu_offload.py DELETED Viewed

@@ -1,206 +0,0 @@
-"""CPU offloading for optimizer states.
-Manages a pinned CPU memory pool and async CUDA streams to offload
-optimizer state tensors (momentum buffers, Adam moments) to CPU between
-optimizer steps, freeing GPU memory.
-All tracked tensors are packed into a single flat pinned CPU buffer
-(per dtype).  D2H and H2D copies are performed per-tensor directly
-between individual GPU tensors and their slice of the CPU flat buffer
-— no GPU staging buffer is allocated, so there is **no temporary GPU
-memory spike** during offload or reload.
-Individual tensor storages are freed after offload via
-``untyped_storage().resize_(0)``, preserving tensor identity so
-downstream caches remain valid.
-"""
-import logging
-from collections import defaultdict
-import torch
-from torch.distributed.tensor import DTensor
-logger = logging.getLogger(__name__)
-class CPUOffloadPool:
-    """Pinned CPU memory pool for async optimizer state offloading.
-    Tracked tensors are grouped by dtype.  Each group gets a single flat
-    pinned CPU buffer.  D2H / H2D copies are per-tensor (into slices of
-    the flat buffer) to avoid allocating a GPU staging buffer.
-    """
-    def __init__(self):
-        self._managed: list[torch.Tensor] = []
-        self._storage_nbytes: dict[int, int] = {}  # id(t) → bytes
-        # Per-dtype group: populated on first offload.
-        # dtype → dict with keys:
-        #   "indices"   : list[int]           managed-list indices
-        #   "offsets"   : list[tuple[int,int]] (start, numel) in flat buf
-        #   "total"     : int                  total numel
-        #   "cpu_flat"  : Tensor               pinned CPU buffer
-        self._groups: dict[torch.dtype, dict] = {}
-        self._offload_stream: torch.cuda.Stream | None = None
-        self._device: torch.device | None = None
-        self._initialized: bool = False
-        self._logged: bool = False
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _local(t: torch.Tensor) -> torch.Tensor:
-        """Unwrap DTensor to its local CUDA tensor."""
-        return t._local_tensor if isinstance(t, DTensor) else t
-    def _ensure_stream(self):
-        if self._offload_stream is None:
-            self._offload_stream = torch.cuda.Stream(device=self._device)
-    # ------------------------------------------------------------------
-    def track(self, tensor: torch.Tensor):
-        """Register a GPU tensor for CPU offloading.  Idempotent."""
-        tid = id(tensor)
-        if tid in self._storage_nbytes:
-            return
-        local = self._local(tensor)
-        if self._device is None:
-            self._device = local.device
-        storage = local.untyped_storage()
-        # Skip tensors with empty storage (e.g. empty FSDP shards)
-        if storage.size() == 0:
-            return
-        self._storage_nbytes[tid] = storage.size()
-        self._managed.append(tensor)
-    # ------------------------------------------------------------------
-    def _init_buffers(self):
-        """Build per-dtype flat buffers on first offload."""
-        # Group managed tensors by dtype.
-        dtype_map: dict[torch.dtype, list[tuple[int, int]]] = defaultdict(list)
-        for idx, t in enumerate(self._managed):
-            local = self._local(t)
-            dtype_map[local.dtype].append((idx, local.numel()))
-        total_cpu_bytes = 0
-        for dtype, entries in dtype_map.items():
-            offsets: list[tuple[int, int]] = []
-            indices: list[int] = []
-            off = 0
-            for idx, n in entries:
-                indices.append(idx)
-                offsets.append((off, n))
-                off += n
-            cpu_flat = torch.empty(off, dtype=dtype, device="cpu", pin_memory=True)
-            self._groups[dtype] = {
-                "indices": indices,
-                "offsets": offsets,
-                "total": off,
-                "cpu_flat": cpu_flat,
-            }
-            total_cpu_bytes += off * cpu_flat.element_size()
-        self._initialized = True
-        logger.info(
-            "[CPUOffload] Pool initialized: %d tensors, %d dtype group(s), "
-            "%.2f MB pinned CPU memory",
-            len(self._managed),
-            len(self._groups),
-            total_cpu_bytes / (1024**2),
-        )
-    # ------------------------------------------------------------------
-    def offload(self):
-        """Per-tensor async D2H into CPU flat buffer, then free GPU storage."""
-        if not self._managed:
-            return
-        if not self._initialized:
-            self._init_buffers()
-        self._ensure_stream()
-        # Offload stream waits for compute to finish.
-        compute_event = torch.cuda.current_stream(self._device).record_event()
-        self._offload_stream.wait_event(compute_event)
-        offloaded_bytes = 0
-        # Per-tensor D2H copies directly into CPU flat buffer slices.
-        # No GPU staging buffer → no temporary GPU memory spike.
-        with torch.cuda.stream(self._offload_stream):
-            for dtype, grp in self._groups.items():
-                indices = grp["indices"]
-                offsets = grp["offsets"]
-                cpu_flat = grp["cpu_flat"]
-                for i, mgd_idx in enumerate(indices):
-                    local = self._local(self._managed[mgd_idx])
-                    off, n = offsets[i]
-                    cpu_flat[off : off + n].copy_(local.reshape(-1), non_blocking=True)
-                offloaded_bytes += grp["total"] * cpu_flat.element_size()
-        # Wait for all D2H copies to land, then free GPU storage.
-        self._offload_stream.synchronize()
-        for t in self._managed:
-            storage = self._local(t).untyped_storage()
-            if storage.size() != 0:
-                storage.resize_(0)
-            else:
-                raise RuntimeError(
-                    f"Tensor storage is already freed (size=0) before offload. "
-                    f"This indicates a double-free or external interference. "
-                    f"Tensor shape: {t.shape}, dtype: {t.dtype}"
-                )
-        if not self._logged:
-            logger.info(
-                "[CPUOffload] Offloaded %.2f MB (GPU → CPU)",
-                offloaded_bytes / (1024**2),
-            )
-    # ------------------------------------------------------------------
-    def reload(self):
-        """Per-tensor H2D from CPU flat buffer on the default stream.
-        Runs on the current (default) CUDA stream to avoid stream
-        interaction issues with the parallel Muon pipeline.  Since
-        pinned CPU memory is the source, the copies overlap with
-        GPU idle time between steps.
-        """
-        if not self._managed or not self._initialized:
-            return
-        reloaded_bytes = 0
-        # Re-allocate all GPU storages first.
-        for t in self._managed:
-            local = self._local(t)
-            storage = local.untyped_storage()
-            if storage.size() != 0:
-                raise RuntimeError(
-                    f"Storage should have been freed (size=0) before reload, "
-                    f"but got size={storage.size()}. "
-                    f"Tensor shape: {t.shape}, dtype: {t.dtype}"
-                )
-            storage.resize_(self._storage_nbytes[id(t)])
-        # Per-tensor H2D copies from CPU flat buffer slices.
-        # non_blocking=True with pinned source allows DMA overlap.
-        for dtype, grp in self._groups.items():
-            indices = grp["indices"]
-            offsets = grp["offsets"]
-            cpu_flat = grp["cpu_flat"]
-            for i, mgd_idx in enumerate(indices):
-                local = self._local(self._managed[mgd_idx])
-                off, n = offsets[i]
-                local.reshape(-1).copy_(cpu_flat[off : off + n], non_blocking=True)
-            reloaded_bytes += grp["total"] * cpu_flat.element_size()
-        if not self._logged:
-            logger.info(
-                "[CPUOffload] Reloaded %.2f MB (CPU → GPU)", reloaded_bytes / (1024**2)
-            )

build/torch210-cxx11-cu128-x86_64-linux/distributed/utils.py DELETED Viewed

@@ -1,232 +0,0 @@
-import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import DTensor
-from torch.distributed.tensor.placement_types import (Placement, Shard,
-                                                      _StridedShard)
-def _is_shard(placement: Placement) -> bool:
-    """Check if a placement is a shard type (Shard or _StridedShard).
-    In PyTorch 2.10+, _StridedShard no longer inherits from Shard, so
-    ``placement.is_shard()`` returns False for _StridedShard.  This helper
-    handles both old and new hierarchies.
-    """
-    return isinstance(placement, (Shard, _StridedShard))
-def get_slices_of_dtensor(
-    target: DTensor | torch.Tensor,
-    local_rank: int,
-    shard_mesh: DeviceMesh,
-    shard_placements: tuple[Placement],
-) -> tuple[slice | torch.Tensor, ...]:
-    """
-    Get per-dimension indices for a given rank's shard of the target tensor.
-    Uses ``Shard.local_shard_size_and_offset`` and
-    ``_StridedShard.local_shard_size_and_offset`` for correct handling of
-    both contiguous and strided (non-contiguous) sharding.
-    Args:
-        target (DTensor | torch.Tensor): The target tensor (for its shape).
-        local_rank (int): The local rank within the shard group.
-        shard_mesh (DeviceMesh): The shard mesh (only shard dimensions).
-        shard_placements (tuple[Placement]): The shard placements.
-    Returns:
-        A tuple of indices (one per tensor dim).  Each element is either:
-        - A ``slice`` (for contiguous or unsharded dims)
-        - A 1-D ``torch.LongTensor`` of indices (for strided sharding)
-    """
-    # find the global rank of the local rank in the shard mesh
-    rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
-    rank_coords = (shard_mesh.mesh == rank).nonzero()
-    assert len(rank_coords) == 1
-    rank_coords = tuple(rank_coords[0].tolist())
-    assert len(rank_coords) == len(shard_placements)
-    # Track per-shard-dim indices.
-    # None means "not yet sharded on this dim".
-    dim_indices: dict[int, torch.Tensor] = {}
-    # Caution: Assuming replicate-to-shard of the shard mesh goes with
-    # left-to-right sharding. This is ensured by the sorting logic of
-    # construct_shard_mesh function.
-    for mesh_dim_idx, (rank_coord, placement) in enumerate(
-            zip(rank_coords, shard_placements)):
-        assert _is_shard(placement)
-        num_chunks = shard_mesh.mesh.shape[mesh_dim_idx]
-        shard_dim = placement.dim
-        # Current effective size on this dim (may already be sub-sharded)
-        if shard_dim in dim_indices:
-            curr_size = len(dim_indices[shard_dim])
-        else:
-            curr_size = target.size()[shard_dim]
-        # Compute indices for this level of sharding
-        if isinstance(placement, _StridedShard):
-            _shard_size, offsets = _StridedShard.local_shard_size_and_offset(
-                placement,
-                curr_size,
-                num_chunks,
-                rank_coord,
-                return_first_offset=False)
-            new_indices = torch.tensor(offsets, dtype=torch.long)
-        else:
-            shard_size, offset = Shard.local_shard_size_and_offset(
-                curr_size, num_chunks, rank_coord)
-            new_indices = torch.arange(offset,
-                                       offset + shard_size,
-                                       dtype=torch.long)
-        # Compose with previous indices on this dim
-        if shard_dim in dim_indices:
-            dim_indices[shard_dim] = dim_indices[shard_dim][new_indices]
-        else:
-            dim_indices[shard_dim] = new_indices
-    # Build result tuple
-    result: list[slice | torch.Tensor] = []
-    for d in range(len(target.size())):
-        if d not in dim_indices:
-            result.append(slice(None))
-        else:
-            indices = dim_indices[d]
-            # Convert contiguous indices to slice for efficiency
-            if len(indices) > 0:
-                start = indices[0].item()
-                expected = torch.arange(start,
-                                        start + len(indices),
-                                        dtype=torch.long)
-                if torch.equal(indices, expected):
-                    result.append(slice(start, start + len(indices)))
-                else:
-                    result.append(indices)
-            else:
-                result.append(slice(0, 0))
-    return tuple(result)
-_ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
-                                                  ProcessGroup]] = dict()
-def construct_shard_mesh(
-    placements: tuple[Placement],
-    mesh: DeviceMesh,
-) -> tuple[DeviceMesh, ProcessGroup, tuple[Placement, ...]]:
-    """Construct shard sub-mesh and ProcessGroup for all-to-all communication.
-    Given a DTensor's placements and device mesh, extracts the "shard group"
-    — the set of ranks that together hold all shards of the same replica —
-    and creates a ProcessGroup for all-to-all among them.
-    Steps:
-        1. Sort placements: Replicate first, then Shard by (dim, granularity).
-        2. Permute the mesh tensor to match the sorted order.
-        3. Collapse Replicate dims → list of shard sub-meshes (one per replica).
-        4. Create/retrieve a cached ProcessGroup for the current rank's sub-mesh.
-    Example — 8 GPUs, mesh shape (2, 2, 2),
-              placements ``[Shard(0), Replicate, _StridedShard(0)]``::
-        Step 1 — Sort: [Replicate, _StridedShard(0), Shard(0)]
-                 Permutation: [1, 2, 0]
-        Step 2 — Permute mesh dims by [1, 2, 0]:
-                 Original:                Permuted:
-                 [[[0,1],[2,3]],          [[[0,2],[1,3]],
-                  [[4,5],[6,7]]]           [[4,6],[5,7]]]
-        Step 3 — Unbind replicate dim (dim 0), giving 2 shard sub-meshes:
-                 sub-mesh 0 = [[0,2],[1,3]]  (replica group 0)
-                 sub-mesh 1 = [[4,6],[5,7]]  (replica group 1)
-                 shard_placements = (_StridedShard(0), Shard(0))
-        Step 4 — Rank 0 → ProcessGroup([0,1,4,5])
-                 Rank 2 → ProcessGroup([2,3,6,7])
-    Returns:
-        ``(shard_mesh, process_group, shard_placements)``
-    """
-    my_rank = dist.get_rank()
-    assert mesh.mesh.device.type == 'cpu'
-    # -- Fast path: 1D all-shard mesh → reuse existing PG. ----------------
-    # Reuses the mesh's existing ProcessGroup directly, avoiding the
-    # overhead of dist.new_group(). The standard path below also handles
-    # subset calls safely via use_local_synchronization=True, but this
-    # fast path is still beneficial for the common 1D shard case.
-    if mesh.ndim == 1 and len(placements) == 1 and _is_shard(placements[0]):
-        key = (*mesh.mesh.shape, *mesh.mesh.flatten().tolist())
-        if key not in _ranks_to_dist_cache:
-            _ranks_to_dist_cache[key] = (mesh, mesh.get_group())
-        return (*_ranks_to_dist_cache[key], tuple(placements))
-    mesh_tensor = mesh.mesh.clone()
-    # -- Step 1: Sort placements (Replicate first, then Shard by dim). ------
-    # _StridedShard comes BEFORE regular Shard on the same dim so that
-    # get_slices_of_dtensor applies the outer sharding first, matching
-    # DTensor's left-to-right (outer-to-inner) composition order.
-    def _sort_key(item):
-        index, placement = item
-        assert not placement.is_partial(), "Partial placement not supported"
-        if placement.is_replicate():
-            return (-1, 0, index)
-        assert _is_shard(placement), f"Unsupported: {type(placement)}"
-        split = (-1 / placement.split_factor if isinstance(
-            placement, _StridedShard) else 0)
-        return (placement.dim, split, index)
-    indexed = sorted(enumerate(placements), key=_sort_key)
-    perm, sorted_placements = zip(*indexed)
-    # -- Step 2: Permute mesh to match sorted placement order. --------------
-    sorted_mesh = mesh_tensor.permute(perm)
-    # -- Step 3: Collapse replicate dims → list of shard sub-meshes. --------
-    # E.g. mesh (2, 3, 4, 4) with [R, R, S(0), S(1)] → 6 sub-meshes of (4, 4)
-    num_rep = sum(1 for p in sorted_placements if p.is_replicate())
-    if num_rep > 0:
-        if num_rep > 1:
-            sorted_mesh = sorted_mesh.flatten(0, num_rep - 1)
-        shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
-    else:
-        shard_meshes = [sorted_mesh]
-    shard_placements = sorted_placements[num_rep:]
-    assert len(shard_placements) == len(set(shard_placements))
-    # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
-    # Each rank only creates the group it belongs to, using
-    # use_local_synchronization=True so that only group members need to
-    # coordinate. This avoids deadlocks when different PP stages call
-    # construct_shard_mesh for different parameters.
-    def _cache_key(t: torch.Tensor) -> tuple:
-        return (*t.shape, *t.flatten().tolist())
-    my_key = None
-    for sm in shard_meshes:
-        if (my_rank == sm).any().item():
-            key = _cache_key(sm)
-            assert my_key is None, "Rank appears in multiple shard groups"
-            my_key = key
-            if key not in _ranks_to_dist_cache:
-                pg = dist.new_group(sm.flatten().tolist(),
-                                    use_local_synchronization=True)
-                _ranks_to_dist_cache[key] = (
-                    DeviceMesh(device_type="cuda", mesh=sm),
-                    pg,
-                )
-    return (*_ranks_to_dist_cache[my_key], shard_placements)

build/torch210-cxx11-cu128-x86_64-linux/matmul_transpose_triton.py DELETED Viewed

@@ -1,122 +0,0 @@
-# MIT License
-#
-# Copyright (c) 2025 Tianyang Lin
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import torch
-import triton
-import triton.language as tl
-def get_autotune_config():
-    return [
-        triton.Config(
-            {
-                'BLOCK_SIZE_M': blk_m,
-                'BLOCK_SIZE_K': blk_k,
-                'GROUP_SIZE_M': grp_sz
-            },
-            num_stages=n_stages,
-            num_warps=n_warps) for blk_m in [32, 64, 128]
-        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
-        for n_warps in [4, 8]
-    ]
-@triton.autotune(
-    configs=get_autotune_config(),
-    key=['M', 'K'],
-    restore_value=['y'],
-)
-@triton.jit
-def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
-               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-               GROUP_SIZE_M: tl.constexpr):
-    """
-    Core kernel jit function of matmul_transpose that computes y = x @ x.T
-    The code is a simple adaptation from the triton `matmul` tutorial:
-    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
-    """
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    if pid_m > pid_n:
-        return
-    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    # we use a & b ptrs to denote different rows of x.
-    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        a = tl.load(a_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
-        a_ptrs += BLOCK_SIZE_K * stride_xk
-        b_ptrs += BLOCK_SIZE_K * stride_xk
-    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
-    # https://github.com/triton-lang/triton/issues/2252
-    c = accumulator.to(x.dtype.element_ty)
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
-    tl.store(c_ptrs, c, mask=c_mask)
-    # transpose and copy
-    if pid_m < pid_n:
-        ct_ptrs = y + stride_ym * offs_cn[:,
-                                          None] + stride_yn * offs_cm[None, :]
-        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
-        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
-@torch.library.custom_op("muon::matmul_transpose_assign",
-                         mutates_args=("d_out", ))
-def matmul_transpose_assign(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
-    """Compute d_out = d_in @ d_in.T using an optimized Triton kernel."""
-    d_in = d_in.contiguous()
-    M, K = d_in.shape
-    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
-        M, META['BLOCK_SIZE_M']), )
-    with torch.cuda.device(d_in.device.index):
-        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
-                         d_out.stride(0), d_out.stride(1))
-@matmul_transpose_assign.register_fake
-def _(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
-    """FakeTensor impl: d_out is already allocated, mutation is declared."""
-    pass

build/torch210-cxx11-cu128-x86_64-linux/metadata.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "python-depends": []
-}

build/torch210-cxx11-cu128-x86_64-linux/muon.py DELETED Viewed

@@ -1,1068 +0,0 @@
-import logging
-import types
-from collections import defaultdict
-from typing import Any
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor, Replicate, Shard
-from torch.profiler import record_function
-from .adamw import _placement_cache, _tensor_cache, step_adamw
-from .async_utils import run_pipeline
-from .core import (_muon_state, adjust_lr_for_muon, batch_pre_ortho,
-                   get_default_muon_param_groups, is_expert_param, update_p)
-from .cpu_offload import CPUOffloadPool
-from .distributed.utils import (_is_shard, construct_shard_mesh,
-                                get_slices_of_dtensor)
-from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
-                            _zeropower_via_newtonschulz5,
-                            zeropower_via_newtonschulz5,
-                            zeropower_via_newtonschulz5_batched)
-from .pipeline import muon_chunk_pipeline, prelaunch_first_gather
-from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
-logger = logging.getLogger(__name__)
-def _expand_expert_params(names, params, expert_keys):
-    """Expand expert params by splitting on dim 0 (expert dimension).
-    Params whose name matches any key in ``expert_keys`` are treated as
-    expert-parallel tensors.  Their outermost dimension is the expert
-    dimension: an ``(E, out, in)`` tensor becomes ``E`` separate 2D
-    ``nn.Parameter`` views so that in-place updates propagate back to
-    the original storage.
-    Non-expert params with ``ndim > 2`` trigger an ``AssertionError`` —
-    if they are expert params, their key must be added to ``expert_keys``.
-    The grad must already be set on each expert param (e.g. after momentum).
-    For DTensor expert params, placements that shard on dim 0 (expert dim)
-    are consumed by the split.  Non-dim-0 shard placements (e.g. TP) are
-    preserved: each 2D slice is wrapped as a DTensor on the corresponding
-    submesh so the parallel pipeline handles the TP communication.
-    """
-    expanded_names = []
-    expanded_params = []
-    for n, p in zip(names, params):
-        is_expert = is_expert_param(n, expert_keys)
-        is_dtensor = isinstance(p.data, DTensor)
-        if is_expert:
-            if is_dtensor:
-                logger.debug(
-                    "[expand_expert] %s: expert DTensor, shape=%s, "
-                    "placements=%s, mesh=%s, local_shape=%s", n, p.shape,
-                    p.placements, p.device_mesh.mesh_dim_names,
-                    p.to_local().shape)
-            else:
-                logger.debug(
-                    "[expand_expert] %s: expert plain tensor, shape=%s", n,
-                    p.data.shape)
-        if not is_expert:
-            assert p.data.ndim <= 2, (
-                f"Param {n} has ndim={p.data.ndim} but does not match "
-                f"expert_keys={expert_keys}. If this is an expert param, "
-                f"add its key to expert_keys.")
-            expanded_names.append(n)
-            expanded_params.append(p)
-            continue
-        g = p.grad
-        assert g is not None, (
-            f"Expert param {n} must have grad set before expansion")
-        tp_mesh = None
-        tp_placements_2d = None
-        if is_dtensor:
-            local_data = p.to_local()
-            local_grad = g.to_local() if isinstance(g, DTensor) else g
-            # Find non-dim-0 shard placements (e.g. TP sharding).
-            # After splitting on dim 0, Shard(k) becomes Shard(k-1).
-            tp_dim_indices = []
-            tp_placements_2d = []
-            for i, pl in enumerate(p.placements):
-                if _is_shard(pl) and pl.dim != 0:
-                    tp_dim_indices.append(i)
-                    tp_placements_2d.append(Shard(pl.dim - 1))
-            if tp_dim_indices:
-                tp_dim_names = tuple(p.device_mesh.mesh_dim_names[i]
-                                     for i in tp_dim_indices)
-                if len(tp_dim_names) == 1:
-                    tp_mesh = p.device_mesh[tp_dim_names[0]]
-                else:
-                    tp_mesh = p.device_mesh[tp_dim_names]
-        else:
-            local_data = p.data
-            local_grad = g
-        # Expand: split dim 0, reshape each slice to 2D.
-        num_local_experts = local_data.shape[0]
-        for i in range(num_local_experts):
-            slice_data = local_data[i]
-            slice_grad = local_grad[i]
-            if tp_mesh is not None:
-                # Wrap as DTensor on TP submesh so the pipeline handles
-                # TP communication (gather/scatter across TP ranks).
-                dt_data = DTensor.from_local(slice_data,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                dt_grad = DTensor.from_local(slice_grad,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                expert_param = torch.nn.Parameter(dt_data, requires_grad=False)
-                expert_param.grad = dt_grad
-            else:
-                expert_param = torch.nn.Parameter(slice_data,
-                                                  requires_grad=False)
-                expert_param.grad = slice_grad
-            expanded_names.append(f"{n}[{i}]")
-            expanded_params.append(expert_param)
-        p.grad = None  # allow expert grad storage to be freed after pipeline
-    return expanded_names, expanded_params
-class Muon(torch.optim.Optimizer):
-    """
-    Muon - MomentUm Orthogonalized by Newton-schulz
-    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
-    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
-    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
-    the advantage that it can be stably run in bfloat16 on the GPU.
-    Some warnings:
-    - We believe this optimizer is unlikely to work well for training with small batch size.
-    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
-    Arguments:
-        model: The model to be optimized by Muon.
-        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
-        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
-        momentum: The momentum used by the internal SGD. (0.95 is a good default)
-        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
-        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        weight_decay: The weight decay for Muon and AdamW.
-            Parameters that are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW instead.
-        adamw_lr: The learning rate for the internal AdamW.
-        adamw_betas: The betas for the internal AdamW.
-        adamw_eps: The epsilon for the internal AdamW.
-        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
-        debug: Whether to print debug information.
-        clip_info : Configuration for QK clipping. Expected keys:
-            - "q_indices" (list[int]): Indices of query heads to consider.
-            - "k_indices" (list[int]): Indices of key heads to consider.
-            - "head_dim" (int): Dimensionality of each attention head.
-            - "threshold" (float): Threshold value; heads whose QK logits exceed
-            this value will be scaled down.
-            Default is:
-                {
-                    "q_indices": [],
-                    "k_indices": [],
-                    "head_dim": 128,
-                    "threshold": 100
-                }
-        warmup_step : How many all2all gather, compute operations are launched in advance
-                      before the corresponding all2all scatter steps begin.
-                      A higher warmup_step increases memory usage but can improve
-                      performance by overlapping communication.
-                      Parallel muon only.
-        chunk_size : Batch size of parameters to process in each
-                     all2all gather/compute/scatter step.
-                     Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
-        use_distributed_muon: Use distributed muon by Liu et al. (2024).
-                              For testing purpose only.
-        expert_keys: List of strings to identify expert-parallel parameters.
-                     If any key appears in a parameter's name, its outermost
-                     dimension is treated as the expert dimension and expanded
-                     into per-expert 2D params for Muon.  For example,
-                     ``expert_keys=["experts"]`` matches any param whose name
-                     contains "experts".  3D+ params not matched by any key
-                     will raise an error.
-    """
-    def __init__(self,
-                 params,
-                 lr=1e-3,
-                 momentum=0.95,
-                 nesterov=True,
-                 ns_steps=5,
-                 weight_decay=0.1,
-                 adamw_betas=(0.9, 0.95),
-                 adamw_eps=1e-8,
-                 none_grad=True,
-                 debug=False,
-                 clip_config=None,
-                 warmup_step=5,
-                 chunk_size=-1,
-                 use_distributed_muon=False,
-                 expert_keys=None):
-        defaults = dict(
-            lr=lr,
-            weight_decay=weight_decay,
-            momentum=momentum,
-            nesterov=nesterov,
-            ns_steps=ns_steps,
-            adamw_betas=adamw_betas,
-            adamw_eps=adamw_eps,
-            none_grad=none_grad,
-            use_muon=True,
-        )
-        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
-        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
-        if isinstance(params, types.GeneratorType):
-            raise ValueError(error_message.format(idx=0) + instruction_code)
-        for _idx, param_group in enumerate(params):
-            if param_group.get("use_muon", None) is None:
-                raise ValueError(
-                    error_message.format(idx=_idx) + instruction_code)
-        super().__init__(params, defaults)
-        self.debug = debug
-        self.clip_config = clip_config if clip_config is not None else {
-            "q_indices": [],
-            "k_indices": [],
-            "head_dim": 128,
-            "threshold": 100,
-        }
-        self.warmup_step = warmup_step
-        self.chunk_size = chunk_size
-        self.use_distributed_muon = use_distributed_muon
-        self.expert_keys = expert_keys
-        self.cpu_offload = False
-        self._cpu_offload_pool: CPUOffloadPool | None = None
-        self._offload_initialized = False
-        self._parallel_cache: dict[tuple[str, ...], dict] = {}
-        self._expert_expand_cache: dict[tuple[int, ...], dict] = {}
-    def _calc_flops(self, G, steps):
-        assert len(G.shape) == 2
-        M, N = G.shape
-        if M > N:
-            M, N = N, M
-        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def get_shard_mesh(self, p):
-        """
-        Get the shard mesh for a parameter p on the given rank.
-        """
-        assert isinstance(
-            p, DTensor), "Parallel Muon only supports DTensor parameters."
-        shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
-            p.placements, p.device_mesh)
-        return shard_mesh, shard_pg, shard_placements
-    def init_state_and_assign_params(self, names, params, group, qk_logits):
-        param_to_state = {}
-        param_to_flops = {}
-        total_flops = 0
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            assert g.ndim == 2, "Muon only supports 2D parameters."
-            flops = self._calc_flops(g, group["ns_steps"])
-            param_to_flops[id(p)] = flops
-            total_flops += flops
-        if self.debug:
-            logger.debug("Total TFLOPs for Muon: %.2f TFLOPs",
-                         total_flops / 1e12)
-        paired = list(zip(names, params))
-        paired_sorted = sorted(paired,
-                               key=lambda x: param_to_flops[id(x[1])],
-                               reverse=True)
-        names_sorted, params_sorted = zip(*paired_sorted)
-        ordered_names = list(names_sorted)
-        ordered_params = list(params_sorted)
-        round_robin = 0
-        mesh = ordered_params[0].device_mesh
-        placements = ordered_params[0].placements
-        shard_mesh, shard_pg, shard_placements = self.get_shard_mesh(
-            ordered_params[0])
-        shard_mesh_flattened = shard_mesh.mesh.flatten()
-        num_ranks = dist.get_world_size(group=shard_pg)
-        for n, p in zip(ordered_names, ordered_params):
-            if mesh != p.device_mesh:
-                raise ValueError("All parameters must be on the same mesh.")
-            if placements != p.placements:
-                raise ValueError("All parameters must have same placements.")
-            worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
-            round_robin = (round_robin + 1) % len(shard_mesh_flattened)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            # Precompute per-rank indices and numels for all-to-all.
-            rank_indices: dict[int, tuple] = {}
-            rank_numels: dict[int, int] = {}
-            for r in range(num_ranks):
-                indices = get_slices_of_dtensor(p, r, shard_mesh,
-                                                shard_placements)
-                rank_indices[r] = indices
-                numel = 1
-                for idx, dim_size in zip(indices, p.shape):
-                    if isinstance(idx, slice):
-                        start, stop, step = idx.indices(dim_size)
-                        numel *= max(0, (stop - start + (step - 1)) // step)
-                    else:
-                        numel *= len(idx)
-                rank_numels[r] = numel
-            param_to_state[id(p)] = _muon_state(
-                worker_rank=worker_rank,
-                process_group=shard_pg,
-                rank_indices=rank_indices,
-                rank_numels=rank_numels,
-                name=n,
-                qk_clip_state=qk_clip_state,
-            )
-        return param_to_state, ordered_params
-    def base(self, names, params, group, lr, weight_decay, qk_logits):
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            u = zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
-                                            steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-            update_p(p, u, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p, scales_full, qk_clip_state)
-    def distributed_muon(
-        self,
-        names: list[str],
-        params: list[torch.nn.Parameter],
-        group: dict[str, Any],
-        lr: float,
-        weight_decay: float,
-        qk_logits: list[torch.Tensor | DTensor] | None,
-    ):
-        """Batched Distributed Muon — for testing/correctness verification only.
-        Uses all-gather to reconstruct full tensors, computes Newton-Schulz on
-        the full grad, then slices back to local shards.  This is simpler but
-        slower than the parallel pipeline (all2all) path, so it serves as a
-        reference implementation for verifying correctness.
-        """
-        with record_function("distributed_muon"):
-            # Momentum is already applied by _step_muon before this method.
-            ns_steps = group["ns_steps"]
-            # Separate plain tensors (no communication) from DTensors.
-            plain_names, plain_params = [], []
-            dtensor_names, dtensor_params = [], []
-            for n, p in zip(names, params):
-                if p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    dtensor_names.append(n)
-                    dtensor_params.append(p)
-                else:
-                    plain_names.append(n)
-                    plain_params.append(p)
-            # Process plain tensors per-param (no communication).
-            for n, p in zip(plain_names, plain_params):
-                u = _zeropower_via_newtonschulz5(p.grad.to(COMM_DTYPE),
-                                                 steps=ns_steps)
-                adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-                update_p(p, u, lr, adjusted_lr, weight_decay)
-                qk_clip_state = get_qk_clip_info(self.clip_config, n,
-                                                 qk_logits)
-                scales_full = compute_scales(
-                    p, qk_clip_state) if qk_clip_state is not None else None
-                if scales_full is not None:
-                    qk_clip(p, scales_full, qk_clip_state)
-            if not dtensor_params:
-                return
-            # Group DTensors by (placements, mesh) for batched all-gather.
-            placement_groups: dict[tuple,
-                                   tuple[list,
-                                         list]] = defaultdict(lambda: ([], []))
-            for n, p in zip(dtensor_names, dtensor_params):
-                key = (p.placements, p.device_mesh)
-                placement_groups[key][0].append(n)
-                placement_groups[key][1].append(p)
-            logger.info(
-                "distributed_muon: %d placement groups, %d total dtensors",
-                len(placement_groups), len(dtensor_params))
-            for (placements, mesh), (grp_names,
-                                     grp_params) in placement_groups.items():
-                shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
-                    placements, mesh)
-                rank = dist.get_rank(shard_pg)
-                world_size = dist.get_world_size(shard_pg)
-                logger.info("  group: %d params, placements=%s, world_size=%d",
-                            len(grp_params), placements, world_size)
-                # Separate params that can be batched (all shard dims evenly
-                # divisible) from those needing per-param full_tensor
-                # (e.g. MoE gate weights with fewer rows than shard ranks).
-                # all_gather_into_tensor requires equal buffer sizes across
-                # ranks, so uneven splits must use DTensor full_tensor().
-                batch_names, batch_params = [], []
-                single_names, single_params = [], []
-                for n, p in zip(grp_names, grp_params):
-                    even = all(p.shape[pl.dim] %
-                               shard_mesh.mesh.shape[dim_idx] == 0
-                               for dim_idx, pl in enumerate(shard_placements))
-                    if even:
-                        batch_names.append(n)
-                        batch_params.append(p)
-                    else:
-                        single_names.append(n)
-                        single_params.append(p)
-                # Process uneven-split params per-param via full_tensor().
-                for n, p in zip(single_names, single_params):
-                    with record_function("distributed_muon::newton_schulz"):
-                        g_full = p.grad.full_tensor().to(COMM_DTYPE)
-                        u_full = _zeropower_via_newtonschulz5(g_full,
-                                                              steps=ns_steps)
-                        del g_full
-                    with record_function("distributed_muon::update"):
-                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-                        p._local_tensor.mul_(1 - lr * weight_decay)
-                        local_indices = get_slices_of_dtensor(
-                            p, rank, shard_mesh, shard_placements)
-                        u_local = u_full[local_indices]
-                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
-                        del u_full
-                        qk_clip_state = get_qk_clip_info(
-                            self.clip_config, n, qk_logits)
-                        scales_full = compute_scales(
-                            p, qk_clip_state
-                        ) if qk_clip_state is not None else None
-                        if scales_full is not None:
-                            ratio = p.shape[0] // scales_full.shape[0]
-                            idx0 = local_indices[0]
-                            if isinstance(idx0, slice):
-                                start = idx0.start or 0
-                                idx0 = torch.arange(start,
-                                                    idx0.stop,
-                                                    device=scales_full.device)
-                            row_scales = scales_full[idx0 // ratio]
-                            p._local_tensor.mul_(row_scales.view(-1, 1))
-                if not batch_params:
-                    continue
-                logger.info("  batched=%d, single=%d", len(batch_params),
-                            len(single_params))
-                # Concat all local grad shards into a single flat buffer.
-                with record_function("distributed_muon::gather"):
-                    grad_locals = [
-                        p.grad.to_local().to(COMM_DTYPE).flatten()
-                        for p in batch_params
-                    ]
-                    numels = [g.numel() for g in grad_locals]
-                    grad_concat = torch.cat(grad_locals)
-                    del grad_locals
-                    # Single all-gather (replaces N separate full_tensor).
-                    grad_gathered = torch.empty(
-                        grad_concat.numel() * world_size,
-                        dtype=COMM_DTYPE,
-                        device="cuda",
-                    )
-                    dist.all_gather_into_tensor(grad_gathered,
-                                                grad_concat,
-                                                group=shard_pg)
-                total_numel = grad_concat.numel()
-                del grad_concat
-                # Precompute per-param offsets within the concat buffer.
-                offsets = []
-                off = 0
-                for ne in numels:
-                    offsets.append(off)
-                    off += ne
-                # Per-param: reconstruct full grad → NS → local update.
-                for i, (n, p) in enumerate(zip(batch_names, batch_params)):
-                    with record_function("distributed_muon::newton_schulz"):
-                        g_full = torch.empty(p.shape,
-                                             dtype=COMM_DTYPE,
-                                             device="cuda")
-                        for r in range(world_size):
-                            r_start = r * total_numel + offsets[i]
-                            shard = grad_gathered[r_start:r_start + numels[i]]
-                            indices = get_slices_of_dtensor(
-                                p, r, shard_mesh, shard_placements)
-                            g_full[indices] = shard.reshape(
-                                g_full[indices].shape)
-                        u_full = _zeropower_via_newtonschulz5(g_full,
-                                                              steps=ns_steps)
-                        del g_full
-                    with record_function("distributed_muon::update"):
-                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-                        p._local_tensor.mul_(1 - lr * weight_decay)
-                        local_indices = get_slices_of_dtensor(
-                            p, rank, shard_mesh, shard_placements)
-                        u_local = u_full[local_indices]
-                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
-                        del u_full
-                        qk_clip_state = get_qk_clip_info(
-                            self.clip_config, n, qk_logits)
-                        scales_full = compute_scales(
-                            p, qk_clip_state
-                        ) if qk_clip_state is not None else None
-                        if scales_full is not None:
-                            ratio = p.shape[0] // scales_full.shape[0]
-                            idx0 = local_indices[0]
-                            if isinstance(idx0, slice):
-                                start = idx0.start or 0
-                                idx0 = torch.arange(start,
-                                                    idx0.stop,
-                                                    device=scales_full.device)
-                            row_scales = scales_full[idx0 // ratio]
-                            p._local_tensor.mul_(row_scales.view(-1, 1))
-    def _setup_parallel(self, names, params, group, qk_logits):
-        """Compute (or retrieve cached) parallel pipeline metadata.
-        Returns:
-            (ordered_params, param_to_state, rank, chunk_size)
-        """
-        cache_key = tuple(names)
-        if cache_key not in self._parallel_cache:
-            # First call: compute metadata and populate cache.
-            param_to_state, ordered_params = self.init_state_and_assign_params(
-                names, params, group, qk_logits)
-            shard_pg = param_to_state[id(ordered_params[0])].process_group
-            rank = dist.get_rank(group=shard_pg)
-            if self.chunk_size == -1:
-                shard_ranks = dist.get_world_size(shard_pg)
-                chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
-            elif self.chunk_size > 0:
-                chunk_size = self.chunk_size
-            else:
-                raise ValueError(
-                    "chunk_size must be -1 or a positive integer.")
-            ordered_names = [
-                param_to_state[id(p)].name for p in ordered_params
-            ]
-            name_to_state = {
-                param_to_state[id(p)].name: param_to_state[id(p)]
-                for p in ordered_params
-            }
-            self._parallel_cache[cache_key] = {
-                'ordered_names': ordered_names,
-                'name_to_state': name_to_state,
-                'rank': rank,
-                'chunk_size': chunk_size,
-            }
-        else:
-            # Cached path: rebuild param_to_state with current id(p) keys.
-            cache = self._parallel_cache[cache_key]
-            rank = cache['rank']
-            chunk_size = cache['chunk_size']
-            name_to_param = dict(zip(names, params))
-            ordered_params = [name_to_param[n] for n in cache['ordered_names']]
-            param_to_state = {}
-            for p, n in zip(ordered_params, cache['ordered_names']):
-                cached_state = cache['name_to_state'][n]
-                param_to_state[id(p)] = _muon_state(
-                    worker_rank=cached_state.worker_rank,
-                    process_group=cached_state.process_group,
-                    rank_indices=cached_state.rank_indices,
-                    rank_numels=cached_state.rank_numels,
-                    name=n,
-                    qk_clip_state=get_qk_clip_info(self.clip_config, n,
-                                                   qk_logits),
-                )
-        return ordered_params, param_to_state, rank, chunk_size
-    def parallel(self,
-                 names,
-                 params,
-                 group,
-                 lr,
-                 weight_decay,
-                 qk_logits,
-                 prelaunch_gather=None):
-        """
-        Perform a parallel optimization step using Muon.
-        Parameters are chunked and each chunk is processed by a
-        :func:`muon_chunk_pipeline` generator.  :func:`run_pipeline`
-        interleaves multiple chunks so that communication and computation
-        overlap across chunks (the same overlap previously achieved by the
-        warmup + main-loop index scheduling).
-        If ``prelaunch_gather`` is provided, it is passed to the first
-        chunk's generator to skip re-launching the already in-flight
-        A2A gather.
-        """
-        # Momentum is already applied by _step_muon before this method.
-        ordered_params, param_to_state, rank, chunk_size = (
-            self._setup_parallel(names, params, group, qk_logits))
-        def pipelines():
-            first = True
-            for start in range(0, len(ordered_params), chunk_size):
-                chunk = ordered_params[start:start + chunk_size]
-                if chunk:
-                    kwargs = dict(
-                        params=chunk,
-                        param_to_state=param_to_state,
-                        rank=rank,
-                        ns_steps=group["ns_steps"],
-                        lr=lr,
-                        weight_decay=weight_decay,
-                        none_grad=group["none_grad"],
-                    )
-                    if first and prelaunch_gather is not None:
-                        kwargs['prelaunch_gather'] = prelaunch_gather
-                    first = False
-                    yield muon_chunk_pipeline(**kwargs)
-        with record_function("muon::pipeline"):
-            run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
-    def _step_muon(self, group, qk_logits=None):
-        params = group["params"]
-        lr = group["lr"]
-        weight_decay = group["weight_decay"]
-        momentum = group["momentum"]
-        names = group["names"]
-        # Apply momentum to all params before routing/expansion.
-        # Batched using _foreach_* ops (compiled, fullgraph=True).
-        with record_function("muon::momentum"):
-            active_params = [p for p in params if p.grad is not None]
-            if active_params:
-                # Ensure momentum buffers exist (avoid zeros_like when already present).
-                for p in active_params:
-                    if "momentum_buffer" not in self.state[p]:
-                        self.state[p]["momentum_buffer"] = torch.zeros_like(
-                            p.grad)
-                # Extract local tensors for compiled batch function.
-                local_grads = [
-                    p.grad._local_tensor
-                    if isinstance(p.grad, DTensor) else p.grad
-                    for p in active_params
-                ]
-                local_bufs = [
-                    self.state[p]["momentum_buffer"]._local_tensor
-                    if isinstance(self.state[p]["momentum_buffer"], DTensor)
-                    else self.state[p]["momentum_buffer"]
-                    for p in active_params
-                ]
-                # Wrap momentum as tensor for torch.compile.
-                batch_pre_ortho(local_grads, local_bufs,
-                                torch.tensor(momentum), group["nesterov"])
-                # For non-nesterov, the result is the momentum buffer.
-                if not group["nesterov"]:
-                    for p in active_params:
-                        p.grad = self.state[p]["momentum_buffer"]
-        # Identify batched experts for deferred NS.
-        # Detection is cheap (condition checks only); actual NS compute is
-        # deferred so it can overlap with the first chunk's A2A gather.
-        deferred_expert_work = []
-        if self.expert_keys:
-            batched_expert_indices = []
-            for i, (n, p) in enumerate(zip(names, params)):
-                if not (is_expert_param(n, self.expert_keys)
-                        and p.grad is not None):
-                    continue
-                # Eligible: plain tensor, or DTensor with no non-dim-0 shards.
-                if isinstance(p.data, DTensor):
-                    has_tp = any(
-                        _is_shard(pl) and pl.dim != 0 for pl in p.placements)
-                    if has_tp:
-                        continue
-                batched_expert_indices.append(i)
-            if batched_expert_indices:
-                # Save refs for deferred NS; free grads from param list.
-                for i in batched_expert_indices:
-                    p = params[i]
-                    g = p.grad
-                    local_g = (g._local_tensor
-                               if isinstance(g, DTensor) else g)
-                    local_data = (p.data._local_tensor if isinstance(
-                        p.data, DTensor) else p.data)
-                    deferred_expert_work.append((local_data, local_g))
-                    p.grad = None
-                # Remove batched experts from lists before expansion.
-                keep = sorted(
-                    set(range(len(params))) - set(batched_expert_indices))
-                names = [names[i] for i in keep]
-                params = [params[i] for i in keep]
-        def _run_deferred_expert_ns():
-            """Execute deferred batched expert NS."""
-            if not deferred_expert_work:
-                return
-            with record_function("muon::batched_expert_ns"):
-                ns_steps = group["ns_steps"]
-                for local_data, local_g in deferred_expert_work:
-                    u = zeropower_via_newtonschulz5_batched(
-                        local_g.to(COMM_DTYPE), steps=ns_steps)
-                    adjusted_lr = adjust_lr_for_muon(lr, local_g.shape[1:])
-                    local_data.mul_(1 - lr * weight_decay)
-                    local_data.add_(u, alpha=-adjusted_lr)
-        # Expand expert params by splitting on dim 0.
-        logger.debug("[_step_muon] before expand: %d params, expert_keys=%s",
-                     len(params), self.expert_keys)
-        if self.expert_keys:
-            cache_key = tuple(id(p) for p in params)
-            cache = self._expert_expand_cache.get(cache_key)
-            if cache is None:
-                # Cold path: full expansion + build cache metadata.
-                exp_names, exp_params = _expand_expert_params(
-                    names, params, self.expert_keys)
-                # Build per-expert-group info for hot-path grad updates.
-                grad_info = []
-                exp_idx = 0
-                for orig_idx, (n, p) in enumerate(zip(names, params)):
-                    if not is_expert_param(n, self.expert_keys):
-                        exp_idx += 1
-                        continue
-                    is_dt = isinstance(p.data, DTensor)
-                    num_experts = (p.to_local() if is_dt else p.data).shape[0]
-                    # Detect TP mesh from the first expanded expert param.
-                    tp_mesh = None
-                    tp_pls = None
-                    sample = exp_params[exp_idx]
-                    if isinstance(sample.data, DTensor):
-                        tp_mesh = sample.data.device_mesh
-                        tp_pls = list(sample.data.placements)
-                    grad_info.append((orig_idx, num_experts, exp_idx, is_dt,
-                                      tp_mesh, tp_pls))
-                    exp_idx += num_experts
-                self._expert_expand_cache[cache_key] = {
-                    'names': exp_names,
-                    'params': exp_params,
-                    'grad_info': grad_info,
-                }
-                names, params = exp_names, exp_params
-            else:
-                # Hot path: reuse cached params, only update expert grads.
-                for (orig_idx, num_experts, exp_start, is_dt, tp_mesh,
-                     tp_pls) in cache['grad_info']:
-                    p = params[orig_idx]
-                    g = p.grad
-                    local_grad = (g.to_local()
-                                  if is_dt and isinstance(g, DTensor) else g)
-                    for i in range(num_experts):
-                        expert_p = cache['params'][exp_start + i]
-                        sg = local_grad[i]
-                        if tp_mesh is not None:
-                            expert_p.grad = DTensor.from_local(
-                                sg, device_mesh=tp_mesh, placements=tp_pls)
-                        else:
-                            expert_p.grad = sg
-                    p.grad = None
-                names = cache['names']
-                params = cache['params']
-        else:
-            names, params = _expand_expert_params(names, params,
-                                                  self.expert_keys)
-        logger.debug("[_step_muon] after expand: %d params", len(params))
-        param_dtensors = []
-        name_dtensors = []
-        param_tensors = []
-        name_tensors = []
-        # distributed_muon is a reference implementation for testing only.
-        # The parallel pipeline (all2all) path below is the production path.
-        if self.use_distributed_muon:
-            _run_deferred_expert_ns()
-            self.distributed_muon(names=names,
-                                  params=params,
-                                  group=group,
-                                  lr=lr,
-                                  weight_decay=weight_decay,
-                                  qk_logits=qk_logits)
-            return
-        for n, p in zip(names, params):
-            if p is None or p.grad is None:
-                continue
-            if isinstance(p.data, DTensor):
-                if all(
-                        isinstance(placement, Replicate)
-                        for placement in p.placements):
-                    logger.debug(
-                        "[route] %s → base (DTensor all-Replicate), "
-                        "shape=%s, placements=%s", n, p.shape, p.placements)
-                    param_tensors.append(p)
-                    name_tensors.append(n)
-                else:
-                    logger.debug(
-                        "[route] %s → parallel (DTensor), shape=%s, "
-                        "placements=%s, mesh=%s", n, p.shape, p.placements,
-                        p.device_mesh.mesh_dim_names)
-                    param_dtensors.append(p)
-                    name_dtensors.append(n)
-            elif isinstance(p.data, torch.Tensor):
-                logger.debug("[route] %s → base (plain tensor), shape=%s", n,
-                             p.data.shape)
-                param_tensors.append(p)
-                name_tensors.append(n)
-            else:
-                raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-        logger.debug(f"[Muon] {len(param_dtensors)} DTensors → parallel, "
-                     f"{len(param_tensors)} Tensors → base")
-        def group_dtensors(dtensors, names):
-            # To support different placements, we group parameters by placements
-            # and run parallel Muon on each group.
-            placement_to_params = defaultdict(lambda: ([], []))
-            assert len(dtensors) == len(names)
-            for p, n in zip(dtensors, names):
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][0].append(n)
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][1].append(p)
-            return placement_to_params
-        if len(param_dtensors) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            dtensor_group = group_dtensors(param_dtensors, name_dtensors)
-            # Pre-launch the first chunk's A2A gather so that the NCCL
-            # communication overlaps with the (deferred) batched expert NS
-            # compute on the default CUDA stream.
-            prelaunch = None
-            if deferred_expert_work:
-                first_names, first_params = next(iter(dtensor_group.values()))
-                ordered, pts, rnk, csz = self._setup_parallel(
-                    first_names, first_params, group, qk_logits)
-                first_chunk = ordered[:csz]
-                if first_chunk:
-                    prelaunch = prelaunch_first_gather(first_chunk, pts, rnk,
-                                                       group["none_grad"])
-            _run_deferred_expert_ns()
-            first_group = True
-            for _, (names, params) in dtensor_group.items():
-                pg = prelaunch if first_group else None
-                first_group = False
-                self.parallel(
-                    names,
-                    params,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    qk_logits=qk_logits,
-                    prelaunch_gather=pg,
-                )
-        else:
-            _run_deferred_expert_ns()
-        if len(param_tensors) > 0:
-            self.base(
-                name_tensors,
-                param_tensors,
-                group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
-    def _register_states_for_offload(self):
-        """Register all optimizer state tensors with the CPU offload pool.
-        Called once after the first step when states have been lazily created.
-        Offloads all param states (momentum buffers for Muon, moment1/moment2
-        for AdamW) to free GPU memory between steps.
-        """
-        pool = self._cpu_offload_pool
-        tracked = 0
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p not in self.state:
-                    continue
-                state = self.state[p]
-                if group.get("use_muon", False):
-                    if "momentum_buffer" in state:
-                        pool.track(state["momentum_buffer"])
-                        tracked += 1
-                else:
-                    if "moment1" in state:
-                        pool.track(state["moment1"])
-                    if "moment2" in state:
-                        pool.track(state["moment2"])
-                        tracked += 1
-        logger.info("[CPUOffload] Registered %d param states for offload",
-                    tracked)
-    @torch.no_grad
-    def step(self, closure=None, qk_logits=None):
-        """Perform a single optimization step.
-        Args:
-            closure (Callable, optional): A closure that reevaluates the model
-                and returns the loss.
-            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
-                to 1D tensors of shape (num_heads,), representing the maximum
-                QK logits across all tokens, computed as
-                (1 / sqrt(head_dim)) * (Q @ K^T).
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-        # H2D: reload optimizer states from CPU before computation.
-        if self.cpu_offload and self._offload_initialized:
-            self._cpu_offload_pool.reload()
-        logger.debug("[Muon.step] expert_keys=%s, %d param groups",
-                     self.expert_keys, len(self.param_groups))
-        for i, group in enumerate(self.param_groups):
-            if group["use_muon"]:
-                logger.debug("[Muon.step] group %d: use_muon=True, %d params",
-                             i, len(group["params"]))
-                self._step_muon(group, qk_logits=qk_logits)
-            else:
-                logger.debug(
-                    "[Muon.step] group %d: use_muon=False (AdamW), %d params",
-                    i, len(group["params"]))
-                step_adamw(self.state, group)
-        # D2H: offload optimizer states to CPU after computation.
-        if self.cpu_offload:
-            if not self._offload_initialized:
-                if self._cpu_offload_pool is None:
-                    self._cpu_offload_pool = CPUOffloadPool()
-                self._register_states_for_offload()
-                self._offload_initialized = True
-            self._cpu_offload_pool.offload()
-        return loss
-    # ------------------------------------------------------------------
-    # CPU offload public helpers
-    # ------------------------------------------------------------------
-    def turn_on_cpu_offload(self):
-        """Enable CPU offload for optimizer states."""
-        if self.cpu_offload:
-            return
-        logger.info("[Muon] turn_on_cpu_offload")
-        self.cpu_offload = True
-        if not self.state:
-            return
-        self._cpu_offload_pool = CPUOffloadPool()
-        self._offload_initialized = False
-        self._register_states_for_offload()
-        self._offload_initialized = True
-        self._cpu_offload_pool.offload()
-    def turn_off_cpu_offload(self):
-        """Disable CPU offload and keep optimizer states resident on GPU."""
-        if not self.cpu_offload:
-            return
-        logger.info("[Muon] turn_off_cpu_offload")
-        if self._offload_initialized:
-            self._cpu_offload_pool.reload()
-            torch.cuda.current_stream().synchronize()
-        self._cpu_offload_pool = None
-        self._offload_initialized = False
-        self.cpu_offload = False
-    # ------------------------------------------------------------------
-    # Checkpoint support for cpu_offload
-    # ------------------------------------------------------------------
-    def state_dict(self) -> dict:
-        if self.cpu_offload:
-            raise RuntimeError(
-                "Muon.state_dict() requires turn_off_cpu_offload() before checkpoint save."
-            )
-        return super().state_dict()
-    def load_state_dict(self, state_dict: dict) -> None:
-        if self.cpu_offload:
-            raise RuntimeError(
-                "Muon.load_state_dict() requires turn_off_cpu_offload() before checkpoint load."
-            )
-        super().load_state_dict(state_dict)
-        # Invalidate adamw.py's module-level tensor caches so that
-        # the next step rebuilds them with the newly loaded state tensors.
-        _placement_cache.clear()
-        _tensor_cache.clear()

build/torch210-cxx11-cu128-x86_64-linux/newton_schulz.py DELETED Viewed

@@ -1,240 +0,0 @@
-from itertools import repeat
-from math import inf, sqrt
-import numpy as np
-import torch
-from .matmul_transpose_triton import matmul_transpose_assign
-COMM_DTYPE = torch.bfloat16
-DEFAULT_CHUNK_SIZE_RATIO = 4
-def _optimal_quintic(l, u, max_iter=1000):
-    """
-    Use the simplified Remez algorithm to find the optimal odd quintic approximant
-    to the constant function x -> 1 over the interval [l, u].
-    Returns (a, b, c) for p(x) = ax + bx^3 + cx^5 that minimizes the maximum
-    approximation error max_{x in [l,u]} |p(x) - 1|. Iterates by updating the
-    two interior equioscillation nodes q, r until convergence. Returns the
-    closed-form equioscillating solution when l ≈ u.
-    Raises ValueError if any intermediate value (a, b, c, E, q, r) is non-finite
-    (NaN or inf). Raises RuntimeError if convergence is not reached within
-    max_iter iterations.
-    """
-    assert 0 <= l <= u
-    if 1 - 5e-6 <= l / u:
-        return (15 / 8) / u, (-10 / 8) / (u**3), (3 / 8) / (u**5)
-    q = (3 * l + u) / 4
-    r = (l + 3 * u) / 4
-    E = inf
-    for _ in range(max_iter):
-        old_E = E
-        LHS = np.array(
-            [
-                [l, l**3, l**5, 1],
-                [q, q**3, q**5, -1],
-                [r, r**3, r**5, 1],
-                [u, u**3, u**5, -1],
-            ]
-        )
-        a, b, c, E = np.linalg.solve(LHS, np.ones(4))
-        if not np.all(np.isfinite([a, b, c, E])):
-            raise ValueError(
-                f"_optimal_quintic: non-finite solve result a={a}, b={b}, c={c}, E={E}"
-            )
-        q, r = np.sqrt(
-            (-3 * b + np.array([-1, 1]) * sqrt(9 * b**2 - 20 * a * c)) / (10 * c)
-        )
-        if not np.all(np.isfinite([q, r])):
-            raise ValueError(f"_optimal_quintic: non-finite node update q={q}, r={r}")
-        if abs(old_E - E) <= 1e-15:
-            break
-    else:
-        raise RuntimeError(
-            f"_optimal_quintic: did not converge after {max_iter} iterations"
-        )
-    return float(a), float(b), float(c)
-def _optimal_composition(l, num_iters, safety_factor_eps=0, cushion=0):
-    """
-    Compute the Polar Express coefficient series for `num_iters` quintic iterations.
-    Builds a sequence of per-step optimal odd quintic coefficients (a, b, c) that
-    compose to map singular values from [l, 1] toward 1. At each step:
-      1. Solves `_optimal_quintic` on [max(l, cushion*u), u]. The `cushion`
-         prevents near-zero singular values from stalling by raising the effective
-         lower bound; if it is active (cushion*u > l), the coefficients are
-         rescaled so that p(l) and p(u) are centered around 1 w.r.t. the true [l, u].
-      2. Deflates the coefficients by (1 + safety_factor_eps)^degree for all but the
-         last iteration, providing numerical headroom at the cost of a slightly slower
-         final convergence step.
-      3. Advances the interval: l <- p(l), u <- 2 - p(l) (by symmetry of p around 1).
-    Returns a list of (a, b, c) tuples, one per iteration.
-    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
-    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
-    """
-    u = 1
-    assert 0 <= l <= u
-    safety_factor = 1 + safety_factor_eps
-    coefficients = []
-    for iter in range(num_iters):
-        a, b, c = _optimal_quintic(max(l, cushion * u), u)
-        if cushion * u > l:
-            pl = a * l + b * l**3 + c * l**5
-            pu = a * u + b * u**3 + c * u**5
-            rescaler = 2 / (pl + pu)
-            a *= rescaler
-            b *= rescaler
-            c *= rescaler
-        if iter < num_iters - 1:
-            a /= safety_factor
-            b /= safety_factor**3
-            c /= safety_factor**5
-        coefficients.append((a, b, c))
-        l = a * l + b * l**3 + c * l**5
-        u = 2 - l
-    return coefficients
-# Precomputed Polar Express coefficients (a, b, c) for 10 quintic Newton-Schulz
-# iterations. Each tuple is the minimax-optimal (Remez/equioscillation) odd quintic
-# approximant to x->1 over the current singular-value interval, computed once at
-# import time and reused across all optimizer steps.
-#
-# Contrast with the former hardcoded NS coefficients (5 fixed tuples):
-#   - Former: empirically tuned to maximize slope at zero; did not converge
-#     singular values to 1, yielding US'V^T with S' ~ Uniform(0.5, 1.5) instead
-#     of the true polar factor UV^T.
-#   - Polar Express: analytically optimal per step, adapting to the shrinking
-#     singular-value interval [l, u] as iterations progress; converges all
-#     singular values to 1, producing the exact polar factor UV^T.
-_coeffs_list = _optimal_composition(
-    l=1e-3, num_iters=10, safety_factor_eps=1e-2, cushion=0.02
-)
-# This code is adapted from:
-#   KellerJordan/Muon (https://github.com/KellerJordan/Muon/blob/master/muon.py)
-#   NoahAmsel/PolarExpress (https://github.com/NoahAmsel/PolarExpress)
-#   matmul_transpose_assign kernel from nil0x9/flash-muon (https://github.com/nil0x9/flash-muon)
-@torch.no_grad()
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Compute the polar factor of G via the Polar Express method.
-    Applies `steps` quintic iterations X <- aX + bX^3 + cX^5, where (a, b, c)
-    are the Polar Express coefficients from `_coeffs_list`. Each step is the
-    optimal odd quintic approximant to x -> 1 over the current singular-value
-    interval, minimizing the maximum approximation error (Remez / minimax criterion).
-    The composition maps singular values from [l, 1] to near 1, producing the
-    polar factor (orthogonal factor in the polar decomposition G = UP).
-    `_coeffs_list` is precomputed for 10 iterations (l=1e-3, safety_factor_eps=1e-2,
-    cushion=0.02). If `steps` exceeds 10, the final coefficient set is repeated.
-    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
-    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
-    """
-    assert len(G.shape) == 2
-    assert G.dtype == COMM_DTYPE
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    X = X / (X.norm() + 1e-7)
-    hs = _coeffs_list[:steps] + list(
-        repeat(_coeffs_list[-1], steps - len(_coeffs_list))
-    )
-    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    # Perform the NS iterations
-    for a, b, c in hs:
-        matmul_transpose_assign(X, buf1)
-        matmul_transpose_assign(buf1, buf2)
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X
-@torch.no_grad()
-def _zeropower_via_newtonschulz5_batched(G, steps):
-    """Batched polar factor computation for 3D (E, out, in) tensors.
-    Same algorithm as ``_zeropower_via_newtonschulz5`` but uses
-    ``torch.bmm`` / ``torch.baddbmm`` instead of the 2D Triton kernel,
-    processing all E expert matrices in a single batched call.
-    """
-    assert len(G.shape) == 3
-    assert G.dtype == COMM_DTYPE
-    X = G
-    if G.size(1) > G.size(2):
-        X = X.transpose(-2, -1)
-    # Per-expert Frobenius norm.
-    X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
-    hs = _coeffs_list[:steps] + list(
-        repeat(_coeffs_list[-1], steps - len(_coeffs_list))
-    )
-    for a, b, c in hs:
-        buf1 = torch.bmm(X, X.transpose(-2, -1))
-        buf2 = torch.bmm(buf1, buf1.transpose(-2, -1))
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.baddbmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(1) > G.size(2):
-        X = X.transpose(-2, -1)
-    return X
-_ns_per_shape: dict[tuple[int, ...], callable] = {}
-_use_compile = True
-def set_ns_compile(enabled: bool):
-    """Toggle torch.compile for Newton-Schulz iteration."""
-    global _use_compile
-    _use_compile = enabled
-def zeropower_via_newtonschulz5(G, steps=5):
-    if not _use_compile:
-        return _zeropower_via_newtonschulz5(G, steps)
-    key = G.shape
-    if key not in _ns_per_shape:
-        _ns_per_shape[key] = torch.compile(_zeropower_via_newtonschulz5,
-                                           options={
-                                               "triton.cudagraphs": True,
-                                               "shape_padding": False
-                                           })
-    torch.compiler.cudagraph_mark_step_begin()
-    return _ns_per_shape[key](G, steps).clone()
-def zeropower_via_newtonschulz5_batched(G, steps=5):
-    """Compile-cached batched Newton-Schulz for 3D expert tensors."""
-    if not _use_compile:
-        return _zeropower_via_newtonschulz5_batched(G, steps)
-    key = G.shape
-    if key not in _ns_per_shape:
-        _ns_per_shape[key] = torch.compile(
-            _zeropower_via_newtonschulz5_batched,
-            options={
-                "triton.cudagraphs": True,
-                "shape_padding": False
-            })
-    torch.compiler.cudagraph_mark_step_begin()
-    return _ns_per_shape[key](G, steps).clone()

build/torch210-cxx11-cu128-x86_64-linux/optimizer/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu128-x86_64-linux/pipeline.py DELETED Viewed

@@ -1,468 +0,0 @@
-import logging
-from typing import Generator
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor
-from torch.profiler import record_function
-from .core import _muon_state, adjust_lr_for_muon
-from .newton_schulz import COMM_DTYPE, zeropower_via_newtonschulz5
-from .qk_clip import compute_scales
-logger = logging.getLogger(__name__)
-# ======================================================================
-# Stage helpers
-# ======================================================================
-def _launch_gather(
-    params: list[DTensor],
-    owned_params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    num_ranks: int,
-    process_group: dist.ProcessGroup,
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
-    """Allocate gather buffers, build send/recv, and launch async all-to-all.
-    Returns:
-        work: Async operation handle.
-        recv_buf: Flat receive buffer (needed by ``_complete_gather``).
-        gathered_grads: ``{id(p): empty_tensor}`` for owned params,
-            ``None`` for non-owned.
-        recv_counts: Per-source-rank element counts.
-    """
-    # Allocate gathered-grad buffers
-    gathered_grads: dict[int, torch.Tensor | None] = {}
-    for p in params:
-        state = param_to_state[id(p)]
-        if rank == state.worker_rank:
-            gathered_grads[id(p)] = torch.empty(p.shape,
-                                                dtype=COMM_DTYPE,
-                                                device="cuda")
-        else:
-            gathered_grads[id(p)] = None
-    # Build send buffer – batch grad copies via torch.cat
-    # (1-2 fused kernels vs N individual narrow().copy_() calls).
-    send_counts = [0] * num_ranks
-    for p in params:
-        state = param_to_state[id(p)]
-        send_counts[state.worker_rank] += state.rank_numels[rank]
-    total_send = sum(send_counts)
-    if total_send > 0:
-        # Group grad slices by destination rank in a single pass.
-        dst_to_grads = [[] for _ in range(num_ranks)]
-        for p in params:
-            state = param_to_state[id(p)]
-            n = state.rank_numels[rank]
-            if n > 0:
-                g = p.grad.to_local()
-                dst_to_grads[state.worker_rank].append(g.reshape(-1))
-        # Flatten in dst order and cat once.
-        all_slices = []
-        for dst in range(num_ranks):
-            all_slices.extend(dst_to_grads[dst])
-        send_buf = torch.cat(all_slices)
-        if send_buf.dtype != COMM_DTYPE:
-            send_buf = send_buf.to(COMM_DTYPE)
-    else:
-        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-    # Build recv buffer
-    recv_counts = [0] * num_ranks
-    for src in range(num_ranks):
-        total = 0
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert state.worker_rank == rank
-            total += state.rank_numels[src]
-        recv_counts[src] = total
-    recv_buf = torch.empty(sum(recv_counts), dtype=COMM_DTYPE, device="cuda")
-    # Launch async all-to-all
-    logger.debug(f"send_buf size: {send_buf.numel()}, "
-                 f"recv_buf size: {recv_buf.numel()}, "
-                 f"recv_counts: {recv_counts}, "
-                 f"send_counts: {send_counts}, "
-                 f"process_group: {str(process_group)}")
-    work = dist.all_to_all_single(
-        recv_buf,
-        send_buf,
-        output_split_sizes=recv_counts,
-        input_split_sizes=send_counts,
-        group=process_group,
-        async_op=True,
-    )
-    return work, recv_buf, gathered_grads, recv_counts
-def _complete_gather(
-    recv_buf: torch.Tensor,
-    recv_counts: list[int],
-    owned_params: list[DTensor],
-    gathered_grads: dict[int, torch.Tensor | None],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-) -> None:
-    """Reconstruct gathered grads from the recv buffer (in-place)."""
-    off = 0
-    for src in range(len(recv_counts)):
-        if recv_counts[src] == 0:
-            continue
-        block = recv_counts[src]
-        inner_off = 0
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert state.worker_rank == rank
-            indices = state.rank_indices[src]
-            shard_view = gathered_grads[id(p)][indices]
-            n = shard_view.numel()
-            if n == 0:
-                continue
-            sg = recv_buf.narrow(0, off + inner_off, n)
-            sg = sg.reshape(shard_view.shape)
-            gathered_grads[id(p)][indices] = sg
-            inner_off += n
-        assert inner_off == block
-        off += block
-def _compute_ns(
-    owned_params: list[DTensor],
-    gathered_grads: dict[int, torch.Tensor | None],
-    ns_steps: int,
-) -> dict[int, torch.Tensor | None]:
-    """Run Newton-Schulz orthogonalization on owned parameters.
-    Returns:
-        computed_us: ``{id(p): orthogonalized_update}`` for owned params.
-    """
-    computed_us: dict[int, torch.Tensor | None] = {}
-    for p in owned_params:
-        u = zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
-        gathered_grads[id(p)] = None  # free gathered grad
-        computed_us[id(p)] = u
-    return computed_us
-def _launch_scatter(
-    params: list[DTensor],
-    owned_params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    num_ranks: int,
-    process_group: dist.ProcessGroup,
-    computed_us: dict[int, torch.Tensor | None],
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor], list[int]]:
-    """Allocate scatter buffers, build send/recv, and launch async all-to-all.
-    Returns:
-        work: Async operation handle.
-        recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
-        scattered_us: Empty dict, populated by ``_complete_scatter`` with
-            zero-copy views into ``recv_buf``.
-        recv_counts: Per-source-rank element counts.
-    """
-    # scattered_us is populated by _complete_scatter with zero-copy views
-    # into recv_buf, avoiding N empty_like allocations + N copy_ calls.
-    # Pre-seed entries for params whose local shard is empty (rank_numels == 0)
-    # so _update_params can iterate all params without KeyError.
-    scattered_us: dict[int, torch.Tensor] = {}
-    for p in params:
-        if param_to_state[id(p)].rank_numels[rank] == 0:
-            scattered_us[id(p)] = torch.empty_like(p.to_local(),
-                                                   dtype=COMM_DTYPE)
-    # Build send buffer – batch via torch.cat
-    # (1 fused kernel vs N*num_ranks individual narrow().copy_() calls).
-    send_counts = [0] * num_ranks
-    if owned_params:
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            for dst_rank in range(num_ranks):
-                send_counts[dst_rank] += state.rank_numels[dst_rank]
-    total_send = sum(send_counts)
-    if total_send > 0:
-        # Cache u_full conversions to avoid redundant .to() per dst_rank.
-        u_fulls = {}
-        for p in owned_params:
-            u_fulls[id(p)] = computed_us[id(p)].to(COMM_DTYPE).contiguous()
-        # Collect slices in dst order (matches all-to-all send layout).
-        all_slices = []
-        for dst_rank in range(num_ranks):
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                su = u_fulls[id(p)][state.rank_indices[dst_rank]].flatten()
-                if su.numel() > 0:
-                    all_slices.append(su)
-        send_buf = torch.cat(all_slices) if all_slices else torch.empty(
-            0, dtype=COMM_DTYPE, device="cuda")
-    else:
-        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-    # Build recv buffer
-    recv_counts = [0] * num_ranks
-    for src in range(num_ranks):
-        total = 0
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank != src:
-                continue
-            total += state.rank_numels[rank]
-        recv_counts[src] = total
-    recv_total = sum(recv_counts)
-    recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-    # Launch async all-to-all
-    work = dist.all_to_all_single(
-        recv_buf,
-        send_buf,
-        output_split_sizes=recv_counts,
-        input_split_sizes=send_counts,
-        group=process_group,
-        async_op=True,
-    )
-    return work, recv_buf, scattered_us, recv_counts
-def _complete_scatter(
-    recv_buf: torch.Tensor,
-    recv_counts: list[int],
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    scattered_us: dict[int, torch.Tensor],
-) -> None:
-    """Populate scattered_us with zero-copy views into recv_buf.
-    Instead of pre-allocating tensors and copying, we assign views directly
-    from ``recv_buf``.  This eliminates N ``empty_like`` + N ``copy_`` calls.
-    The underlying storage of ``recv_buf`` is kept alive through the views
-    until ``scattered_us`` is cleared after ``_update_params``.
-    """
-    off = 0
-    for src in range(len(recv_counts)):
-        block = recv_counts[src]
-        if block == 0:
-            continue
-        inner_off = 0
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank != src:
-                continue
-            n = state.rank_numels[rank]
-            if n == 0:
-                continue
-            scattered_us[id(p)] = recv_buf.narrow(0, off + inner_off,
-                                                  n).view_as(p.to_local())
-            inner_off += n
-        assert inner_off == block
-        off += block
-def _update_params(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    scattered_us: dict[int, torch.Tensor],
-    lr: float,
-    weight_decay: float,
-) -> None:
-    """Apply weight decay, Muon update, and optional QK clipping.
-    Uses batched ``_foreach_mul_`` for weight decay and batched
-    ``_foreach_add_`` for the Muon update, grouping parameters by
-    adjusted_lr to minimize kernel launches while preserving float32
-    precision for the alpha scaling.
-    """
-    if not params:
-        return
-    # Batched weight decay: p *= (1 - lr * wd) — single fused kernel.
-    p_locals = [p._local_tensor for p in params]
-    torch._foreach_mul_(p_locals, 1.0 - lr * weight_decay)
-    # Group params by adjusted_lr so _foreach_add_ can use a single
-    # alpha per group (preserves float32 precision for alpha scaling).
-    lr_groups: dict[float, tuple[list, list]] = {}
-    for p in params:
-        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-        if adjusted_lr not in lr_groups:
-            lr_groups[adjusted_lr] = ([], [])
-        lr_groups[adjusted_lr][0].append(p._local_tensor)
-        lr_groups[adjusted_lr][1].append(scattered_us[id(p)])
-    for adjusted_lr, (p_group, u_group) in lr_groups.items():
-        torch._foreach_add_(p_group, u_group, alpha=-adjusted_lr)
-    # QK clipping – applied directly on the local tensor to
-    # avoid DTensor sharding-propagation issues with _StridedShard.
-    for p in params:
-        state = param_to_state[id(p)]
-        if state.qk_clip_state is None:
-            continue
-        scales_full = compute_scales(p, state.qk_clip_state)
-        if scales_full is not None:
-            ratio = p.shape[0] // scales_full.shape[0]
-            idx0 = state.rank_indices[rank][0]
-            if isinstance(idx0, slice):
-                start = idx0.start or 0
-                idx0 = torch.arange(start,
-                                    idx0.stop,
-                                    device=scales_full.device)
-            row_scales = scales_full[idx0 // ratio]
-            p._local_tensor.mul_(row_scales.view(-1, 1))
-# ======================================================================
-# Pre-launch helper for overlapping first chunk's gather with other work.
-# ======================================================================
-@torch.no_grad()
-def prelaunch_first_gather(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    none_grad: bool,
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
-    """Launch the first chunk's A2A gather early for overlap with other compute.
-    Call this *before* expensive GPU work (e.g. batched expert NS) so that
-    the NCCL all-to-all runs concurrently on the NCCL stream while the
-    default stream executes compute.
-    Returns the same 4-tuple that ``_launch_gather`` produces, which should
-    be passed as ``prelaunch_gather`` to :func:`muon_chunk_pipeline`.
-    """
-    process_group = param_to_state[id(params[0])].process_group
-    num_ranks = dist.get_world_size(group=process_group)
-    owned_params = [
-        p for p in params if param_to_state[id(p)].worker_rank == rank
-    ]
-    with record_function("muon::prelaunch_gather"):
-        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group)
-    if none_grad:
-        for p in params:
-            p.grad = None
-    return work, recv_buf, gathered_grads, recv_counts
-# ======================================================================
-# Main generator – thin orchestrator that wires stages together.
-# ======================================================================
-@torch.no_grad()
-def muon_chunk_pipeline(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    ns_steps: int,
-    lr: float,
-    weight_decay: float,
-    none_grad: bool,
-    prelaunch_gather: tuple | None = None,
-) -> Generator[None, None, None]:
-    """Process one chunk of parameters through the full Muon pipeline.
-    Stages: gather -> compute (Newton-Schulz) -> scatter -> update.
-    Each ``yield`` lets :func:`run_pipeline` interleave other chunks so
-    that communication and computation overlap across chunks.  Async
-    communication is launched via ``async_op=True`` and completed after
-    the yield with ``work.wait()``.
-    Overlap happens because :func:`run_pipeline` admits one new chunk
-    per iteration (staggered admission).  While chunk *N* does NS
-    compute on the default CUDA stream, chunk *N+1*'s async all-to-all
-    runs concurrently on the NCCL stream — no separate ``comm_stream``
-    is required.
-    If ``prelaunch_gather`` is provided, the gather was already launched
-    by :func:`prelaunch_first_gather` and we skip launching it again.
-    Yields exactly **2** times:
-    1. After launching async all-to-all gather (or immediately if pre-launched).
-    2. After launching async all-to-all scatter.
-    """
-    process_group = param_to_state[id(params[0])].process_group
-    num_ranks = dist.get_world_size(group=process_group)
-    owned_params = [
-        p for p in params if param_to_state[id(p)].worker_rank == rank
-    ]
-    if prelaunch_gather is not None:
-        # Gather was pre-launched; none_grad already handled by caller.
-        work, recv_buf, gathered_grads, recv_counts = prelaunch_gather
-    else:
-        # Normal path: launch async gather.
-        with record_function("muon::launch_gather"):
-            work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-                params, owned_params, param_to_state, rank, num_ranks,
-                process_group)
-            if none_grad:
-                for p in params:
-                    p.grad = None
-    yield  # --- YIELD 1: other chunks can launch their gather ---
-    with record_function("muon::wait_gather"):
-        work.wait()
-        _complete_gather(recv_buf, recv_counts, owned_params, gathered_grads,
-                         param_to_state, rank)
-        del recv_buf
-    # Stage 3: Newton-Schulz orthogonalization.
-    with record_function("muon::newton_schulz"):
-        computed_us = _compute_ns(owned_params, gathered_grads, ns_steps)
-        gathered_grads.clear()
-    # Stages 4-5: launch async scatter.
-    with record_function("muon::launch_scatter"):
-        work, recv_buf, scattered_us, recv_counts = _launch_scatter(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group, computed_us)
-        computed_us.clear()
-    yield  # --- YIELD 2: other chunks can launch their scatter ---
-    with record_function("muon::wait_scatter"):
-        work.wait()
-        _complete_scatter(recv_buf, recv_counts, params, param_to_state, rank,
-                          scattered_us)
-        del recv_buf
-    # Stage 6: apply parameter updates.
-    with record_function("muon::update_params"):
-        _update_params(params, param_to_state, rank, scattered_us, lr,
-                       weight_decay)
-        scattered_us.clear()

build/torch210-cxx11-cu128-x86_64-linux/qk_clip.py DELETED Viewed

@@ -1,198 +0,0 @@
-import logging
-import math
-from dataclasses import dataclass
-import torch
-from torch.distributed.tensor import DTensor
-from .core import normalize_fqn
-logger = logging.getLogger(__name__)
-def parse_qk_layer(name: str) -> tuple[str | None, int]:
-    """
-    Parse a parameter name to check if it is a query/key projection layer
-    and return (kind, layer_index).
-    Supported kinds:
-        MHA/GQA: 'wq', 'wk', 'q_proj', 'k_proj'
-        MLA:     'wq_b' (Q up-proj), 'wkv_b' (KV up-proj)
-    Returns:
-        (kind, layer_idx) or (None, -1) if not matched.
-    Example:
-        'model.3.attn.wq.weight'      -> ('wq', 3)
-        'model.5.attn.wk.weight'      -> ('wk', 5)
-        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
-        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
-        'model.1.attn.wq_b.weight'    -> ('wq_b', 1)
-        'model.0.attn.wkv_b.weight'   -> ('wkv_b', 0)
-        'model.4.attn.v_proj.weight'  -> (None, -1)
-    """
-    parts = normalize_fqn(name).split('.')
-    if len(parts) < 3:
-        return None, -1
-    kind = parts[-2]
-    layer_idx = -1
-    for part in reversed(parts):
-        if part.isdigit():
-            layer_idx = int(part)
-            break
-    if kind in ('wq', 'wk', 'q_proj', 'k_proj', 'wq_b', 'wkv_b'):
-        return kind, layer_idx
-    return None, -1
-@dataclass
-class QKClipInfo:
-    """Per-parameter dynamic info computed from config + runtime logits."""
-    kind: str | None  # 'wq'/'q_proj'/'wq_b' or 'wk'/'k_proj'/'wkv_b' or None
-    indices: list[int]  # which heads to consider for clipping
-    head_dim: int  # from config (qk_head_dim for MLA wq_b)
-    threshold: float  # from config
-    logit: torch.Tensor | None
-    # MLA-specific fields
-    is_mla: bool = False
-    qk_nope_head_dim: int = 0
-    qk_rope_head_dim: int = 0
-    v_head_dim: int = 0
-def get_qk_clip_info(clip_config, n, qk_logits):
-    """Extract QK clipping info for a named parameter.
-    Args:
-        clip_config: QK clipping configuration dict (or None).
-            MHA/GQA keys: head_dim, threshold, q_indices, k_indices
-            MLA extra keys: is_mla=True, qk_nope_head_dim, qk_rope_head_dim, v_head_dim
-        n: Parameter name string.
-        qk_logits: Dict mapping layer indices to logit tensors (or None).
-    Returns:
-        QKClipInfo instance with clipping configuration for this parameter.
-    """
-    if clip_config is None:
-        return None
-    head_dim = clip_config.get('head_dim')
-    threshold = clip_config.get('threshold')
-    kind, layer_idx = parse_qk_layer(n)
-    is_mla = clip_config.get('is_mla', False)
-    logit, indices = None, []
-    if qk_logits is not None and kind is not None:
-        logit = qk_logits[layer_idx]
-        if isinstance(logit, DTensor):
-            # In TP settings, qk_logits may be DTensor
-            # We convert it to full tensor here for simplicity
-            logit = logit.full_tensor()
-        if kind in ('wq_b', 'wq', 'q_proj'):
-            indices = clip_config.get('q_indices', []) or []
-        elif kind in ('wkv_b', 'wk', 'k_proj'):
-            indices = clip_config.get('k_indices', []) or []
-    if is_mla:
-        return QKClipInfo(
-            kind=kind,
-            indices=indices,
-            head_dim=head_dim,
-            threshold=threshold,
-            logit=logit,
-            is_mla=True,
-            qk_nope_head_dim=clip_config['qk_nope_head_dim'],
-            qk_rope_head_dim=clip_config['qk_rope_head_dim'],
-            v_head_dim=clip_config['v_head_dim'],
-        )
-    else:
-        return QKClipInfo(
-            kind=kind,
-            indices=indices,
-            head_dim=head_dim,
-            threshold=threshold,
-            logit=logit,
-        )
-def compute_scales(p, qk_clip_state):
-    """Compute per-head scaling factors for QK clipping.
-    Returns scales tensor (√γ per head) if any head exceeds threshold, else None.
-    For MLA wkv_b, effective row stride is qk_nope_head_dim + v_head_dim.
-    """
-    kind = qk_clip_state.kind
-    indices = qk_clip_state.indices
-    head_dim = qk_clip_state.head_dim
-    threshold = qk_clip_state.threshold
-    logit = qk_clip_state.logit
-    # Check if any head exceeds threshold before allocating.
-    head_scales = {}
-    for logit_idx, head_idx in enumerate(indices):
-        v_ele = float(logit[logit_idx])
-        if v_ele > threshold:
-            new_scale = math.sqrt(threshold / v_ele)
-            if head_idx not in head_scales or new_scale < head_scales[head_idx]:
-                head_scales[head_idx] = new_scale
-                logger.info(
-                    f"[{kind}] Head {head_idx} exceeded threshold "
-                    f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
-                )
-    if not head_scales:
-        return None
-    # For MLA wkv_b, each KV head spans qk_nope_head_dim + v_head_dim rows
-    if qk_clip_state.is_mla and kind == 'wkv_b':
-        effective_head_dim = qk_clip_state.qk_nope_head_dim + qk_clip_state.v_head_dim
-    else:
-        effective_head_dim = head_dim
-    H_global = p.shape[0] // effective_head_dim
-    scales_full = torch.ones(H_global, device=p.data.device)
-    for head_idx, scale in head_scales.items():
-        scales_full[head_idx] = scale
-    return scales_full
-def qk_clip(p, scales, info):
-    """Apply per-head scaling to a Q/K projection weight matrix.
-    Args:
-        p: Parameter (nn.Parameter or raw tensor).
-        scales: [n_heads] tensor, each element = √γ_h.
-        info: QKClipInfo with kind, head_dim, and MLA sub-head dimensions.
-    MLA sub-region scaling per Algorithm 1 (MuonClip):
-        wq_b: q_nope rows → √γ,  q_pe rows → γ
-        wkv_b: k_nope rows → √γ, v rows → unchanged
-    """
-    W = p.data if isinstance(p, torch.nn.Parameter) else p
-    if not info.is_mla:
-        # MHA/GQA: uniform √γ applied to all rows in each head
-        W.view(-1, info.head_dim, W.shape[1]).mul_(scales.view(-1, 1, 1))
-        return
-    # MLA: vectorized sub-region scaling within each head
-    if info.kind == 'wq_b':
-        qk_nope = info.qk_nope_head_dim
-        qk_head_dim = qk_nope + info.qk_rope_head_dim
-        W_3d = W.view(-1, qk_head_dim, W.shape[1])  # [H, qk_head_dim, in_dim]
-        W_3d[:, :qk_nope, :].mul_(scales.view(-1, 1, 1))  # q_nope → √γ
-        W_3d[:, qk_nope:, :].mul_((scales * scales).view(-1, 1,
-                                                         1))  # q_pe   → γ
-    elif info.kind == 'wkv_b':
-        qk_nope = info.qk_nope_head_dim
-        kv_stride = qk_nope + info.v_head_dim
-        W_3d = W.view(-1, kv_stride, W.shape[1])  # [H, kv_stride, in_dim]
-        W_3d[:, :qk_nope, :].mul_(scales.view(-1, 1, 1))  # k_nope → √γ
-        # v rows: not touched (k_R shared rotary unchanged)

build/torch210-cxx11-cu130-x86_64-linux/adamw.py DELETED Viewed

@@ -1,271 +0,0 @@
-import logging
-from collections import defaultdict
-from typing import cast
-import torch
-from torch.distributed.tensor import DTensor
-from torch.profiler import record_function
-logger = logging.getLogger(__name__)
-def fused_adamw(
-    params: list[torch.Tensor],
-    grads: list[torch.Tensor],
-    exp_avgs: list[torch.Tensor],
-    exp_avg_sqs: list[torch.Tensor],
-    max_exp_avg_sqs: list[torch.Tensor],
-    state_steps: list[torch.Tensor],
-    amsgrad: bool,
-    beta1: float,
-    beta2: float,
-    lr: float | torch.Tensor,
-    weight_decay: float,
-    eps: float,
-    maximize: bool,
-) -> None:
-    if not params:
-        return
-    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
-    # treating it as a scalar.
-    lr_dict: dict | None = ({
-        lr.device: lr
-    } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else None)
-    grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
-         state_steps]  # type: ignore[list-item]
-    )
-    for (device, _), (
-        (
-            device_params_,
-            device_grads_,
-            device_exp_avgs_,
-            device_exp_avg_sqs_,
-            device_max_exp_avg_sqs,
-            device_state_steps_,
-        ),
-            _,
-    ) in grouped_tensors.items():
-        device_params = cast(list[torch.Tensor], device_params_)
-        device_grads = cast(list[torch.Tensor], device_grads_)
-        device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
-        device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
-        device_state_steps = cast(list[torch.Tensor], device_state_steps_)
-        if lr_dict is not None and device not in lr_dict:
-            lr_dict[device] = lr.to(
-                device=device, non_blocking=True)  # type: ignore[union-attr]
-            lr = lr_dict[device]
-        torch._foreach_add_(device_state_steps, 1)
-        func = torch._fused_adamw_
-        func(
-            device_params,
-            device_grads,
-            device_exp_avgs,
-            device_exp_avg_sqs,
-            device_max_exp_avg_sqs,  # type: ignore[arg-type]
-            device_state_steps,
-            amsgrad=amsgrad,
-            lr=lr,  # type: ignore[arg-type]
-            beta1=beta1,
-            beta2=beta2,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=maximize,
-        )
-def _to_local(t):
-    """Unwrap DTensor to local tensor for fused ops."""
-    return t._local_tensor if isinstance(t, DTensor) else t
-# ---------------------------------------------------------------------------
-# Caches for eliminating per-step Python overhead.
-#
-# Placement grouping and tensor list assembly are identical every step
-# (params don't change placement, moment/step tensors are the same objects
-# after initialisation).  We cache them keyed by id() of the param list
-# stored in param_groups (stable across steps).
-#
-# Only gradients change each step and must be collected fresh.
-# ---------------------------------------------------------------------------
-# id(group["params"]) → dict[placement_key, list[param]]
-_placement_cache: dict[int, dict[tuple, list]] = {}
-# id(placement_group_list) → (params_local, moment1, moment2, state_steps)
-_tensor_cache: dict[int, tuple[list, list, list, list]] = {}
-def _step_adamw_params_slow(optimizer_state, params, group):
-    """Uncached fallback for the rare case where some params lack grads."""
-    params_with_grads = []
-    grads = []
-    moment1 = []
-    moment2 = []
-    state_steps = []
-    for p in params:
-        g = p.grad
-        if g is None:
-            continue
-        state = optimizer_state[p]
-        params_with_grads.append(_to_local(p))
-        grads.append(_to_local(g))
-        if "step" not in state:
-            state["step"] = torch.zeros((),
-                                        dtype=torch.float32,
-                                        device=p.device)
-            state["moment1"] = torch.zeros_like(g)
-            state["moment2"] = torch.zeros_like(g)
-        moment1.append(_to_local(state["moment1"]))
-        moment2.append(_to_local(state["moment2"]))
-        if not isinstance(state["step"], torch.Tensor):
-            state["step"] = torch.tensor(state["step"],
-                                         dtype=torch.float32,
-                                         device=p.device)
-        state_steps.append(state["step"])
-    if not params_with_grads:
-        return
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
-    fused_adamw(
-        params_with_grads,
-        grads,
-        moment1,
-        moment2,
-        [],
-        state_steps,
-        amsgrad=False,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        weight_decay=weight_decay,
-        eps=eps,
-        maximize=False,
-    )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    After the first call, cached tensor lists (params_local, moment1,
-    moment2, state_steps) are reused — only gradients are collected fresh.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
-    # Collect grads — the only thing that changes each step.
-    with record_function("adamw::collect_grads"):
-        grads = []
-        for p in params:
-            g = p.grad
-            if g is None:
-                # Rare: fall back to slow path that filters per-param.
-                _step_adamw_params_slow(optimizer_state, params, group)
-                return
-            grads.append(_to_local(g))
-    tensor_key = id(params)
-    if tensor_key not in _tensor_cache:
-        with record_function("adamw::init_tensor_cache"):
-            params_local = []
-            moment1 = []
-            moment2 = []
-            state_steps = []
-            for p in params:
-                state = optimizer_state[p]
-                params_local.append(_to_local(p))
-                if "step" not in state:
-                    state["step"] = torch.zeros((),
-                                                dtype=torch.float32,
-                                                device=p.device)
-                    state["moment1"] = torch.zeros_like(p.grad)
-                    state["moment2"] = torch.zeros_like(p.grad)
-                moment1.append(_to_local(state["moment1"]))
-                moment2.append(_to_local(state["moment2"]))
-                if not isinstance(state["step"], torch.Tensor):
-                    state["step"] = torch.tensor(state["step"],
-                                                 dtype=torch.float32,
-                                                 device=p.device)
-                state_steps.append(state["step"])
-            _tensor_cache[tensor_key] = (params_local, moment1, moment2,
-                                         state_steps)
-    params_local, moment1, moment2, state_steps = _tensor_cache[tensor_key]
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
-    with record_function("adamw::fused_adamw"):
-        fused_adamw(
-            params_local,
-            grads,
-            moment1,
-            moment2,
-            [],
-            state_steps,
-            amsgrad=False,
-            beta1=beta1,
-            beta2=beta2,
-            lr=lr,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=False,
-        )
-def step_adamw(optimizer_state, group):
-    """Dispatch AdamW step, grouping parameters by type and placement.
-    Placement grouping is cached after the first call since params never
-    change their placement between steps.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        group: Parameter group dict.
-    """
-    params = group["params"]
-    placement_key = id(params)
-    if placement_key not in _placement_cache:
-        with record_function("adamw::group_by_placement"):
-            placement_to_params: dict[tuple,
-                                      list[torch.Tensor]] = defaultdict(list)
-            for p in params:
-                match p:
-                    case DTensor():
-                        logger.debug(
-                            "[AdamW] DTensor param: shape=%s, placements=%s, "
-                            "mesh=%s, grad=%s", p.shape, p.placements,
-                            p.device_mesh.mesh_dim_names,
-                            p.grad.shape if p.grad is not None else None)
-                        placement_to_params[tuple(
-                            [p.placements, p.device_mesh])].append(p)
-                    case torch.Tensor():
-                        logger.debug(
-                            "[AdamW] plain param: shape=%s, grad=%s", p.shape,
-                            p.grad.shape if p.grad is not None else None)
-                        placement_to_params[tuple([torch.Tensor,
-                                                   None])].append(p)
-            logger.debug("[AdamW] %d placement groups, %d total params",
-                         len(placement_to_params), len(params))
-            _placement_cache[placement_key] = dict(placement_to_params)
-    for group_params in _placement_cache[placement_key].values():
-        step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-cu130-x86_64-linux/async_utils.py DELETED Viewed

@@ -1,77 +0,0 @@
-import logging
-from typing import Generator
-logger = logging.getLogger(__name__)
-class _Task:
-    """Internal: wraps a generator, advances one yield at a time."""
-    def __init__(self, generator: Generator[None, None, None], index: int):
-        self._generator = generator
-        self._index = index
-        self._steps_completed = 0
-        self.step()  # run to first yield
-    def step(self) -> bool:
-        try:
-            next(self._generator)
-            self._steps_completed += 1
-            logger.debug("pipeline[%d] completed stage %d", self._index,
-                         self._steps_completed)
-            return True
-        except StopIteration:
-            logger.debug("pipeline[%d] finished after %d stages", self._index,
-                         self._steps_completed)
-            return False
-    def close(self):
-        self._generator.close()
-def run_pipeline(
-    pipelines: Generator[Generator[None, None, None], None, None],
-    max_concurrent: int,
-) -> None:
-    """Run generator-based pipelines with bounded concurrency.
-    Each pipeline is a generator that yields at stage boundaries.
-    The runtime interleaves pipelines so communication and computation
-    overlap across chunks.
-    """
-    if max_concurrent <= 0:
-        raise ValueError(f"max_concurrent must be > 0, got {max_concurrent}")
-    have_new = True
-    task_index = 0
-    previous_tasks: list[_Task] = []
-    try:
-        while have_new or previous_tasks:
-            running_tasks: list[_Task] = []
-            # Admit one new pipeline per iteration (staggered admission).
-            # Admitting one at a time ensures that while chunk N does NS
-            # compute on the default stream, chunk N+1's NCCL all-to-all
-            # runs concurrently on the NCCL stream — creating real
-            # communication/computation overlap on the GPU.
-            if have_new and len(previous_tasks) < max_concurrent:
-                try:
-                    gen = next(pipelines)
-                    task = _Task(gen, task_index)
-                    task_index += 1
-                    running_tasks.append(task)
-                except StopIteration:
-                    have_new = False
-            # Advance every previously-yielded task by one step.
-            for task in previous_tasks:
-                if task.step():
-                    running_tasks.append(task)
-            previous_tasks = running_tasks
-    except BaseException:
-        # Clean up all in-flight generators to release GPU resources.
-        for task in previous_tasks:
-            task.close()
-        raise

build/torch210-cxx11-cu130-x86_64-linux/core.py DELETED Viewed

@@ -1,219 +0,0 @@
-import logging
-import math
-from dataclasses import dataclass
-from typing import List
-import torch
-from torch.distributed import ProcessGroup
-from torch.distributed.tensor import DTensor
-# torch.compile wraps modules as OptimizedModule, inserting "_orig_mod" into
-# parameter FQNs.  Activation checkpointing similarly inserts
-# "_checkpoint_wrapped_module".  Strip these so name-based matching (skip_keys,
-# expert_keys, QK layer parsing) works regardless of wrapper nesting.
-_WRAPPER_PARTS = frozenset({"_orig_mod", "_checkpoint_wrapped_module"})
-logger = logging.getLogger(__name__)
-def normalize_fqn(name: str) -> str:
-    """Strip torch.compile / checkpoint wrapper components from a parameter FQN."""
-    return ".".join(p for p in name.split(".") if p not in _WRAPPER_PARTS)
-@dataclass
-class _muon_state:
-    worker_rank: int
-    process_group: ProcessGroup
-    rank_indices: dict[int, tuple]  # local_rank -> per-dim indices
-    rank_numels: dict[int, int]  # local_rank -> numel
-    name: str
-    qk_clip_state: torch.Tensor | None = None
-def _batch_momentum(
-    grads: List[torch.Tensor],
-    momentum_bufs: List[torch.Tensor],
-    momentum: torch.Tensor,
-) -> None:
-    """Batched momentum update (no nesterov)."""
-    torch._foreach_mul_(momentum_bufs, momentum)
-    torch._foreach_add_(momentum_bufs, grads)
-def _batch_momentum_nesterov(
-    grads: List[torch.Tensor],
-    momentum_bufs: List[torch.Tensor],
-    momentum: torch.Tensor,
-) -> None:
-    """Batched momentum update with nesterov correction."""
-    torch._foreach_mul_(momentum_bufs, momentum)
-    torch._foreach_add_(momentum_bufs, grads)
-    nesterov_terms = torch._foreach_mul(momentum_bufs, momentum)
-    torch._foreach_add_(grads, nesterov_terms)
-_compiled_momentum: dict[bool, callable] = {}
-_use_momentum_compile = True
-def set_momentum_compile(enabled: bool):
-    """Toggle torch.compile for batched momentum."""
-    global _use_momentum_compile
-    _use_momentum_compile = enabled
-def batch_pre_ortho(
-    grads: List[torch.Tensor],
-    momentum_bufs: List[torch.Tensor],
-    momentum: torch.Tensor,
-    nesterov: bool,
-) -> None:
-    """Batched momentum update on lists of plain tensors.
-    Mirrors dion's ``muon_update_pre_orthogonalize``.
-    Inputs must be plain CUDA tensors (not DTensor).
-    Modifies ``momentum_bufs`` and (for nesterov) ``grads`` in-place.
-    When compile is enabled, uses separately compiled functions for
-    nesterov=True/False to avoid graph breaks from the branch.
-    """
-    fn = _batch_momentum_nesterov if nesterov else _batch_momentum
-    if _use_momentum_compile:
-        if nesterov not in _compiled_momentum:
-            _compiled_momentum[nesterov] = torch.compile(fn)
-        fn = _compiled_momentum[nesterov]
-    fn(grads, momentum_bufs, momentum)
-def _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay):
-    """Weight-decay + update on plain tensors.
-    Not compiled: per-param @torch.compile caused ~0.25ms TorchDynamo cache
-    lookup per call × 256+ params = massive overhead.  The pipeline path uses
-    batched _foreach_* ops instead; this function remains for base() and
-    distributed_muon().
-    """
-    p_data.mul_(1 - lr * weight_decay)
-    p_data.add_(u_data, alpha=-adjusted_lr)
-def update_p(p, u, lr, adjusted_lr, weight_decay):
-    """Apply weight decay and orthogonalized update to parameter.
-    Args:
-        p: Parameter (torch.nn.Parameter or DTensor).
-        u: Orthogonalized update tensor.
-        lr: Base learning rate.
-        adjusted_lr: Size-adjusted learning rate.
-        weight_decay: Weight decay coefficient.
-    """
-    # Unwrap Parameter -> underlying data tensor.
-    p_data = p.data if isinstance(p, torch.nn.Parameter) else p
-    # Unwrap DTensor -> local CUDA tensor for compiled kernel.
-    if isinstance(p_data, DTensor):
-        p_data = p_data._local_tensor
-    u_data = u._local_tensor if isinstance(u, DTensor) else u
-    _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay)
-def adjust_lr_for_muon(lr, param_shape):
-    """Scale learning rate based on parameter matrix dimensions.
-    Args:
-        lr: Base learning rate.
-        param_shape: Shape of the parameter tensor.
-    Returns:
-        Adjusted learning rate.
-    """
-    A, B = param_shape[:2]
-    # We adjust the learning rate and weight decay based on the size of the parameter matrix
-    # as described in the paper
-    adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-    adjusted_lr = lr * adjusted_ratio
-    return adjusted_lr
-def _match_key(parts, key):
-    """Check if key matches as contiguous components in parts.
-    Single-component keys (e.g. "experts") match any single component.
-    Multi-component keys (e.g. "experts.w1") match as a contiguous subsequence.
-    """
-    key_parts = key.split(".")
-    key_len = len(key_parts)
-    if key_len == 1:
-        return key in parts
-    return any(parts[i:i + key_len] == key_parts
-               for i in range(len(parts) - key_len + 1))
-def is_expert_param(name, expert_keys):
-    """Check if a parameter name matches any expert key (component-level)."""
-    if not expert_keys:
-        return False
-    parts = normalize_fqn(name).split(".")
-    return any(_match_key(parts, key) for key in expert_keys)
-def default_is_muon(name, x, expert_keys=None):
-    normalized = normalize_fqn(name)
-    parts = normalized.split(".")
-    skip_keys = [
-        "embed_tokens",
-        "lm_head",
-        "tok_embeddings",
-        "output",
-        "mhc_attn",
-        "mhc_ffn",
-        "lambda_proj",
-    ]
-    if any(key in parts for key in skip_keys):
-        logger.info(
-            "[is_muon] %s (orig: %s): skip (matched skip_key), ndim=%d",
-            normalized, name, x.ndim)
-        return False
-    effective_ndim = x.ndim
-    is_expert = is_expert_param(name, expert_keys)
-    if is_expert:
-        effective_ndim -= 1
-    result = effective_ndim >= 2
-    logger.info(
-        "[is_muon] %s (orig: %s): ndim=%d, expert=%s, effective_ndim=%d → %s",
-        normalized, name, x.ndim, is_expert, effective_ndim,
-        "Muon" if result else "AdamW")
-    return result
-def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
-    if is_muon_func is None:
-        is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
-    muon_params, muon_names = [], []
-    non_muon_params, non_muon_names = [], []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
-            continue
-        if is_muon_func(n, p):
-            muon_params.append(p)
-            muon_names.append(n)
-        else:
-            non_muon_params.append(p)
-            non_muon_names.append(n)
-    logger.info("[param_groups] expert_keys=%s, Muon=%d, AdamW=%d",
-                expert_keys, len(muon_names), len(non_muon_names))
-    return [
-        {
-            "params": muon_params,
-            "names": muon_names,
-            "use_muon": True,
-        },
-        {
-            "params": non_muon_params,
-            "use_muon": False,
-        },
-    ]

build/torch210-cxx11-cu130-x86_64-linux/cpu_offload.py DELETED Viewed

@@ -1,206 +0,0 @@
-"""CPU offloading for optimizer states.
-Manages a pinned CPU memory pool and async CUDA streams to offload
-optimizer state tensors (momentum buffers, Adam moments) to CPU between
-optimizer steps, freeing GPU memory.
-All tracked tensors are packed into a single flat pinned CPU buffer
-(per dtype).  D2H and H2D copies are performed per-tensor directly
-between individual GPU tensors and their slice of the CPU flat buffer
-— no GPU staging buffer is allocated, so there is **no temporary GPU
-memory spike** during offload or reload.
-Individual tensor storages are freed after offload via
-``untyped_storage().resize_(0)``, preserving tensor identity so
-downstream caches remain valid.
-"""
-import logging
-from collections import defaultdict
-import torch
-from torch.distributed.tensor import DTensor
-logger = logging.getLogger(__name__)
-class CPUOffloadPool:
-    """Pinned CPU memory pool for async optimizer state offloading.
-    Tracked tensors are grouped by dtype.  Each group gets a single flat
-    pinned CPU buffer.  D2H / H2D copies are per-tensor (into slices of
-    the flat buffer) to avoid allocating a GPU staging buffer.
-    """
-    def __init__(self):
-        self._managed: list[torch.Tensor] = []
-        self._storage_nbytes: dict[int, int] = {}  # id(t) → bytes
-        # Per-dtype group: populated on first offload.
-        # dtype → dict with keys:
-        #   "indices"   : list[int]           managed-list indices
-        #   "offsets"   : list[tuple[int,int]] (start, numel) in flat buf
-        #   "total"     : int                  total numel
-        #   "cpu_flat"  : Tensor               pinned CPU buffer
-        self._groups: dict[torch.dtype, dict] = {}
-        self._offload_stream: torch.cuda.Stream | None = None
-        self._device: torch.device | None = None
-        self._initialized: bool = False
-        self._logged: bool = False
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _local(t: torch.Tensor) -> torch.Tensor:
-        """Unwrap DTensor to its local CUDA tensor."""
-        return t._local_tensor if isinstance(t, DTensor) else t
-    def _ensure_stream(self):
-        if self._offload_stream is None:
-            self._offload_stream = torch.cuda.Stream(device=self._device)
-    # ------------------------------------------------------------------
-    def track(self, tensor: torch.Tensor):
-        """Register a GPU tensor for CPU offloading.  Idempotent."""
-        tid = id(tensor)
-        if tid in self._storage_nbytes:
-            return
-        local = self._local(tensor)
-        if self._device is None:
-            self._device = local.device
-        storage = local.untyped_storage()
-        # Skip tensors with empty storage (e.g. empty FSDP shards)
-        if storage.size() == 0:
-            return
-        self._storage_nbytes[tid] = storage.size()
-        self._managed.append(tensor)
-    # ------------------------------------------------------------------
-    def _init_buffers(self):
-        """Build per-dtype flat buffers on first offload."""
-        # Group managed tensors by dtype.
-        dtype_map: dict[torch.dtype, list[tuple[int, int]]] = defaultdict(list)
-        for idx, t in enumerate(self._managed):
-            local = self._local(t)
-            dtype_map[local.dtype].append((idx, local.numel()))
-        total_cpu_bytes = 0
-        for dtype, entries in dtype_map.items():
-            offsets: list[tuple[int, int]] = []
-            indices: list[int] = []
-            off = 0
-            for idx, n in entries:
-                indices.append(idx)
-                offsets.append((off, n))
-                off += n
-            cpu_flat = torch.empty(off, dtype=dtype, device="cpu", pin_memory=True)
-            self._groups[dtype] = {
-                "indices": indices,
-                "offsets": offsets,
-                "total": off,
-                "cpu_flat": cpu_flat,
-            }
-            total_cpu_bytes += off * cpu_flat.element_size()
-        self._initialized = True
-        logger.info(
-            "[CPUOffload] Pool initialized: %d tensors, %d dtype group(s), "
-            "%.2f MB pinned CPU memory",
-            len(self._managed),
-            len(self._groups),
-            total_cpu_bytes / (1024**2),
-        )
-    # ------------------------------------------------------------------
-    def offload(self):
-        """Per-tensor async D2H into CPU flat buffer, then free GPU storage."""
-        if not self._managed:
-            return
-        if not self._initialized:
-            self._init_buffers()
-        self._ensure_stream()
-        # Offload stream waits for compute to finish.
-        compute_event = torch.cuda.current_stream(self._device).record_event()
-        self._offload_stream.wait_event(compute_event)
-        offloaded_bytes = 0
-        # Per-tensor D2H copies directly into CPU flat buffer slices.
-        # No GPU staging buffer → no temporary GPU memory spike.
-        with torch.cuda.stream(self._offload_stream):
-            for dtype, grp in self._groups.items():
-                indices = grp["indices"]
-                offsets = grp["offsets"]
-                cpu_flat = grp["cpu_flat"]
-                for i, mgd_idx in enumerate(indices):
-                    local = self._local(self._managed[mgd_idx])
-                    off, n = offsets[i]
-                    cpu_flat[off : off + n].copy_(local.reshape(-1), non_blocking=True)
-                offloaded_bytes += grp["total"] * cpu_flat.element_size()
-        # Wait for all D2H copies to land, then free GPU storage.
-        self._offload_stream.synchronize()
-        for t in self._managed:
-            storage = self._local(t).untyped_storage()
-            if storage.size() != 0:
-                storage.resize_(0)
-            else:
-                raise RuntimeError(
-                    f"Tensor storage is already freed (size=0) before offload. "
-                    f"This indicates a double-free or external interference. "
-                    f"Tensor shape: {t.shape}, dtype: {t.dtype}"
-                )
-        if not self._logged:
-            logger.info(
-                "[CPUOffload] Offloaded %.2f MB (GPU → CPU)",
-                offloaded_bytes / (1024**2),
-            )
-    # ------------------------------------------------------------------
-    def reload(self):
-        """Per-tensor H2D from CPU flat buffer on the default stream.
-        Runs on the current (default) CUDA stream to avoid stream
-        interaction issues with the parallel Muon pipeline.  Since
-        pinned CPU memory is the source, the copies overlap with
-        GPU idle time between steps.
-        """
-        if not self._managed or not self._initialized:
-            return
-        reloaded_bytes = 0
-        # Re-allocate all GPU storages first.
-        for t in self._managed:
-            local = self._local(t)
-            storage = local.untyped_storage()
-            if storage.size() != 0:
-                raise RuntimeError(
-                    f"Storage should have been freed (size=0) before reload, "
-                    f"but got size={storage.size()}. "
-                    f"Tensor shape: {t.shape}, dtype: {t.dtype}"
-                )
-            storage.resize_(self._storage_nbytes[id(t)])
-        # Per-tensor H2D copies from CPU flat buffer slices.
-        # non_blocking=True with pinned source allows DMA overlap.
-        for dtype, grp in self._groups.items():
-            indices = grp["indices"]
-            offsets = grp["offsets"]
-            cpu_flat = grp["cpu_flat"]
-            for i, mgd_idx in enumerate(indices):
-                local = self._local(self._managed[mgd_idx])
-                off, n = offsets[i]
-                local.reshape(-1).copy_(cpu_flat[off : off + n], non_blocking=True)
-            reloaded_bytes += grp["total"] * cpu_flat.element_size()
-        if not self._logged:
-            logger.info(
-                "[CPUOffload] Reloaded %.2f MB (CPU → GPU)", reloaded_bytes / (1024**2)
-            )

build/torch210-cxx11-cu130-x86_64-linux/distributed/utils.py DELETED Viewed

@@ -1,232 +0,0 @@
-import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import DTensor
-from torch.distributed.tensor.placement_types import (Placement, Shard,
-                                                      _StridedShard)
-def _is_shard(placement: Placement) -> bool:
-    """Check if a placement is a shard type (Shard or _StridedShard).
-    In PyTorch 2.10+, _StridedShard no longer inherits from Shard, so
-    ``placement.is_shard()`` returns False for _StridedShard.  This helper
-    handles both old and new hierarchies.
-    """
-    return isinstance(placement, (Shard, _StridedShard))
-def get_slices_of_dtensor(
-    target: DTensor | torch.Tensor,
-    local_rank: int,
-    shard_mesh: DeviceMesh,
-    shard_placements: tuple[Placement],
-) -> tuple[slice | torch.Tensor, ...]:
-    """
-    Get per-dimension indices for a given rank's shard of the target tensor.
-    Uses ``Shard.local_shard_size_and_offset`` and
-    ``_StridedShard.local_shard_size_and_offset`` for correct handling of
-    both contiguous and strided (non-contiguous) sharding.
-    Args:
-        target (DTensor | torch.Tensor): The target tensor (for its shape).
-        local_rank (int): The local rank within the shard group.
-        shard_mesh (DeviceMesh): The shard mesh (only shard dimensions).
-        shard_placements (tuple[Placement]): The shard placements.
-    Returns:
-        A tuple of indices (one per tensor dim).  Each element is either:
-        - A ``slice`` (for contiguous or unsharded dims)
-        - A 1-D ``torch.LongTensor`` of indices (for strided sharding)
-    """
-    # find the global rank of the local rank in the shard mesh
-    rank = sorted(shard_mesh.mesh.flatten().tolist())[local_rank]
-    rank_coords = (shard_mesh.mesh == rank).nonzero()
-    assert len(rank_coords) == 1
-    rank_coords = tuple(rank_coords[0].tolist())
-    assert len(rank_coords) == len(shard_placements)
-    # Track per-shard-dim indices.
-    # None means "not yet sharded on this dim".
-    dim_indices: dict[int, torch.Tensor] = {}
-    # Caution: Assuming replicate-to-shard of the shard mesh goes with
-    # left-to-right sharding. This is ensured by the sorting logic of
-    # construct_shard_mesh function.
-    for mesh_dim_idx, (rank_coord, placement) in enumerate(
-            zip(rank_coords, shard_placements)):
-        assert _is_shard(placement)
-        num_chunks = shard_mesh.mesh.shape[mesh_dim_idx]
-        shard_dim = placement.dim
-        # Current effective size on this dim (may already be sub-sharded)
-        if shard_dim in dim_indices:
-            curr_size = len(dim_indices[shard_dim])
-        else:
-            curr_size = target.size()[shard_dim]
-        # Compute indices for this level of sharding
-        if isinstance(placement, _StridedShard):
-            _shard_size, offsets = _StridedShard.local_shard_size_and_offset(
-                placement,
-                curr_size,
-                num_chunks,
-                rank_coord,
-                return_first_offset=False)
-            new_indices = torch.tensor(offsets, dtype=torch.long)
-        else:
-            shard_size, offset = Shard.local_shard_size_and_offset(
-                curr_size, num_chunks, rank_coord)
-            new_indices = torch.arange(offset,
-                                       offset + shard_size,
-                                       dtype=torch.long)
-        # Compose with previous indices on this dim
-        if shard_dim in dim_indices:
-            dim_indices[shard_dim] = dim_indices[shard_dim][new_indices]
-        else:
-            dim_indices[shard_dim] = new_indices
-    # Build result tuple
-    result: list[slice | torch.Tensor] = []
-    for d in range(len(target.size())):
-        if d not in dim_indices:
-            result.append(slice(None))
-        else:
-            indices = dim_indices[d]
-            # Convert contiguous indices to slice for efficiency
-            if len(indices) > 0:
-                start = indices[0].item()
-                expected = torch.arange(start,
-                                        start + len(indices),
-                                        dtype=torch.long)
-                if torch.equal(indices, expected):
-                    result.append(slice(start, start + len(indices)))
-                else:
-                    result.append(indices)
-            else:
-                result.append(slice(0, 0))
-    return tuple(result)
-_ranks_to_dist_cache: dict[tuple[int, ...], tuple[DeviceMesh,
-                                                  ProcessGroup]] = dict()
-def construct_shard_mesh(
-    placements: tuple[Placement],
-    mesh: DeviceMesh,
-) -> tuple[DeviceMesh, ProcessGroup, tuple[Placement, ...]]:
-    """Construct shard sub-mesh and ProcessGroup for all-to-all communication.
-    Given a DTensor's placements and device mesh, extracts the "shard group"
-    — the set of ranks that together hold all shards of the same replica —
-    and creates a ProcessGroup for all-to-all among them.
-    Steps:
-        1. Sort placements: Replicate first, then Shard by (dim, granularity).
-        2. Permute the mesh tensor to match the sorted order.
-        3. Collapse Replicate dims → list of shard sub-meshes (one per replica).
-        4. Create/retrieve a cached ProcessGroup for the current rank's sub-mesh.
-    Example — 8 GPUs, mesh shape (2, 2, 2),
-              placements ``[Shard(0), Replicate, _StridedShard(0)]``::
-        Step 1 — Sort: [Replicate, _StridedShard(0), Shard(0)]
-                 Permutation: [1, 2, 0]
-        Step 2 — Permute mesh dims by [1, 2, 0]:
-                 Original:                Permuted:
-                 [[[0,1],[2,3]],          [[[0,2],[1,3]],
-                  [[4,5],[6,7]]]           [[4,6],[5,7]]]
-        Step 3 — Unbind replicate dim (dim 0), giving 2 shard sub-meshes:
-                 sub-mesh 0 = [[0,2],[1,3]]  (replica group 0)
-                 sub-mesh 1 = [[4,6],[5,7]]  (replica group 1)
-                 shard_placements = (_StridedShard(0), Shard(0))
-        Step 4 — Rank 0 → ProcessGroup([0,1,4,5])
-                 Rank 2 → ProcessGroup([2,3,6,7])
-    Returns:
-        ``(shard_mesh, process_group, shard_placements)``
-    """
-    my_rank = dist.get_rank()
-    assert mesh.mesh.device.type == 'cpu'
-    # -- Fast path: 1D all-shard mesh → reuse existing PG. ----------------
-    # Reuses the mesh's existing ProcessGroup directly, avoiding the
-    # overhead of dist.new_group(). The standard path below also handles
-    # subset calls safely via use_local_synchronization=True, but this
-    # fast path is still beneficial for the common 1D shard case.
-    if mesh.ndim == 1 and len(placements) == 1 and _is_shard(placements[0]):
-        key = (*mesh.mesh.shape, *mesh.mesh.flatten().tolist())
-        if key not in _ranks_to_dist_cache:
-            _ranks_to_dist_cache[key] = (mesh, mesh.get_group())
-        return (*_ranks_to_dist_cache[key], tuple(placements))
-    mesh_tensor = mesh.mesh.clone()
-    # -- Step 1: Sort placements (Replicate first, then Shard by dim). ------
-    # _StridedShard comes BEFORE regular Shard on the same dim so that
-    # get_slices_of_dtensor applies the outer sharding first, matching
-    # DTensor's left-to-right (outer-to-inner) composition order.
-    def _sort_key(item):
-        index, placement = item
-        assert not placement.is_partial(), "Partial placement not supported"
-        if placement.is_replicate():
-            return (-1, 0, index)
-        assert _is_shard(placement), f"Unsupported: {type(placement)}"
-        split = (-1 / placement.split_factor if isinstance(
-            placement, _StridedShard) else 0)
-        return (placement.dim, split, index)
-    indexed = sorted(enumerate(placements), key=_sort_key)
-    perm, sorted_placements = zip(*indexed)
-    # -- Step 2: Permute mesh to match sorted placement order. --------------
-    sorted_mesh = mesh_tensor.permute(perm)
-    # -- Step 3: Collapse replicate dims → list of shard sub-meshes. --------
-    # E.g. mesh (2, 3, 4, 4) with [R, R, S(0), S(1)] → 6 sub-meshes of (4, 4)
-    num_rep = sum(1 for p in sorted_placements if p.is_replicate())
-    if num_rep > 0:
-        if num_rep > 1:
-            sorted_mesh = sorted_mesh.flatten(0, num_rep - 1)
-        shard_meshes = list(torch.unbind(sorted_mesh, dim=0))
-    else:
-        shard_meshes = [sorted_mesh]
-    shard_placements = sorted_placements[num_rep:]
-    assert len(shard_placements) == len(set(shard_placements))
-    # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
-    # Each rank only creates the group it belongs to, using
-    # use_local_synchronization=True so that only group members need to
-    # coordinate. This avoids deadlocks when different PP stages call
-    # construct_shard_mesh for different parameters.
-    def _cache_key(t: torch.Tensor) -> tuple:
-        return (*t.shape, *t.flatten().tolist())
-    my_key = None
-    for sm in shard_meshes:
-        if (my_rank == sm).any().item():
-            key = _cache_key(sm)
-            assert my_key is None, "Rank appears in multiple shard groups"
-            my_key = key
-            if key not in _ranks_to_dist_cache:
-                pg = dist.new_group(sm.flatten().tolist(),
-                                    use_local_synchronization=True)
-                _ranks_to_dist_cache[key] = (
-                    DeviceMesh(device_type="cuda", mesh=sm),
-                    pg,
-                )
-    return (*_ranks_to_dist_cache[my_key], shard_placements)

build/torch210-cxx11-cu130-x86_64-linux/matmul_transpose_triton.py DELETED Viewed

@@ -1,122 +0,0 @@
-# MIT License
-#
-# Copyright (c) 2025 Tianyang Lin
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import torch
-import triton
-import triton.language as tl
-def get_autotune_config():
-    return [
-        triton.Config(
-            {
-                'BLOCK_SIZE_M': blk_m,
-                'BLOCK_SIZE_K': blk_k,
-                'GROUP_SIZE_M': grp_sz
-            },
-            num_stages=n_stages,
-            num_warps=n_warps) for blk_m in [32, 64, 128]
-        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
-        for n_warps in [4, 8]
-    ]
-@triton.autotune(
-    configs=get_autotune_config(),
-    key=['M', 'K'],
-    restore_value=['y'],
-)
-@triton.jit
-def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
-               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-               GROUP_SIZE_M: tl.constexpr):
-    """
-    Core kernel jit function of matmul_transpose that computes y = x @ x.T
-    The code is a simple adaptation from the triton `matmul` tutorial:
-    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
-    """
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    if pid_m > pid_n:
-        return
-    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    # we use a & b ptrs to denote different rows of x.
-    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        a = tl.load(a_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
-        a_ptrs += BLOCK_SIZE_K * stride_xk
-        b_ptrs += BLOCK_SIZE_K * stride_xk
-    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
-    # https://github.com/triton-lang/triton/issues/2252
-    c = accumulator.to(x.dtype.element_ty)
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
-    tl.store(c_ptrs, c, mask=c_mask)
-    # transpose and copy
-    if pid_m < pid_n:
-        ct_ptrs = y + stride_ym * offs_cn[:,
-                                          None] + stride_yn * offs_cm[None, :]
-        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
-        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
-@torch.library.custom_op("muon::matmul_transpose_assign",
-                         mutates_args=("d_out", ))
-def matmul_transpose_assign(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
-    """Compute d_out = d_in @ d_in.T using an optimized Triton kernel."""
-    d_in = d_in.contiguous()
-    M, K = d_in.shape
-    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
-        M, META['BLOCK_SIZE_M']), )
-    with torch.cuda.device(d_in.device.index):
-        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
-                         d_out.stride(0), d_out.stride(1))
-@matmul_transpose_assign.register_fake
-def _(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
-    """FakeTensor impl: d_out is already allocated, mutation is declared."""
-    pass

build/torch210-cxx11-cu130-x86_64-linux/metadata.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "python-depends": []
-}

build/torch210-cxx11-cu130-x86_64-linux/muon.py DELETED Viewed

@@ -1,1068 +0,0 @@
-import logging
-import types
-from collections import defaultdict
-from typing import Any
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor, Replicate, Shard
-from torch.profiler import record_function
-from .adamw import _placement_cache, _tensor_cache, step_adamw
-from .async_utils import run_pipeline
-from .core import (_muon_state, adjust_lr_for_muon, batch_pre_ortho,
-                   get_default_muon_param_groups, is_expert_param, update_p)
-from .cpu_offload import CPUOffloadPool
-from .distributed.utils import (_is_shard, construct_shard_mesh,
-                                get_slices_of_dtensor)
-from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
-                            _zeropower_via_newtonschulz5,
-                            zeropower_via_newtonschulz5,
-                            zeropower_via_newtonschulz5_batched)
-from .pipeline import muon_chunk_pipeline, prelaunch_first_gather
-from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
-logger = logging.getLogger(__name__)
-def _expand_expert_params(names, params, expert_keys):
-    """Expand expert params by splitting on dim 0 (expert dimension).
-    Params whose name matches any key in ``expert_keys`` are treated as
-    expert-parallel tensors.  Their outermost dimension is the expert
-    dimension: an ``(E, out, in)`` tensor becomes ``E`` separate 2D
-    ``nn.Parameter`` views so that in-place updates propagate back to
-    the original storage.
-    Non-expert params with ``ndim > 2`` trigger an ``AssertionError`` —
-    if they are expert params, their key must be added to ``expert_keys``.
-    The grad must already be set on each expert param (e.g. after momentum).
-    For DTensor expert params, placements that shard on dim 0 (expert dim)
-    are consumed by the split.  Non-dim-0 shard placements (e.g. TP) are
-    preserved: each 2D slice is wrapped as a DTensor on the corresponding
-    submesh so the parallel pipeline handles the TP communication.
-    """
-    expanded_names = []
-    expanded_params = []
-    for n, p in zip(names, params):
-        is_expert = is_expert_param(n, expert_keys)
-        is_dtensor = isinstance(p.data, DTensor)
-        if is_expert:
-            if is_dtensor:
-                logger.debug(
-                    "[expand_expert] %s: expert DTensor, shape=%s, "
-                    "placements=%s, mesh=%s, local_shape=%s", n, p.shape,
-                    p.placements, p.device_mesh.mesh_dim_names,
-                    p.to_local().shape)
-            else:
-                logger.debug(
-                    "[expand_expert] %s: expert plain tensor, shape=%s", n,
-                    p.data.shape)
-        if not is_expert:
-            assert p.data.ndim <= 2, (
-                f"Param {n} has ndim={p.data.ndim} but does not match "
-                f"expert_keys={expert_keys}. If this is an expert param, "
-                f"add its key to expert_keys.")
-            expanded_names.append(n)
-            expanded_params.append(p)
-            continue
-        g = p.grad
-        assert g is not None, (
-            f"Expert param {n} must have grad set before expansion")
-        tp_mesh = None
-        tp_placements_2d = None
-        if is_dtensor:
-            local_data = p.to_local()
-            local_grad = g.to_local() if isinstance(g, DTensor) else g
-            # Find non-dim-0 shard placements (e.g. TP sharding).
-            # After splitting on dim 0, Shard(k) becomes Shard(k-1).
-            tp_dim_indices = []
-            tp_placements_2d = []
-            for i, pl in enumerate(p.placements):
-                if _is_shard(pl) and pl.dim != 0:
-                    tp_dim_indices.append(i)
-                    tp_placements_2d.append(Shard(pl.dim - 1))
-            if tp_dim_indices:
-                tp_dim_names = tuple(p.device_mesh.mesh_dim_names[i]
-                                     for i in tp_dim_indices)
-                if len(tp_dim_names) == 1:
-                    tp_mesh = p.device_mesh[tp_dim_names[0]]
-                else:
-                    tp_mesh = p.device_mesh[tp_dim_names]
-        else:
-            local_data = p.data
-            local_grad = g
-        # Expand: split dim 0, reshape each slice to 2D.
-        num_local_experts = local_data.shape[0]
-        for i in range(num_local_experts):
-            slice_data = local_data[i]
-            slice_grad = local_grad[i]
-            if tp_mesh is not None:
-                # Wrap as DTensor on TP submesh so the pipeline handles
-                # TP communication (gather/scatter across TP ranks).
-                dt_data = DTensor.from_local(slice_data,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                dt_grad = DTensor.from_local(slice_grad,
-                                             device_mesh=tp_mesh,
-                                             placements=tp_placements_2d)
-                expert_param = torch.nn.Parameter(dt_data, requires_grad=False)
-                expert_param.grad = dt_grad
-            else:
-                expert_param = torch.nn.Parameter(slice_data,
-                                                  requires_grad=False)
-                expert_param.grad = slice_grad
-            expanded_names.append(f"{n}[{i}]")
-            expanded_params.append(expert_param)
-        p.grad = None  # allow expert grad storage to be freed after pipeline
-    return expanded_names, expanded_params
-class Muon(torch.optim.Optimizer):
-    """
-    Muon - MomentUm Orthogonalized by Newton-schulz
-    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
-    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
-    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
-    the advantage that it can be stably run in bfloat16 on the GPU.
-    Some warnings:
-    - We believe this optimizer is unlikely to work well for training with small batch size.
-    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
-    Arguments:
-        model: The model to be optimized by Muon.
-        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
-        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
-        momentum: The momentum used by the internal SGD. (0.95 is a good default)
-        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
-        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        weight_decay: The weight decay for Muon and AdamW.
-            Parameters that are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW instead.
-        adamw_lr: The learning rate for the internal AdamW.
-        adamw_betas: The betas for the internal AdamW.
-        adamw_eps: The epsilon for the internal AdamW.
-        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
-        debug: Whether to print debug information.
-        clip_info : Configuration for QK clipping. Expected keys:
-            - "q_indices" (list[int]): Indices of query heads to consider.
-            - "k_indices" (list[int]): Indices of key heads to consider.
-            - "head_dim" (int): Dimensionality of each attention head.
-            - "threshold" (float): Threshold value; heads whose QK logits exceed
-            this value will be scaled down.
-            Default is:
-                {
-                    "q_indices": [],
-                    "k_indices": [],
-                    "head_dim": 128,
-                    "threshold": 100
-                }
-        warmup_step : How many all2all gather, compute operations are launched in advance
-                      before the corresponding all2all scatter steps begin.
-                      A higher warmup_step increases memory usage but can improve
-                      performance by overlapping communication.
-                      Parallel muon only.
-        chunk_size : Batch size of parameters to process in each
-                     all2all gather/compute/scatter step.
-                     Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
-        use_distributed_muon: Use distributed muon by Liu et al. (2024).
-                              For testing purpose only.
-        expert_keys: List of strings to identify expert-parallel parameters.
-                     If any key appears in a parameter's name, its outermost
-                     dimension is treated as the expert dimension and expanded
-                     into per-expert 2D params for Muon.  For example,
-                     ``expert_keys=["experts"]`` matches any param whose name
-                     contains "experts".  3D+ params not matched by any key
-                     will raise an error.
-    """
-    def __init__(self,
-                 params,
-                 lr=1e-3,
-                 momentum=0.95,
-                 nesterov=True,
-                 ns_steps=5,
-                 weight_decay=0.1,
-                 adamw_betas=(0.9, 0.95),
-                 adamw_eps=1e-8,
-                 none_grad=True,
-                 debug=False,
-                 clip_config=None,
-                 warmup_step=5,
-                 chunk_size=-1,
-                 use_distributed_muon=False,
-                 expert_keys=None):
-        defaults = dict(
-            lr=lr,
-            weight_decay=weight_decay,
-            momentum=momentum,
-            nesterov=nesterov,
-            ns_steps=ns_steps,
-            adamw_betas=adamw_betas,
-            adamw_eps=adamw_eps,
-            none_grad=none_grad,
-            use_muon=True,
-        )
-        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
-        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
-        if isinstance(params, types.GeneratorType):
-            raise ValueError(error_message.format(idx=0) + instruction_code)
-        for _idx, param_group in enumerate(params):
-            if param_group.get("use_muon", None) is None:
-                raise ValueError(
-                    error_message.format(idx=_idx) + instruction_code)
-        super().__init__(params, defaults)
-        self.debug = debug
-        self.clip_config = clip_config if clip_config is not None else {
-            "q_indices": [],
-            "k_indices": [],
-            "head_dim": 128,
-            "threshold": 100,
-        }
-        self.warmup_step = warmup_step
-        self.chunk_size = chunk_size
-        self.use_distributed_muon = use_distributed_muon
-        self.expert_keys = expert_keys
-        self.cpu_offload = False
-        self._cpu_offload_pool: CPUOffloadPool | None = None
-        self._offload_initialized = False
-        self._parallel_cache: dict[tuple[str, ...], dict] = {}
-        self._expert_expand_cache: dict[tuple[int, ...], dict] = {}
-    def _calc_flops(self, G, steps):
-        assert len(G.shape) == 2
-        M, N = G.shape
-        if M > N:
-            M, N = N, M
-        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def get_shard_mesh(self, p):
-        """
-        Get the shard mesh for a parameter p on the given rank.
-        """
-        assert isinstance(
-            p, DTensor), "Parallel Muon only supports DTensor parameters."
-        shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
-            p.placements, p.device_mesh)
-        return shard_mesh, shard_pg, shard_placements
-    def init_state_and_assign_params(self, names, params, group, qk_logits):
-        param_to_state = {}
-        param_to_flops = {}
-        total_flops = 0
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            assert g.ndim == 2, "Muon only supports 2D parameters."
-            flops = self._calc_flops(g, group["ns_steps"])
-            param_to_flops[id(p)] = flops
-            total_flops += flops
-        if self.debug:
-            logger.debug("Total TFLOPs for Muon: %.2f TFLOPs",
-                         total_flops / 1e12)
-        paired = list(zip(names, params))
-        paired_sorted = sorted(paired,
-                               key=lambda x: param_to_flops[id(x[1])],
-                               reverse=True)
-        names_sorted, params_sorted = zip(*paired_sorted)
-        ordered_names = list(names_sorted)
-        ordered_params = list(params_sorted)
-        round_robin = 0
-        mesh = ordered_params[0].device_mesh
-        placements = ordered_params[0].placements
-        shard_mesh, shard_pg, shard_placements = self.get_shard_mesh(
-            ordered_params[0])
-        shard_mesh_flattened = shard_mesh.mesh.flatten()
-        num_ranks = dist.get_world_size(group=shard_pg)
-        for n, p in zip(ordered_names, ordered_params):
-            if mesh != p.device_mesh:
-                raise ValueError("All parameters must be on the same mesh.")
-            if placements != p.placements:
-                raise ValueError("All parameters must have same placements.")
-            worker_rank = shard_mesh_flattened[round_robin].item() % num_ranks
-            round_robin = (round_robin + 1) % len(shard_mesh_flattened)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            # Precompute per-rank indices and numels for all-to-all.
-            rank_indices: dict[int, tuple] = {}
-            rank_numels: dict[int, int] = {}
-            for r in range(num_ranks):
-                indices = get_slices_of_dtensor(p, r, shard_mesh,
-                                                shard_placements)
-                rank_indices[r] = indices
-                numel = 1
-                for idx, dim_size in zip(indices, p.shape):
-                    if isinstance(idx, slice):
-                        start, stop, step = idx.indices(dim_size)
-                        numel *= max(0, (stop - start + (step - 1)) // step)
-                    else:
-                        numel *= len(idx)
-                rank_numels[r] = numel
-            param_to_state[id(p)] = _muon_state(
-                worker_rank=worker_rank,
-                process_group=shard_pg,
-                rank_indices=rank_indices,
-                rank_numels=rank_numels,
-                name=n,
-                qk_clip_state=qk_clip_state,
-            )
-        return param_to_state, ordered_params
-    def base(self, names, params, group, lr, weight_decay, qk_logits):
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            u = zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
-                                            steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-            update_p(p, u, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p, scales_full, qk_clip_state)
-    def distributed_muon(
-        self,
-        names: list[str],
-        params: list[torch.nn.Parameter],
-        group: dict[str, Any],
-        lr: float,
-        weight_decay: float,
-        qk_logits: list[torch.Tensor | DTensor] | None,
-    ):
-        """Batched Distributed Muon — for testing/correctness verification only.
-        Uses all-gather to reconstruct full tensors, computes Newton-Schulz on
-        the full grad, then slices back to local shards.  This is simpler but
-        slower than the parallel pipeline (all2all) path, so it serves as a
-        reference implementation for verifying correctness.
-        """
-        with record_function("distributed_muon"):
-            # Momentum is already applied by _step_muon before this method.
-            ns_steps = group["ns_steps"]
-            # Separate plain tensors (no communication) from DTensors.
-            plain_names, plain_params = [], []
-            dtensor_names, dtensor_params = [], []
-            for n, p in zip(names, params):
-                if p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    dtensor_names.append(n)
-                    dtensor_params.append(p)
-                else:
-                    plain_names.append(n)
-                    plain_params.append(p)
-            # Process plain tensors per-param (no communication).
-            for n, p in zip(plain_names, plain_params):
-                u = _zeropower_via_newtonschulz5(p.grad.to(COMM_DTYPE),
-                                                 steps=ns_steps)
-                adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-                update_p(p, u, lr, adjusted_lr, weight_decay)
-                qk_clip_state = get_qk_clip_info(self.clip_config, n,
-                                                 qk_logits)
-                scales_full = compute_scales(
-                    p, qk_clip_state) if qk_clip_state is not None else None
-                if scales_full is not None:
-                    qk_clip(p, scales_full, qk_clip_state)
-            if not dtensor_params:
-                return
-            # Group DTensors by (placements, mesh) for batched all-gather.
-            placement_groups: dict[tuple,
-                                   tuple[list,
-                                         list]] = defaultdict(lambda: ([], []))
-            for n, p in zip(dtensor_names, dtensor_params):
-                key = (p.placements, p.device_mesh)
-                placement_groups[key][0].append(n)
-                placement_groups[key][1].append(p)
-            logger.info(
-                "distributed_muon: %d placement groups, %d total dtensors",
-                len(placement_groups), len(dtensor_params))
-            for (placements, mesh), (grp_names,
-                                     grp_params) in placement_groups.items():
-                shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
-                    placements, mesh)
-                rank = dist.get_rank(shard_pg)
-                world_size = dist.get_world_size(shard_pg)
-                logger.info("  group: %d params, placements=%s, world_size=%d",
-                            len(grp_params), placements, world_size)
-                # Separate params that can be batched (all shard dims evenly
-                # divisible) from those needing per-param full_tensor
-                # (e.g. MoE gate weights with fewer rows than shard ranks).
-                # all_gather_into_tensor requires equal buffer sizes across
-                # ranks, so uneven splits must use DTensor full_tensor().
-                batch_names, batch_params = [], []
-                single_names, single_params = [], []
-                for n, p in zip(grp_names, grp_params):
-                    even = all(p.shape[pl.dim] %
-                               shard_mesh.mesh.shape[dim_idx] == 0
-                               for dim_idx, pl in enumerate(shard_placements))
-                    if even:
-                        batch_names.append(n)
-                        batch_params.append(p)
-                    else:
-                        single_names.append(n)
-                        single_params.append(p)
-                # Process uneven-split params per-param via full_tensor().
-                for n, p in zip(single_names, single_params):
-                    with record_function("distributed_muon::newton_schulz"):
-                        g_full = p.grad.full_tensor().to(COMM_DTYPE)
-                        u_full = _zeropower_via_newtonschulz5(g_full,
-                                                              steps=ns_steps)
-                        del g_full
-                    with record_function("distributed_muon::update"):
-                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-                        p._local_tensor.mul_(1 - lr * weight_decay)
-                        local_indices = get_slices_of_dtensor(
-                            p, rank, shard_mesh, shard_placements)
-                        u_local = u_full[local_indices]
-                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
-                        del u_full
-                        qk_clip_state = get_qk_clip_info(
-                            self.clip_config, n, qk_logits)
-                        scales_full = compute_scales(
-                            p, qk_clip_state
-                        ) if qk_clip_state is not None else None
-                        if scales_full is not None:
-                            ratio = p.shape[0] // scales_full.shape[0]
-                            idx0 = local_indices[0]
-                            if isinstance(idx0, slice):
-                                start = idx0.start or 0
-                                idx0 = torch.arange(start,
-                                                    idx0.stop,
-                                                    device=scales_full.device)
-                            row_scales = scales_full[idx0 // ratio]
-                            p._local_tensor.mul_(row_scales.view(-1, 1))
-                if not batch_params:
-                    continue
-                logger.info("  batched=%d, single=%d", len(batch_params),
-                            len(single_params))
-                # Concat all local grad shards into a single flat buffer.
-                with record_function("distributed_muon::gather"):
-                    grad_locals = [
-                        p.grad.to_local().to(COMM_DTYPE).flatten()
-                        for p in batch_params
-                    ]
-                    numels = [g.numel() for g in grad_locals]
-                    grad_concat = torch.cat(grad_locals)
-                    del grad_locals
-                    # Single all-gather (replaces N separate full_tensor).
-                    grad_gathered = torch.empty(
-                        grad_concat.numel() * world_size,
-                        dtype=COMM_DTYPE,
-                        device="cuda",
-                    )
-                    dist.all_gather_into_tensor(grad_gathered,
-                                                grad_concat,
-                                                group=shard_pg)
-                total_numel = grad_concat.numel()
-                del grad_concat
-                # Precompute per-param offsets within the concat buffer.
-                offsets = []
-                off = 0
-                for ne in numels:
-                    offsets.append(off)
-                    off += ne
-                # Per-param: reconstruct full grad → NS → local update.
-                for i, (n, p) in enumerate(zip(batch_names, batch_params)):
-                    with record_function("distributed_muon::newton_schulz"):
-                        g_full = torch.empty(p.shape,
-                                             dtype=COMM_DTYPE,
-                                             device="cuda")
-                        for r in range(world_size):
-                            r_start = r * total_numel + offsets[i]
-                            shard = grad_gathered[r_start:r_start + numels[i]]
-                            indices = get_slices_of_dtensor(
-                                p, r, shard_mesh, shard_placements)
-                            g_full[indices] = shard.reshape(
-                                g_full[indices].shape)
-                        u_full = _zeropower_via_newtonschulz5(g_full,
-                                                              steps=ns_steps)
-                        del g_full
-                    with record_function("distributed_muon::update"):
-                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-                        p._local_tensor.mul_(1 - lr * weight_decay)
-                        local_indices = get_slices_of_dtensor(
-                            p, rank, shard_mesh, shard_placements)
-                        u_local = u_full[local_indices]
-                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
-                        del u_full
-                        qk_clip_state = get_qk_clip_info(
-                            self.clip_config, n, qk_logits)
-                        scales_full = compute_scales(
-                            p, qk_clip_state
-                        ) if qk_clip_state is not None else None
-                        if scales_full is not None:
-                            ratio = p.shape[0] // scales_full.shape[0]
-                            idx0 = local_indices[0]
-                            if isinstance(idx0, slice):
-                                start = idx0.start or 0
-                                idx0 = torch.arange(start,
-                                                    idx0.stop,
-                                                    device=scales_full.device)
-                            row_scales = scales_full[idx0 // ratio]
-                            p._local_tensor.mul_(row_scales.view(-1, 1))
-    def _setup_parallel(self, names, params, group, qk_logits):
-        """Compute (or retrieve cached) parallel pipeline metadata.
-        Returns:
-            (ordered_params, param_to_state, rank, chunk_size)
-        """
-        cache_key = tuple(names)
-        if cache_key not in self._parallel_cache:
-            # First call: compute metadata and populate cache.
-            param_to_state, ordered_params = self.init_state_and_assign_params(
-                names, params, group, qk_logits)
-            shard_pg = param_to_state[id(ordered_params[0])].process_group
-            rank = dist.get_rank(group=shard_pg)
-            if self.chunk_size == -1:
-                shard_ranks = dist.get_world_size(shard_pg)
-                chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
-            elif self.chunk_size > 0:
-                chunk_size = self.chunk_size
-            else:
-                raise ValueError(
-                    "chunk_size must be -1 or a positive integer.")
-            ordered_names = [
-                param_to_state[id(p)].name for p in ordered_params
-            ]
-            name_to_state = {
-                param_to_state[id(p)].name: param_to_state[id(p)]
-                for p in ordered_params
-            }
-            self._parallel_cache[cache_key] = {
-                'ordered_names': ordered_names,
-                'name_to_state': name_to_state,
-                'rank': rank,
-                'chunk_size': chunk_size,
-            }
-        else:
-            # Cached path: rebuild param_to_state with current id(p) keys.
-            cache = self._parallel_cache[cache_key]
-            rank = cache['rank']
-            chunk_size = cache['chunk_size']
-            name_to_param = dict(zip(names, params))
-            ordered_params = [name_to_param[n] for n in cache['ordered_names']]
-            param_to_state = {}
-            for p, n in zip(ordered_params, cache['ordered_names']):
-                cached_state = cache['name_to_state'][n]
-                param_to_state[id(p)] = _muon_state(
-                    worker_rank=cached_state.worker_rank,
-                    process_group=cached_state.process_group,
-                    rank_indices=cached_state.rank_indices,
-                    rank_numels=cached_state.rank_numels,
-                    name=n,
-                    qk_clip_state=get_qk_clip_info(self.clip_config, n,
-                                                   qk_logits),
-                )
-        return ordered_params, param_to_state, rank, chunk_size
-    def parallel(self,
-                 names,
-                 params,
-                 group,
-                 lr,
-                 weight_decay,
-                 qk_logits,
-                 prelaunch_gather=None):
-        """
-        Perform a parallel optimization step using Muon.
-        Parameters are chunked and each chunk is processed by a
-        :func:`muon_chunk_pipeline` generator.  :func:`run_pipeline`
-        interleaves multiple chunks so that communication and computation
-        overlap across chunks (the same overlap previously achieved by the
-        warmup + main-loop index scheduling).
-        If ``prelaunch_gather`` is provided, it is passed to the first
-        chunk's generator to skip re-launching the already in-flight
-        A2A gather.
-        """
-        # Momentum is already applied by _step_muon before this method.
-        ordered_params, param_to_state, rank, chunk_size = (
-            self._setup_parallel(names, params, group, qk_logits))
-        def pipelines():
-            first = True
-            for start in range(0, len(ordered_params), chunk_size):
-                chunk = ordered_params[start:start + chunk_size]
-                if chunk:
-                    kwargs = dict(
-                        params=chunk,
-                        param_to_state=param_to_state,
-                        rank=rank,
-                        ns_steps=group["ns_steps"],
-                        lr=lr,
-                        weight_decay=weight_decay,
-                        none_grad=group["none_grad"],
-                    )
-                    if first and prelaunch_gather is not None:
-                        kwargs['prelaunch_gather'] = prelaunch_gather
-                    first = False
-                    yield muon_chunk_pipeline(**kwargs)
-        with record_function("muon::pipeline"):
-            run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
-    def _step_muon(self, group, qk_logits=None):
-        params = group["params"]
-        lr = group["lr"]
-        weight_decay = group["weight_decay"]
-        momentum = group["momentum"]
-        names = group["names"]
-        # Apply momentum to all params before routing/expansion.
-        # Batched using _foreach_* ops (compiled, fullgraph=True).
-        with record_function("muon::momentum"):
-            active_params = [p for p in params if p.grad is not None]
-            if active_params:
-                # Ensure momentum buffers exist (avoid zeros_like when already present).
-                for p in active_params:
-                    if "momentum_buffer" not in self.state[p]:
-                        self.state[p]["momentum_buffer"] = torch.zeros_like(
-                            p.grad)
-                # Extract local tensors for compiled batch function.
-                local_grads = [
-                    p.grad._local_tensor
-                    if isinstance(p.grad, DTensor) else p.grad
-                    for p in active_params
-                ]
-                local_bufs = [
-                    self.state[p]["momentum_buffer"]._local_tensor
-                    if isinstance(self.state[p]["momentum_buffer"], DTensor)
-                    else self.state[p]["momentum_buffer"]
-                    for p in active_params
-                ]
-                # Wrap momentum as tensor for torch.compile.
-                batch_pre_ortho(local_grads, local_bufs,
-                                torch.tensor(momentum), group["nesterov"])
-                # For non-nesterov, the result is the momentum buffer.
-                if not group["nesterov"]:
-                    for p in active_params:
-                        p.grad = self.state[p]["momentum_buffer"]
-        # Identify batched experts for deferred NS.
-        # Detection is cheap (condition checks only); actual NS compute is
-        # deferred so it can overlap with the first chunk's A2A gather.
-        deferred_expert_work = []
-        if self.expert_keys:
-            batched_expert_indices = []
-            for i, (n, p) in enumerate(zip(names, params)):
-                if not (is_expert_param(n, self.expert_keys)
-                        and p.grad is not None):
-                    continue
-                # Eligible: plain tensor, or DTensor with no non-dim-0 shards.
-                if isinstance(p.data, DTensor):
-                    has_tp = any(
-                        _is_shard(pl) and pl.dim != 0 for pl in p.placements)
-                    if has_tp:
-                        continue
-                batched_expert_indices.append(i)
-            if batched_expert_indices:
-                # Save refs for deferred NS; free grads from param list.
-                for i in batched_expert_indices:
-                    p = params[i]
-                    g = p.grad
-                    local_g = (g._local_tensor
-                               if isinstance(g, DTensor) else g)
-                    local_data = (p.data._local_tensor if isinstance(
-                        p.data, DTensor) else p.data)
-                    deferred_expert_work.append((local_data, local_g))
-                    p.grad = None
-                # Remove batched experts from lists before expansion.
-                keep = sorted(
-                    set(range(len(params))) - set(batched_expert_indices))
-                names = [names[i] for i in keep]
-                params = [params[i] for i in keep]
-        def _run_deferred_expert_ns():
-            """Execute deferred batched expert NS."""
-            if not deferred_expert_work:
-                return
-            with record_function("muon::batched_expert_ns"):
-                ns_steps = group["ns_steps"]
-                for local_data, local_g in deferred_expert_work:
-                    u = zeropower_via_newtonschulz5_batched(
-                        local_g.to(COMM_DTYPE), steps=ns_steps)
-                    adjusted_lr = adjust_lr_for_muon(lr, local_g.shape[1:])
-                    local_data.mul_(1 - lr * weight_decay)
-                    local_data.add_(u, alpha=-adjusted_lr)
-        # Expand expert params by splitting on dim 0.
-        logger.debug("[_step_muon] before expand: %d params, expert_keys=%s",
-                     len(params), self.expert_keys)
-        if self.expert_keys:
-            cache_key = tuple(id(p) for p in params)
-            cache = self._expert_expand_cache.get(cache_key)
-            if cache is None:
-                # Cold path: full expansion + build cache metadata.
-                exp_names, exp_params = _expand_expert_params(
-                    names, params, self.expert_keys)
-                # Build per-expert-group info for hot-path grad updates.
-                grad_info = []
-                exp_idx = 0
-                for orig_idx, (n, p) in enumerate(zip(names, params)):
-                    if not is_expert_param(n, self.expert_keys):
-                        exp_idx += 1
-                        continue
-                    is_dt = isinstance(p.data, DTensor)
-                    num_experts = (p.to_local() if is_dt else p.data).shape[0]
-                    # Detect TP mesh from the first expanded expert param.
-                    tp_mesh = None
-                    tp_pls = None
-                    sample = exp_params[exp_idx]
-                    if isinstance(sample.data, DTensor):
-                        tp_mesh = sample.data.device_mesh
-                        tp_pls = list(sample.data.placements)
-                    grad_info.append((orig_idx, num_experts, exp_idx, is_dt,
-                                      tp_mesh, tp_pls))
-                    exp_idx += num_experts
-                self._expert_expand_cache[cache_key] = {
-                    'names': exp_names,
-                    'params': exp_params,
-                    'grad_info': grad_info,
-                }
-                names, params = exp_names, exp_params
-            else:
-                # Hot path: reuse cached params, only update expert grads.
-                for (orig_idx, num_experts, exp_start, is_dt, tp_mesh,
-                     tp_pls) in cache['grad_info']:
-                    p = params[orig_idx]
-                    g = p.grad
-                    local_grad = (g.to_local()
-                                  if is_dt and isinstance(g, DTensor) else g)
-                    for i in range(num_experts):
-                        expert_p = cache['params'][exp_start + i]
-                        sg = local_grad[i]
-                        if tp_mesh is not None:
-                            expert_p.grad = DTensor.from_local(
-                                sg, device_mesh=tp_mesh, placements=tp_pls)
-                        else:
-                            expert_p.grad = sg
-                    p.grad = None
-                names = cache['names']
-                params = cache['params']
-        else:
-            names, params = _expand_expert_params(names, params,
-                                                  self.expert_keys)
-        logger.debug("[_step_muon] after expand: %d params", len(params))
-        param_dtensors = []
-        name_dtensors = []
-        param_tensors = []
-        name_tensors = []
-        # distributed_muon is a reference implementation for testing only.
-        # The parallel pipeline (all2all) path below is the production path.
-        if self.use_distributed_muon:
-            _run_deferred_expert_ns()
-            self.distributed_muon(names=names,
-                                  params=params,
-                                  group=group,
-                                  lr=lr,
-                                  weight_decay=weight_decay,
-                                  qk_logits=qk_logits)
-            return
-        for n, p in zip(names, params):
-            if p is None or p.grad is None:
-                continue
-            if isinstance(p.data, DTensor):
-                if all(
-                        isinstance(placement, Replicate)
-                        for placement in p.placements):
-                    logger.debug(
-                        "[route] %s → base (DTensor all-Replicate), "
-                        "shape=%s, placements=%s", n, p.shape, p.placements)
-                    param_tensors.append(p)
-                    name_tensors.append(n)
-                else:
-                    logger.debug(
-                        "[route] %s → parallel (DTensor), shape=%s, "
-                        "placements=%s, mesh=%s", n, p.shape, p.placements,
-                        p.device_mesh.mesh_dim_names)
-                    param_dtensors.append(p)
-                    name_dtensors.append(n)
-            elif isinstance(p.data, torch.Tensor):
-                logger.debug("[route] %s → base (plain tensor), shape=%s", n,
-                             p.data.shape)
-                param_tensors.append(p)
-                name_tensors.append(n)
-            else:
-                raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-        logger.debug(f"[Muon] {len(param_dtensors)} DTensors → parallel, "
-                     f"{len(param_tensors)} Tensors → base")
-        def group_dtensors(dtensors, names):
-            # To support different placements, we group parameters by placements
-            # and run parallel Muon on each group.
-            placement_to_params = defaultdict(lambda: ([], []))
-            assert len(dtensors) == len(names)
-            for p, n in zip(dtensors, names):
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][0].append(n)
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])][1].append(p)
-            return placement_to_params
-        if len(param_dtensors) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            dtensor_group = group_dtensors(param_dtensors, name_dtensors)
-            # Pre-launch the first chunk's A2A gather so that the NCCL
-            # communication overlaps with the (deferred) batched expert NS
-            # compute on the default CUDA stream.
-            prelaunch = None
-            if deferred_expert_work:
-                first_names, first_params = next(iter(dtensor_group.values()))
-                ordered, pts, rnk, csz = self._setup_parallel(
-                    first_names, first_params, group, qk_logits)
-                first_chunk = ordered[:csz]
-                if first_chunk:
-                    prelaunch = prelaunch_first_gather(first_chunk, pts, rnk,
-                                                       group["none_grad"])
-            _run_deferred_expert_ns()
-            first_group = True
-            for _, (names, params) in dtensor_group.items():
-                pg = prelaunch if first_group else None
-                first_group = False
-                self.parallel(
-                    names,
-                    params,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    qk_logits=qk_logits,
-                    prelaunch_gather=pg,
-                )
-        else:
-            _run_deferred_expert_ns()
-        if len(param_tensors) > 0:
-            self.base(
-                name_tensors,
-                param_tensors,
-                group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
-    def _register_states_for_offload(self):
-        """Register all optimizer state tensors with the CPU offload pool.
-        Called once after the first step when states have been lazily created.
-        Offloads all param states (momentum buffers for Muon, moment1/moment2
-        for AdamW) to free GPU memory between steps.
-        """
-        pool = self._cpu_offload_pool
-        tracked = 0
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p not in self.state:
-                    continue
-                state = self.state[p]
-                if group.get("use_muon", False):
-                    if "momentum_buffer" in state:
-                        pool.track(state["momentum_buffer"])
-                        tracked += 1
-                else:
-                    if "moment1" in state:
-                        pool.track(state["moment1"])
-                    if "moment2" in state:
-                        pool.track(state["moment2"])
-                        tracked += 1
-        logger.info("[CPUOffload] Registered %d param states for offload",
-                    tracked)
-    @torch.no_grad
-    def step(self, closure=None, qk_logits=None):
-        """Perform a single optimization step.
-        Args:
-            closure (Callable, optional): A closure that reevaluates the model
-                and returns the loss.
-            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
-                to 1D tensors of shape (num_heads,), representing the maximum
-                QK logits across all tokens, computed as
-                (1 / sqrt(head_dim)) * (Q @ K^T).
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-        # H2D: reload optimizer states from CPU before computation.
-        if self.cpu_offload and self._offload_initialized:
-            self._cpu_offload_pool.reload()
-        logger.debug("[Muon.step] expert_keys=%s, %d param groups",
-                     self.expert_keys, len(self.param_groups))
-        for i, group in enumerate(self.param_groups):
-            if group["use_muon"]:
-                logger.debug("[Muon.step] group %d: use_muon=True, %d params",
-                             i, len(group["params"]))
-                self._step_muon(group, qk_logits=qk_logits)
-            else:
-                logger.debug(
-                    "[Muon.step] group %d: use_muon=False (AdamW), %d params",
-                    i, len(group["params"]))
-                step_adamw(self.state, group)
-        # D2H: offload optimizer states to CPU after computation.
-        if self.cpu_offload:
-            if not self._offload_initialized:
-                if self._cpu_offload_pool is None:
-                    self._cpu_offload_pool = CPUOffloadPool()
-                self._register_states_for_offload()
-                self._offload_initialized = True
-            self._cpu_offload_pool.offload()
-        return loss
-    # ------------------------------------------------------------------
-    # CPU offload public helpers
-    # ------------------------------------------------------------------
-    def turn_on_cpu_offload(self):
-        """Enable CPU offload for optimizer states."""
-        if self.cpu_offload:
-            return
-        logger.info("[Muon] turn_on_cpu_offload")
-        self.cpu_offload = True
-        if not self.state:
-            return
-        self._cpu_offload_pool = CPUOffloadPool()
-        self._offload_initialized = False
-        self._register_states_for_offload()
-        self._offload_initialized = True
-        self._cpu_offload_pool.offload()
-    def turn_off_cpu_offload(self):
-        """Disable CPU offload and keep optimizer states resident on GPU."""
-        if not self.cpu_offload:
-            return
-        logger.info("[Muon] turn_off_cpu_offload")
-        if self._offload_initialized:
-            self._cpu_offload_pool.reload()
-            torch.cuda.current_stream().synchronize()
-        self._cpu_offload_pool = None
-        self._offload_initialized = False
-        self.cpu_offload = False
-    # ------------------------------------------------------------------
-    # Checkpoint support for cpu_offload
-    # ------------------------------------------------------------------
-    def state_dict(self) -> dict:
-        if self.cpu_offload:
-            raise RuntimeError(
-                "Muon.state_dict() requires turn_off_cpu_offload() before checkpoint save."
-            )
-        return super().state_dict()
-    def load_state_dict(self, state_dict: dict) -> None:
-        if self.cpu_offload:
-            raise RuntimeError(
-                "Muon.load_state_dict() requires turn_off_cpu_offload() before checkpoint load."
-            )
-        super().load_state_dict(state_dict)
-        # Invalidate adamw.py's module-level tensor caches so that
-        # the next step rebuilds them with the newly loaded state tensors.
-        _placement_cache.clear()
-        _tensor_cache.clear()

build/torch210-cxx11-cu130-x86_64-linux/newton_schulz.py DELETED Viewed

@@ -1,240 +0,0 @@
-from itertools import repeat
-from math import inf, sqrt
-import numpy as np
-import torch
-from .matmul_transpose_triton import matmul_transpose_assign
-COMM_DTYPE = torch.bfloat16
-DEFAULT_CHUNK_SIZE_RATIO = 4
-def _optimal_quintic(l, u, max_iter=1000):
-    """
-    Use the simplified Remez algorithm to find the optimal odd quintic approximant
-    to the constant function x -> 1 over the interval [l, u].
-    Returns (a, b, c) for p(x) = ax + bx^3 + cx^5 that minimizes the maximum
-    approximation error max_{x in [l,u]} |p(x) - 1|. Iterates by updating the
-    two interior equioscillation nodes q, r until convergence. Returns the
-    closed-form equioscillating solution when l ≈ u.
-    Raises ValueError if any intermediate value (a, b, c, E, q, r) is non-finite
-    (NaN or inf). Raises RuntimeError if convergence is not reached within
-    max_iter iterations.
-    """
-    assert 0 <= l <= u
-    if 1 - 5e-6 <= l / u:
-        return (15 / 8) / u, (-10 / 8) / (u**3), (3 / 8) / (u**5)
-    q = (3 * l + u) / 4
-    r = (l + 3 * u) / 4
-    E = inf
-    for _ in range(max_iter):
-        old_E = E
-        LHS = np.array(
-            [
-                [l, l**3, l**5, 1],
-                [q, q**3, q**5, -1],
-                [r, r**3, r**5, 1],
-                [u, u**3, u**5, -1],
-            ]
-        )
-        a, b, c, E = np.linalg.solve(LHS, np.ones(4))
-        if not np.all(np.isfinite([a, b, c, E])):
-            raise ValueError(
-                f"_optimal_quintic: non-finite solve result a={a}, b={b}, c={c}, E={E}"
-            )
-        q, r = np.sqrt(
-            (-3 * b + np.array([-1, 1]) * sqrt(9 * b**2 - 20 * a * c)) / (10 * c)
-        )
-        if not np.all(np.isfinite([q, r])):
-            raise ValueError(f"_optimal_quintic: non-finite node update q={q}, r={r}")
-        if abs(old_E - E) <= 1e-15:
-            break
-    else:
-        raise RuntimeError(
-            f"_optimal_quintic: did not converge after {max_iter} iterations"
-        )
-    return float(a), float(b), float(c)
-def _optimal_composition(l, num_iters, safety_factor_eps=0, cushion=0):
-    """
-    Compute the Polar Express coefficient series for `num_iters` quintic iterations.
-    Builds a sequence of per-step optimal odd quintic coefficients (a, b, c) that
-    compose to map singular values from [l, 1] toward 1. At each step:
-      1. Solves `_optimal_quintic` on [max(l, cushion*u), u]. The `cushion`
-         prevents near-zero singular values from stalling by raising the effective
-         lower bound; if it is active (cushion*u > l), the coefficients are
-         rescaled so that p(l) and p(u) are centered around 1 w.r.t. the true [l, u].
-      2. Deflates the coefficients by (1 + safety_factor_eps)^degree for all but the
-         last iteration, providing numerical headroom at the cost of a slightly slower
-         final convergence step.
-      3. Advances the interval: l <- p(l), u <- 2 - p(l) (by symmetry of p around 1).
-    Returns a list of (a, b, c) tuples, one per iteration.
-    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
-    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
-    """
-    u = 1
-    assert 0 <= l <= u
-    safety_factor = 1 + safety_factor_eps
-    coefficients = []
-    for iter in range(num_iters):
-        a, b, c = _optimal_quintic(max(l, cushion * u), u)
-        if cushion * u > l:
-            pl = a * l + b * l**3 + c * l**5
-            pu = a * u + b * u**3 + c * u**5
-            rescaler = 2 / (pl + pu)
-            a *= rescaler
-            b *= rescaler
-            c *= rescaler
-        if iter < num_iters - 1:
-            a /= safety_factor
-            b /= safety_factor**3
-            c /= safety_factor**5
-        coefficients.append((a, b, c))
-        l = a * l + b * l**3 + c * l**5
-        u = 2 - l
-    return coefficients
-# Precomputed Polar Express coefficients (a, b, c) for 10 quintic Newton-Schulz
-# iterations. Each tuple is the minimax-optimal (Remez/equioscillation) odd quintic
-# approximant to x->1 over the current singular-value interval, computed once at
-# import time and reused across all optimizer steps.
-#
-# Contrast with the former hardcoded NS coefficients (5 fixed tuples):
-#   - Former: empirically tuned to maximize slope at zero; did not converge
-#     singular values to 1, yielding US'V^T with S' ~ Uniform(0.5, 1.5) instead
-#     of the true polar factor UV^T.
-#   - Polar Express: analytically optimal per step, adapting to the shrinking
-#     singular-value interval [l, u] as iterations progress; converges all
-#     singular values to 1, producing the exact polar factor UV^T.
-_coeffs_list = _optimal_composition(
-    l=1e-3, num_iters=10, safety_factor_eps=1e-2, cushion=0.02
-)
-# This code is adapted from:
-#   KellerJordan/Muon (https://github.com/KellerJordan/Muon/blob/master/muon.py)
-#   NoahAmsel/PolarExpress (https://github.com/NoahAmsel/PolarExpress)
-#   matmul_transpose_assign kernel from nil0x9/flash-muon (https://github.com/nil0x9/flash-muon)
-@torch.no_grad()
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Compute the polar factor of G via the Polar Express method.
-    Applies `steps` quintic iterations X <- aX + bX^3 + cX^5, where (a, b, c)
-    are the Polar Express coefficients from `_coeffs_list`. Each step is the
-    optimal odd quintic approximant to x -> 1 over the current singular-value
-    interval, minimizing the maximum approximation error (Remez / minimax criterion).
-    The composition maps singular values from [l, 1] to near 1, producing the
-    polar factor (orthogonal factor in the polar decomposition G = UP).
-    `_coeffs_list` is precomputed for 10 iterations (l=1e-3, safety_factor_eps=1e-2,
-    cushion=0.02). If `steps` exceeds 10, the final coefficient set is repeated.
-    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
-    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
-    """
-    assert len(G.shape) == 2
-    assert G.dtype == COMM_DTYPE
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    X = X / (X.norm() + 1e-7)
-    hs = _coeffs_list[:steps] + list(
-        repeat(_coeffs_list[-1], steps - len(_coeffs_list))
-    )
-    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
-    # Perform the NS iterations
-    for a, b, c in hs:
-        matmul_transpose_assign(X, buf1)
-        matmul_transpose_assign(buf1, buf2)
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X
-@torch.no_grad()
-def _zeropower_via_newtonschulz5_batched(G, steps):
-    """Batched polar factor computation for 3D (E, out, in) tensors.
-    Same algorithm as ``_zeropower_via_newtonschulz5`` but uses
-    ``torch.bmm`` / ``torch.baddbmm`` instead of the 2D Triton kernel,
-    processing all E expert matrices in a single batched call.
-    """
-    assert len(G.shape) == 3
-    assert G.dtype == COMM_DTYPE
-    X = G
-    if G.size(1) > G.size(2):
-        X = X.transpose(-2, -1)
-    # Per-expert Frobenius norm.
-    X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
-    hs = _coeffs_list[:steps] + list(
-        repeat(_coeffs_list[-1], steps - len(_coeffs_list))
-    )
-    for a, b, c in hs:
-        buf1 = torch.bmm(X, X.transpose(-2, -1))
-        buf2 = torch.bmm(buf1, buf1.transpose(-2, -1))
-        buf1.mul_(b).add_(buf2, alpha=c)
-        X = torch.baddbmm(X, buf1, X, alpha=1.0, beta=a)
-    if G.size(1) > G.size(2):
-        X = X.transpose(-2, -1)
-    return X
-_ns_per_shape: dict[tuple[int, ...], callable] = {}
-_use_compile = True
-def set_ns_compile(enabled: bool):
-    """Toggle torch.compile for Newton-Schulz iteration."""
-    global _use_compile
-    _use_compile = enabled
-def zeropower_via_newtonschulz5(G, steps=5):
-    if not _use_compile:
-        return _zeropower_via_newtonschulz5(G, steps)
-    key = G.shape
-    if key not in _ns_per_shape:
-        _ns_per_shape[key] = torch.compile(_zeropower_via_newtonschulz5,
-                                           options={
-                                               "triton.cudagraphs": True,
-                                               "shape_padding": False
-                                           })
-    torch.compiler.cudagraph_mark_step_begin()
-    return _ns_per_shape[key](G, steps).clone()
-def zeropower_via_newtonschulz5_batched(G, steps=5):
-    """Compile-cached batched Newton-Schulz for 3D expert tensors."""
-    if not _use_compile:
-        return _zeropower_via_newtonschulz5_batched(G, steps)
-    key = G.shape
-    if key not in _ns_per_shape:
-        _ns_per_shape[key] = torch.compile(
-            _zeropower_via_newtonschulz5_batched,
-            options={
-                "triton.cudagraphs": True,
-                "shape_padding": False
-            })
-    torch.compiler.cudagraph_mark_step_begin()
-    return _ns_per_shape[key](G, steps).clone()

build/torch210-cxx11-cu130-x86_64-linux/optimizer/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu130-x86_64-linux/pipeline.py DELETED Viewed

@@ -1,468 +0,0 @@
-import logging
-from typing import Generator
-import torch
-import torch.distributed as dist
-from torch.distributed.tensor import DTensor
-from torch.profiler import record_function
-from .core import _muon_state, adjust_lr_for_muon
-from .newton_schulz import COMM_DTYPE, zeropower_via_newtonschulz5
-from .qk_clip import compute_scales
-logger = logging.getLogger(__name__)
-# ======================================================================
-# Stage helpers
-# ======================================================================
-def _launch_gather(
-    params: list[DTensor],
-    owned_params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    num_ranks: int,
-    process_group: dist.ProcessGroup,
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
-    """Allocate gather buffers, build send/recv, and launch async all-to-all.
-    Returns:
-        work: Async operation handle.
-        recv_buf: Flat receive buffer (needed by ``_complete_gather``).
-        gathered_grads: ``{id(p): empty_tensor}`` for owned params,
-            ``None`` for non-owned.
-        recv_counts: Per-source-rank element counts.
-    """
-    # Allocate gathered-grad buffers
-    gathered_grads: dict[int, torch.Tensor | None] = {}
-    for p in params:
-        state = param_to_state[id(p)]
-        if rank == state.worker_rank:
-            gathered_grads[id(p)] = torch.empty(p.shape,
-                                                dtype=COMM_DTYPE,
-                                                device="cuda")
-        else:
-            gathered_grads[id(p)] = None
-    # Build send buffer – batch grad copies via torch.cat
-    # (1-2 fused kernels vs N individual narrow().copy_() calls).
-    send_counts = [0] * num_ranks
-    for p in params:
-        state = param_to_state[id(p)]
-        send_counts[state.worker_rank] += state.rank_numels[rank]
-    total_send = sum(send_counts)
-    if total_send > 0:
-        # Group grad slices by destination rank in a single pass.
-        dst_to_grads = [[] for _ in range(num_ranks)]
-        for p in params:
-            state = param_to_state[id(p)]
-            n = state.rank_numels[rank]
-            if n > 0:
-                g = p.grad.to_local()
-                dst_to_grads[state.worker_rank].append(g.reshape(-1))
-        # Flatten in dst order and cat once.
-        all_slices = []
-        for dst in range(num_ranks):
-            all_slices.extend(dst_to_grads[dst])
-        send_buf = torch.cat(all_slices)
-        if send_buf.dtype != COMM_DTYPE:
-            send_buf = send_buf.to(COMM_DTYPE)
-    else:
-        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-    # Build recv buffer
-    recv_counts = [0] * num_ranks
-    for src in range(num_ranks):
-        total = 0
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert state.worker_rank == rank
-            total += state.rank_numels[src]
-        recv_counts[src] = total
-    recv_buf = torch.empty(sum(recv_counts), dtype=COMM_DTYPE, device="cuda")
-    # Launch async all-to-all
-    logger.debug(f"send_buf size: {send_buf.numel()}, "
-                 f"recv_buf size: {recv_buf.numel()}, "
-                 f"recv_counts: {recv_counts}, "
-                 f"send_counts: {send_counts}, "
-                 f"process_group: {str(process_group)}")
-    work = dist.all_to_all_single(
-        recv_buf,
-        send_buf,
-        output_split_sizes=recv_counts,
-        input_split_sizes=send_counts,
-        group=process_group,
-        async_op=True,
-    )
-    return work, recv_buf, gathered_grads, recv_counts
-def _complete_gather(
-    recv_buf: torch.Tensor,
-    recv_counts: list[int],
-    owned_params: list[DTensor],
-    gathered_grads: dict[int, torch.Tensor | None],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-) -> None:
-    """Reconstruct gathered grads from the recv buffer (in-place)."""
-    off = 0
-    for src in range(len(recv_counts)):
-        if recv_counts[src] == 0:
-            continue
-        block = recv_counts[src]
-        inner_off = 0
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            assert state.worker_rank == rank
-            indices = state.rank_indices[src]
-            shard_view = gathered_grads[id(p)][indices]
-            n = shard_view.numel()
-            if n == 0:
-                continue
-            sg = recv_buf.narrow(0, off + inner_off, n)
-            sg = sg.reshape(shard_view.shape)
-            gathered_grads[id(p)][indices] = sg
-            inner_off += n
-        assert inner_off == block
-        off += block
-def _compute_ns(
-    owned_params: list[DTensor],
-    gathered_grads: dict[int, torch.Tensor | None],
-    ns_steps: int,
-) -> dict[int, torch.Tensor | None]:
-    """Run Newton-Schulz orthogonalization on owned parameters.
-    Returns:
-        computed_us: ``{id(p): orthogonalized_update}`` for owned params.
-    """
-    computed_us: dict[int, torch.Tensor | None] = {}
-    for p in owned_params:
-        u = zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
-        gathered_grads[id(p)] = None  # free gathered grad
-        computed_us[id(p)] = u
-    return computed_us
-def _launch_scatter(
-    params: list[DTensor],
-    owned_params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    num_ranks: int,
-    process_group: dist.ProcessGroup,
-    computed_us: dict[int, torch.Tensor | None],
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor], list[int]]:
-    """Allocate scatter buffers, build send/recv, and launch async all-to-all.
-    Returns:
-        work: Async operation handle.
-        recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
-        scattered_us: Empty dict, populated by ``_complete_scatter`` with
-            zero-copy views into ``recv_buf``.
-        recv_counts: Per-source-rank element counts.
-    """
-    # scattered_us is populated by _complete_scatter with zero-copy views
-    # into recv_buf, avoiding N empty_like allocations + N copy_ calls.
-    # Pre-seed entries for params whose local shard is empty (rank_numels == 0)
-    # so _update_params can iterate all params without KeyError.
-    scattered_us: dict[int, torch.Tensor] = {}
-    for p in params:
-        if param_to_state[id(p)].rank_numels[rank] == 0:
-            scattered_us[id(p)] = torch.empty_like(p.to_local(),
-                                                   dtype=COMM_DTYPE)
-    # Build send buffer – batch via torch.cat
-    # (1 fused kernel vs N*num_ranks individual narrow().copy_() calls).
-    send_counts = [0] * num_ranks
-    if owned_params:
-        for p in owned_params:
-            state = param_to_state[id(p)]
-            for dst_rank in range(num_ranks):
-                send_counts[dst_rank] += state.rank_numels[dst_rank]
-    total_send = sum(send_counts)
-    if total_send > 0:
-        # Cache u_full conversions to avoid redundant .to() per dst_rank.
-        u_fulls = {}
-        for p in owned_params:
-            u_fulls[id(p)] = computed_us[id(p)].to(COMM_DTYPE).contiguous()
-        # Collect slices in dst order (matches all-to-all send layout).
-        all_slices = []
-        for dst_rank in range(num_ranks):
-            for p in owned_params:
-                state = param_to_state[id(p)]
-                su = u_fulls[id(p)][state.rank_indices[dst_rank]].flatten()
-                if su.numel() > 0:
-                    all_slices.append(su)
-        send_buf = torch.cat(all_slices) if all_slices else torch.empty(
-            0, dtype=COMM_DTYPE, device="cuda")
-    else:
-        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
-    # Build recv buffer
-    recv_counts = [0] * num_ranks
-    for src in range(num_ranks):
-        total = 0
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank != src:
-                continue
-            total += state.rank_numels[rank]
-        recv_counts[src] = total
-    recv_total = sum(recv_counts)
-    recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
-    # Launch async all-to-all
-    work = dist.all_to_all_single(
-        recv_buf,
-        send_buf,
-        output_split_sizes=recv_counts,
-        input_split_sizes=send_counts,
-        group=process_group,
-        async_op=True,
-    )
-    return work, recv_buf, scattered_us, recv_counts
-def _complete_scatter(
-    recv_buf: torch.Tensor,
-    recv_counts: list[int],
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    scattered_us: dict[int, torch.Tensor],
-) -> None:
-    """Populate scattered_us with zero-copy views into recv_buf.
-    Instead of pre-allocating tensors and copying, we assign views directly
-    from ``recv_buf``.  This eliminates N ``empty_like`` + N ``copy_`` calls.
-    The underlying storage of ``recv_buf`` is kept alive through the views
-    until ``scattered_us`` is cleared after ``_update_params``.
-    """
-    off = 0
-    for src in range(len(recv_counts)):
-        block = recv_counts[src]
-        if block == 0:
-            continue
-        inner_off = 0
-        for p in params:
-            state = param_to_state[id(p)]
-            if state.worker_rank != src:
-                continue
-            n = state.rank_numels[rank]
-            if n == 0:
-                continue
-            scattered_us[id(p)] = recv_buf.narrow(0, off + inner_off,
-                                                  n).view_as(p.to_local())
-            inner_off += n
-        assert inner_off == block
-        off += block
-def _update_params(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    scattered_us: dict[int, torch.Tensor],
-    lr: float,
-    weight_decay: float,
-) -> None:
-    """Apply weight decay, Muon update, and optional QK clipping.
-    Uses batched ``_foreach_mul_`` for weight decay and batched
-    ``_foreach_add_`` for the Muon update, grouping parameters by
-    adjusted_lr to minimize kernel launches while preserving float32
-    precision for the alpha scaling.
-    """
-    if not params:
-        return
-    # Batched weight decay: p *= (1 - lr * wd) — single fused kernel.
-    p_locals = [p._local_tensor for p in params]
-    torch._foreach_mul_(p_locals, 1.0 - lr * weight_decay)
-    # Group params by adjusted_lr so _foreach_add_ can use a single
-    # alpha per group (preserves float32 precision for alpha scaling).
-    lr_groups: dict[float, tuple[list, list]] = {}
-    for p in params:
-        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-        if adjusted_lr not in lr_groups:
-            lr_groups[adjusted_lr] = ([], [])
-        lr_groups[adjusted_lr][0].append(p._local_tensor)
-        lr_groups[adjusted_lr][1].append(scattered_us[id(p)])
-    for adjusted_lr, (p_group, u_group) in lr_groups.items():
-        torch._foreach_add_(p_group, u_group, alpha=-adjusted_lr)
-    # QK clipping – applied directly on the local tensor to
-    # avoid DTensor sharding-propagation issues with _StridedShard.
-    for p in params:
-        state = param_to_state[id(p)]
-        if state.qk_clip_state is None:
-            continue
-        scales_full = compute_scales(p, state.qk_clip_state)
-        if scales_full is not None:
-            ratio = p.shape[0] // scales_full.shape[0]
-            idx0 = state.rank_indices[rank][0]
-            if isinstance(idx0, slice):
-                start = idx0.start or 0
-                idx0 = torch.arange(start,
-                                    idx0.stop,
-                                    device=scales_full.device)
-            row_scales = scales_full[idx0 // ratio]
-            p._local_tensor.mul_(row_scales.view(-1, 1))
-# ======================================================================
-# Pre-launch helper for overlapping first chunk's gather with other work.
-# ======================================================================
-@torch.no_grad()
-def prelaunch_first_gather(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    none_grad: bool,
-) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
-    """Launch the first chunk's A2A gather early for overlap with other compute.
-    Call this *before* expensive GPU work (e.g. batched expert NS) so that
-    the NCCL all-to-all runs concurrently on the NCCL stream while the
-    default stream executes compute.
-    Returns the same 4-tuple that ``_launch_gather`` produces, which should
-    be passed as ``prelaunch_gather`` to :func:`muon_chunk_pipeline`.
-    """
-    process_group = param_to_state[id(params[0])].process_group
-    num_ranks = dist.get_world_size(group=process_group)
-    owned_params = [
-        p for p in params if param_to_state[id(p)].worker_rank == rank
-    ]
-    with record_function("muon::prelaunch_gather"):
-        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group)
-    if none_grad:
-        for p in params:
-            p.grad = None
-    return work, recv_buf, gathered_grads, recv_counts
-# ======================================================================
-# Main generator – thin orchestrator that wires stages together.
-# ======================================================================
-@torch.no_grad()
-def muon_chunk_pipeline(
-    params: list[DTensor],
-    param_to_state: dict[int, _muon_state],
-    rank: int,
-    ns_steps: int,
-    lr: float,
-    weight_decay: float,
-    none_grad: bool,
-    prelaunch_gather: tuple | None = None,
-) -> Generator[None, None, None]:
-    """Process one chunk of parameters through the full Muon pipeline.
-    Stages: gather -> compute (Newton-Schulz) -> scatter -> update.
-    Each ``yield`` lets :func:`run_pipeline` interleave other chunks so
-    that communication and computation overlap across chunks.  Async
-    communication is launched via ``async_op=True`` and completed after
-    the yield with ``work.wait()``.
-    Overlap happens because :func:`run_pipeline` admits one new chunk
-    per iteration (staggered admission).  While chunk *N* does NS
-    compute on the default CUDA stream, chunk *N+1*'s async all-to-all
-    runs concurrently on the NCCL stream — no separate ``comm_stream``
-    is required.
-    If ``prelaunch_gather`` is provided, the gather was already launched
-    by :func:`prelaunch_first_gather` and we skip launching it again.
-    Yields exactly **2** times:
-    1. After launching async all-to-all gather (or immediately if pre-launched).
-    2. After launching async all-to-all scatter.
-    """
-    process_group = param_to_state[id(params[0])].process_group
-    num_ranks = dist.get_world_size(group=process_group)
-    owned_params = [
-        p for p in params if param_to_state[id(p)].worker_rank == rank
-    ]
-    if prelaunch_gather is not None:
-        # Gather was pre-launched; none_grad already handled by caller.
-        work, recv_buf, gathered_grads, recv_counts = prelaunch_gather
-    else:
-        # Normal path: launch async gather.
-        with record_function("muon::launch_gather"):
-            work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-                params, owned_params, param_to_state, rank, num_ranks,
-                process_group)
-            if none_grad:
-                for p in params:
-                    p.grad = None
-    yield  # --- YIELD 1: other chunks can launch their gather ---
-    with record_function("muon::wait_gather"):
-        work.wait()
-        _complete_gather(recv_buf, recv_counts, owned_params, gathered_grads,
-                         param_to_state, rank)
-        del recv_buf
-    # Stage 3: Newton-Schulz orthogonalization.
-    with record_function("muon::newton_schulz"):
-        computed_us = _compute_ns(owned_params, gathered_grads, ns_steps)
-        gathered_grads.clear()
-    # Stages 4-5: launch async scatter.
-    with record_function("muon::launch_scatter"):
-        work, recv_buf, scattered_us, recv_counts = _launch_scatter(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group, computed_us)
-        computed_us.clear()
-    yield  # --- YIELD 2: other chunks can launch their scatter ---
-    with record_function("muon::wait_scatter"):
-        work.wait()
-        _complete_scatter(recv_buf, recv_counts, params, param_to_state, rank,
-                          scattered_us)
-        del recv_buf
-    # Stage 6: apply parameter updates.
-    with record_function("muon::update_params"):
-        _update_params(params, param_to_state, rank, scattered_us, lr,
-                       weight_decay)
-        scattered_us.clear()

build/torch210-cxx11-cu130-x86_64-linux/qk_clip.py DELETED Viewed

@@ -1,198 +0,0 @@
-import logging
-import math
-from dataclasses import dataclass
-import torch
-from torch.distributed.tensor import DTensor
-from .core import normalize_fqn
-logger = logging.getLogger(__name__)
-def parse_qk_layer(name: str) -> tuple[str | None, int]:
-    """
-    Parse a parameter name to check if it is a query/key projection layer
-    and return (kind, layer_index).
-    Supported kinds:
-        MHA/GQA: 'wq', 'wk', 'q_proj', 'k_proj'
-        MLA:     'wq_b' (Q up-proj), 'wkv_b' (KV up-proj)
-    Returns:
-        (kind, layer_idx) or (None, -1) if not matched.
-    Example:
-        'model.3.attn.wq.weight'      -> ('wq', 3)
-        'model.5.attn.wk.weight'      -> ('wk', 5)
-        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
-        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
-        'model.1.attn.wq_b.weight'    -> ('wq_b', 1)
-        'model.0.attn.wkv_b.weight'   -> ('wkv_b', 0)
-        'model.4.attn.v_proj.weight'  -> (None, -1)
-    """
-    parts = normalize_fqn(name).split('.')
-    if len(parts) < 3:
-        return None, -1
-    kind = parts[-2]
-    layer_idx = -1
-    for part in reversed(parts):
-        if part.isdigit():
-            layer_idx = int(part)
-            break
-    if kind in ('wq', 'wk', 'q_proj', 'k_proj', 'wq_b', 'wkv_b'):
-        return kind, layer_idx
-    return None, -1
-@dataclass
-class QKClipInfo:
-    """Per-parameter dynamic info computed from config + runtime logits."""
-    kind: str | None  # 'wq'/'q_proj'/'wq_b' or 'wk'/'k_proj'/'wkv_b' or None
-    indices: list[int]  # which heads to consider for clipping
-    head_dim: int  # from config (qk_head_dim for MLA wq_b)
-    threshold: float  # from config
-    logit: torch.Tensor | None
-    # MLA-specific fields
-    is_mla: bool = False
-    qk_nope_head_dim: int = 0
-    qk_rope_head_dim: int = 0
-    v_head_dim: int = 0
-def get_qk_clip_info(clip_config, n, qk_logits):
-    """Extract QK clipping info for a named parameter.
-    Args:
-        clip_config: QK clipping configuration dict (or None).
-            MHA/GQA keys: head_dim, threshold, q_indices, k_indices
-            MLA extra keys: is_mla=True, qk_nope_head_dim, qk_rope_head_dim, v_head_dim
-        n: Parameter name string.
-        qk_logits: Dict mapping layer indices to logit tensors (or None).
-    Returns:
-        QKClipInfo instance with clipping configuration for this parameter.
-    """
-    if clip_config is None:
-        return None
-    head_dim = clip_config.get('head_dim')
-    threshold = clip_config.get('threshold')
-    kind, layer_idx = parse_qk_layer(n)
-    is_mla = clip_config.get('is_mla', False)
-    logit, indices = None, []
-    if qk_logits is not None and kind is not None:
-        logit = qk_logits[layer_idx]
-        if isinstance(logit, DTensor):
-            # In TP settings, qk_logits may be DTensor
-            # We convert it to full tensor here for simplicity
-            logit = logit.full_tensor()
-        if kind in ('wq_b', 'wq', 'q_proj'):
-            indices = clip_config.get('q_indices', []) or []
-        elif kind in ('wkv_b', 'wk', 'k_proj'):
-            indices = clip_config.get('k_indices', []) or []
-    if is_mla:
-        return QKClipInfo(
-            kind=kind,
-            indices=indices,
-            head_dim=head_dim,
-            threshold=threshold,
-            logit=logit,
-            is_mla=True,
-            qk_nope_head_dim=clip_config['qk_nope_head_dim'],
-            qk_rope_head_dim=clip_config['qk_rope_head_dim'],
-            v_head_dim=clip_config['v_head_dim'],
-        )
-    else:
-        return QKClipInfo(
-            kind=kind,
-            indices=indices,
-            head_dim=head_dim,
-            threshold=threshold,
-            logit=logit,
-        )
-def compute_scales(p, qk_clip_state):
-    """Compute per-head scaling factors for QK clipping.
-    Returns scales tensor (√γ per head) if any head exceeds threshold, else None.
-    For MLA wkv_b, effective row stride is qk_nope_head_dim + v_head_dim.
-    """
-    kind = qk_clip_state.kind
-    indices = qk_clip_state.indices
-    head_dim = qk_clip_state.head_dim
-    threshold = qk_clip_state.threshold
-    logit = qk_clip_state.logit
-    # Check if any head exceeds threshold before allocating.
-    head_scales = {}
-    for logit_idx, head_idx in enumerate(indices):
-        v_ele = float(logit[logit_idx])
-        if v_ele > threshold:
-            new_scale = math.sqrt(threshold / v_ele)
-            if head_idx not in head_scales or new_scale < head_scales[head_idx]:
-                head_scales[head_idx] = new_scale
-                logger.info(
-                    f"[{kind}] Head {head_idx} exceeded threshold "
-                    f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
-                )
-    if not head_scales:
-        return None
-    # For MLA wkv_b, each KV head spans qk_nope_head_dim + v_head_dim rows
-    if qk_clip_state.is_mla and kind == 'wkv_b':
-        effective_head_dim = qk_clip_state.qk_nope_head_dim + qk_clip_state.v_head_dim
-    else:
-        effective_head_dim = head_dim
-    H_global = p.shape[0] // effective_head_dim
-    scales_full = torch.ones(H_global, device=p.data.device)
-    for head_idx, scale in head_scales.items():
-        scales_full[head_idx] = scale
-    return scales_full
-def qk_clip(p, scales, info):
-    """Apply per-head scaling to a Q/K projection weight matrix.
-    Args:
-        p: Parameter (nn.Parameter or raw tensor).
-        scales: [n_heads] tensor, each element = √γ_h.
-        info: QKClipInfo with kind, head_dim, and MLA sub-head dimensions.
-    MLA sub-region scaling per Algorithm 1 (MuonClip):
-        wq_b: q_nope rows → √γ,  q_pe rows → γ
-        wkv_b: k_nope rows → √γ, v rows → unchanged
-    """
-    W = p.data if isinstance(p, torch.nn.Parameter) else p
-    if not info.is_mla:
-        # MHA/GQA: uniform √γ applied to all rows in each head
-        W.view(-1, info.head_dim, W.shape[1]).mul_(scales.view(-1, 1, 1))
-        return
-    # MLA: vectorized sub-region scaling within each head
-    if info.kind == 'wq_b':
-        qk_nope = info.qk_nope_head_dim
-        qk_head_dim = qk_nope + info.qk_rope_head_dim
-        W_3d = W.view(-1, qk_head_dim, W.shape[1])  # [H, qk_head_dim, in_dim]
-        W_3d[:, :qk_nope, :].mul_(scales.view(-1, 1, 1))  # q_nope → √γ
-        W_3d[:, qk_nope:, :].mul_((scales * scales).view(-1, 1,
-                                                         1))  # q_pe   → γ
-    elif info.kind == 'wkv_b':
-        qk_nope = info.qk_nope_head_dim
-        kv_stride = qk_nope + info.v_head_dim
-        W_3d = W.view(-1, kv_stride, W.shape[1])  # [H, kv_stride, in_dim]
-        W_3d[:, :qk_nope, :].mul_(scales.view(-1, 1, 1))  # k_nope → √γ
-        # v rows: not touched (k_R shared rotary unchanged)

build/torch210-cxx11-rocm70-x86_64-linux/adamw.py DELETED Viewed

@@ -1,271 +0,0 @@
-import logging
-from collections import defaultdict
-from typing import cast
-import torch
-from torch.distributed.tensor import DTensor
-from torch.profiler import record_function
-logger = logging.getLogger(__name__)
-def fused_adamw(
-    params: list[torch.Tensor],
-    grads: list[torch.Tensor],
-    exp_avgs: list[torch.Tensor],
-    exp_avg_sqs: list[torch.Tensor],
-    max_exp_avg_sqs: list[torch.Tensor],
-    state_steps: list[torch.Tensor],
-    amsgrad: bool,
-    beta1: float,
-    beta2: float,
-    lr: float | torch.Tensor,
-    weight_decay: float,
-    eps: float,
-    maximize: bool,
-) -> None:
-    if not params:
-        return
-    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
-    # treating it as a scalar.
-    lr_dict: dict | None = ({
-        lr.device: lr
-    } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else None)
-    grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
-         state_steps]  # type: ignore[list-item]
-    )
-    for (device, _), (
-        (
-            device_params_,
-            device_grads_,
-            device_exp_avgs_,
-            device_exp_avg_sqs_,
-            device_max_exp_avg_sqs,
-            device_state_steps_,
-        ),
-            _,
-    ) in grouped_tensors.items():
-        device_params = cast(list[torch.Tensor], device_params_)
-        device_grads = cast(list[torch.Tensor], device_grads_)
-        device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
-        device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
-        device_state_steps = cast(list[torch.Tensor], device_state_steps_)
-        if lr_dict is not None and device not in lr_dict:
-            lr_dict[device] = lr.to(
-                device=device, non_blocking=True)  # type: ignore[union-attr]
-            lr = lr_dict[device]
-        torch._foreach_add_(device_state_steps, 1)
-        func = torch._fused_adamw_
-        func(
-            device_params,
-            device_grads,
-            device_exp_avgs,
-            device_exp_avg_sqs,
-            device_max_exp_avg_sqs,  # type: ignore[arg-type]
-            device_state_steps,
-            amsgrad=amsgrad,
-            lr=lr,  # type: ignore[arg-type]
-            beta1=beta1,
-            beta2=beta2,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=maximize,
-        )
-def _to_local(t):
-    """Unwrap DTensor to local tensor for fused ops."""
-    return t._local_tensor if isinstance(t, DTensor) else t
-# ---------------------------------------------------------------------------
-# Caches for eliminating per-step Python overhead.
-#
-# Placement grouping and tensor list assembly are identical every step
-# (params don't change placement, moment/step tensors are the same objects
-# after initialisation).  We cache them keyed by id() of the param list
-# stored in param_groups (stable across steps).
-#
-# Only gradients change each step and must be collected fresh.
-# ---------------------------------------------------------------------------
-# id(group["params"]) → dict[placement_key, list[param]]
-_placement_cache: dict[int, dict[tuple, list]] = {}
-# id(placement_group_list) → (params_local, moment1, moment2, state_steps)
-_tensor_cache: dict[int, tuple[list, list, list, list]] = {}
-def _step_adamw_params_slow(optimizer_state, params, group):
-    """Uncached fallback for the rare case where some params lack grads."""
-    params_with_grads = []
-    grads = []
-    moment1 = []
-    moment2 = []
-    state_steps = []
-    for p in params:
-        g = p.grad
-        if g is None:
-            continue
-        state = optimizer_state[p]
-        params_with_grads.append(_to_local(p))
-        grads.append(_to_local(g))
-        if "step" not in state:
-            state["step"] = torch.zeros((),
-                                        dtype=torch.float32,
-                                        device=p.device)
-            state["moment1"] = torch.zeros_like(g)
-            state["moment2"] = torch.zeros_like(g)
-        moment1.append(_to_local(state["moment1"]))
-        moment2.append(_to_local(state["moment2"]))
-        if not isinstance(state["step"], torch.Tensor):
-            state["step"] = torch.tensor(state["step"],
-                                         dtype=torch.float32,
-                                         device=p.device)
-        state_steps.append(state["step"])
-    if not params_with_grads:
-        return
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
-    fused_adamw(
-        params_with_grads,
-        grads,
-        moment1,
-        moment2,
-        [],
-        state_steps,
-        amsgrad=False,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        weight_decay=weight_decay,
-        eps=eps,
-        maximize=False,
-    )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    After the first call, cached tensor lists (params_local, moment1,
-    moment2, state_steps) are reused — only gradients are collected fresh.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
-    # Collect grads — the only thing that changes each step.
-    with record_function("adamw::collect_grads"):
-        grads = []
-        for p in params:
-            g = p.grad
-            if g is None:
-                # Rare: fall back to slow path that filters per-param.
-                _step_adamw_params_slow(optimizer_state, params, group)
-                return
-            grads.append(_to_local(g))
-    tensor_key = id(params)
-    if tensor_key not in _tensor_cache:
-        with record_function("adamw::init_tensor_cache"):
-            params_local = []
-            moment1 = []
-            moment2 = []
-            state_steps = []
-            for p in params:
-                state = optimizer_state[p]
-                params_local.append(_to_local(p))
-                if "step" not in state:
-                    state["step"] = torch.zeros((),
-                                                dtype=torch.float32,
-                                                device=p.device)
-                    state["moment1"] = torch.zeros_like(p.grad)
-                    state["moment2"] = torch.zeros_like(p.grad)
-                moment1.append(_to_local(state["moment1"]))
-                moment2.append(_to_local(state["moment2"]))
-                if not isinstance(state["step"], torch.Tensor):
-                    state["step"] = torch.tensor(state["step"],
-                                                 dtype=torch.float32,
-                                                 device=p.device)
-                state_steps.append(state["step"])
-            _tensor_cache[tensor_key] = (params_local, moment1, moment2,
-                                         state_steps)
-    params_local, moment1, moment2, state_steps = _tensor_cache[tensor_key]
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
-    with record_function("adamw::fused_adamw"):
-        fused_adamw(
-            params_local,
-            grads,
-            moment1,
-            moment2,
-            [],
-            state_steps,
-            amsgrad=False,
-            beta1=beta1,
-            beta2=beta2,
-            lr=lr,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=False,
-        )
-def step_adamw(optimizer_state, group):
-    """Dispatch AdamW step, grouping parameters by type and placement.
-    Placement grouping is cached after the first call since params never
-    change their placement between steps.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        group: Parameter group dict.
-    """
-    params = group["params"]
-    placement_key = id(params)
-    if placement_key not in _placement_cache:
-        with record_function("adamw::group_by_placement"):
-            placement_to_params: dict[tuple,
-                                      list[torch.Tensor]] = defaultdict(list)
-            for p in params:
-                match p:
-                    case DTensor():
-                        logger.debug(
-                            "[AdamW] DTensor param: shape=%s, placements=%s, "
-                            "mesh=%s, grad=%s", p.shape, p.placements,
-                            p.device_mesh.mesh_dim_names,
-                            p.grad.shape if p.grad is not None else None)
-                        placement_to_params[tuple(
-                            [p.placements, p.device_mesh])].append(p)
-                    case torch.Tensor():
-                        logger.debug(
-                            "[AdamW] plain param: shape=%s, grad=%s", p.shape,
-                            p.grad.shape if p.grad is not None else None)
-                        placement_to_params[tuple([torch.Tensor,
-                                                   None])].append(p)
-            logger.debug("[AdamW] %d placement groups, %d total params",
-                         len(placement_to_params), len(params))
-            _placement_cache[placement_key] = dict(placement_to_params)
-    for group_params in _placement_cache[placement_key].values():
-        step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-rocm70-x86_64-linux/async_utils.py DELETED Viewed

@@ -1,77 +0,0 @@
-import logging
-from typing import Generator
-logger = logging.getLogger(__name__)
-class _Task:
-    """Internal: wraps a generator, advances one yield at a time."""
-    def __init__(self, generator: Generator[None, None, None], index: int):
-        self._generator = generator
-        self._index = index
-        self._steps_completed = 0
-        self.step()  # run to first yield
-    def step(self) -> bool:
-        try:
-            next(self._generator)
-            self._steps_completed += 1
-            logger.debug("pipeline[%d] completed stage %d", self._index,
-                         self._steps_completed)
-            return True
-        except StopIteration:
-            logger.debug("pipeline[%d] finished after %d stages", self._index,
-                         self._steps_completed)
-            return False
-    def close(self):
-        self._generator.close()
-def run_pipeline(
-    pipelines: Generator[Generator[None, None, None], None, None],
-    max_concurrent: int,
-) -> None:
-    """Run generator-based pipelines with bounded concurrency.
-    Each pipeline is a generator that yields at stage boundaries.
-    The runtime interleaves pipelines so communication and computation
-    overlap across chunks.
-    """
-    if max_concurrent <= 0:
-        raise ValueError(f"max_concurrent must be > 0, got {max_concurrent}")
-    have_new = True
-    task_index = 0
-    previous_tasks: list[_Task] = []
-    try:
-        while have_new or previous_tasks:
-            running_tasks: list[_Task] = []
-            # Admit one new pipeline per iteration (staggered admission).
-            # Admitting one at a time ensures that while chunk N does NS
-            # compute on the default stream, chunk N+1's NCCL all-to-all
-            # runs concurrently on the NCCL stream — creating real
-            # communication/computation overlap on the GPU.
-            if have_new and len(previous_tasks) < max_concurrent:
-                try:
-                    gen = next(pipelines)
-                    task = _Task(gen, task_index)
-                    task_index += 1
-                    running_tasks.append(task)
-                except StopIteration:
-                    have_new = False
-            # Advance every previously-yielded task by one step.
-            for task in previous_tasks:
-                if task.step():
-                    running_tasks.append(task)
-            previous_tasks = running_tasks
-    except BaseException:
-        # Clean up all in-flight generators to release GPU resources.
-        for task in previous_tasks:
-            task.close()
-        raise

build/torch210-cxx11-rocm70-x86_64-linux/core.py DELETED Viewed

@@ -1,219 +0,0 @@
-import logging
-import math
-from dataclasses import dataclass
-from typing import List
-import torch
-from torch.distributed import ProcessGroup
-from torch.distributed.tensor import DTensor
-# torch.compile wraps modules as OptimizedModule, inserting "_orig_mod" into
-# parameter FQNs.  Activation checkpointing similarly inserts
-# "_checkpoint_wrapped_module".  Strip these so name-based matching (skip_keys,
-# expert_keys, QK layer parsing) works regardless of wrapper nesting.
-_WRAPPER_PARTS = frozenset({"_orig_mod", "_checkpoint_wrapped_module"})
-logger = logging.getLogger(__name__)
-def normalize_fqn(name: str) -> str:
-    """Strip torch.compile / checkpoint wrapper components from a parameter FQN."""
-    return ".".join(p for p in name.split(".") if p not in _WRAPPER_PARTS)
-@dataclass
-class _muon_state:
-    worker_rank: int
-    process_group: ProcessGroup
-    rank_indices: dict[int, tuple]  # local_rank -> per-dim indices
-    rank_numels: dict[int, int]  # local_rank -> numel
-    name: str
-    qk_clip_state: torch.Tensor | None = None
-def _batch_momentum(
-    grads: List[torch.Tensor],
-    momentum_bufs: List[torch.Tensor],
-    momentum: torch.Tensor,
-) -> None:
-    """Batched momentum update (no nesterov)."""
-    torch._foreach_mul_(momentum_bufs, momentum)
-    torch._foreach_add_(momentum_bufs, grads)
-def _batch_momentum_nesterov(
-    grads: List[torch.Tensor],
-    momentum_bufs: List[torch.Tensor],
-    momentum: torch.Tensor,
-) -> None:
-    """Batched momentum update with nesterov correction."""
-    torch._foreach_mul_(momentum_bufs, momentum)
-    torch._foreach_add_(momentum_bufs, grads)
-    nesterov_terms = torch._foreach_mul(momentum_bufs, momentum)
-    torch._foreach_add_(grads, nesterov_terms)
-_compiled_momentum: dict[bool, callable] = {}
-_use_momentum_compile = True
-def set_momentum_compile(enabled: bool):
-    """Toggle torch.compile for batched momentum."""
-    global _use_momentum_compile
-    _use_momentum_compile = enabled
-def batch_pre_ortho(
-    grads: List[torch.Tensor],
-    momentum_bufs: List[torch.Tensor],
-    momentum: torch.Tensor,
-    nesterov: bool,
-) -> None:
-    """Batched momentum update on lists of plain tensors.
-    Mirrors dion's ``muon_update_pre_orthogonalize``.
-    Inputs must be plain CUDA tensors (not DTensor).
-    Modifies ``momentum_bufs`` and (for nesterov) ``grads`` in-place.
-    When compile is enabled, uses separately compiled functions for
-    nesterov=True/False to avoid graph breaks from the branch.
-    """
-    fn = _batch_momentum_nesterov if nesterov else _batch_momentum
-    if _use_momentum_compile:
-        if nesterov not in _compiled_momentum:
-            _compiled_momentum[nesterov] = torch.compile(fn)
-        fn = _compiled_momentum[nesterov]
-    fn(grads, momentum_bufs, momentum)
-def _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay):
-    """Weight-decay + update on plain tensors.
-    Not compiled: per-param @torch.compile caused ~0.25ms TorchDynamo cache
-    lookup per call × 256+ params = massive overhead.  The pipeline path uses
-    batched _foreach_* ops instead; this function remains for base() and
-    distributed_muon().
-    """
-    p_data.mul_(1 - lr * weight_decay)
-    p_data.add_(u_data, alpha=-adjusted_lr)
-def update_p(p, u, lr, adjusted_lr, weight_decay):
-    """Apply weight decay and orthogonalized update to parameter.
-    Args:
-        p: Parameter (torch.nn.Parameter or DTensor).
-        u: Orthogonalized update tensor.
-        lr: Base learning rate.
-        adjusted_lr: Size-adjusted learning rate.
-        weight_decay: Weight decay coefficient.
-    """
-    # Unwrap Parameter -> underlying data tensor.
-    p_data = p.data if isinstance(p, torch.nn.Parameter) else p
-    # Unwrap DTensor -> local CUDA tensor for compiled kernel.
-    if isinstance(p_data, DTensor):
-        p_data = p_data._local_tensor
-    u_data = u._local_tensor if isinstance(u, DTensor) else u
-    _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay)
-def adjust_lr_for_muon(lr, param_shape):
-    """Scale learning rate based on parameter matrix dimensions.
-    Args:
-        lr: Base learning rate.
-        param_shape: Shape of the parameter tensor.
-    Returns:
-        Adjusted learning rate.
-    """
-    A, B = param_shape[:2]
-    # We adjust the learning rate and weight decay based on the size of the parameter matrix
-    # as described in the paper
-    adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-    adjusted_lr = lr * adjusted_ratio
-    return adjusted_lr
-def _match_key(parts, key):
-    """Check if key matches as contiguous components in parts.
-    Single-component keys (e.g. "experts") match any single component.
-    Multi-component keys (e.g. "experts.w1") match as a contiguous subsequence.
-    """
-    key_parts = key.split(".")
-    key_len = len(key_parts)
-    if key_len == 1:
-        return key in parts
-    return any(parts[i:i + key_len] == key_parts
-               for i in range(len(parts) - key_len + 1))
-def is_expert_param(name, expert_keys):
-    """Check if a parameter name matches any expert key (component-level)."""
-    if not expert_keys:
-        return False
-    parts = normalize_fqn(name).split(".")
-    return any(_match_key(parts, key) for key in expert_keys)
-def default_is_muon(name, x, expert_keys=None):
-    normalized = normalize_fqn(name)
-    parts = normalized.split(".")
-    skip_keys = [
-        "embed_tokens",
-        "lm_head",
-        "tok_embeddings",
-        "output",
-        "mhc_attn",
-        "mhc_ffn",
-        "lambda_proj",
-    ]
-    if any(key in parts for key in skip_keys):
-        logger.info(
-            "[is_muon] %s (orig: %s): skip (matched skip_key), ndim=%d",
-            normalized, name, x.ndim)
-        return False
-    effective_ndim = x.ndim
-    is_expert = is_expert_param(name, expert_keys)
-    if is_expert:
-        effective_ndim -= 1
-    result = effective_ndim >= 2
-    logger.info(
-        "[is_muon] %s (orig: %s): ndim=%d, expert=%s, effective_ndim=%d → %s",
-        normalized, name, x.ndim, is_expert, effective_ndim,
-        "Muon" if result else "AdamW")
-    return result
-def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
-    if is_muon_func is None:
-        is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
-    muon_params, muon_names = [], []
-    non_muon_params, non_muon_names = [], []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
-            continue
-        if is_muon_func(n, p):
-            muon_params.append(p)
-            muon_names.append(n)
-        else:
-            non_muon_params.append(p)
-            non_muon_names.append(n)
-    logger.info("[param_groups] expert_keys=%s, Muon=%d, AdamW=%d",
-                expert_keys, len(muon_names), len(non_muon_names))
-    return [
-        {
-            "params": muon_params,
-            "names": muon_names,
-            "use_muon": True,
-        },
-        {
-            "params": non_muon_params,
-            "use_muon": False,
-        },
-    ]

build/torch210-cxx11-rocm70-x86_64-linux/cpu_offload.py DELETED Viewed

@@ -1,206 +0,0 @@
-"""CPU offloading for optimizer states.
-Manages a pinned CPU memory pool and async CUDA streams to offload
-optimizer state tensors (momentum buffers, Adam moments) to CPU between
-optimizer steps, freeing GPU memory.
-All tracked tensors are packed into a single flat pinned CPU buffer
-(per dtype).  D2H and H2D copies are performed per-tensor directly
-between individual GPU tensors and their slice of the CPU flat buffer
-— no GPU staging buffer is allocated, so there is **no temporary GPU
-memory spike** during offload or reload.
-Individual tensor storages are freed after offload via
-``untyped_storage().resize_(0)``, preserving tensor identity so
-downstream caches remain valid.
-"""
-import logging
-from collections import defaultdict
-import torch
-from torch.distributed.tensor import DTensor
-logger = logging.getLogger(__name__)
-class CPUOffloadPool:
-    """Pinned CPU memory pool for async optimizer state offloading.
-    Tracked tensors are grouped by dtype.  Each group gets a single flat
-    pinned CPU buffer.  D2H / H2D copies are per-tensor (into slices of
-    the flat buffer) to avoid allocating a GPU staging buffer.
-    """
-    def __init__(self):
-        self._managed: list[torch.Tensor] = []
-        self._storage_nbytes: dict[int, int] = {}  # id(t) → bytes
-        # Per-dtype group: populated on first offload.
-        # dtype → dict with keys:
-        #   "indices"   : list[int]           managed-list indices
-        #   "offsets"   : list[tuple[int,int]] (start, numel) in flat buf
-        #   "total"     : int                  total numel
-        #   "cpu_flat"  : Tensor               pinned CPU buffer
-        self._groups: dict[torch.dtype, dict] = {}
-        self._offload_stream: torch.cuda.Stream | None = None
-        self._device: torch.device | None = None
-        self._initialized: bool = False
-        self._logged: bool = False
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _local(t: torch.Tensor) -> torch.Tensor:
-        """Unwrap DTensor to its local CUDA tensor."""
-        return t._local_tensor if isinstance(t, DTensor) else t
-    def _ensure_stream(self):
-        if self._offload_stream is None:
-            self._offload_stream = torch.cuda.Stream(device=self._device)
-    # ------------------------------------------------------------------
-    def track(self, tensor: torch.Tensor):
-        """Register a GPU tensor for CPU offloading.  Idempotent."""
-        tid = id(tensor)
-        if tid in self._storage_nbytes:
-            return
-        local = self._local(tensor)
-        if self._device is None:
-            self._device = local.device
-        storage = local.untyped_storage()
-        # Skip tensors with empty storage (e.g. empty FSDP shards)
-        if storage.size() == 0:
-            return
-        self._storage_nbytes[tid] = storage.size()
-        self._managed.append(tensor)
-    # ------------------------------------------------------------------
-    def _init_buffers(self):
-        """Build per-dtype flat buffers on first offload."""
-        # Group managed tensors by dtype.
-        dtype_map: dict[torch.dtype, list[tuple[int, int]]] = defaultdict(list)
-        for idx, t in enumerate(self._managed):
-            local = self._local(t)
-            dtype_map[local.dtype].append((idx, local.numel()))
-        total_cpu_bytes = 0
-        for dtype, entries in dtype_map.items():
-            offsets: list[tuple[int, int]] = []
-            indices: list[int] = []
-            off = 0
-            for idx, n in entries:
-                indices.append(idx)
-                offsets.append((off, n))
-                off += n
-            cpu_flat = torch.empty(off, dtype=dtype, device="cpu", pin_memory=True)
-            self._groups[dtype] = {
-                "indices": indices,
-                "offsets": offsets,
-                "total": off,
-                "cpu_flat": cpu_flat,
-            }
-            total_cpu_bytes += off * cpu_flat.element_size()
-        self._initialized = True
-        logger.info(
-            "[CPUOffload] Pool initialized: %d tensors, %d dtype group(s), "
-            "%.2f MB pinned CPU memory",
-            len(self._managed),
-            len(self._groups),
-            total_cpu_bytes / (1024**2),
-        )
-    # ------------------------------------------------------------------
-    def offload(self):
-        """Per-tensor async D2H into CPU flat buffer, then free GPU storage."""
-        if not self._managed:
-            return
-        if not self._initialized:
-            self._init_buffers()
-        self._ensure_stream()
-        # Offload stream waits for compute to finish.
-        compute_event = torch.cuda.current_stream(self._device).record_event()
-        self._offload_stream.wait_event(compute_event)
-        offloaded_bytes = 0
-        # Per-tensor D2H copies directly into CPU flat buffer slices.
-        # No GPU staging buffer → no temporary GPU memory spike.
-        with torch.cuda.stream(self._offload_stream):
-            for dtype, grp in self._groups.items():
-                indices = grp["indices"]
-                offsets = grp["offsets"]
-                cpu_flat = grp["cpu_flat"]
-                for i, mgd_idx in enumerate(indices):
-                    local = self._local(self._managed[mgd_idx])
-                    off, n = offsets[i]
-                    cpu_flat[off : off + n].copy_(local.reshape(-1), non_blocking=True)
-                offloaded_bytes += grp["total"] * cpu_flat.element_size()
-        # Wait for all D2H copies to land, then free GPU storage.
-        self._offload_stream.synchronize()
-        for t in self._managed:
-            storage = self._local(t).untyped_storage()
-            if storage.size() != 0:
-                storage.resize_(0)
-            else:
-                raise RuntimeError(
-                    f"Tensor storage is already freed (size=0) before offload. "
-                    f"This indicates a double-free or external interference. "
-                    f"Tensor shape: {t.shape}, dtype: {t.dtype}"
-                )
-        if not self._logged:
-            logger.info(
-                "[CPUOffload] Offloaded %.2f MB (GPU → CPU)",
-                offloaded_bytes / (1024**2),
-            )
-    # ------------------------------------------------------------------
-    def reload(self):
-        """Per-tensor H2D from CPU flat buffer on the default stream.
-        Runs on the current (default) CUDA stream to avoid stream
-        interaction issues with the parallel Muon pipeline.  Since
-        pinned CPU memory is the source, the copies overlap with
-        GPU idle time between steps.
-        """
-        if not self._managed or not self._initialized:
-            return
-        reloaded_bytes = 0
-        # Re-allocate all GPU storages first.
-        for t in self._managed:
-            local = self._local(t)
-            storage = local.untyped_storage()
-            if storage.size() != 0:
-                raise RuntimeError(
-                    f"Storage should have been freed (size=0) before reload, "
-                    f"but got size={storage.size()}. "
-                    f"Tensor shape: {t.shape}, dtype: {t.dtype}"
-                )
-            storage.resize_(self._storage_nbytes[id(t)])
-        # Per-tensor H2D copies from CPU flat buffer slices.
-        # non_blocking=True with pinned source allows DMA overlap.
-        for dtype, grp in self._groups.items():
-            indices = grp["indices"]
-            offsets = grp["offsets"]
-            cpu_flat = grp["cpu_flat"]
-            for i, mgd_idx in enumerate(indices):
-                local = self._local(self._managed[mgd_idx])
-                off, n = offsets[i]
-                local.reshape(-1).copy_(cpu_flat[off : off + n], non_blocking=True)
-            reloaded_bytes += grp["total"] * cpu_flat.element_size()
-        if not self._logged:
-            logger.info(
-                "[CPUOffload] Reloaded %.2f MB (CPU → GPU)", reloaded_bytes / (1024**2)
-            )