Update tests for MoE and parallel optimizations [skip-build]

- Add MoE test cases (test_muon_moe.py)
- Update parallel test configurations
- Test utility updates

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (4) hide show

test/test_muon.py +98 -35
test/test_muon_moe.py +21 -26
test/test_normalize_fqn.py +0 -1
test/utils.py +6 -2

test/test_muon.py CHANGED Viewed

@@ -7,7 +7,9 @@ import pytest
 import torch
 import torch.distributed as dist
 from optimizer.muon import Muon, get_default_muon_param_groups
-from torch.distributed.tensor import DTensor, Replicate
 from torch.profiler import ProfilerActivity, profile
 from .utils import (ParallelDims, assert_params_equal, parallelize_motif,
@@ -23,7 +25,6 @@ def apply_muon_step(
     grads: list[torch.Tensor],
     warmup_step: int,
     chunk_size: int,
-    small_param_numel_threshold: int,
     qk_logits: dict[int, torch.Tensor] | None = None,
     use_distributed_muon: bool = False,
     measure_perf: bool = False,
@@ -67,7 +68,6 @@ def apply_muon_step(
         none_grad=False,
         warmup_step=warmup_step,
         chunk_size=chunk_size,
-        small_param_numel_threshold=small_param_numel_threshold,
         use_distributed_muon=use_distributed_muon,
     )
@@ -119,43 +119,45 @@ def apply_muon_step(
 def sequential_muon_result(
     skip_verify,  # from conftest.py
     inputs  # from conftest.py
-) -> dict[bool, torch.nn.Module]:
-    """Run Muon optimizer to sequential model for baseline results."""
     if skip_verify:
         logger.info("Skipping verification tests as per user request")
         return None
     model, grads, qk_logits = inputs
-    result = apply_muon_step(
-        model=copy.deepcopy(model).cuda(),
-        parallel_dims=None,
-        grads=grads,
-        warmup_step=-1,
-        chunk_size=-1,
-        small_param_numel_threshold=-1,
-        qk_logits=None,
-    )[0].cpu()
-    result_qk_clip = apply_muon_step(
-        model=copy.deepcopy(model).cuda(),
-        parallel_dims=None,
-        grads=grads,
-        warmup_step=-1,
-        chunk_size=-1,
-        small_param_numel_threshold=-1,
-        qk_logits=qk_logits,
-    )[0].cpu()
-    return {
-        False: result,
-        True: result_qk_clip,
-    }
 OVERLAP_STEPS = [5]
 CHUNK_SIZES = [2]
-SMALL_PARAM_NUMEL_THRESHOLDS = [65536, 1_000_000_000]
 @pytest.mark.parametrize("parallel_dims", [
@@ -170,17 +172,16 @@ SMALL_PARAM_NUMEL_THRESHOLDS = [65536, 1_000_000_000]
 @pytest.mark.parametrize("use_distributed_muon", [False])
 @pytest.mark.parametrize("warmup_step", OVERLAP_STEPS)
 @pytest.mark.parametrize("chunk_size", CHUNK_SIZES)
-@pytest.mark.parametrize("small_param_numel_threshold",
-                         SMALL_PARAM_NUMEL_THRESHOLDS)
 def test_parallel_muon(
         request,
-        sequential_muon_result: dict[bool, torch.nn.Module],
         parallel_dims: ParallelDims,
         apply_qk_clip: bool,
         use_distributed_muon: bool,
         warmup_step: int,
         chunk_size: int,
-        small_param_numel_threshold: int,
         inputs: tuple[torch.nn.Module, list[torch.Tensor],
                       dict[int, torch.Tensor]],  # from conftest.py
         measure_perf,  # from conftest.py
@@ -191,6 +192,8 @@ def test_parallel_muon(
     if use_distributed_muon and warmup_step != OVERLAP_STEPS[0]:
         pytest.skip("Distributed Muon does not effected by warmup step")
     model, grads, qk_logits = inputs
     if not apply_qk_clip:
@@ -212,7 +215,6 @@ def test_parallel_muon(
         grads=grads,
         warmup_step=warmup_step,
         chunk_size=chunk_size,
-        small_param_numel_threshold=small_param_numel_threshold,
         qk_logits=qk_logits,
         use_distributed_muon=use_distributed_muon,
         measure_perf=measure_perf,
@@ -236,5 +238,66 @@ def test_parallel_muon(
     elif measure_perf:
         logger.info("Skipping correctness check as timing is enabled")
     else:
         assert_params_equal(parallelized_model,
-                            sequential_muon_result[apply_qk_clip])

 import torch
 import torch.distributed as dist
 from optimizer.muon import Muon, get_default_muon_param_groups
+from optimizer.newton_schulz import set_ns_compile
+from torch.distributed.tensor import (DTensor, Replicate, Shard,
+                                      distribute_tensor)
 from torch.profiler import ProfilerActivity, profile
 from .utils import (ParallelDims, assert_params_equal, parallelize_motif,
     grads: list[torch.Tensor],
     warmup_step: int,
     chunk_size: int,
     qk_logits: dict[int, torch.Tensor] | None = None,
     use_distributed_muon: bool = False,
     measure_perf: bool = False,
         none_grad=False,
         warmup_step=warmup_step,
         chunk_size=chunk_size,
         use_distributed_muon=use_distributed_muon,
     )
 def sequential_muon_result(
     skip_verify,  # from conftest.py
     inputs  # from conftest.py
+) -> dict[tuple[bool, bool], torch.nn.Module]:
+    """Run Muon optimizer to sequential model for baseline results.
+    Returns dict keyed by ``(apply_qk_clip, use_compile)``.
+    """
     if skip_verify:
         logger.info("Skipping verification tests as per user request")
         return None
     model, grads, qk_logits = inputs
+    results: dict[tuple[bool, bool], torch.nn.Module] = {}
+    for use_compile in [False, True]:
+        set_ns_compile(use_compile)
+        results[(False, use_compile)] = apply_muon_step(
+            model=copy.deepcopy(model).cuda(),
+            parallel_dims=None,
+            grads=grads,
+            warmup_step=-1,
+            chunk_size=-1,
+            qk_logits=None,
+        )[0].cpu()
+        results[(True, use_compile)] = apply_muon_step(
+            model=copy.deepcopy(model).cuda(),
+            parallel_dims=None,
+            grads=grads,
+            warmup_step=-1,
+            chunk_size=-1,
+            qk_logits=qk_logits,
+        )[0].cpu()
+    set_ns_compile(True)  # restore default
+    return results
 OVERLAP_STEPS = [5]
 CHUNK_SIZES = [2]
 @pytest.mark.parametrize("parallel_dims", [
 @pytest.mark.parametrize("use_distributed_muon", [False])
 @pytest.mark.parametrize("warmup_step", OVERLAP_STEPS)
 @pytest.mark.parametrize("chunk_size", CHUNK_SIZES)
+@pytest.mark.parametrize("use_compile", [False, True])
 def test_parallel_muon(
         request,
+        sequential_muon_result: dict[tuple[bool, bool], torch.nn.Module],
         parallel_dims: ParallelDims,
         apply_qk_clip: bool,
         use_distributed_muon: bool,
         warmup_step: int,
         chunk_size: int,
+        use_compile: bool,
         inputs: tuple[torch.nn.Module, list[torch.Tensor],
                       dict[int, torch.Tensor]],  # from conftest.py
         measure_perf,  # from conftest.py
     if use_distributed_muon and warmup_step != OVERLAP_STEPS[0]:
         pytest.skip("Distributed Muon does not effected by warmup step")
+    set_ns_compile(use_compile)
     model, grads, qk_logits = inputs
     if not apply_qk_clip:
         grads=grads,
         warmup_step=warmup_step,
         chunk_size=chunk_size,
         qk_logits=qk_logits,
         use_distributed_muon=use_distributed_muon,
         measure_perf=measure_perf,
     elif measure_perf:
         logger.info("Skipping correctness check as timing is enabled")
     else:
+        atol = 1e-5 if use_compile else 0
+        rtol = 1e-2 if use_compile else 0
         assert_params_equal(parallelized_model,
+                            sequential_muon_result[(apply_qk_clip,
+                                                    use_compile)],
+                            atol=atol,
+                            rtol=rtol)
+def test_parallel_muon_empty_shard(init_dist):
+    """Regression: parallel Muon must handle chunks where some ranks have
+    empty local shards (dim-0 < world_size).
+    With 8-way Shard(0) and dim-0 of size 4, ranks 4-7 get 0-element local
+    shards.  Previously ``_launch_gather`` hit ``assert total_send > 0``.
+    """
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    mesh = dist.init_device_mesh("cuda", (world_size, ),
+                                 mesh_dim_names=("dp", ))
+    set_ns_compile(False)
+    # dim-0 = 4 < 8 ranks → ranks 4-7 have empty local shards with Shard(0)
+    small_dim = 4
+    num_params = 4
+    torch.manual_seed(42)
+    muon_params = []
+    muon_names = []
+    for i in range(num_params):
+        full = torch.randn(small_dim, 64, device="cuda")
+        dt = distribute_tensor(full, mesh, [Shard(0)])
+        p = torch.nn.Parameter(dt)
+        grad_full = torch.randn(small_dim, 64, device="cuda")
+        p.grad = distribute_tensor(grad_full, mesh, [Shard(0)])
+        muon_params.append(p)
+        muon_names.append(f"layer.{i}.weight")
+    param_groups = [{
+        "params": muon_params,
+        "names": muon_names,
+        "use_muon": True,
+        "lr": 0.02,
+        "weight_decay": 0.01,
+        "momentum": 0.95,
+        "nesterov": True,
+        "ns_steps": 5,
+        "none_grad": False,
+    }]
+    optim = Muon(params=param_groups, chunk_size=1, warmup_step=0)
+    # Must not raise AssertionError: total_send > 0
+    optim.step()
+    # Run a second step to verify cached path also works
+    for p in muon_params:
+        grad_full = torch.randn(small_dim, 64, device="cuda")
+        p.grad = distribute_tensor(grad_full, mesh, [Shard(0)])
+    optim.step()
+    set_ns_compile(True)
+    logger.info("test_parallel_muon_empty_shard PASSED (rank %d)", rank)

test/test_muon_moe.py CHANGED Viewed

@@ -45,7 +45,6 @@ def apply_muon_step_moe(
     grads: list[torch.Tensor],
     warmup_step: int,
     chunk_size: int,
-    small_param_numel_threshold: int,
     use_distributed_muon: bool = False,
     measure_perf: bool = False,
     do_profile: bool = False,
@@ -63,7 +62,6 @@ def apply_muon_step_moe(
         none_grad=False,
         warmup_step=warmup_step,
         chunk_size=chunk_size,
-        small_param_numel_threshold=small_param_numel_threshold,
         use_distributed_muon=use_distributed_muon,
         expert_keys=["experts"],
     )
@@ -73,6 +71,10 @@ def apply_muon_step_moe(
     optim.step()
     timing_result: tuple[float, float] | None = None
     if measure_perf:
@@ -133,7 +135,6 @@ def sequential_moe_result(
         grads=grads,
         warmup_step=-1,
         chunk_size=-1,
-        small_param_numel_threshold=-1,
     )
     result = result.cpu()
@@ -142,25 +143,26 @@ def sequential_moe_result(
 OVERLAP_STEPS = [5]
 CHUNK_SIZES = [2]
-SMALL_PARAM_NUMEL_THRESHOLDS = [65536, 1_000_000_000]
-@pytest.mark.parametrize("parallel_dims", [
-    pytest.param(ParallelDims(8, 1, 1), id="base"),
-    pytest.param(ParallelDims(1, 8, 1), id="fsdp"),
-    pytest.param(ParallelDims(2, 4, 1), id="hsdp"),
-    pytest.param(ParallelDims(2, 2, 2), id="hsdp+tp"),
-    pytest.param(ParallelDims(1, 2, 4), id="fsdp+tp"),
-    pytest.param(ParallelDims(1, 1, 1, ep_degree=8), id="ep"),
-    pytest.param(ParallelDims(1, 4, 1, ep_degree=2), id="ep+fsdp"),
-    pytest.param(ParallelDims(1, 2, 1, ep_degree=4), id="ep4+fsdp"),
-    pytest.param(ParallelDims(2, 2, 1, ep_degree=2), id="ep+hsdp"),
-])
 @pytest.mark.parametrize("use_distributed_muon", [False])
 @pytest.mark.parametrize("warmup_step", OVERLAP_STEPS)
 @pytest.mark.parametrize("chunk_size", CHUNK_SIZES)
-@pytest.mark.parametrize("small_param_numel_threshold",
-                         SMALL_PARAM_NUMEL_THRESHOLDS)
 def test_parallel_muon_moe(
     request,
     sequential_moe_result: torch.nn.Module | None,
@@ -168,7 +170,6 @@ def test_parallel_muon_moe(
     use_distributed_muon: bool,
     warmup_step: int,
     chunk_size: int,
-    small_param_numel_threshold: int,
     moe_inputs: tuple[torch.nn.Module, list[torch.Tensor]],
     measure_perf,
     do_profile,
@@ -186,7 +187,6 @@ def test_parallel_muon_moe(
         grads=grads,
         warmup_step=warmup_step,
         chunk_size=chunk_size,
-        small_param_numel_threshold=small_param_numel_threshold,
         use_distributed_muon=use_distributed_muon,
         measure_perf=measure_perf,
         do_profile=do_profile,
@@ -231,7 +231,6 @@ def sequential_moe_result_few_experts(
         grads=grads,
         warmup_step=-1,
         chunk_size=-1,
-        small_param_numel_threshold=-1,
     )
     result = result.cpu()
@@ -239,14 +238,12 @@ def sequential_moe_result_few_experts(
 @pytest.mark.parametrize("parallel_dims", [
-    pytest.param(ParallelDims(1, 4, 1, ep_degree=2), id="ep+fsdp"),
-    pytest.param(ParallelDims(2, 2, 1, ep_degree=2), id="ep+hsdp"),
 ])
 @pytest.mark.parametrize("use_distributed_muon", [False])
 @pytest.mark.parametrize("warmup_step", OVERLAP_STEPS)
 @pytest.mark.parametrize("chunk_size", CHUNK_SIZES)
-@pytest.mark.parametrize("small_param_numel_threshold",
-                         SMALL_PARAM_NUMEL_THRESHOLDS)
 def test_parallel_muon_moe_few_experts(
     request,
     sequential_moe_result_few_experts: torch.nn.Module | None,
@@ -254,7 +251,6 @@ def test_parallel_muon_moe_few_experts(
     use_distributed_muon: bool,
     warmup_step: int,
     chunk_size: int,
-    small_param_numel_threshold: int,
     moe_inputs_few_experts: tuple[torch.nn.Module, list[torch.Tensor]],
     measure_perf,
     do_profile,
@@ -271,7 +267,6 @@ def test_parallel_muon_moe_few_experts(
         grads=grads,
         warmup_step=warmup_step,
         chunk_size=chunk_size,
-        small_param_numel_threshold=small_param_numel_threshold,
         use_distributed_muon=use_distributed_muon,
         measure_perf=measure_perf,
         do_profile=do_profile,

     grads: list[torch.Tensor],
     warmup_step: int,
     chunk_size: int,
     use_distributed_muon: bool = False,
     measure_perf: bool = False,
     do_profile: bool = False,
         none_grad=False,
         warmup_step=warmup_step,
         chunk_size=chunk_size,
         use_distributed_muon=use_distributed_muon,
         expert_keys=["experts"],
     )
     optim.step()
+    # Second step to exercise expert expand cache hot path.
+    _restore_grads(model, saved_grads)
+    optim.step()
     timing_result: tuple[float, float] | None = None
     if measure_perf:
         grads=grads,
         warmup_step=-1,
         chunk_size=-1,
     )
     result = result.cpu()
 OVERLAP_STEPS = [5]
 CHUNK_SIZES = [2]
+@pytest.mark.parametrize(
+    "parallel_dims",
+    [
+        # --- No EP (non-expert only) ---
+        pytest.param(ParallelDims(8, 1, 1), id="dp8"),
+        pytest.param(ParallelDims(1, 8, 1), id="fsdp8"),
+        pytest.param(ParallelDims(2, 4, 1), id="hsdp2x4"),
+        # --- EP configs ---
+        # naming: fsdp{dp_shard}_ep{ep} where dp_shard = dp_shard_mod_ep * ep
+        # dp_shard_mod_ep (= expert FSDP) = dp_shard_degree in our ParallelDims
+        pytest.param(ParallelDims(1, 1, 1, ep_degree=8), id="fsdp8_ep8"),
+        pytest.param(ParallelDims(1, 4, 1, ep_degree=2), id="fsdp8_ep2"),
+        pytest.param(ParallelDims(1, 2, 1, ep_degree=4), id="fsdp8_ep4"),
+        pytest.param(ParallelDims(2, 2, 1, ep_degree=2), id="hsdp_ep2"),
+    ])
 @pytest.mark.parametrize("use_distributed_muon", [False])
 @pytest.mark.parametrize("warmup_step", OVERLAP_STEPS)
 @pytest.mark.parametrize("chunk_size", CHUNK_SIZES)
 def test_parallel_muon_moe(
     request,
     sequential_moe_result: torch.nn.Module | None,
     use_distributed_muon: bool,
     warmup_step: int,
     chunk_size: int,
     moe_inputs: tuple[torch.nn.Module, list[torch.Tensor]],
     measure_perf,
     do_profile,
         grads=grads,
         warmup_step=warmup_step,
         chunk_size=chunk_size,
         use_distributed_muon=use_distributed_muon,
         measure_perf=measure_perf,
         do_profile=do_profile,
         grads=grads,
         warmup_step=-1,
         chunk_size=-1,
     )
     result = result.cpu()
 @pytest.mark.parametrize("parallel_dims", [
+    pytest.param(ParallelDims(1, 4, 1, ep_degree=2), id="fsdp8_ep2"),
+    pytest.param(ParallelDims(2, 2, 1, ep_degree=2), id="hsdp_ep2"),
 ])
 @pytest.mark.parametrize("use_distributed_muon", [False])
 @pytest.mark.parametrize("warmup_step", OVERLAP_STEPS)
 @pytest.mark.parametrize("chunk_size", CHUNK_SIZES)
 def test_parallel_muon_moe_few_experts(
     request,
     sequential_moe_result_few_experts: torch.nn.Module | None,
     use_distributed_muon: bool,
     warmup_step: int,
     chunk_size: int,
     moe_inputs_few_experts: tuple[torch.nn.Module, list[torch.Tensor]],
     measure_perf,
     do_profile,
         grads=grads,
         warmup_step=warmup_step,
         chunk_size=chunk_size,
         use_distributed_muon=use_distributed_muon,
         measure_perf=measure_perf,
         do_profile=do_profile,

test/test_normalize_fqn.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Unit tests for FQN normalization (no GPU / distributed required)."""
 from optimizer.core import default_is_muon, is_expert_param, normalize_fqn
 from optimizer.qk_clip import parse_qk_layer

 """Unit tests for FQN normalization (no GPU / distributed required)."""
 from optimizer.core import default_is_muon, is_expert_param, normalize_fqn
 from optimizer.qk_clip import parse_qk_layer

test/utils.py CHANGED Viewed

@@ -259,12 +259,16 @@ def parallelize_qk_logits(
 def assert_params_equal(actual: torch.nn.Module,
-                        expected: torch.nn.Module) -> None:
     """Asserts that the parameters of two models are equal.
     Args:
         actual (torch.nn.Module): The actual model.
         expected (torch.nn.Module): The expected model.
     Returns:
         None
     """
@@ -279,4 +283,4 @@ def assert_params_equal(actual: torch.nn.Module,
         p = get_full_param(p.cuda())
         s = get_full_param(s.cuda())
-        torch.testing.assert_close(p, s, atol=0, rtol=0)

 def assert_params_equal(actual: torch.nn.Module,
+                        expected: torch.nn.Module,
+                        atol: float = 0,
+                        rtol: float = 0) -> None:
     """Asserts that the parameters of two models are equal.
     Args:
         actual (torch.nn.Module): The actual model.
         expected (torch.nn.Module): The expected model.
+        atol: Absolute tolerance.
+        rtol: Relative tolerance.
     Returns:
         None
     """
         p = get_full_param(p.cuda())
         s = get_full_param(s.cuda())
+        torch.testing.assert_close(p, s, atol=atol, rtol=rtol)