Add uneven shard correctness test [skip-build]

Test parallel Muon with param dimensions not divisible by shard count
(dim=33,19,11 with 8 ranks). Verifies against sequential baseline.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

test/test_muon.py +90 -0

test/test_muon.py CHANGED Viewed

@@ -301,3 +301,93 @@ def test_parallel_muon_empty_shard(init_dist):
     set_ns_compile(True)
     logger.info("test_parallel_muon_empty_shard PASSED (rank %d)", rank)

     set_ns_compile(True)
     logger.info("test_parallel_muon_empty_shard PASSED (rank %d)", rank)
+@pytest.mark.parametrize("uneven_dim", [
+    pytest.param(33, id="33"),
+    pytest.param(19, id="19"),
+    pytest.param(11, id="11"),
+])
+def test_parallel_muon_uneven_shard(init_dist, uneven_dim):
+    """Test that parallel Muon produces correct results when parameter
+    dimensions are not evenly divisible by the number of shard ranks.
+    For example, dim=33 with 8 ranks gives 7 ranks with 4 rows and
+    1 rank with 5 rows.  This exercises the remainder-handling logic
+    in ``get_slices_of_dtensor`` and the all-to-all pipeline.
+    """
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    mesh = dist.init_device_mesh("cuda", (world_size, ),
+                                 mesh_dim_names=("dp", ))
+    set_ns_compile(False)
+    torch.manual_seed(42)
+    other_dim = 64
+    num_params = 3
+    # --- Build sharded params + grads ---
+    muon_params = []
+    muon_names = []
+    full_params_snapshot = []
+    full_grads = []
+    for i in range(num_params):
+        full = torch.randn(uneven_dim, other_dim, device="cuda")
+        full_params_snapshot.append(full.clone())
+        dt = distribute_tensor(full, mesh, [Shard(0)])
+        p = torch.nn.Parameter(dt)
+        grad_full = torch.randn(uneven_dim, other_dim, device="cuda")
+        full_grads.append(grad_full.clone())
+        p.grad = distribute_tensor(grad_full, mesh, [Shard(0)])
+        muon_params.append(p)
+        muon_names.append(f"layer.{i}.weight")
+    # --- Parallel path (all2all pipeline) ---
+    param_groups_par = [{
+        "params": muon_params,
+        "names": muon_names,
+        "use_muon": True,
+        "lr": 0.02,
+        "weight_decay": 0.01,
+        "momentum": 0.95,
+        "nesterov": True,
+        "ns_steps": 5,
+        "none_grad": False,
+    }]
+    optim_par = Muon(params=param_groups_par, chunk_size=1, warmup_step=0)
+    optim_par.step()
+    # --- Sequential baseline (base path, no sharding) ---
+    seq_params = []
+    seq_names = []
+    for i in range(num_params):
+        p = torch.nn.Parameter(full_params_snapshot[i].clone())
+        p.grad = full_grads[i].clone()
+        seq_params.append(p)
+        seq_names.append(f"layer.{i}.weight")
+    param_groups_seq = [{
+        "params": seq_params,
+        "names": seq_names,
+        "use_muon": True,
+        "lr": 0.02,
+        "weight_decay": 0.01,
+        "momentum": 0.95,
+        "nesterov": True,
+        "ns_steps": 5,
+        "none_grad": False,
+    }]
+    optim_seq = Muon(params=param_groups_seq)
+    optim_seq.step()
+    # --- Compare: parallel result (gathered) must match sequential ---
+    for i in range(num_params):
+        par_full = muon_params[i].data.full_tensor()
+        seq_full = seq_params[i].data
+        torch.testing.assert_close(par_full, seq_full, atol=0, rtol=0)
+    set_ns_compile(True)
+    logger.info("test_parallel_muon_uneven_shard (dim=%d) PASSED (rank %d)",
+                uneven_dim, rank)