Use component-level matching for expert_keys to avoid shared_experts collision

Substring matching (`key in name`) causes "experts" to match
"shared_experts". Switch to dot-split component exact matching
(`key in name.split(".")`) in default_is_muon() and
_expand_expert_params(). Also applies to skip_keys for consistency.

[skip-build]

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (3) hide show

test/test_normalize_fqn.py +37 -3
torch-ext/optimizer/core.py +3 -4
torch-ext/optimizer/muon.py +2 -2

test/test_normalize_fqn.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Unit tests for FQN normalization (no GPU / distributed required)."""
 import pytest
-from optimizer.core import normalize_fqn
 from optimizer.qk_clip import parse_qk_layer
@@ -34,10 +34,12 @@ class TestNormalizeFqn:
 class TestParseQkLayerWithWrappers:
     def test_plain_name(self):
-        assert parse_qk_layer("model.layers.3.attn.q_proj.weight") == ("q_proj", 3)
     def test_orig_mod(self):
-        assert parse_qk_layer("model._orig_mod.layers.3.attn.wq.weight") == ("wq", 3)
     def test_checkpoint_wrapped(self):
         name = "model.layers.5._checkpoint_wrapped_module.self_attn.k_proj.weight"
@@ -50,3 +52,35 @@ class TestParseQkLayerWithWrappers:
     def test_non_qk_still_none(self):
         name = "model._orig_mod.layers.2.attn.v_proj.weight"
         assert parse_qk_layer(name) == (None, -1)

 """Unit tests for FQN normalization (no GPU / distributed required)."""
 import pytest
+from optimizer.core import default_is_muon, normalize_fqn
 from optimizer.qk_clip import parse_qk_layer
 class TestParseQkLayerWithWrappers:
     def test_plain_name(self):
+        assert parse_qk_layer("model.layers.3.attn.q_proj.weight") == (
+            "q_proj", 3)
     def test_orig_mod(self):
+        assert parse_qk_layer("model._orig_mod.layers.3.attn.wq.weight") == (
+            "wq", 3)
     def test_checkpoint_wrapped(self):
         name = "model.layers.5._checkpoint_wrapped_module.self_attn.k_proj.weight"
     def test_non_qk_still_none(self):
         name = "model._orig_mod.layers.2.attn.v_proj.weight"
         assert parse_qk_layer(name) == (None, -1)
+class TestExpertKeyMatching:
+    """Verify expert_keys uses component-level matching, not substring."""
+    class FakeParam:
+        def __init__(self, ndim):
+            self.ndim = ndim
+    def test_experts_matches(self):
+        name = "model.layers.0.moe.experts.w1.weight"
+        assert default_is_muon(name,
+                               self.FakeParam(3),
+                               expert_keys=["experts"])
+    def test_shared_experts_does_not_match(self):
+        name = "model.layers.0.moe.shared_experts.w1.weight"
+        # shared_experts has ndim=2, which is muon-eligible on its own.
+        # But it must NOT be recognized as expert (ndim-1 would make it 1D → False).
+        assert default_is_muon(name,
+                               self.FakeParam(2),
+                               expert_keys=["experts"])
+    def test_shared_experts_3d_not_treated_as_expert(self):
+        # 3D shared_experts: if wrongly matched as expert, ndim-1=2 → True (same result).
+        # Verify by checking that a 2D shared_experts is NOT downgraded to 1D.
+        name = "model.layers.0.moe.shared_experts.gate_proj.weight"
+        # 2D param: if expert-matched → ndim-1=1 → False. Must stay True.
+        assert default_is_muon(name,
+                               self.FakeParam(2),
+                               expert_keys=["experts"])

torch-ext/optimizer/core.py CHANGED Viewed

@@ -6,7 +6,6 @@ import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
 # torch.compile wraps modules as OptimizedModule, inserting "_orig_mod" into
 # parameter FQNs.  Activation checkpointing similarly inserts
 # "_checkpoint_wrapped_module".  Strip these so name-based matching (skip_keys,
@@ -90,12 +89,12 @@ def adjust_lr_for_muon(lr, param_shape):
 def default_is_muon(name, x, expert_keys=None):
-    name = normalize_fqn(name)
     skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    if any(key in name for key in skip_keys):
         return False
     effective_ndim = x.ndim
-    if expert_keys and any(key in name for key in expert_keys):
         effective_ndim -= 1
     return effective_ndim >= 2

 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
 # torch.compile wraps modules as OptimizedModule, inserting "_orig_mod" into
 # parameter FQNs.  Activation checkpointing similarly inserts
 # "_checkpoint_wrapped_module".  Strip these so name-based matching (skip_keys,
 def default_is_muon(name, x, expert_keys=None):
+    parts = normalize_fqn(name).split(".")
     skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    if any(key in parts for key in skip_keys):
         return False
     effective_ndim = x.ndim
+    if expert_keys and any(key in parts for key in expert_keys):
         effective_ndim -= 1
     return effective_ndim >= 2

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -46,8 +46,8 @@ def _expand_expert_params(names, params, expert_keys):
     expanded_params = []
     for n, p in zip(names, params):
-        is_expert = expert_keys and any(
-            key in normalize_fqn(n) for key in expert_keys)
         is_dtensor = isinstance(p.data, DTensor)
         if not is_expert:

     expanded_params = []
     for n, p in zip(names, params):
+        is_expert = expert_keys and any(key in normalize_fqn(n).split(".")
+                                        for key in expert_keys)
         is_dtensor = isinstance(p.data, DTensor)
         if not is_expert: