Support multi-component expert_keys (e.g. "experts.w1")

Single-component keys match any single FQN component as before.
Multi-component keys (containing dots) now match as a contiguous
subsequence, so "experts.w1" matches "moe.experts.w1.weight" but
not "moe.shared_experts.w1.weight".

[skip-build]

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

test/test_normalize_fqn.py +14 -1
torch-ext/optimizer/core.py +15 -1

test/test_normalize_fqn.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Unit tests for FQN normalization (no GPU / distributed required)."""
 import pytest
-from optimizer.core import default_is_muon, normalize_fqn
 from optimizer.qk_clip import parse_qk_layer
@@ -84,3 +84,16 @@ class TestExpertKeyMatching:
         assert default_is_muon(name,
                                self.FakeParam(2),
                                expert_keys=["experts"])

 """Unit tests for FQN normalization (no GPU / distributed required)."""
 import pytest
+from optimizer.core import default_is_muon, is_expert_param, normalize_fqn
 from optimizer.qk_clip import parse_qk_layer
         assert default_is_muon(name,
                                self.FakeParam(2),
                                expert_keys=["experts"])
+    def test_multi_component_key_matches(self):
+        name = "model.layers.0.moe.experts.w1.weight"
+        assert is_expert_param(name, expert_keys=["experts.w1"])
+    def test_multi_component_key_no_false_positive(self):
+        # "experts.w2" should not match "experts.w1"
+        name = "model.layers.0.moe.experts.w1.weight"
+        assert not is_expert_param(name, expert_keys=["experts.w2"])
+    def test_multi_component_key_shared_experts(self):
+        name = "model.layers.0.moe.shared_experts.w1.weight"
+        assert not is_expert_param(name, expert_keys=["experts.w1"])

torch-ext/optimizer/core.py CHANGED Viewed

@@ -91,12 +91,26 @@ def adjust_lr_for_muon(lr, param_shape):
     return adjusted_lr
 def is_expert_param(name, expert_keys):
     """Check if a parameter name matches any expert key (component-level)."""
     if not expert_keys:
         return False
     parts = normalize_fqn(name).split(".")
-    return any(key in parts for key in expert_keys)
 def default_is_muon(name, x, expert_keys=None):

     return adjusted_lr
+def _match_key(parts, key):
+    """Check if key matches as contiguous components in parts.
+    Single-component keys (e.g. "experts") match any single component.
+    Multi-component keys (e.g. "experts.w1") match as a contiguous subsequence.
+    """
+    key_parts = key.split(".")
+    key_len = len(key_parts)
+    if key_len == 1:
+        return key in parts
+    return any(parts[i:i + key_len] == key_parts
+               for i in range(len(parts) - key_len + 1))
 def is_expert_param(name, expert_keys):
     """Check if a parameter name matches any expert key (component-level)."""
     if not expert_keys:
         return False
     parts = normalize_fqn(name).split(".")
+    return any(_match_key(parts, key) for key in expert_keys)
 def default_is_muon(name, x, expert_keys=None):