| | """Unit tests for FQN normalization (no GPU / distributed required).""" |
| |
|
| | from optimizer.core import default_is_muon, is_expert_param, normalize_fqn |
| | from optimizer.qk_clip import parse_qk_layer |
| |
|
| |
|
| | class TestNormalizeFqn: |
| |
|
| | def test_passthrough(self): |
| | assert normalize_fqn("model.layers.3.attn.q_proj.weight") == \ |
| | "model.layers.3.attn.q_proj.weight" |
| |
|
| | def test_strip_orig_mod(self): |
| | assert normalize_fqn("model._orig_mod.layers.3.attn.q_proj.weight") == \ |
| | "model.layers.3.attn.q_proj.weight" |
| |
|
| | def test_strip_checkpoint_wrapped(self): |
| | name = "model.layers.0._checkpoint_wrapped_module.moe.experts.w1.weight" |
| | assert normalize_fqn(name) == \ |
| | "model.layers.0.moe.experts.w1.weight" |
| |
|
| | def test_strip_both(self): |
| | name = "model._orig_mod.layers.0._checkpoint_wrapped_module.attn.q_proj.weight" |
| | assert normalize_fqn(name) == \ |
| | "model.layers.0.attn.q_proj.weight" |
| |
|
| | def test_strip_nested_orig_mod(self): |
| | name = "_orig_mod._orig_mod.layers.0.mlp.gate_proj.weight" |
| | assert normalize_fqn(name) == \ |
| | "layers.0.mlp.gate_proj.weight" |
| |
|
| |
|
| | class TestParseQkLayerWithWrappers: |
| |
|
| | def test_plain_name(self): |
| | assert parse_qk_layer("model.layers.3.attn.q_proj.weight") == ( |
| | "q_proj", 3) |
| |
|
| | def test_orig_mod(self): |
| | assert parse_qk_layer("model._orig_mod.layers.3.attn.wq.weight") == ( |
| | "wq", 3) |
| |
|
| | def test_checkpoint_wrapped(self): |
| | name = "model.layers.5._checkpoint_wrapped_module.self_attn.k_proj.weight" |
| | assert parse_qk_layer(name) == ("k_proj", 5) |
| |
|
| | def test_both_wrappers(self): |
| | name = "_orig_mod.model._checkpoint_wrapped_module.layers.7.attn.wk.weight" |
| | assert parse_qk_layer(name) == ("wk", 7) |
| |
|
| | def test_non_qk_still_none(self): |
| | name = "model._orig_mod.layers.2.attn.v_proj.weight" |
| | assert parse_qk_layer(name) == (None, -1) |
| |
|
| |
|
| | class TestExpertKeyMatching: |
| | """Verify expert_keys uses component-level matching, not substring.""" |
| |
|
| | class FakeParam: |
| |
|
| | def __init__(self, ndim): |
| | self.ndim = ndim |
| |
|
| | def test_experts_matches(self): |
| | name = "model.layers.0.moe.experts.w1.weight" |
| | assert default_is_muon(name, |
| | self.FakeParam(3), |
| | expert_keys=["experts"]) |
| |
|
| | def test_shared_experts_does_not_match(self): |
| | name = "model.layers.0.moe.shared_experts.w1.weight" |
| | |
| | |
| | assert default_is_muon(name, |
| | self.FakeParam(2), |
| | expert_keys=["experts"]) |
| |
|
| | def test_shared_experts_3d_not_treated_as_expert(self): |
| | |
| | |
| | name = "model.layers.0.moe.shared_experts.gate_proj.weight" |
| | |
| | assert default_is_muon(name, |
| | self.FakeParam(2), |
| | expert_keys=["experts"]) |
| |
|
| | def test_multi_component_key_matches(self): |
| | name = "model.layers.0.moe.experts.w1.weight" |
| | assert is_expert_param(name, expert_keys=["experts.w1"]) |
| |
|
| | def test_multi_component_key_no_false_positive(self): |
| | |
| | name = "model.layers.0.moe.experts.w1.weight" |
| | assert not is_expert_param(name, expert_keys=["experts.w2"]) |
| |
|
| | def test_multi_component_key_shared_experts(self): |
| | name = "model.layers.0.moe.shared_experts.w1.weight" |
| | assert not is_expert_param(name, expert_keys=["experts.w1"]) |
| |
|