Charlie81
/

LoRE

TensorBoard

Safetensors

Model card Files Files and versions

xet

Metrics Training metrics Community

Charlie81 commited on Jul 5, 2025

Commit

a82f934

1 Parent(s): 438a56a

attempt fix and more prints

Browse files

Files changed (1) hide show

scripts/train.py +22 -7

scripts/train.py CHANGED Viewed

@@ -41,7 +41,8 @@ def expand_model_with_small_experts(base_model):
     print("# DEBUG: Expanding model with small experts...")
     config = base_model.config
     config.num_small_experts = 64  # Add 64 small experts
-    config.small_expert_intermediate_size = config.intermediate_size // 16  # Half size
     expanded_model = MyOlmoeForCausalLM(config)
     base_state_dict = base_model.state_dict()
@@ -61,11 +62,17 @@ def expand_model_with_small_experts(base_model):
             key = f'model.layers.{i}.mlp.experts.{i}.{proj}.weight'
             if key in base_state_dict:
                 orig_weight = base_state_dict[key]
                 if proj == 'down_proj':
-                    expanded_state_dict[key].copy_(orig_weight[:, :config.small_expert_intermediate_size])
                 else:
-                    expanded_state_dict[key].copy_(orig_weight[:config.small_expert_intermediate_size])
-                print(f"# DEBUG: Copied {proj} weights for expert {i}")
             else:
                 print(f"# DEBUG: Missing {key} in base model")
@@ -75,9 +82,18 @@ def expand_model_with_small_experts(base_model):
         if gate_key in base_state_dict:
             original_gate = base_state_dict[gate_key]
             new_gate = expanded_state_dict[gate_key]
             new_gate[:, :config.num_experts].copy_(original_gate)
-            torch.nn.init.normal_(new_gate[:, config.num_experts:], mean=0.0, std=config.initializer_range * 0.1)
-            print(f"# DEBUG: Initialized gate for layer {i}")
         else:
             print(f"# DEBUG: Missing gate weight {gate_key}")
@@ -85,7 +101,6 @@ def expand_model_with_small_experts(base_model):
     expanded_model.load_state_dict(expanded_state_dict, strict=False)
     return expanded_model
 def main():
     model_path = "myolmoe"
     print("# DEBUG: Loading base model...")

     print("# DEBUG: Expanding model with small experts...")
     config = base_model.config
     config.num_small_experts = 64  # Add 64 small experts
+    # Changed from //16 to //2 for more reasonable size
+    config.small_expert_intermediate_size = config.intermediate_size // 2
     expanded_model = MyOlmoeForCausalLM(config)
     base_state_dict = base_model.state_dict()
             key = f'model.layers.{i}.mlp.experts.{i}.{proj}.weight'
             if key in base_state_dict:
                 orig_weight = base_state_dict[key]
+                target_weight = expanded_state_dict[key]
                 if proj == 'down_proj':
+                    # For down_proj, we copy the first part of the input dimension
+                    target_weight.copy_(orig_weight[:, :config.small_expert_intermediate_size])
                 else:
+                    # For gate_proj and up_proj, we copy the first part of the output dimension
+                    target_weight.copy_(orig_weight[:config.small_expert_intermediate_size, :])
+                print(f"# DEBUG: Copied {proj} weights for expert {i} "
+                      f"(original shape: {orig_weight.shape}, new shape: {target_weight.shape})")
             else:
                 print(f"# DEBUG: Missing {key} in base model")
         if gate_key in base_state_dict:
             original_gate = base_state_dict[gate_key]
             new_gate = expanded_state_dict[gate_key]
+            # Copy original gate weights
             new_gate[:, :config.num_experts].copy_(original_gate)
+            # Initialize small experts gate weights
+            torch.nn.init.normal_(
+                new_gate[:, config.num_experts:],
+                mean=0.0,
+                std=config.initializer_range * 0.1
+            )
+            print(f"# DEBUG: Initialized gate for layer {i} "
+                  f"(original shape: {original_gate.shape}, new shape: {new_gate.shape})")
         else:
             print(f"# DEBUG: Missing gate weight {gate_key}")
     expanded_model.load_state_dict(expanded_state_dict, strict=False)
     return expanded_model
 def main():
     model_path = "myolmoe"
     print("# DEBUG: Loading base model...")