Charlie81
/

LoRE

TensorBoard

Safetensors

Model card Files Files and versions

xet

Metrics Training metrics Community

Charlie81 commited on Jul 4, 2025

Commit

b63994d

1 Parent(s): 65e7011

handle base and product architecture differences

Browse files

Files changed (1) hide show

scripts/train.py +93 -19

scripts/train.py CHANGED Viewed

@@ -1,13 +1,14 @@
 # scripts/train_small_experts.py
 import torch
-from transformers import TrainingArguments, Trainer
 from datasets import load_dataset
-from myolmoe.modeling_myolmoe import MyOlmoeForCausalLM, OlmoeConfig
 from torch.utils.data import Dataset
 class CustomDataset(Dataset):
     def __init__(self, tokenizer, dataset_name="allenai/tulu-v2-sft-mixture", max_length=512):
-        self.dataset = load_dataset(dataset_name)
         self.tokenizer = tokenizer
         self.max_length = max_length
@@ -30,24 +31,74 @@ class CustomDataset(Dataset):
             "labels": encoding["input_ids"].squeeze()
         }
-def main():
-    # Load base model
-    model_path = "myolmoe"
-    base_model = MyOlmoeForCausalLM.from_pretrained(model_path)
     # Create new config with small experts
     config = base_model.config
     config.num_small_experts = 64  # Add 64 small experts
-    config.small_expert_intermediate_size = 512  # Half the size of regular experts
-    # Initialize new model with same weights but expanded architecture
-    model = MyOlmoeForCausalLM(config)
-    # Copy existing weights
-    model.load_state_dict(base_model.state_dict(), strict=False)
-    # Initialize small experts (they'll start with random weights)
-    # You might want to initialize them differently, perhaps with smaller variance
     # Prepare dataset
     tokenizer = AutoTokenizer.from_pretrained(model_path)
@@ -67,19 +118,42 @@ def main():
         eval_steps=500,
         fp16=True,
         gradient_checkpointing=True,
-        report_to="tensorboard"
     )
-    # Trainer
-    trainer = Trainer(
         model=model,
         args=training_args,
         train_dataset=dataset,
-        eval_dataset=dataset,  # In practice, use a separate validation set
     )
     # Train
     trainer.train()
 if __name__ == "__main__":
     main()

 # scripts/train_small_experts.py
 import torch
+from transformers import TrainingArguments, Trainer, AutoTokenizer
 from datasets import load_dataset
+from myolmoe import MyOlmoeForCausalLM, OlmoeConfig
 from torch.utils.data import Dataset
+import os
 class CustomDataset(Dataset):
     def __init__(self, tokenizer, dataset_name="allenai/tulu-v2-sft-mixture", max_length=512):
+        self.dataset = load_dataset(dataset_name, split="train")  # Use train split
         self.tokenizer = tokenizer
         self.max_length = max_length
             "labels": encoding["input_ids"].squeeze()
         }
+def expand_model_with_small_experts(base_model):
     # Create new config with small experts
     config = base_model.config
     config.num_small_experts = 64  # Add 64 small experts
+    config.small_expert_intermediate_size = config.intermediate_size // 2  # Half size
+    # Create new model with expanded architecture
+    expanded_model = MyOlmoeForCausalLM(config)
+    # 1. Copy all non-expert weights exactly
+    base_state_dict = base_model.state_dict()
+    expanded_state_dict = expanded_model.state_dict()
+    # Copy all non-expert parameters
+    for name, param in base_state_dict.items():
+        if "experts" not in name:  # Skip expert-specific parameters
+            expanded_state_dict[name].copy_(param)
+    # 2. Copy the original experts' weights
+    for i in range(config.num_experts):
+        # Copy gate_proj weights
+        expanded_state_dict[f'model.layers.{i}.mlp.experts.{i}.gate_proj.weight'].copy_(
+            base_state_dict[f'model.layers.{i}.mlp.experts.{i}.gate_proj.weight'][:config.small_expert_intermediate_size]
+        )
+        # Copy up_proj weights
+        expanded_state_dict[f'model.layers.{i}.mlp.experts.{i}.up_proj.weight'].copy_(
+            base_state_dict[f'model.layers.{i}.mlp.experts.{i}.up_proj.weight'][:config.small_expert_intermediate_size]
+        )
+        # Copy down_proj weights (need to handle output dimension differently)
+        expanded_state_dict[f'model.layers.{i}.mlp.experts.{i}.down_proj.weight'].copy_(
+            base_state_dict[f'model.layers.{i}.mlp.experts.{i}.down_proj.weight'][:,:config.small_expert_intermediate_size]
+        )
+    # 3. Initialize the gate layer for all experts (original + small)
+    # The original gate had shape (hidden_size, num_experts)
+    # New gate needs shape (hidden_size, num_experts + num_small_experts)
+    for i in range(config.num_hidden_layers):
+        original_gate = base_state_dict[f'model.layers.{i}.mlp.gate.weight']
+        new_gate = expanded_state_dict[f'model.layers.{i}.mlp.gate.weight']
+        # Copy original gate weights
+        new_gate[:, :config.num_experts].copy_(original_gate)
+        # Initialize small experts gate weights (could use different initialization)
+        torch.nn.init.normal_(
+            new_gate[:, config.num_experts:],
+            mean=0.0,
+            std=config.initializer_range
+        )
+    # Load the combined state dict into the new model
+    expanded_model.load_state_dict(expanded_state_dict)
+    return expanded_model
+def main():
+    # Load base model (with only 64 experts)
+    model_path = "myolmoe"
+    base_model = MyOlmoeForCausalLM.from_pretrained(model_path)
+    # Verify base model has only 64 experts
+    print(f"Base model has {base_model.config.num_experts} experts")
+    # Expand model to include small experts
+    model = expand_model_with_small_experts(base_model)
+    # Verify expanded model
+    print(f"Expanded model has {model.config.num_experts} regular experts and {model.config.num_small_experts} small experts")
     # Prepare dataset
     tokenizer = AutoTokenizer.from_pretrained(model_path)
         eval_steps=500,
         fp16=True,
         gradient_checkpointing=True,
+        report_to="tensorboard",
+        # Important: Only train the new parameters initially
+        # Freeze original experts first, then unfreeze later
+        # You may want to modify this based on your training strategy
+        freeze_existing_experts=True
     )
+    # Custom trainer to handle expert freezing
+    class MoETrainer(Trainer):
+        def __init__(self, *args, **kwargs):
+            self.freeze_existing = kwargs.pop('freeze_existing_experts', False)
+            super().__init__(*args, **kwargs)
+            if self.freeze_existing:
+                # Freeze all original expert parameters
+                for name, param in self.model.named_parameters():
+                    if "experts" in name and "small_experts" not in name:
+                        param.requires_grad = False
+                print("Frozen original experts, only training small experts")
+    trainer = MoETrainer(
         model=model,
         args=training_args,
         train_dataset=dataset,
+        eval_dataset=dataset,
+        freeze_existing_experts=training_args.freeze_existing_experts
     )
     # Train
     trainer.train()
+    # Save final model
+    output_dir = "./final_model"
+    os.makedirs(output_dir, exist_ok=True)
+    model.save_pretrained(output_dir)
+    tokenizer.save_pretrained(output_dir)
 if __name__ == "__main__":
     main()