Charlie81 commited on
Commit
b63994d
·
1 Parent(s): 65e7011

handle base and product architecture differences

Browse files
Files changed (1) hide show
  1. scripts/train.py +93 -19
scripts/train.py CHANGED
@@ -1,13 +1,14 @@
1
  # scripts/train_small_experts.py
2
  import torch
3
- from transformers import TrainingArguments, Trainer
4
  from datasets import load_dataset
5
- from myolmoe.modeling_myolmoe import MyOlmoeForCausalLM, OlmoeConfig
6
  from torch.utils.data import Dataset
 
7
 
8
  class CustomDataset(Dataset):
9
  def __init__(self, tokenizer, dataset_name="allenai/tulu-v2-sft-mixture", max_length=512):
10
- self.dataset = load_dataset(dataset_name)
11
  self.tokenizer = tokenizer
12
  self.max_length = max_length
13
 
@@ -30,24 +31,74 @@ class CustomDataset(Dataset):
30
  "labels": encoding["input_ids"].squeeze()
31
  }
32
 
33
- def main():
34
- # Load base model
35
- model_path = "myolmoe"
36
- base_model = MyOlmoeForCausalLM.from_pretrained(model_path)
37
-
38
  # Create new config with small experts
39
  config = base_model.config
40
  config.num_small_experts = 64 # Add 64 small experts
41
- config.small_expert_intermediate_size = 512 # Half the size of regular experts
 
 
 
 
 
 
 
42
 
43
- # Initialize new model with same weights but expanded architecture
44
- model = MyOlmoeForCausalLM(config)
 
 
45
 
46
- # Copy existing weights
47
- model.load_state_dict(base_model.state_dict(), strict=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- # Initialize small experts (they'll start with random weights)
50
- # You might want to initialize them differently, perhaps with smaller variance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  # Prepare dataset
53
  tokenizer = AutoTokenizer.from_pretrained(model_path)
@@ -67,19 +118,42 @@ def main():
67
  eval_steps=500,
68
  fp16=True,
69
  gradient_checkpointing=True,
70
- report_to="tensorboard"
 
 
 
 
71
  )
72
 
73
- # Trainer
74
- trainer = Trainer(
 
 
 
 
 
 
 
 
 
 
 
 
75
  model=model,
76
  args=training_args,
77
  train_dataset=dataset,
78
- eval_dataset=dataset, # In practice, use a separate validation set
 
79
  )
80
 
81
  # Train
82
  trainer.train()
 
 
 
 
 
 
83
 
84
  if __name__ == "__main__":
85
  main()
 
1
  # scripts/train_small_experts.py
2
  import torch
3
+ from transformers import TrainingArguments, Trainer, AutoTokenizer
4
  from datasets import load_dataset
5
+ from myolmoe import MyOlmoeForCausalLM, OlmoeConfig
6
  from torch.utils.data import Dataset
7
+ import os
8
 
9
  class CustomDataset(Dataset):
10
  def __init__(self, tokenizer, dataset_name="allenai/tulu-v2-sft-mixture", max_length=512):
11
+ self.dataset = load_dataset(dataset_name, split="train") # Use train split
12
  self.tokenizer = tokenizer
13
  self.max_length = max_length
14
 
 
31
  "labels": encoding["input_ids"].squeeze()
32
  }
33
 
34
+ def expand_model_with_small_experts(base_model):
 
 
 
 
35
  # Create new config with small experts
36
  config = base_model.config
37
  config.num_small_experts = 64 # Add 64 small experts
38
+ config.small_expert_intermediate_size = config.intermediate_size // 2 # Half size
39
+
40
+ # Create new model with expanded architecture
41
+ expanded_model = MyOlmoeForCausalLM(config)
42
+
43
+ # 1. Copy all non-expert weights exactly
44
+ base_state_dict = base_model.state_dict()
45
+ expanded_state_dict = expanded_model.state_dict()
46
 
47
+ # Copy all non-expert parameters
48
+ for name, param in base_state_dict.items():
49
+ if "experts" not in name: # Skip expert-specific parameters
50
+ expanded_state_dict[name].copy_(param)
51
 
52
+ # 2. Copy the original experts' weights
53
+ for i in range(config.num_experts):
54
+ # Copy gate_proj weights
55
+ expanded_state_dict[f'model.layers.{i}.mlp.experts.{i}.gate_proj.weight'].copy_(
56
+ base_state_dict[f'model.layers.{i}.mlp.experts.{i}.gate_proj.weight'][:config.small_expert_intermediate_size]
57
+ )
58
+ # Copy up_proj weights
59
+ expanded_state_dict[f'model.layers.{i}.mlp.experts.{i}.up_proj.weight'].copy_(
60
+ base_state_dict[f'model.layers.{i}.mlp.experts.{i}.up_proj.weight'][:config.small_expert_intermediate_size]
61
+ )
62
+ # Copy down_proj weights (need to handle output dimension differently)
63
+ expanded_state_dict[f'model.layers.{i}.mlp.experts.{i}.down_proj.weight'].copy_(
64
+ base_state_dict[f'model.layers.{i}.mlp.experts.{i}.down_proj.weight'][:,:config.small_expert_intermediate_size]
65
+ )
66
+
67
+ # 3. Initialize the gate layer for all experts (original + small)
68
+ # The original gate had shape (hidden_size, num_experts)
69
+ # New gate needs shape (hidden_size, num_experts + num_small_experts)
70
+ for i in range(config.num_hidden_layers):
71
+ original_gate = base_state_dict[f'model.layers.{i}.mlp.gate.weight']
72
+ new_gate = expanded_state_dict[f'model.layers.{i}.mlp.gate.weight']
73
+
74
+ # Copy original gate weights
75
+ new_gate[:, :config.num_experts].copy_(original_gate)
76
+
77
+ # Initialize small experts gate weights (could use different initialization)
78
+ torch.nn.init.normal_(
79
+ new_gate[:, config.num_experts:],
80
+ mean=0.0,
81
+ std=config.initializer_range
82
+ )
83
 
84
+ # Load the combined state dict into the new model
85
+ expanded_model.load_state_dict(expanded_state_dict)
86
+
87
+ return expanded_model
88
+
89
+ def main():
90
+ # Load base model (with only 64 experts)
91
+ model_path = "myolmoe"
92
+ base_model = MyOlmoeForCausalLM.from_pretrained(model_path)
93
+
94
+ # Verify base model has only 64 experts
95
+ print(f"Base model has {base_model.config.num_experts} experts")
96
+
97
+ # Expand model to include small experts
98
+ model = expand_model_with_small_experts(base_model)
99
+
100
+ # Verify expanded model
101
+ print(f"Expanded model has {model.config.num_experts} regular experts and {model.config.num_small_experts} small experts")
102
 
103
  # Prepare dataset
104
  tokenizer = AutoTokenizer.from_pretrained(model_path)
 
118
  eval_steps=500,
119
  fp16=True,
120
  gradient_checkpointing=True,
121
+ report_to="tensorboard",
122
+ # Important: Only train the new parameters initially
123
+ # Freeze original experts first, then unfreeze later
124
+ # You may want to modify this based on your training strategy
125
+ freeze_existing_experts=True
126
  )
127
 
128
+ # Custom trainer to handle expert freezing
129
+ class MoETrainer(Trainer):
130
+ def __init__(self, *args, **kwargs):
131
+ self.freeze_existing = kwargs.pop('freeze_existing_experts', False)
132
+ super().__init__(*args, **kwargs)
133
+
134
+ if self.freeze_existing:
135
+ # Freeze all original expert parameters
136
+ for name, param in self.model.named_parameters():
137
+ if "experts" in name and "small_experts" not in name:
138
+ param.requires_grad = False
139
+ print("Frozen original experts, only training small experts")
140
+
141
+ trainer = MoETrainer(
142
  model=model,
143
  args=training_args,
144
  train_dataset=dataset,
145
+ eval_dataset=dataset,
146
+ freeze_existing_experts=training_args.freeze_existing_experts
147
  )
148
 
149
  # Train
150
  trainer.train()
151
+
152
+ # Save final model
153
+ output_dir = "./final_model"
154
+ os.makedirs(output_dir, exist_ok=True)
155
+ model.save_pretrained(output_dir)
156
+ tokenizer.save_pretrained(output_dir)
157
 
158
  if __name__ == "__main__":
159
  main()