Charlie81 commited on
Commit
01ec808
·
1 Parent(s): 9ac5c9f

add better small expert debugs and del checkpoints

Browse files
checkpoints/checkpoint-20/config.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "architectures": [
3
- "MyOlmoeForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "clip_qkv": null,
8
- "eos_token_id": 50279,
9
- "hidden_act": "silu",
10
- "hidden_size": 2048,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 1024,
13
- "max_position_embeddings": 4096,
14
- "max_small_expert_count": 64,
15
- "model_type": "olmoe",
16
- "norm_topk_prob": false,
17
- "num_attention_heads": 16,
18
- "num_experts": 64,
19
- "num_experts_per_tok": 2,
20
- "num_hidden_layers": 16,
21
- "num_key_value_heads": 16,
22
- "num_small_experts": 64,
23
- "output_router_logits": false,
24
- "pad_token_id": 1,
25
- "rms_norm_eps": 1e-05,
26
- "rope_scaling": null,
27
- "rope_theta": 10000.0,
28
- "router_aux_loss_coef": 0.01,
29
- "small_expert_count": 64,
30
- "small_expert_intermediate_ratio": 64,
31
- "small_expert_intermediate_size": 0,
32
- "small_expert_sparsity_coef": 0.1,
33
- "tie_word_embeddings": false,
34
- "torch_dtype": "bfloat16",
35
- "transformers_version": "4.55.4",
36
- "use_cache": true,
37
- "vocab_size": 50304
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-20/generation_config.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "eos_token_id": 50279,
4
- "pad_token_id": 1,
5
- "transformers_version": "4.55.4"
6
- }
 
 
 
 
 
 
 
checkpoints/checkpoint-20/model-00001-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec65de6b7f2b08b49208c8bda60870ae14a8b7d9dfd8bfa480c07ac1e41ab5fd
3
- size 4998915096
 
 
 
 
checkpoints/checkpoint-20/model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0721ebd6dbf82e24daf03a07c53b758404f8a06b4c725d05689dcef590a0e2e
3
- size 4998953920
 
 
 
 
checkpoints/checkpoint-20/model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4443578f094776023ea4802948984fec808bce132bacb3a947ec4d034aef8c0
3
- size 4046769808
 
 
 
 
checkpoints/checkpoint-20/model.safetensors.index.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-20/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4542de7483192cb194833c57436064d36a1ef0be956887e0cb5593c00cf1e36a
3
- size 384761474
 
 
 
 
checkpoints/checkpoint-20/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
- size 14645
 
 
 
 
checkpoints/checkpoint-20/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:795c503600292c8c704d0cbf6e623255ed54848f02f567e3d52d4c939d5df78e
3
- size 1465
 
 
 
 
checkpoints/checkpoint-20/trainer_state.json DELETED
@@ -1,48 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.0009811316126737675,
6
- "eval_steps": 500,
7
- "global_step": 20,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.0004905658063368838,
14
- "grad_norm": 90.5,
15
- "learning_rate": 1.471550032701112e-07,
16
- "loss": 104.1216,
17
- "step": 10
18
- },
19
- {
20
- "epoch": 0.0009811316126737675,
21
- "grad_norm": 65.5,
22
- "learning_rate": 3.106605624591236e-07,
23
- "loss": 104.3206,
24
- "step": 20
25
- }
26
- ],
27
- "logging_steps": 10,
28
- "max_steps": 61155,
29
- "num_input_tokens_seen": 0,
30
- "num_train_epochs": 3,
31
- "save_steps": 20,
32
- "stateful_callbacks": {
33
- "TrainerControl": {
34
- "args": {
35
- "should_epoch_stop": false,
36
- "should_evaluate": false,
37
- "should_log": false,
38
- "should_save": true,
39
- "should_training_stop": false
40
- },
41
- "attributes": {}
42
- }
43
- },
44
- "total_flos": 5.441244138307584e+16,
45
- "train_batch_size": 2,
46
- "trial_name": null,
47
- "trial_params": null
48
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-20/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffa490bde32401dd6d70c4f1b1cff8f5df114f94b79824d3a47b2ae8c00b822d
3
- size 5713
 
 
 
 
scripts/train.py CHANGED
@@ -102,8 +102,21 @@ def main():
102
  ):
103
  param.requires_grad = True
104
  trainable_params.append(name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
-
107
  print(f"Total trainable parameters: {len(trainable_params)}")
108
 
109
  # Verify gradient requirements
@@ -155,6 +168,28 @@ def main():
155
  print("Checkpoint pushed successfully.")
156
  except subprocess.CalledProcessError as e:
157
  print(f"Git push failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  # Initialize trainer
160
  trainer = CustomTrainer(
@@ -162,7 +197,10 @@ def main():
162
  args=training_args,
163
  train_dataset=tokenized_dataset,
164
  data_collator=data_collator,
165
- callbacks=[GitPushCallback()]
 
 
 
166
  )
167
 
168
  # Test forward/backward pass before training
 
102
  ):
103
  param.requires_grad = True
104
  trainable_params.append(name)
105
+ ### ADDED: Check if small experts were found
106
+ if trainable_params:
107
+ print(f"[INFO] Found {len(trainable_params)} small_expert/small_gate parameters.")
108
+ else:
109
+ print("[WARNING] No small_expert or small_gate parameters found in model!")
110
+
111
+ # Verify gradient requirements
112
+ unfrozen = [name for name, param in model.named_parameters() if param.requires_grad]
113
+ if unfrozen:
114
+ print(f"[INFO] {len(unfrozen)} parameters are unfrozen and trainable.")
115
+ for name in unfrozen:
116
+ print(f" - {name}")
117
+ else:
118
+ print("[ERROR] No parameters were unfrozen! Training will not update anything.")
119
 
 
120
  print(f"Total trainable parameters: {len(trainable_params)}")
121
 
122
  # Verify gradient requirements
 
168
  print("Checkpoint pushed successfully.")
169
  except subprocess.CalledProcessError as e:
170
  print(f"Git push failed: {e}")
171
+ class SmallExpertSaveCallback(TrainerCallback):
172
+ def __init__(self, model, trainable_params):
173
+ self.model = model
174
+ self.trainable_params = trainable_params
175
+
176
+ def on_save(self, args, state, control, **kwargs):
177
+ # Define save path inside the checkpoint dir
178
+ checkpoint_dir = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
179
+ small_expert_path = os.path.join(checkpoint_dir, "small_experts_and_gates.bin")
180
+
181
+ small_expert_state_dict = {
182
+ name: param for name, param in self.model.named_parameters()
183
+ if name in self.trainable_params
184
+ }
185
+
186
+ if small_expert_state_dict:
187
+ os.makedirs(checkpoint_dir, exist_ok=True)
188
+ torch.save(small_expert_state_dict, small_expert_path)
189
+ print(f"[INFO] Saved {len(small_expert_state_dict)} small_expert/small_gate parameters "
190
+ f"to {small_expert_path}")
191
+ else:
192
+ print("[ERROR] No small_expert or small_gate parameters found to save!")
193
 
194
  # Initialize trainer
195
  trainer = CustomTrainer(
 
197
  args=training_args,
198
  train_dataset=tokenized_dataset,
199
  data_collator=data_collator,
200
+ callbacks=[
201
+ GitPushCallback(),
202
+ SmallExpertSaveCallback(model, trainable_params)
203
+ ]
204
  )
205
 
206
  # Test forward/backward pass before training