add better small expert debugs and del checkpoints

Browse files

Files changed (12) hide show

checkpoints/checkpoint-20/config.json +0 -38
checkpoints/checkpoint-20/generation_config.json +0 -6
checkpoints/checkpoint-20/model-00001-of-00003.safetensors +0 -3
checkpoints/checkpoint-20/model-00002-of-00003.safetensors +0 -3
checkpoints/checkpoint-20/model-00003-of-00003.safetensors +0 -3
checkpoints/checkpoint-20/model.safetensors.index.json +0 -0
checkpoints/checkpoint-20/optimizer.pt +0 -3
checkpoints/checkpoint-20/rng_state.pth +0 -3
checkpoints/checkpoint-20/scheduler.pt +0 -3
checkpoints/checkpoint-20/trainer_state.json +0 -48
checkpoints/checkpoint-20/training_args.bin +0 -3
scripts/train.py +40 -2

checkpoints/checkpoint-20/config.json DELETED Viewed

@@ -1,38 +0,0 @@
-{
-  "architectures": [
-    "MyOlmoeForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "clip_qkv": null,
-  "eos_token_id": 50279,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "initializer_range": 0.02,
-  "intermediate_size": 1024,
-  "max_position_embeddings": 4096,
-  "max_small_expert_count": 64,
-  "model_type": "olmoe",
-  "norm_topk_prob": false,
-  "num_attention_heads": 16,
-  "num_experts": 64,
-  "num_experts_per_tok": 2,
-  "num_hidden_layers": 16,
-  "num_key_value_heads": 16,
-  "num_small_experts": 64,
-  "output_router_logits": false,
-  "pad_token_id": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "router_aux_loss_coef": 0.01,
-  "small_expert_count": 64,
-  "small_expert_intermediate_ratio": 64,
-  "small_expert_intermediate_size": 0,
-  "small_expert_sparsity_coef": 0.1,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.55.4",
-  "use_cache": true,
-  "vocab_size": 50304
-}

checkpoints/checkpoint-20/generation_config.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "_from_model_config": true,
-  "eos_token_id": 50279,
-  "pad_token_id": 1,
-  "transformers_version": "4.55.4"
-}

checkpoints/checkpoint-20/model-00001-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ec65de6b7f2b08b49208c8bda60870ae14a8b7d9dfd8bfa480c07ac1e41ab5fd
-size 4998915096

checkpoints/checkpoint-20/model-00002-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a0721ebd6dbf82e24daf03a07c53b758404f8a06b4c725d05689dcef590a0e2e
-size 4998953920

checkpoints/checkpoint-20/model-00003-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e4443578f094776023ea4802948984fec808bce132bacb3a947ec4d034aef8c0
-size 4046769808

checkpoints/checkpoint-20/model.safetensors.index.json DELETED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/checkpoint-20/optimizer.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4542de7483192cb194833c57436064d36a1ef0be956887e0cb5593c00cf1e36a
-size 384761474

checkpoints/checkpoint-20/rng_state.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
-size 14645

checkpoints/checkpoint-20/scheduler.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:795c503600292c8c704d0cbf6e623255ed54848f02f567e3d52d4c939d5df78e
-size 1465

checkpoints/checkpoint-20/trainer_state.json DELETED Viewed

@@ -1,48 +0,0 @@
-{
-  "best_global_step": null,
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 0.0009811316126737675,
-  "eval_steps": 500,
-  "global_step": 20,
-  "is_hyper_param_search": false,
-  "is_local_process_zero": true,
-  "is_world_process_zero": true,
-  "log_history": [
-    {
-      "epoch": 0.0004905658063368838,
-      "grad_norm": 90.5,
-      "learning_rate": 1.471550032701112e-07,
-      "loss": 104.1216,
-      "step": 10
-    },
-    {
-      "epoch": 0.0009811316126737675,
-      "grad_norm": 65.5,
-      "learning_rate": 3.106605624591236e-07,
-      "loss": 104.3206,
-      "step": 20
-    }
-  ],
-  "logging_steps": 10,
-  "max_steps": 61155,
-  "num_input_tokens_seen": 0,
-  "num_train_epochs": 3,
-  "save_steps": 20,
-  "stateful_callbacks": {
-    "TrainerControl": {
-      "args": {
-        "should_epoch_stop": false,
-        "should_evaluate": false,
-        "should_log": false,
-        "should_save": true,
-        "should_training_stop": false
-      },
-      "attributes": {}
-    }
-  },
-  "total_flos": 5.441244138307584e+16,
-  "train_batch_size": 2,
-  "trial_name": null,
-  "trial_params": null
-}

checkpoints/checkpoint-20/training_args.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ffa490bde32401dd6d70c4f1b1cff8f5df114f94b79824d3a47b2ae8c00b822d
-size 5713

scripts/train.py CHANGED Viewed

@@ -102,8 +102,21 @@ def main():
         ):
             param.requires_grad = True
             trainable_params.append(name)
     print(f"Total trainable parameters: {len(trainable_params)}")
     # Verify gradient requirements
@@ -155,6 +168,28 @@ def main():
                 print("Checkpoint pushed successfully.")
             except subprocess.CalledProcessError as e:
                 print(f"Git push failed: {e}")
     # Initialize trainer
     trainer = CustomTrainer(
@@ -162,7 +197,10 @@ def main():
         args=training_args,
         train_dataset=tokenized_dataset,
         data_collator=data_collator,
-        callbacks=[GitPushCallback()]
     )
     # Test forward/backward pass before training

         ):
             param.requires_grad = True
             trainable_params.append(name)
+    ### ADDED: Check if small experts were found
+    if trainable_params:
+        print(f"[INFO] Found {len(trainable_params)} small_expert/small_gate parameters.")
+    else:
+        print("[WARNING] No small_expert or small_gate parameters found in model!")
+    # Verify gradient requirements
+    unfrozen = [name for name, param in model.named_parameters() if param.requires_grad]
+    if unfrozen:
+        print(f"[INFO] {len(unfrozen)} parameters are unfrozen and trainable.")
+        for name in unfrozen:
+            print(f"   - {name}")
+    else:
+        print("[ERROR] No parameters were unfrozen! Training will not update anything.")
     print(f"Total trainable parameters: {len(trainable_params)}")
     # Verify gradient requirements
                 print("Checkpoint pushed successfully.")
             except subprocess.CalledProcessError as e:
                 print(f"Git push failed: {e}")
+    class SmallExpertSaveCallback(TrainerCallback):
+        def __init__(self, model, trainable_params):
+            self.model = model
+            self.trainable_params = trainable_params
+        def on_save(self, args, state, control, **kwargs):
+            # Define save path inside the checkpoint dir
+            checkpoint_dir = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
+            small_expert_path = os.path.join(checkpoint_dir, "small_experts_and_gates.bin")
+            small_expert_state_dict = {
+                name: param for name, param in self.model.named_parameters()
+                if name in self.trainable_params
+            }
+            if small_expert_state_dict:
+                os.makedirs(checkpoint_dir, exist_ok=True)
+                torch.save(small_expert_state_dict, small_expert_path)
+                print(f"[INFO] Saved {len(small_expert_state_dict)} small_expert/small_gate parameters "
+                    f"to {small_expert_path}")
+            else:
+                print("[ERROR] No small_expert or small_gate parameters found to save!")
     # Initialize trainer
     trainer = CustomTrainer(
         args=training_args,
         train_dataset=tokenized_dataset,
         data_collator=data_collator,
+        callbacks=[
+            GitPushCallback(),
+            SmallExpertSaveCallback(model, trainable_params)
+        ]
     )
     # Test forward/backward pass before training