fix scheduler bug: don't prepare scheduler with accelerate (was over-stepping cosine by num_processes); add grow40_winning_v2 config

Files changed (4) hide show

configs/grow40_winning.toml CHANGED Viewed

@@ -34,7 +34,7 @@ attn_implementation  = "flash_attention_2"
 student_dtype        = "bfloat16"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
-kl_chunk_size        = 0
 [eval]
 every_steps = 50

 student_dtype        = "bfloat16"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
+kl_chunk_size        = 256
 [eval]
 every_steps = 50

configs/grow40_winning_v2.toml ADDED Viewed

+# grow40_winning v2: same hparams, but the scheduler bug is fixed in distill.py
+# (we no longer prepare the scheduler with accelerate, so cosine reaches its
+# minimum at step max_steps instead of step max_steps / num_processes).
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-7
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "grow40_winning_v2"
+log_every     = 1
+output_dir    = "./out/grow40_winning_v2"
+[init]
+zero_layers        = []
+target_num_layers  = 40

distill.py CHANGED Viewed

@@ -541,9 +541,13 @@ def main():
     optimizer = make_optimizer(student, cfg["train"])
     scheduler = make_scheduler(optimizer, cfg["train"])
-    student, optimizer, scheduler = accelerator.prepare(
-        student, optimizer, scheduler
-    )
     # ---- Output dir + config snapshot
     output_dir = Path(cfg["log"]["output_dir"])

     optimizer = make_optimizer(student, cfg["train"])
     scheduler = make_scheduler(optimizer, cfg["train"])
+    # NB: do NOT pass `scheduler` to accelerator.prepare. When prepared, accelerate
+    # advances the scheduler by `num_processes` steps per call (to match the
+    # "single-GPU equivalent" timeline). Combined with our explicit max_steps
+    # accounting, that causes the cosine to cycle multiple times mid-run. By
+    # leaving the scheduler unprepared, scheduler.step() advances exactly once
+    # per training step, matching how max_steps is interpreted in this script.
+    student, optimizer = accelerator.prepare(student, optimizer)
     # ---- Output dir + config snapshot
     output_dir = Path(cfg["log"]["output_dir"])

scripts/backup_to_hf.py CHANGED Viewed

@@ -21,6 +21,7 @@ INCLUDE = [
     "configs/replicate_zero4.toml",
     "configs/grow40_winning.toml",
     "configs/grow40_simple.toml",
     "configs/accelerate.yaml",
     "scripts/backup_to_hf.py",
     "scripts/run_sweep.sh",

     "configs/replicate_zero4.toml",
     "configs/grow40_winning.toml",
     "configs/grow40_simple.toml",
+    "configs/grow40_winning_v2.toml",
     "configs/accelerate.yaml",
     "scripts/backup_to_hf.py",
     "scripts/run_sweep.sh",