Delta-Vector
/

distill-m-6a3lnzvb-code

Delta-Vector commited on Apr 7

Commit

e9ce4f0

verified ·

1 Parent(s): cd6b583

grow40_winning: switch student to bf16 to fit in B200 memory + 40-layer Adam state

Files changed (1) hide show

configs/grow40_winning.toml CHANGED Viewed

@@ -1,6 +1,7 @@
 # Grow student to 40 layers AND apply the winning hparams from zero4_long.
-# New layers (32-39) are appended at the end with output projections zeroed
-# (identity at init, gradients still flow). No layer zeroing.
 [model]
 teacher    = "Qwen/Qwen3.5-35B-A3B"
@@ -26,14 +27,14 @@ grad_clip            = 1.0
 betas                = [0.9, 0.999]
 eps                  = 1.0e-3
 samples_per_step     = 4
-micro_batch_size     = 1
 max_steps            = 2000
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"
-student_dtype        = "float32"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
-kl_chunk_size        = 256
 [eval]
 every_steps = 50

 # Grow student to 40 layers AND apply the winning hparams from zero4_long.
+# Note: student is bf16 (not fp32 as in the original winning run) because the
+# fp32 master weights + 40 layers + Adam state + bf16 teacher OOMs on B200
+# without sharding. Everything else matches the winning recipe.
 [model]
 teacher    = "Qwen/Qwen3.5-35B-A3B"
 betas                = [0.9, 0.999]
 eps                  = 1.0e-3
 samples_per_step     = 4
+micro_batch_size     = 4
 max_steps            = 2000
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
+kl_chunk_size        = 0
 [eval]
 every_steps = 50