Delta-Vector commited on
Commit
e9ce4f0
·
verified ·
1 Parent(s): cd6b583

grow40_winning: switch student to bf16 to fit in B200 memory + 40-layer Adam state

Browse files
Files changed (1) hide show
  1. configs/grow40_winning.toml +6 -5
configs/grow40_winning.toml CHANGED
@@ -1,6 +1,7 @@
1
  # Grow student to 40 layers AND apply the winning hparams from zero4_long.
2
- # New layers (32-39) are appended at the end with output projections zeroed
3
- # (identity at init, gradients still flow). No layer zeroing.
 
4
 
5
  [model]
6
  teacher = "Qwen/Qwen3.5-35B-A3B"
@@ -26,14 +27,14 @@ grad_clip = 1.0
26
  betas = [0.9, 0.999]
27
  eps = 1.0e-3
28
  samples_per_step = 4
29
- micro_batch_size = 1
30
  max_steps = 2000
31
  grad_checkpointing = true
32
  attn_implementation = "flash_attention_2"
33
- student_dtype = "float32"
34
  teacher_dtype = "bfloat16"
35
  mixed_precision = "bf16"
36
- kl_chunk_size = 256
37
 
38
  [eval]
39
  every_steps = 50
 
1
  # Grow student to 40 layers AND apply the winning hparams from zero4_long.
2
+ # Note: student is bf16 (not fp32 as in the original winning run) because the
3
+ # fp32 master weights + 40 layers + Adam state + bf16 teacher OOMs on B200
4
+ # without sharding. Everything else matches the winning recipe.
5
 
6
  [model]
7
  teacher = "Qwen/Qwen3.5-35B-A3B"
 
27
  betas = [0.9, 0.999]
28
  eps = 1.0e-3
29
  samples_per_step = 4
30
+ micro_batch_size = 4
31
  max_steps = 2000
32
  grad_checkpointing = true
33
  attn_implementation = "flash_attention_2"
34
+ student_dtype = "bfloat16"
35
  teacher_dtype = "bfloat16"
36
  mixed_precision = "bf16"
37
+ kl_chunk_size = 0
38
 
39
  [eval]
40
  every_steps = 50