grow40_winning: switch student to bf16 to fit in B200 memory + 40-layer Adam state
Browse files
configs/grow40_winning.toml
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
# Grow student to 40 layers AND apply the winning hparams from zero4_long.
|
| 2 |
-
#
|
| 3 |
-
#
|
|
|
|
| 4 |
|
| 5 |
[model]
|
| 6 |
teacher = "Qwen/Qwen3.5-35B-A3B"
|
|
@@ -26,14 +27,14 @@ grad_clip = 1.0
|
|
| 26 |
betas = [0.9, 0.999]
|
| 27 |
eps = 1.0e-3
|
| 28 |
samples_per_step = 4
|
| 29 |
-
micro_batch_size =
|
| 30 |
max_steps = 2000
|
| 31 |
grad_checkpointing = true
|
| 32 |
attn_implementation = "flash_attention_2"
|
| 33 |
-
student_dtype = "
|
| 34 |
teacher_dtype = "bfloat16"
|
| 35 |
mixed_precision = "bf16"
|
| 36 |
-
kl_chunk_size =
|
| 37 |
|
| 38 |
[eval]
|
| 39 |
every_steps = 50
|
|
|
|
| 1 |
# Grow student to 40 layers AND apply the winning hparams from zero4_long.
|
| 2 |
+
# Note: student is bf16 (not fp32 as in the original winning run) because the
|
| 3 |
+
# fp32 master weights + 40 layers + Adam state + bf16 teacher OOMs on B200
|
| 4 |
+
# without sharding. Everything else matches the winning recipe.
|
| 5 |
|
| 6 |
[model]
|
| 7 |
teacher = "Qwen/Qwen3.5-35B-A3B"
|
|
|
|
| 27 |
betas = [0.9, 0.999]
|
| 28 |
eps = 1.0e-3
|
| 29 |
samples_per_step = 4
|
| 30 |
+
micro_batch_size = 4
|
| 31 |
max_steps = 2000
|
| 32 |
grad_checkpointing = true
|
| 33 |
attn_implementation = "flash_attention_2"
|
| 34 |
+
student_dtype = "bfloat16"
|
| 35 |
teacher_dtype = "bfloat16"
|
| 36 |
mixed_precision = "bf16"
|
| 37 |
+
kl_chunk_size = 0
|
| 38 |
|
| 39 |
[eval]
|
| 40 |
every_steps = 50
|