fix scheduler bug: don't prepare scheduler with accelerate (was over-stepping cosine by num_processes); add grow40_winning_v2 config
Browse files- configs/grow40_winning.toml +1 -1
- configs/grow40_winning_v2.toml +52 -0
- distill.py +7 -3
- scripts/backup_to_hf.py +1 -0
configs/grow40_winning.toml
CHANGED
|
@@ -34,7 +34,7 @@ attn_implementation = "flash_attention_2"
|
|
| 34 |
student_dtype = "bfloat16"
|
| 35 |
teacher_dtype = "bfloat16"
|
| 36 |
mixed_precision = "bf16"
|
| 37 |
-
kl_chunk_size =
|
| 38 |
|
| 39 |
[eval]
|
| 40 |
every_steps = 50
|
|
|
|
| 34 |
student_dtype = "bfloat16"
|
| 35 |
teacher_dtype = "bfloat16"
|
| 36 |
mixed_precision = "bf16"
|
| 37 |
+
kl_chunk_size = 256
|
| 38 |
|
| 39 |
[eval]
|
| 40 |
every_steps = 50
|
configs/grow40_winning_v2.toml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# grow40_winning v2: same hparams, but the scheduler bug is fixed in distill.py
|
| 2 |
+
# (we no longer prepare the scheduler with accelerate, so cosine reaches its
|
| 3 |
+
# minimum at step max_steps instead of step max_steps / num_processes).
|
| 4 |
+
|
| 5 |
+
[model]
|
| 6 |
+
teacher = "Qwen/Qwen3.5-35B-A3B"
|
| 7 |
+
student = "Troiaaa/m-6a3lnzvb"
|
| 8 |
+
tokenizer = "Qwen/Qwen3.5-35B-A3B"
|
| 9 |
+
|
| 10 |
+
[data]
|
| 11 |
+
dataset = "karpathy/climbmix-400b-shuffle"
|
| 12 |
+
text_field = "text"
|
| 13 |
+
min_chars = 2560
|
| 14 |
+
max_seq_len = 2048
|
| 15 |
+
kl_start_pos = 128
|
| 16 |
+
seed = 6767
|
| 17 |
+
shuffle_buffer = 10000
|
| 18 |
+
|
| 19 |
+
[train]
|
| 20 |
+
seed = 6767
|
| 21 |
+
lr = 5.0e-7
|
| 22 |
+
schedule = "cosine"
|
| 23 |
+
warmup_steps = 100
|
| 24 |
+
weight_decay = 0.0
|
| 25 |
+
grad_clip = 1.0
|
| 26 |
+
betas = [0.9, 0.999]
|
| 27 |
+
eps = 1.0e-3
|
| 28 |
+
samples_per_step = 4
|
| 29 |
+
micro_batch_size = 4
|
| 30 |
+
max_steps = 2000
|
| 31 |
+
grad_checkpointing = true
|
| 32 |
+
attn_implementation = "flash_attention_2"
|
| 33 |
+
student_dtype = "bfloat16"
|
| 34 |
+
teacher_dtype = "bfloat16"
|
| 35 |
+
mixed_precision = "bf16"
|
| 36 |
+
kl_chunk_size = 256
|
| 37 |
+
|
| 38 |
+
[eval]
|
| 39 |
+
every_steps = 50
|
| 40 |
+
samples = 500
|
| 41 |
+
seed = 4242
|
| 42 |
+
|
| 43 |
+
[log]
|
| 44 |
+
wandb = true
|
| 45 |
+
wandb_project = "distil-subnet97"
|
| 46 |
+
wandb_run = "grow40_winning_v2"
|
| 47 |
+
log_every = 1
|
| 48 |
+
output_dir = "./out/grow40_winning_v2"
|
| 49 |
+
|
| 50 |
+
[init]
|
| 51 |
+
zero_layers = []
|
| 52 |
+
target_num_layers = 40
|
distill.py
CHANGED
|
@@ -541,9 +541,13 @@ def main():
|
|
| 541 |
optimizer = make_optimizer(student, cfg["train"])
|
| 542 |
scheduler = make_scheduler(optimizer, cfg["train"])
|
| 543 |
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 547 |
|
| 548 |
# ---- Output dir + config snapshot
|
| 549 |
output_dir = Path(cfg["log"]["output_dir"])
|
|
|
|
| 541 |
optimizer = make_optimizer(student, cfg["train"])
|
| 542 |
scheduler = make_scheduler(optimizer, cfg["train"])
|
| 543 |
|
| 544 |
+
# NB: do NOT pass `scheduler` to accelerator.prepare. When prepared, accelerate
|
| 545 |
+
# advances the scheduler by `num_processes` steps per call (to match the
|
| 546 |
+
# "single-GPU equivalent" timeline). Combined with our explicit max_steps
|
| 547 |
+
# accounting, that causes the cosine to cycle multiple times mid-run. By
|
| 548 |
+
# leaving the scheduler unprepared, scheduler.step() advances exactly once
|
| 549 |
+
# per training step, matching how max_steps is interpreted in this script.
|
| 550 |
+
student, optimizer = accelerator.prepare(student, optimizer)
|
| 551 |
|
| 552 |
# ---- Output dir + config snapshot
|
| 553 |
output_dir = Path(cfg["log"]["output_dir"])
|
scripts/backup_to_hf.py
CHANGED
|
@@ -21,6 +21,7 @@ INCLUDE = [
|
|
| 21 |
"configs/replicate_zero4.toml",
|
| 22 |
"configs/grow40_winning.toml",
|
| 23 |
"configs/grow40_simple.toml",
|
|
|
|
| 24 |
"configs/accelerate.yaml",
|
| 25 |
"scripts/backup_to_hf.py",
|
| 26 |
"scripts/run_sweep.sh",
|
|
|
|
| 21 |
"configs/replicate_zero4.toml",
|
| 22 |
"configs/grow40_winning.toml",
|
| 23 |
"configs/grow40_simple.toml",
|
| 24 |
+
"configs/grow40_winning_v2.toml",
|
| 25 |
"configs/accelerate.yaml",
|
| 26 |
"scripts/backup_to_hf.py",
|
| 27 |
"scripts/run_sweep.sh",
|