Delta-Vector commited on
Commit
35d9db6
·
verified ·
1 Parent(s): e9ce4f0

fix scheduler bug: don't prepare scheduler with accelerate (was over-stepping cosine by num_processes); add grow40_winning_v2 config

Browse files
configs/grow40_winning.toml CHANGED
@@ -34,7 +34,7 @@ attn_implementation = "flash_attention_2"
34
  student_dtype = "bfloat16"
35
  teacher_dtype = "bfloat16"
36
  mixed_precision = "bf16"
37
- kl_chunk_size = 0
38
 
39
  [eval]
40
  every_steps = 50
 
34
  student_dtype = "bfloat16"
35
  teacher_dtype = "bfloat16"
36
  mixed_precision = "bf16"
37
+ kl_chunk_size = 256
38
 
39
  [eval]
40
  every_steps = 50
configs/grow40_winning_v2.toml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # grow40_winning v2: same hparams, but the scheduler bug is fixed in distill.py
2
+ # (we no longer prepare the scheduler with accelerate, so cosine reaches its
3
+ # minimum at step max_steps instead of step max_steps / num_processes).
4
+
5
+ [model]
6
+ teacher = "Qwen/Qwen3.5-35B-A3B"
7
+ student = "Troiaaa/m-6a3lnzvb"
8
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
9
+
10
+ [data]
11
+ dataset = "karpathy/climbmix-400b-shuffle"
12
+ text_field = "text"
13
+ min_chars = 2560
14
+ max_seq_len = 2048
15
+ kl_start_pos = 128
16
+ seed = 6767
17
+ shuffle_buffer = 10000
18
+
19
+ [train]
20
+ seed = 6767
21
+ lr = 5.0e-7
22
+ schedule = "cosine"
23
+ warmup_steps = 100
24
+ weight_decay = 0.0
25
+ grad_clip = 1.0
26
+ betas = [0.9, 0.999]
27
+ eps = 1.0e-3
28
+ samples_per_step = 4
29
+ micro_batch_size = 4
30
+ max_steps = 2000
31
+ grad_checkpointing = true
32
+ attn_implementation = "flash_attention_2"
33
+ student_dtype = "bfloat16"
34
+ teacher_dtype = "bfloat16"
35
+ mixed_precision = "bf16"
36
+ kl_chunk_size = 256
37
+
38
+ [eval]
39
+ every_steps = 50
40
+ samples = 500
41
+ seed = 4242
42
+
43
+ [log]
44
+ wandb = true
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "grow40_winning_v2"
47
+ log_every = 1
48
+ output_dir = "./out/grow40_winning_v2"
49
+
50
+ [init]
51
+ zero_layers = []
52
+ target_num_layers = 40
distill.py CHANGED
@@ -541,9 +541,13 @@ def main():
541
  optimizer = make_optimizer(student, cfg["train"])
542
  scheduler = make_scheduler(optimizer, cfg["train"])
543
 
544
- student, optimizer, scheduler = accelerator.prepare(
545
- student, optimizer, scheduler
546
- )
 
 
 
 
547
 
548
  # ---- Output dir + config snapshot
549
  output_dir = Path(cfg["log"]["output_dir"])
 
541
  optimizer = make_optimizer(student, cfg["train"])
542
  scheduler = make_scheduler(optimizer, cfg["train"])
543
 
544
+ # NB: do NOT pass `scheduler` to accelerator.prepare. When prepared, accelerate
545
+ # advances the scheduler by `num_processes` steps per call (to match the
546
+ # "single-GPU equivalent" timeline). Combined with our explicit max_steps
547
+ # accounting, that causes the cosine to cycle multiple times mid-run. By
548
+ # leaving the scheduler unprepared, scheduler.step() advances exactly once
549
+ # per training step, matching how max_steps is interpreted in this script.
550
+ student, optimizer = accelerator.prepare(student, optimizer)
551
 
552
  # ---- Output dir + config snapshot
553
  output_dir = Path(cfg["log"]["output_dir"])
scripts/backup_to_hf.py CHANGED
@@ -21,6 +21,7 @@ INCLUDE = [
21
  "configs/replicate_zero4.toml",
22
  "configs/grow40_winning.toml",
23
  "configs/grow40_simple.toml",
 
24
  "configs/accelerate.yaml",
25
  "scripts/backup_to_hf.py",
26
  "scripts/run_sweep.sh",
 
21
  "configs/replicate_zero4.toml",
22
  "configs/grow40_winning.toml",
23
  "configs/grow40_simple.toml",
24
+ "configs/grow40_winning_v2.toml",
25
  "configs/accelerate.yaml",
26
  "scripts/backup_to_hf.py",
27
  "scripts/run_sweep.sh",