File size: 1,306 Bytes
2d38ae8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | # Phase 2: same tiny LR but larger inner batch (16/rank → effective 128) so the
# gradients are much smoother. Should give the smoothest descent of all.
[model]
teacher = "Qwen/Qwen3.5-35B-A3B"
student = "./out/phase1_best"
tokenizer = "Qwen/Qwen3.5-35B-A3B"
[data]
dataset = "karpathy/climbmix-400b-shuffle"
text_field = "text"
min_chars = 2560
max_seq_len = 2048
kl_start_pos = 128
seed = 6767
shuffle_buffer = 10000
[train]
seed = 6767
lr = 2.0e-8
schedule = "constant"
warmup_steps = 0
weight_decay = 0.0
grad_clip = 1.0
betas = [0.9, 0.99]
eps = 1.0e-3
samples_per_step = 16
micro_batch_size = 1
max_steps = 2000
grad_checkpointing = true
attn_implementation = "flash_attention_2"
student_dtype = "bfloat16"
teacher_dtype = "bfloat16"
mixed_precision = "bf16"
kl_chunk_size = 256
new_layer_lr_mul = 1.0
[eval]
every_steps = 50
samples = 500
seed = 4242
[log]
wandb = true
wandb_project = "distil-subnet97"
wandb_run = "M_phase2_lr2e8_largebatch"
log_every = 1
output_dir = "./out/sweep/M_phase2_lr2e8_largebatch"
[init]
zero_layers = []
target_num_layers = 40
|