File size: 1,226 Bytes
2d38ae8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# Base distillation config (smoketest variant).
# Every value the script reads must live in this file - no defaults in code.

[model]
teacher    = "Qwen/Qwen3.5-35B-A3B"
student    = "Troiaaa/m-6a3lnzvb"
tokenizer  = "Qwen/Qwen3.5-35B-A3B"

[data]
dataset        = "karpathy/climbmix-400b-shuffle"
text_field     = "text"
min_chars      = 2560
max_seq_len    = 640
kl_start_pos   = 128
seed           = 42
shuffle_buffer = 10000

[train]
seed                 = 42
lr                   = 5.0e-7
schedule             = "constant"
warmup_steps         = 0
weight_decay         = 0.0
grad_clip            = 1.0
betas                = [0.9, 0.95]
eps                  = 1.0e-8
samples_per_step     = 4
micro_batch_size     = 4
max_steps            = 5
grad_checkpointing   = true
attn_implementation  = "flash_attention_2"
student_dtype        = "bfloat16"
teacher_dtype        = "bfloat16"
mixed_precision      = "bf16"
kl_chunk_size        = 0
new_layer_lr_mul     = 1.0

[eval]
every_steps = 5
samples     = 16
seed        = 1234

[log]
wandb         = true
wandb_project = "distil-subnet97"
wandb_run     = "smoketest"
log_every     = 1
output_dir    = "./out/smoketest"

[init]
zero_layers        = []
target_num_layers  = 32