distill-m-6a3lnzvb-code / sweep /I_cold_paramgroups_grow40.toml
Delta-Vector's picture
Upload folder using huggingface_hub
2d38ae8 verified
# Cold start, 40 layers, low LR for original layers + 5x for the new ones.
# Lets the new layers wake up faster without disturbing the trained layers.
[model]
teacher = "Qwen/Qwen3.5-35B-A3B"
student = "Troiaaa/m-6a3lnzvb"
tokenizer = "Qwen/Qwen3.5-35B-A3B"
[data]
dataset = "karpathy/climbmix-400b-shuffle"
text_field = "text"
min_chars = 2560
max_seq_len = 2048
kl_start_pos = 128
seed = 6767
shuffle_buffer = 10000
[train]
seed = 6767
lr = 1.0e-7
schedule = "cosine"
warmup_steps = 100
weight_decay = 0.0
grad_clip = 1.0
betas = [0.9, 0.999]
eps = 1.0e-3
samples_per_step = 4
micro_batch_size = 4
max_steps = 2000
grad_checkpointing = true
attn_implementation = "flash_attention_2"
student_dtype = "bfloat16"
teacher_dtype = "bfloat16"
mixed_precision = "bf16"
kl_chunk_size = 256
new_layer_lr_mul = 5.0
[eval]
every_steps = 50
samples = 500
seed = 4242
[log]
wandb = true
wandb_project = "distil-subnet97"
wandb_run = "I_cold_paramgroups_grow40"
log_every = 1
output_dir = "./out/sweep/I_cold_paramgroups_grow40"
[init]
zero_layers = []
target_num_layers = 40