distill-m-6a3lnzvb-code / sweep /I_cold_paramgroups_grow40.toml

Upload folder using huggingface_hub

2d38ae8 verified about 2 months ago

1.3 kB

	# Cold start, 40 layers, low LR for original layers + 5x for the new ones.
	# Lets the new layers wake up faster without disturbing the trained layers.

	[model]
	teacher = "Qwen/Qwen3.5-35B-A3B"
	student = "Troiaaa/m-6a3lnzvb"
	tokenizer = "Qwen/Qwen3.5-35B-A3B"

	[data]
	dataset = "karpathy/climbmix-400b-shuffle"
	text_field = "text"
	min_chars = 2560
	max_seq_len = 2048
	kl_start_pos = 128
	seed = 6767
	shuffle_buffer = 10000

	[train]
	seed = 6767
	lr = 1.0e-7
	schedule = "cosine"
	warmup_steps = 100
	weight_decay = 0.0
	grad_clip = 1.0
	betas = [0.9, 0.999]
	eps = 1.0e-3
	samples_per_step = 4
	micro_batch_size = 4
	max_steps = 2000
	grad_checkpointing = true
	attn_implementation = "flash_attention_2"
	student_dtype = "bfloat16"
	teacher_dtype = "bfloat16"
	mixed_precision = "bf16"
	kl_chunk_size = 256
	new_layer_lr_mul = 5.0

	[eval]
	every_steps = 50
	samples = 500
	seed = 4242

	[log]
	wandb = true
	wandb_project = "distil-subnet97"
	wandb_run = "I_cold_paramgroups_grow40"
	log_every = 1
	output_dir = "./out/sweep/I_cold_paramgroups_grow40"

	[init]
	zero_layers = []
	target_num_layers = 40