Refresh code/ with latest BLT-Reasoner sources (post-campaign)

bc7101b verified 8 days ago

1.68 kB

	{
	"_doc": "EXPERIMENT: Options 1+3 = richer InfoNCE target (full y, max_len=128) + MLP projector (d->4d->d with GELU). Tests whether stronger supervision + more expressive compression fixes the absolute-accuracy ceiling we hit at 7B. Block_z_to_x=False because pilot 7B AR Δ_random=13pp already shows the architecture is content-load-bearing in AR (TF just understates it); the real problem is absolute accuracy (13% on GSM8K). 2500 K=8 steps then 1000 K=16 steps (3500 total) to match pilot 7B's compute, but with the two new mechanisms. Hypothesis: richer signal + MLP pushes absolute acc materially above pilot 7B's 13%.",

	"base_model": "Qwen/Qwen2.5-Math-7B-Instruct",
	"use_lora": true,
	"lora_r": 16,
	"lora_alpha": 32,
	"lora_dropout": 0.05,
	"lora_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
	"dtype": "bfloat16",
	"attn_impl": "eager",
	"gradient_checkpointing": false,

	"K_latents": 8,
	"K_curriculum": [[0, 8], [2500, 16]],
	"block_y_to_x": true,
	"block_z_to_x": false,
	"proj_init_scale": 0.02,
	"proj_mlp": true,
	"proj_hidden_mult": 4,

	"lambda_lm": 1.0,
	"lambda_id": 1.0,
	"lambda_kl": 0.0001,
	"tau_infonce": 0.2,
	"infonce_full_answer": true,
	"infonce_target_max_len": 128,

	"lr_lora": 2e-4,
	"lr_proj": 1e-4,
	"lr_head": 3e-4,
	"weight_decay": 0.01,
	"max_grad_norm": 1.0,
	"warmup_steps": 200,

	"batch_size": 4,
	"grad_accum": 4,
	"max_steps": 3500,
	"max_prompt_len": 192,
	"max_answer_len": 192,

	"log_every": 10,
	"eval_every": 0,
	"eval_size": 200,
	"save_every": 1000,
	"seed": 42,

	"output_dir": "/home/ubuntu/work/blt_exp7b_opt13",
	"data_train_size": null,
	"data_eval_size": 200
	}