| { | |
| "_doc": "Longer SFT continuation from BLT 7B GRPO ckpt (52.5% AR, H1 thresholds passed). val was still descending at end of prior SFT and GRPO further consolidated. Hypothesis: another 2000 K=16 steps of SFT (same recipe — full-y InfoNCE + MLP projector) lifts absolute accuracy by 2-4 pp. K_curriculum is just [[0,16]] (no curriculum — we resume at K=16). Reset optimizer state on resume (existing limitation).", | |
| "base_model": "Qwen/Qwen2.5-Math-7B-Instruct", | |
| "use_lora": true, | |
| "lora_r": 16, | |
| "lora_alpha": 32, | |
| "lora_dropout": 0.05, | |
| "lora_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"], | |
| "dtype": "bfloat16", | |
| "attn_impl": "eager", | |
| "gradient_checkpointing": false, | |
| "K_latents": 16, | |
| "K_curriculum": [[0, 16]], | |
| "block_y_to_x": true, | |
| "block_z_to_x": false, | |
| "proj_init_scale": 0.02, | |
| "proj_mlp": true, | |
| "proj_hidden_mult": 4, | |
| "lambda_lm": 1.0, | |
| "lambda_id": 1.0, | |
| "lambda_kl": 0.0001, | |
| "tau_infonce": 0.2, | |
| "infonce_full_answer": true, | |
| "infonce_target_max_len": 128, | |
| "lr_lora": 1e-4, | |
| "lr_proj": 5e-5, | |
| "lr_head": 1e-4, | |
| "weight_decay": 0.01, | |
| "max_grad_norm": 1.0, | |
| "warmup_steps": 50, | |
| "batch_size": 4, | |
| "grad_accum": 4, | |
| "max_steps": 2000, | |
| "max_prompt_len": 192, | |
| "max_answer_len": 192, | |
| "log_every": 10, | |
| "eval_every": 0, | |
| "eval_size": 200, | |
| "save_every": 500, | |
| "seed": 7, | |
| "output_dir": "/home/ubuntu/work/blt_longer_sft", | |
| "data_train_size": null, | |
| "data_eval_size": 200 | |
| } | |