{ "_doc": "EXPERIMENT: Options 1+3 = richer InfoNCE target (full y, max_len=128) + MLP projector (d->4d->d with GELU). Tests whether stronger supervision + more expressive compression fixes the absolute-accuracy ceiling we hit at 7B. Block_z_to_x=False because pilot 7B AR Δ_random=13pp already shows the architecture is content-load-bearing in AR (TF just understates it); the real problem is absolute accuracy (13% on GSM8K). 2500 K=8 steps then 1000 K=16 steps (3500 total) to match pilot 7B's compute, but with the two new mechanisms. Hypothesis: richer signal + MLP pushes absolute acc materially above pilot 7B's 13%.", "base_model": "Qwen/Qwen2.5-Math-7B-Instruct", "use_lora": true, "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.05, "lora_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"], "dtype": "bfloat16", "attn_impl": "eager", "gradient_checkpointing": false, "K_latents": 8, "K_curriculum": [[0, 8], [2500, 16]], "block_y_to_x": true, "block_z_to_x": false, "proj_init_scale": 0.02, "proj_mlp": true, "proj_hidden_mult": 4, "lambda_lm": 1.0, "lambda_id": 1.0, "lambda_kl": 0.0001, "tau_infonce": 0.2, "infonce_full_answer": true, "infonce_target_max_len": 128, "lr_lora": 2e-4, "lr_proj": 1e-4, "lr_head": 3e-4, "weight_decay": 0.01, "max_grad_norm": 1.0, "warmup_steps": 200, "batch_size": 4, "grad_accum": 4, "max_steps": 3500, "max_prompt_len": 192, "max_answer_len": 192, "log_every": 10, "eval_every": 0, "eval_size": 200, "save_every": 1000, "seed": 42, "output_dir": "/home/ubuntu/work/blt_exp7b_opt13", "data_train_size": null, "data_eval_size": 200 }