{
  "_doc": "MATH dataset experiment. Fine-tune the GRPO ckpt (best GSM8K model: 52.5% AR, H1 supported) on MATH. Tests two hypotheses: (1) the recipe transfers to harder problems; (2) richer reasoning chains force the model to use more of K=16 (stable_rank rises). If accuracy holds at >25% and stable_rank rises, the recipe scales. If it collapses, MATH is too hard for the bottleneck architecture at our scale.",

  "base_model": "Qwen/Qwen2.5-Math-7B-Instruct",
  "use_lora": true,
  "lora_r": 16,
  "lora_alpha": 32,
  "lora_dropout": 0.05,
  "lora_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
  "dtype": "bfloat16",
  "attn_impl": "eager",
  "gradient_checkpointing": false,

  "K_latents": 16,
  "K_curriculum": [[0, 16]],
  "block_y_to_x": true,
  "block_z_to_x": false,
  "proj_init_scale": 0.02,
  "proj_mlp": true,
  "proj_hidden_mult": 4,

  "lambda_lm": 1.0,
  "lambda_id": 1.0,
  "lambda_kl": 0.0001,
  "tau_infonce": 0.2,
  "infonce_full_answer": true,
  "infonce_target_max_len": 256,

  "lr_lora": 1e-4,
  "lr_proj": 5e-5,
  "lr_head": 1e-4,
  "weight_decay": 0.01,
  "max_grad_norm": 1.0,
  "warmup_steps": 100,

  "batch_size": 4,
  "grad_accum": 4,
  "max_steps": 2000,
  "max_prompt_len": 256,
  "max_answer_len": 256,

  "log_every": 10,
  "eval_every": 0,
  "eval_size": 200,
  "save_every": 500,
  "seed": 23,

  "dataset": "math",
  "output_dir": "/home/ubuntu/work/blt_math",
  "data_train_size": null,
  "data_eval_size": 200
}