{ "_doc": "Control A: identical to pilot config BUT lambda_id=0 (no InfoNCE identifiability loss). Tests whether InfoNCE is the load-bearing piece that makes z informative. Hypothesis: without InfoNCE, z collapses to decorative (like Abstract-CoT). 3000 K=4 steps, ~1.6h on GH200. Compare resulting Delta_random/Delta_zero to pilot's step-2000 K=4 result (Delta=3pp at same compute scale) and step-6000 K=8 result (Delta=11pp peak).", "base_model": "Qwen/Qwen2.5-1.5B-Instruct", "use_lora": true, "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.05, "lora_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"], "dtype": "bfloat16", "attn_impl": "eager", "K_latents": 4, "K_curriculum": [[0, 4]], "block_y_to_x": true, "proj_init_scale": 0.02, "lambda_lm": 1.0, "lambda_id": 0.0, "lambda_kl": 0.0001, "tau_infonce": 0.2, "infonce_target": "final_number", "lr_lora": 3e-4, "lr_proj": 1e-4, "lr_head": 3e-4, "weight_decay": 0.01, "max_grad_norm": 1.0, "warmup_steps": 100, "batch_size": 16, "grad_accum": 2, "max_steps": 3000, "max_prompt_len": 192, "max_answer_len": 192, "log_every": 25, "eval_every": 500, "eval_size": 200, "save_every": 3000, "seed": 42, "output_dir": "/home/ubuntu/work/blt_control_no_infonce", "data_train_size": null, "data_eval_size": 200 }