File size: 1,242 Bytes
bc7101b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | {
"_doc": "Control B: identical to pilot config BUT block_y_to_x=false (no bottleneck; y can attend to x directly). Tests whether the y->only-z mask is necessary. Hypothesis: without the bottleneck, the model uses x->y directly and ignores z (Delta near 0), even with InfoNCE still active. 3000 K=4 steps, ~1.6h on GH200.",
"base_model": "Qwen/Qwen2.5-1.5B-Instruct",
"use_lora": true,
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.05,
"lora_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
"dtype": "bfloat16",
"attn_impl": "eager",
"K_latents": 4,
"K_curriculum": [[0, 4]],
"block_y_to_x": false,
"proj_init_scale": 0.02,
"lambda_lm": 1.0,
"lambda_id": 1.0,
"lambda_kl": 0.0001,
"tau_infonce": 0.2,
"infonce_target": "final_number",
"lr_lora": 3e-4,
"lr_proj": 1e-4,
"lr_head": 3e-4,
"weight_decay": 0.01,
"max_grad_norm": 1.0,
"warmup_steps": 100,
"batch_size": 16,
"grad_accum": 2,
"max_steps": 3000,
"max_prompt_len": 192,
"max_answer_len": 192,
"log_every": 25,
"eval_every": 500,
"eval_size": 200,
"save_every": 3000,
"seed": 42,
"output_dir": "/home/ubuntu/work/blt_control_no_bottleneck",
"data_train_size": null,
"data_eval_size": 200
}
|