Refresh code/ with latest BLT-Reasoner sources (post-campaign)

bc7101b verified 8 days ago

1.59 kB

	{
	"_doc": "Slot-decorrelation experiment. Adds a regularizer that penalizes pairwise alignment of latent slot input embeddings, with the goal of forcing slots to encode orthogonal directions. Tests whether the redundancy we observed at the GRPO ckpt (stable_rank=6.73 across K=16 slots) is a curable architectural problem. Resumes from GRPO ckpt + the longer-SFT ckpt (whichever is the latest best). 1000 K=16 steps. lambda_decorr=0.5 — strong enough to noticeably push slots apart, not so strong it dominates the LM loss.",

	"base_model": "Qwen/Qwen2.5-Math-7B-Instruct",
	"use_lora": true,
	"lora_r": 16,
	"lora_alpha": 32,
	"lora_dropout": 0.05,
	"lora_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
	"dtype": "bfloat16",
	"attn_impl": "eager",
	"gradient_checkpointing": false,

	"K_latents": 16,
	"K_curriculum": [[0, 16]],
	"block_y_to_x": true,
	"block_z_to_x": false,
	"proj_init_scale": 0.02,
	"proj_mlp": true,
	"proj_hidden_mult": 4,

	"lambda_lm": 1.0,
	"lambda_id": 1.0,
	"lambda_kl": 0.0001,
	"lambda_decorr": 0.5,
	"tau_infonce": 0.2,
	"infonce_full_answer": true,
	"infonce_target_max_len": 128,

	"lr_lora": 1e-4,
	"lr_proj": 5e-5,
	"lr_head": 1e-4,
	"weight_decay": 0.01,
	"max_grad_norm": 1.0,
	"warmup_steps": 50,

	"batch_size": 4,
	"grad_accum": 4,
	"max_steps": 1000,
	"max_prompt_len": 192,
	"max_answer_len": 192,

	"log_every": 10,
	"eval_every": 0,
	"eval_size": 200,
	"save_every": 500,
	"seed": 11,

	"output_dir": "/home/ubuntu/work/blt_decorr_exp",
	"data_train_size": null,
	"data_eval_size": 200
	}