{ "model": { "phase1_dir": "omrisap/LMMS_phase1", "v_z": 512, "gumbel_tau_start": 1.0, "gumbel_tau_end": 0.3, "gumbel_anneal_steps": 3000, "z_prefix": "Z_", "latent_token": "<|latent|>", "answer_token": "" }, "data": { "dataset_name": "omrisap/phaseZ", "train_split": "train", "eval_split": "eval", "data_path": null, "max_length": null, "batch_size": 64, "rebalance_train": true, "k_max": 20, "target_k_dist": { "K1": 0.075, "K2": 0.1, "K3": 0.125, "K4_7": 0.3, "K8_12": 0.2, "K13_20": 0.2 } }, "loss": { "lambda_ans": 0.1, "lambda_ans_start": 0.05, "lambda_ans_end": 0.5, "lambda_ans_anneal_steps": 1000, "lambda_sft": 0.05, "lambda_cf": 1.0, "lambda_batch": 0.5, "lambda_consistency": 0.0, "lambda_no_answer_on_latent": 0.95, "digit_temperature": 0.1, "keep_prob": [ 0.02, 0.05, 0.1, 0.5, 1 ], "counterfactual_schedule": { "1": 0.0, "2": 0.1, "3": 0.15, "4": 0.2, "5": 0.3, "6": 0.4, "7": 0.5, "8": 0.6, "9": 0.65, "10": 0.7, "11": 0.75, "12": 0.8, "13": 0.85, "14": 0.85, "15": 0.9, "16": 0.9, "17": 0.9, "18": 0.9, "19": 0.9, "20": 0.9 } }, "train": { "lr": 3e-05, "weight_decay": 0.0, "steps": 3000, "grad_accum": 1, "print_every": 5, "eval_every": 50, "eval_generate_every_mult": 2, "eval_generate_max_new_tokens": 64, "eval_generate_temperature": 1.0, "eval_generate_top_p": 0.95, "save_every": 500, "cf_debug_every": 0, "cf_warmup_steps": 100, "cf_bias_anneal_steps": 300, "cf_attention_bias_strength": 2.0, "cf_attention_bias_enabled": true, "cf_bias_apply_cf_path_only": true, "seed": 42, "output_dir": "./runs/phase23_gs" } }