| { |
| "model": { |
| "phase1_dir": "omrisap/LMMS_phase1", |
| "v_z": 512, |
| "gumbel_tau_start": 1.0, |
| "gumbel_tau_end": 0.3, |
| "gumbel_anneal_steps": 3000, |
| "z_prefix": "Z_", |
| "latent_token": "<|latent|>", |
| "answer_token": "<ANSWER>" |
| }, |
| "data": { |
| "dataset_name": "omrisap/phaseZ", |
| "train_split": "train", |
| "eval_split": "eval", |
| "data_path": null, |
| "max_length": null, |
| "batch_size": 64, |
| "rebalance_train": true, |
| "k_max": 20, |
| "target_k_dist": { |
| "K1": 0.075, |
| "K2": 0.1, |
| "K3": 0.125, |
| "K4_7": 0.3, |
| "K8_12": 0.2, |
| "K13_20": 0.2 |
| } |
| }, |
| "loss": { |
| "lambda_ans": 0.1, |
| "lambda_ans_start": 0.05, |
| "lambda_ans_end": 0.5, |
| "lambda_ans_anneal_steps": 1000, |
| "lambda_sft": 0.05, |
| "lambda_cf": 1.0, |
| "lambda_batch": 0.5, |
| "lambda_consistency": 0.0, |
| "lambda_no_answer_on_latent": 0.95, |
| "digit_temperature": 0.1, |
| "keep_prob": [ |
| 0.02, |
| 0.05, |
| 0.1, |
| 0.5, |
| 1 |
| ], |
| "counterfactual_schedule": { |
| "1": 0.0, |
| "2": 0.1, |
| "3": 0.15, |
| "4": 0.2, |
| "5": 0.3, |
| "6": 0.4, |
| "7": 0.5, |
| "8": 0.6, |
| "9": 0.65, |
| "10": 0.7, |
| "11": 0.75, |
| "12": 0.8, |
| "13": 0.85, |
| "14": 0.85, |
| "15": 0.9, |
| "16": 0.9, |
| "17": 0.9, |
| "18": 0.9, |
| "19": 0.9, |
| "20": 0.9 |
| } |
| }, |
| "train": { |
| "lr": 3e-05, |
| "weight_decay": 0.0, |
| "steps": 3000, |
| "grad_accum": 1, |
| "print_every": 5, |
| "eval_every": 50, |
| "eval_generate_every_mult": 2, |
| "eval_generate_max_new_tokens": 64, |
| "eval_generate_temperature": 1.0, |
| "eval_generate_top_p": 0.95, |
| "save_every": 500, |
| "cf_debug_every": 0, |
| "cf_warmup_steps": 100, |
| "cf_bias_anneal_steps": 300, |
| "cf_attention_bias_strength": 2.0, |
| "cf_attention_bias_enabled": true, |
| "cf_bias_apply_cf_path_only": true, |
| "seed": 42, |
| "output_dir": "./runs/phase23_gs" |
| } |
| } |