| { |
| "model_name": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", |
| "tokenizer_name": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", |
| "dataset_config": "config/data_megamath.json", |
| "seq_len": 2048, |
| "total_tokens": 1000000, |
| "per_device_batch_size": 4, |
| "gradient_accumulation_steps": 4, |
| "learning_rate": 8e-05, |
| "min_learning_rate": 8e-06, |
| "weight_decay": 0.1, |
| "grad_clip": 1.0, |
| "warmup_steps": 0, |
| "lambda_hi": 0.3, |
| "p_inject": 0.1, |
| "max_injections_per_seq": 16, |
| "p_inject_warmup_steps": 10, |
| "lambda_hi_warmup_steps": 10, |
| "normalize_hidden": true, |
| "alpha_scale": 1.0, |
| "output_dir": "outputs/tinyllama_megamath_1m_debug", |
| "log_every_steps": 5, |
| "wandb_project": "megamath-debug", |
| "wandb_run_name": "tinyllama-1m-debug", |
| "hf_repo_id": "Onlydrinkwater/tinyllama-megamath-1m-debug", |
| "hf_upload": true, |
| "hf_private": false, |
| "seed": 42, |
| "bf16": true, |
| "dataloader_num_workers": 2 |
| } |