| { | |
| "model": "mistralai/Mistral-7B-v0.3", | |
| "dataset": "wikitext-103", | |
| "max_steps": 2000, | |
| "batch_size": 1, | |
| "grad_accum": 8, | |
| "seq_len": 2048, | |
| "lr": 5e-05, | |
| "lr_coda": 0.001, | |
| "warmup_steps": 100, | |
| "weight_decay": 0.01, | |
| "freeze": "attention", | |
| "eval_every": 500, | |
| "save_every": 500, | |
| "eval_tokens": 50000, | |
| "eval_bounded_tokens": 20000, | |
| "no_eval_bounded": false, | |
| "head_norm_mode": "identity", | |
| "theta_init": 0.0, | |
| "bounded_steps": 2000, | |
| "bounded_config": "medium", | |
| "bounded_lr_scale": 0.5, | |
| "bounded_block_size": 512, | |
| "no_detach_evicted": false, | |
| "dtype": "bf16", | |
| "gradient_checkpointing": true, | |
| "output_dir": "/workspace/runs/mistral7b", | |
| "total_params": 7248023552, | |
| "trainable_params": 1346438144, | |
| "n_adapters": 32, | |
| "eff_batch_tokens": 16384 | |
| } |