Mistral-7B-v0.3-CoDA-GQA-L / training_config.json
anthonym21's picture
Mistral 7B v0.3 + CoDA-GQA-L: two-phase trained (unbounded + bounded)
a1acae5 verified
raw
history blame contribute delete
787 Bytes
{
"model": "mistralai/Mistral-7B-v0.3",
"dataset": "wikitext-103",
"max_steps": 2000,
"batch_size": 1,
"grad_accum": 8,
"seq_len": 2048,
"lr": 5e-05,
"lr_coda": 0.001,
"warmup_steps": 100,
"weight_decay": 0.01,
"freeze": "attention",
"eval_every": 500,
"save_every": 500,
"eval_tokens": 50000,
"eval_bounded_tokens": 20000,
"no_eval_bounded": false,
"head_norm_mode": "identity",
"theta_init": 0.0,
"bounded_steps": 2000,
"bounded_config": "medium",
"bounded_lr_scale": 0.5,
"bounded_block_size": 512,
"no_detach_evicted": false,
"dtype": "bf16",
"gradient_checkpointing": true,
"output_dir": "/workspace/runs/mistral7b",
"total_params": 7248023552,
"trainable_params": 1346438144,
"n_adapters": 32,
"eff_batch_tokens": 16384
}