moe-emergence / moe-main /final-model.json
sumitdotml's picture
add dense and moe checkpoints
3ff42e6
raw
history blame contribute delete
798 Bytes
{
"format_version": 1,
"step": 9999,
"preset": "moe-main",
"mode": "moe",
"config": {
"preset": "moe-main",
"mode": "moe",
"run_name": "moe-main",
"seed": 42,
"max_steps": 10000,
"batch_size": 2,
"grad_accum_steps": 4,
"effective_batch_size": 8,
"block_size": 512,
"learning_rate": 5e-05,
"weight_decay": 0.01,
"warmup_fraction": 0.1,
"max_grad_norm": 1.0,
"lb_coef": 0.01,
"z_coef": 0.001,
"n_experts": 8,
"topk": 1,
"noise_std": 0.1,
"moe_layers": [
8,
9,
10,
11
],
"size_mb": 10.0,
"balance_tokens": true,
"eval_every": 200,
"save_every": 500,
"collapse_early_stop": false
},
"metrics_summary": {
"eval_loss": 2.0798,
"eval_perplexity": 7.9147
}
}