{ "format_version": 1, "step": 9999, "preset": "moe-main", "mode": "moe", "config": { "preset": "moe-main", "mode": "moe", "run_name": "moe-main", "seed": 42, "max_steps": 10000, "batch_size": 2, "grad_accum_steps": 4, "effective_batch_size": 8, "block_size": 512, "learning_rate": 5e-05, "weight_decay": 0.01, "warmup_fraction": 0.1, "max_grad_norm": 1.0, "lb_coef": 0.01, "z_coef": 0.001, "n_experts": 8, "topk": 1, "noise_std": 0.1, "moe_layers": [ 8, 9, 10, 11 ], "size_mb": 10.0, "balance_tokens": true, "eval_every": 200, "save_every": 500, "collapse_early_stop": false }, "metrics_summary": { "eval_loss": 2.0798, "eval_perplexity": 7.9147 } }