| { | |
| "vocab_size": 50304, | |
| "d_model": 768, | |
| "n_layers": 16, | |
| "n_dense_layers": 1, | |
| "n_q_heads": 12, | |
| "n_kv_heads": 3, | |
| "head_dim": 128, | |
| "qk_norm": true, | |
| "rope_theta": 10000.0, | |
| "attn_softcap": 0.0, | |
| "dense_ffn": 2304, | |
| "expert_ffn": 320, | |
| "n_experts": 36, | |
| "top_k": 6, | |
| "n_shared": 1, | |
| "gating": "sigmoid", | |
| "norm_topk_prob": false, | |
| "balancing": "aux_free", | |
| "aux_loss_coef": 0.001, | |
| "z_loss_coef": 0.001, | |
| "bias_update_rate": 0.001, | |
| "router_init_std": 0.02, | |
| "tie_embeddings": true, | |
| "scale_embeddings": false, | |
| "final_z_loss_coef": 0.0001, | |
| "logit_softcap": 0.0, | |
| "n_mtp": 0, | |
| "mtp_weight": 0.1, | |
| "init_std": 0.02, | |
| "expert_backend": "grouped", | |
| "fused_ce": true, | |
| "ce_chunk": 4096, | |
| "fp8_head": false, | |
| "fp8_x_scale": 1.0, | |
| "fp8_w_scale": 1.0, | |
| "fp8_grad_scale": 1.0, | |
| "preset": "500M" | |
| } |