| { | |
| "vocab_size": 2660, | |
| "d_model": 1480, | |
| "n_heads": 20, | |
| "n_kv_heads": 4, | |
| "d_head": 74, | |
| "max_seq_len": 4096, | |
| "n_prelude_layers": 18, | |
| "n_msa_layers": 9, | |
| "n_experts": 8, | |
| "n_shared_experts": 1, | |
| "top_k": 2, | |
| "d_ff_expert": 732, | |
| "dynmoe_min_k": 1, | |
| "dynmoe_max_k": 4, | |
| "dynmoe_budget_loss": 0.01, | |
| "n_recurrent_layers": 4, | |
| "t_max": 4, | |
| "lora_rank": 32, | |
| "n_coda_layers": 6, | |
| "dropout": 0.1, | |
| "rope_base": 500000.0, | |
| "yarn_original_len": 2048, | |
| "yarn_target_len": 4096, | |
| "msa_block_size": 32, | |
| "msa_k_by_level": [ | |
| 2, | |
| 4, | |
| 8, | |
| 12, | |
| 16 | |
| ], | |
| "msa_default_k": 8, | |
| "mla_latent_dim": 512, | |
| "think_t_map": [ | |
| 0, | |
| 1, | |
| 2, | |
| 3, | |
| 4 | |
| ], | |
| "mtp_depth": 2 | |
| } |