| { | |
| "architectures": [ | |
| "NRM" | |
| ], | |
| "model_type": "nrm", | |
| "dim": 2048, | |
| "n_layers": 1, | |
| "n_heads": 16, | |
| "n_mem_tokens": 64, | |
| "vocab_size": 32000, | |
| "inner_loops": 8, | |
| "outer_loops": 16, | |
| "truncation_loops": 2, | |
| "moe_experts": 8, | |
| "experts_per_token": 2, | |
| "num_shared_experts": 2, | |
| "use_mla": true, | |
| "kv_latent_dim": 512, | |
| "rope_head_dim": 64, | |
| "rope_base": 10000.0, | |
| "mtp_num_heads": 4, | |
| "use_conv_swiglu": true, | |
| "p_exit": 0.1, | |
| "tokenizer_class": "LlamaTokenizer" | |
| } |