| { | |
| "architectures": [ | |
| "MetisMambaLMHeadModel" | |
| ], | |
| "model_type": "metis_mamba2_hybrid", | |
| "name": "Metis-1.3", | |
| "architecture": "mamba2_hybrid_decoder", | |
| "vocab_size": 8192, | |
| "block_size": 4096, | |
| "d_model": 1152, | |
| "n_layer": 28, | |
| "n_heads": 18, | |
| "n_kv_heads": 6, | |
| "head_dim": 64, | |
| "attn_layer_idx": [ | |
| 3, | |
| 7, | |
| 11, | |
| 15, | |
| 19, | |
| 23, | |
| 27 | |
| ], | |
| "attn_d_conv": 4, | |
| "attn_rotary_emb_dim": 0, | |
| "ssm_layer": "Mamba2", | |
| "ssm_d_state": 64, | |
| "ssm_d_conv": 4, | |
| "ssm_expand": 2, | |
| "ssm_cfg": { | |
| "layer": "Mamba2", | |
| "d_state": 64, | |
| "d_conv": 4, | |
| "expand": 2 | |
| }, | |
| "attn_cfg": { | |
| "causal": true, | |
| "d_conv": 4, | |
| "head_dim": 64, | |
| "num_heads": 18, | |
| "num_heads_kv": 6, | |
| "qkv_proj_bias": false, | |
| "out_proj_bias": false, | |
| "rotary_emb_dim": 0 | |
| }, | |
| "bos_token_id": 1, | |
| "eos_token_id": 2, | |
| "pad_token_id": 0, | |
| "unk_token_id": 3, | |
| "rms_norm": true, | |
| "residual_in_fp32": false, | |
| "fused_add_norm": false, | |
| "pad_vocab_size_multiple": 16, | |
| "tie_embeddings": true, | |
| "torch_dtype": "bfloat16", | |
| "estimated_params": 201490560 | |
| } | |