Metis-1.4-base / config.json
GiuliannoV's picture
Fix Metis-1.4 base runtime config
5f864b3 verified
{
"architectures": [
"MetisMoRLMHeadModel"
],
"model_type": "metis_mor_transformer",
"name": "Metis-1.4",
"architecture": "metis_mor_decoder",
"vocab_size": 16384,
"block_size": 1024,
"d_model": 1536,
"n_layer": 19,
"n_heads": 24,
"n_kv_heads": 8,
"head_dim": 64,
"intermediate_size": 4096,
"hidden_act": "swiglu",
"attn_cfg": {},
"bos_token_id": 1,
"eos_token_id": 2,
"pad_token_id": 0,
"unk_token_id": 3,
"rms_norm": true,
"residual_in_fp32": false,
"fused_add_norm": false,
"pad_vocab_size_multiple": 16,
"tie_embeddings": true,
"torch_dtype": "bfloat16",
"estimated_params": 503772163,
"attention_bias": false,
"mlp_bias": false,
"attention_dropout": 0.0,
"rope_theta": 10000.0,
"attention_backend": "flash_attention_3",
"fp8_pad_multiple": 16,
"mor_max_depth": 3,
"mor_router_hidden_dim": 256,
"mor_router_temperature": 1.0,
"mor_router_aux_loss_coef": 0.01,
"mor_target_avg_depth": 1.4,
"effective_layer_count": 57,
"target_effective_layer_count": 28.5,
"training_mode": "static_sequence_mor",
"mor_runtime_mode": "static_sequence",
"mor_enabled": true,
"mor_train_router": false,
"mor_depth2_capacity_sequences": 10,
"mor_depth3_capacity_sequences": 6,
"attention_mask_mode": "causal_none",
"disable_token_packing": true,
"disable_token_scatter": true,
"lm_loss_impl": "standard",
"te_fused_mlp": false
}