Metis-1.4-think / config.json
GiuliannoV's picture
Fix release config for static sequence MoR
b34c15c verified
{
"name": "Metis-1.4",
"model_type": "metis_mor_transformer",
"architecture": "metis_mor_decoder",
"vocab_size": 16384,
"block_size": 1024,
"d_model": 1536,
"n_layer": 19,
"n_heads": 24,
"n_kv_heads": 8,
"head_dim": 64,
"intermediate_size": 4096,
"hidden_act": "swiglu",
"tie_embeddings": true,
"rms_norm": true,
"residual_in_fp32": false,
"fused_add_norm": false,
"pad_vocab_size_multiple": 16,
"initializer_range": 0.02,
"torch_dtype": "bfloat16",
"attention_bias": false,
"mlp_bias": false,
"attention_dropout": 0.0,
"rope_theta": 10000.0,
"attention_backend": "flash_attention_3",
"training_mode": "static_sequence_mor",
"mor_enabled": true,
"mor_train_router": false,
"mor_runtime_mode": "static_sequence",
"attention_mask_mode": "causal_none",
"disable_depth_stack": false,
"disable_token_packing": true,
"disable_token_scatter": true,
"debug_attention_backend": false,
"debug_perf_counters": false,
"native_gqa_attention": true,
"te_dot_product_attention": false,
"cuda_graphs": false,
"cuda_graph_scope": "none",
"fp8_dpa": false,
"fp8_mha": false,
"te_fused_mlp": true,
"lm_loss_impl": "liger_fused_linear_ce",
"mor_max_depth": 3,
"mor_router_hidden_dim": 256,
"mor_router_temperature": 1.0,
"mor_router_aux_loss_coef": 0.0,
"mor_target_avg_depth": 1.4,
"mor_depth2_capacity_sequences": 10,
"mor_depth3_capacity_sequences": 6,
"mor_block_size": 128,
"mor_depth2_capacity_blocks": 0,
"mor_depth3_capacity_blocks": 0,
"block_mor_attention_mode": "local_block_refinement",
"fp8_pad_multiple": 16,
"estimated_params": 503772163,
"attn_cfg": {
"causal": true,
"head_dim": 64,
"num_heads": 24,
"num_heads_kv": 8,
"attention_bias": false,
"dropout": 0.0,
"rope_theta": 10000.0,
"backend": "flash_attention_3",
"native_gqa_attention": true,
"te_dot_product_attention": false
},
"effective_layer_count": 57,
"target_effective_layer_count": 26.599999999999998
}