{ "name": "Metis-1.4", "model_type": "metis_mor_transformer", "architecture": "metis_mor_decoder", "vocab_size": 16384, "block_size": 1024, "d_model": 1536, "n_layer": 19, "n_heads": 24, "n_kv_heads": 8, "head_dim": 64, "intermediate_size": 4096, "hidden_act": "swiglu", "tie_embeddings": true, "rms_norm": true, "residual_in_fp32": false, "fused_add_norm": false, "pad_vocab_size_multiple": 16, "initializer_range": 0.02, "torch_dtype": "bfloat16", "attention_bias": false, "mlp_bias": false, "attention_dropout": 0.0, "rope_theta": 10000.0, "attention_backend": "flash_attention_3", "training_mode": "static_sequence_mor", "mor_enabled": true, "mor_train_router": false, "mor_runtime_mode": "static_sequence", "attention_mask_mode": "causal_none", "disable_depth_stack": false, "disable_token_packing": true, "disable_token_scatter": true, "debug_attention_backend": false, "debug_perf_counters": false, "native_gqa_attention": true, "te_dot_product_attention": false, "cuda_graphs": false, "cuda_graph_scope": "none", "fp8_dpa": false, "fp8_mha": false, "te_fused_mlp": true, "lm_loss_impl": "liger_fused_linear_ce", "mor_max_depth": 3, "mor_router_hidden_dim": 256, "mor_router_temperature": 1.0, "mor_router_aux_loss_coef": 0.0, "mor_target_avg_depth": 1.4, "mor_depth2_capacity_sequences": 10, "mor_depth3_capacity_sequences": 6, "mor_block_size": 128, "mor_depth2_capacity_blocks": 0, "mor_depth3_capacity_blocks": 0, "block_mor_attention_mode": "local_block_refinement", "fp8_pad_multiple": 16, "estimated_params": 503772163, "attn_cfg": { "causal": true, "head_dim": 64, "num_heads": 24, "num_heads_kv": 8, "attention_bias": false, "dropout": 0.0, "rope_theta": 10000.0, "backend": "flash_attention_3", "native_gqa_attention": true, "te_dot_product_attention": false }, "effective_layer_count": 57, "target_effective_layer_count": 26.599999999999998 }