{ "architectures": [ "MetisMoRLMHeadModel" ], "model_type": "metis_mor_transformer", "name": "Metis-1.4", "architecture": "metis_mor_decoder", "vocab_size": 16384, "block_size": 1024, "d_model": 1536, "n_layer": 19, "n_heads": 24, "n_kv_heads": 8, "head_dim": 64, "intermediate_size": 4096, "hidden_act": "swiglu", "attn_cfg": {}, "bos_token_id": 1, "eos_token_id": 2, "pad_token_id": 0, "unk_token_id": 3, "rms_norm": true, "residual_in_fp32": false, "fused_add_norm": false, "pad_vocab_size_multiple": 16, "tie_embeddings": true, "torch_dtype": "bfloat16", "estimated_params": 503772163, "attention_bias": false, "mlp_bias": false, "attention_dropout": 0.0, "rope_theta": 10000.0, "attention_backend": "flash_attention_3", "fp8_pad_multiple": 16, "mor_max_depth": 3, "mor_router_hidden_dim": 256, "mor_router_temperature": 1.0, "mor_router_aux_loss_coef": 0.01, "mor_target_avg_depth": 1.4, "effective_layer_count": 57, "target_effective_layer_count": 28.5, "training_mode": "static_sequence_mor", "mor_runtime_mode": "static_sequence", "mor_enabled": true, "mor_train_router": false, "mor_depth2_capacity_sequences": 10, "mor_depth3_capacity_sequences": 6, "attention_mask_mode": "causal_none", "disable_token_packing": true, "disable_token_scatter": true, "lm_loss_impl": "standard", "te_fused_mlp": false }