{
  "name": "Metis-1.4",
  "model_type": "metis_mor_transformer",
  "architecture": "metis_mor_decoder",
  "vocab_size": 16384,
  "block_size": 1024,
  "d_model": 1536,
  "n_layer": 19,
  "n_heads": 24,
  "n_kv_heads": 8,
  "head_dim": 64,
  "intermediate_size": 4096,
  "hidden_act": "swiglu",
  "tie_embeddings": true,
  "rms_norm": true,
  "residual_in_fp32": false,
  "fused_add_norm": false,
  "pad_vocab_size_multiple": 16,
  "initializer_range": 0.02,
  "torch_dtype": "bfloat16",
  "attention_bias": false,
  "mlp_bias": false,
  "attention_dropout": 0.0,
  "rope_theta": 10000.0,
  "attention_backend": "flash_attention_3",
  "training_mode": "static_sequence_mor",
  "mor_enabled": true,
  "mor_train_router": false,
  "mor_runtime_mode": "static_sequence",
  "attention_mask_mode": "causal_none",
  "disable_depth_stack": false,
  "disable_token_packing": true,
  "disable_token_scatter": true,
  "debug_attention_backend": false,
  "debug_perf_counters": false,
  "native_gqa_attention": true,
  "te_dot_product_attention": false,
  "cuda_graphs": false,
  "cuda_graph_scope": "none",
  "fp8_dpa": false,
  "fp8_mha": false,
  "te_fused_mlp": true,
  "lm_loss_impl": "liger_fused_linear_ce",
  "mor_max_depth": 3,
  "mor_router_hidden_dim": 256,
  "mor_router_temperature": 1.0,
  "mor_router_aux_loss_coef": 0.0,
  "mor_target_avg_depth": 1.4,
  "mor_depth2_capacity_sequences": 10,
  "mor_depth3_capacity_sequences": 6,
  "mor_block_size": 128,
  "mor_depth2_capacity_blocks": 0,
  "mor_depth3_capacity_blocks": 0,
  "block_mor_attention_mode": "local_block_refinement",
  "fp8_pad_multiple": 16,
  "estimated_params": 503772163,
  "attn_cfg": {
    "causal": true,
    "head_dim": 64,
    "num_heads": 24,
    "num_heads_kv": 8,
    "attention_bias": false,
    "dropout": 0.0,
    "rope_theta": 10000.0,
    "backend": "flash_attention_3",
    "native_gqa_attention": true,
    "te_dot_product_attention": false
  },
  "effective_layer_count": 57,
  "target_effective_layer_count": 26.599999999999998
}