Lernex
/

Metis-1.4-think

Text Generation

metis_mor_transformer

Model card Files Files and versions

Metis-1.4-think / config.json

GiuliannoV's picture

Fix release config for static sequence MoR

b34c15c verified 25 days ago

history blame contribute delete

2.02 kB

	{
	"name": "Metis-1.4",
	"model_type": "metis_mor_transformer",
	"architecture": "metis_mor_decoder",
	"vocab_size": 16384,
	"block_size": 1024,
	"d_model": 1536,
	"n_layer": 19,
	"n_heads": 24,
	"n_kv_heads": 8,
	"head_dim": 64,
	"intermediate_size": 4096,
	"hidden_act": "swiglu",
	"tie_embeddings": true,
	"rms_norm": true,
	"residual_in_fp32": false,
	"fused_add_norm": false,
	"pad_vocab_size_multiple": 16,
	"initializer_range": 0.02,
	"torch_dtype": "bfloat16",
	"attention_bias": false,
	"mlp_bias": false,
	"attention_dropout": 0.0,
	"rope_theta": 10000.0,
	"attention_backend": "flash_attention_3",
	"training_mode": "static_sequence_mor",
	"mor_enabled": true,
	"mor_train_router": false,
	"mor_runtime_mode": "static_sequence",
	"attention_mask_mode": "causal_none",
	"disable_depth_stack": false,
	"disable_token_packing": true,
	"disable_token_scatter": true,
	"debug_attention_backend": false,
	"debug_perf_counters": false,
	"native_gqa_attention": true,
	"te_dot_product_attention": false,
	"cuda_graphs": false,
	"cuda_graph_scope": "none",
	"fp8_dpa": false,
	"fp8_mha": false,
	"te_fused_mlp": true,
	"lm_loss_impl": "liger_fused_linear_ce",
	"mor_max_depth": 3,
	"mor_router_hidden_dim": 256,
	"mor_router_temperature": 1.0,
	"mor_router_aux_loss_coef": 0.0,
	"mor_target_avg_depth": 1.4,
	"mor_depth2_capacity_sequences": 10,
	"mor_depth3_capacity_sequences": 6,
	"mor_block_size": 128,
	"mor_depth2_capacity_blocks": 0,
	"mor_depth3_capacity_blocks": 0,
	"block_mor_attention_mode": "local_block_refinement",
	"fp8_pad_multiple": 16,
	"estimated_params": 503772163,
	"attn_cfg": {
	"causal": true,
	"head_dim": 64,
	"num_heads": 24,
	"num_heads_kv": 8,
	"attention_bias": false,
	"dropout": 0.0,
	"rope_theta": 10000.0,
	"backend": "flash_attention_3",
	"native_gqa_attention": true,
	"te_dot_product_attention": false
	},
	"effective_layer_count": 57,
	"target_effective_layer_count": 26.599999999999998
	}