{ "model_name": "Qwen3-5M-MoE-2exp-active", "model_type": "Qwen3MoeForCausalLM", "tokenizer": "gpt2", "dtype": "bfloat16", "vocab_size": 50257, "hidden_size": 128, "num_layers": 8, "num_attention_heads": 4, "num_key_value_heads": 2, "head_dim": 32, "moe_intermediate_size": 1024, "num_experts": 64, "num_experts_per_tok": 2, "sliding_window": 512, "max_position_embeddings": 8192, "rope_theta": 500000, "layer_types": [ "sliding_attention", "sliding_attention", "full_attention", "sliding_attention", "sliding_attention", "full_attention", "sliding_attention", "sliding_attention" ], "max_window_layers": 6, "parameters_total": 208220928, "parameters_active": 13186816, "active_ratio": 0.03125, "positional_encoding": "rope", "normalization": "rmsnorm", "activation": "swiglu", "tie_word_embeddings": true }