C10X
/

moe

Model card Files Files and versions

moe / metadata.json

C10X's picture

Upload 8 files

c2f5196 verified 18 days ago

history blame contribute delete

926 Bytes

	{
	"model_name": "Qwen3-5M-MoE-2exp-active",
	"model_type": "Qwen3MoeForCausalLM",
	"tokenizer": "gpt2",
	"dtype": "bfloat16",
	"vocab_size": 50257,
	"hidden_size": 128,
	"num_layers": 8,
	"num_attention_heads": 4,
	"num_key_value_heads": 2,
	"head_dim": 32,
	"moe_intermediate_size": 1024,
	"num_experts": 64,
	"num_experts_per_tok": 2,
	"sliding_window": 512,
	"max_position_embeddings": 8192,
	"rope_theta": 500000,
	"layer_types": [
	"sliding_attention",
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"sliding_attention"
	],
	"max_window_layers": 6,
	"parameters_total": 208220928,
	"parameters_active": 13186816,
	"active_ratio": 0.03125,
	"positional_encoding": "rope",
	"normalization": "rmsnorm",
	"activation": "swiglu",
	"tie_word_embeddings": true
	}