{ "architectures": [ "DeepSeekMoE" ], "block_size": 2048, "dtype": "bfloat16", "expert_intermediate_size": 1408, "head_dim": 64, "model_type": "eve_moe", "n_embd": 512, "n_head": 8, "n_layer": 12, "num_experts": 8, "rope_theta": 10000.0, "router_aux_loss_coef": 0.01, "shared_expert_intermediate_size": 1408, "top_k": 2, "transformers_version": "5.1.0", "use_cache": false, "use_checkpointing": false, "vocab_size": 50304 }