{ "num_attention_heads": 8, "input_dim": 512, "embed_dim": 512, "q_latent_dim": 128, "kv_latent_dim": 128, "max_token_len": 512, "num_shared_experts": 2, "num_routed_experts": 4, "moe_top_k": 2, "expert_intermediate_dim": 1536, "num_dense_ffn": 1, "num_moe_ffn": 2, "vocab_size": 50257, "max_batch_size": 24 }