{ "vocab_size": 114, "hidden_size": 256, "num_hidden_layers": 6, "num_attention_heads": 8, "num_key_value_heads": 8, "intermediate_size": 768, "max_position_embeddings": 8192, "rms_norm_eps": "1e-6", "initializer_range": 0.02, "use_cache": true, "num_experts": 8, "num_experts_per_tok": 2, "expert_capacity_factor": 1.5, "router_aux_loss_coef": 0.01, "moe_implementation": "megablocks", "moe_world_size": 4, "resid_dropout": 0.0, "hidden_dropout": 0.0, "dropout_warmup_steps": 0, "dropout_ramp_steps": 0, "dropout_schedule": "linear", "attention_dropout": 0.0, "gradient_clip_norm": 0.0, "label_smoothing": 0.0, "eos_loss_weight": 10.0 }