molm_inverted_100BT / config.json
robinfaro's picture
Update config.json
5a2a562 verified
{
"architectures": [
"MoLM"
],
"auto_map": {
"AutoConfig": "configuration.MoLMConfig",
"AutoModelForCausalLM": "modeling.MoLM",
"AutoTokenizer": "GPT2TokenizerFast"
},
"bias": false,
"dropout": 0.0,
"expert_configs": [
{
"bias": false,
"dropout": 0.0,
"dtype": "torch.bfloat16",
"mlp_dim_exp_factor": 1,
"moe": false,
"moe_aux_loss_factor": 0.01,
"moe_num_experts": 1,
"moe_num_experts_per_tok": 2,
"moe_router_loss": "load_balancing_z_loss",
"moe_routing": null,
"moe_softmax_order": "softmax_topk",
"moe_z_loss_factor": 1,
"n_embd": 1152,
"n_head": 16,
"n_layer": 24,
"routing": null,
"sequence_length": 1024,
"shared_attention": true,
"vocab_size": 50304
},
{
"bias": false,
"dropout": 0.0,
"dtype": "torch.bfloat16",
"mlp_dim_exp_factor": 1,
"moe": false,
"moe_aux_loss_factor": 0.01,
"moe_num_experts": 1,
"moe_num_experts_per_tok": 2,
"moe_router_loss": "load_balancing_z_loss",
"moe_routing": null,
"moe_softmax_order": "softmax_topk",
"moe_z_loss_factor": 1,
"n_embd": 1152,
"n_head": 16,
"n_layer": 24,
"routing": null,
"sequence_length": 1024,
"shared_attention": true,
"vocab_size": 50304
},
{
"bias": false,
"dropout": 0.0,
"dtype": "torch.bfloat16",
"mlp_dim_exp_factor": 1,
"moe": false,
"moe_aux_loss_factor": 0.01,
"moe_num_experts": 1,
"moe_num_experts_per_tok": 2,
"moe_router_loss": "load_balancing_z_loss",
"moe_routing": null,
"moe_softmax_order": "softmax_topk",
"moe_z_loss_factor": 1,
"n_embd": 1152,
"n_head": 16,
"n_layer": 24,
"routing": null,
"sequence_length": 1024,
"shared_attention": true,
"vocab_size": 50304
},
{
"bias": false,
"dropout": 0.0,
"dtype": "torch.bfloat16",
"mlp_dim_exp_factor": 1,
"moe": false,
"moe_aux_loss_factor": 0.01,
"moe_num_experts": 1,
"moe_num_experts_per_tok": 2,
"moe_router_loss": "load_balancing_z_loss",
"moe_routing": null,
"moe_softmax_order": "softmax_topk",
"moe_z_loss_factor": 1,
"n_embd": 1152,
"n_head": 16,
"n_layer": 24,
"routing": null,
"sequence_length": 1024,
"shared_attention": true,
"vocab_size": 50304
},
{
"bias": false,
"dropout": 0.0,
"dtype": "torch.bfloat16",
"mlp_dim_exp_factor": 1,
"moe": false,
"moe_aux_loss_factor": 0.01,
"moe_num_experts": 1,
"moe_num_experts_per_tok": 2,
"moe_router_loss": "load_balancing_z_loss",
"moe_routing": null,
"moe_softmax_order": "softmax_topk",
"moe_z_loss_factor": 1,
"n_embd": 1152,
"n_head": 16,
"n_layer": 24,
"routing": null,
"sequence_length": 1024,
"shared_attention": true,
"vocab_size": 50304
},
{
"bias": false,
"dropout": 0.0,
"dtype": "torch.bfloat16",
"mlp_dim_exp_factor": 1,
"moe": false,
"moe_aux_loss_factor": 0.01,
"moe_num_experts": 1,
"moe_num_experts_per_tok": 2,
"moe_router_loss": "load_balancing_z_loss",
"moe_routing": null,
"moe_softmax_order": "softmax_topk",
"moe_z_loss_factor": 1,
"n_embd": 1152,
"n_head": 16,
"n_layer": 24,
"routing": null,
"sequence_length": 1024,
"shared_attention": true,
"vocab_size": 50304
}
],
"mlp_dim_exp_factor": 1.0,
"model_type": "MoLM",
"n_embd": 1152,
"n_head": 16,
"n_layer": 24,
"num_experts": 6,
"sequence_length": 1024,
"use_router": false,
"vocab_size": 50304
}