{ "architectures": [ "MoLM" ], "auto_map": { "AutoConfig": "configuration.MoLMConfig", "AutoModelForCausalLM": "robinfaro/molm_log_prob_router--modeling.MoLM", "AutoTokenizer": "robinfaro/molm_log_prob_router--GPT2TokenizerFast" }, "bias": false, "dropout": 0.0, "expert_configs": [ { "bias": false, "dropout": 0.0, "dtype": "torch.bfloat16", "mlp_dim_exp_factor": 1, "moe": false, "moe_aux_loss_factor": 0.01, "moe_num_experts": 1, "moe_num_experts_per_tok": 2, "moe_router_loss": "load_balancing_z_loss", "moe_routing": null, "moe_softmax_order": "softmax_topk", "moe_z_loss_factor": 1, "n_embd": 1152, "n_head": 16, "n_layer": 24, "routing": null, "sequence_length": 1024, "shared_attention": true, "vocab_size": 50304 }, { "bias": false, "dropout": 0.0, "dtype": "torch.bfloat16", "mlp_dim_exp_factor": 1, "moe": false, "moe_aux_loss_factor": 0.01, "moe_num_experts": 1, "moe_num_experts_per_tok": 2, "moe_router_loss": "load_balancing_z_loss", "moe_routing": null, "moe_softmax_order": "softmax_topk", "moe_z_loss_factor": 1, "n_embd": 1152, "n_head": 16, "n_layer": 24, "routing": null, "sequence_length": 1024, "shared_attention": true, "vocab_size": 50304 }, { "bias": false, "dropout": 0.0, "dtype": "torch.bfloat16", "mlp_dim_exp_factor": 1, "moe": false, "moe_aux_loss_factor": 0.01, "moe_num_experts": 1, "moe_num_experts_per_tok": 2, "moe_router_loss": "load_balancing_z_loss", "moe_routing": null, "moe_softmax_order": "softmax_topk", "moe_z_loss_factor": 1, "n_embd": 1152, "n_head": 16, "n_layer": 24, "routing": null, "sequence_length": 1024, "shared_attention": true, "vocab_size": 50304 }, { "bias": false, "dropout": 0.0, "dtype": "torch.bfloat16", "mlp_dim_exp_factor": 1, "moe": false, "moe_aux_loss_factor": 0.01, "moe_num_experts": 1, "moe_num_experts_per_tok": 2, "moe_router_loss": "load_balancing_z_loss", "moe_routing": null, "moe_softmax_order": "softmax_topk", "moe_z_loss_factor": 1, "n_embd": 1152, "n_head": 16, "n_layer": 24, "routing": null, "sequence_length": 1024, "shared_attention": true, "vocab_size": 50304 }, { "bias": false, "dropout": 0.0, "dtype": "torch.bfloat16", "mlp_dim_exp_factor": 1, "moe": false, "moe_aux_loss_factor": 0.01, "moe_num_experts": 1, "moe_num_experts_per_tok": 2, "moe_router_loss": "load_balancing_z_loss", "moe_routing": null, "moe_softmax_order": "softmax_topk", "moe_z_loss_factor": 1, "n_embd": 1152, "n_head": 16, "n_layer": 24, "routing": null, "sequence_length": 1024, "shared_attention": true, "vocab_size": 50304 }, { "bias": false, "dropout": 0.0, "dtype": "torch.bfloat16", "mlp_dim_exp_factor": 1, "moe": false, "moe_aux_loss_factor": 0.01, "moe_num_experts": 1, "moe_num_experts_per_tok": 2, "moe_router_loss": "load_balancing_z_loss", "moe_routing": null, "moe_softmax_order": "softmax_topk", "moe_z_loss_factor": 1, "n_embd": 1152, "n_head": 16, "n_layer": 24, "routing": null, "sequence_length": 1024, "shared_attention": true, "vocab_size": 50304 } ], "mlp_dim_exp_factor": 1.0, "model_type": "MoLM", "n_embd": 1152, "n_head": 16, "n_layer": 24, "num_experts": 6, "sequence_length": 1024, "top_k_experts": 6, "torch_dtype": "float32", "transformers_version": "4.51.0", "use_router": true, "vocab_size": 50304 }