{ "activation": "swiglu", "alternate_sparsity": true, "architectures": [ "BalmMoEForMaskedLM" ], "attention_classifier": false, "attention_dropout": 0.1, "classifier_activation": "tanh", "classifier_attention_heads": 20, "classifier_freeze_base": true, "dropout": 0.1, "expert_activation": "swiglu", "expert_bias": true, "expert_capacity": 1.0, "expert_capacity_type": "multiplier", "expert_dropout": 0.1, "expert_intermediate_size": [ 2560, 2560, 2560, 2560, 2560, 2560, 2560, 2560 ], "ffn_bias": true, "hidden_dropout": 0.1, "hidden_size": 640, "homogeneous_experts": false, "initializer_range": 0.02, "intermediate_size": 2560, "layer_norm_eps": 1e-05, "mask_token_id": 31, "max_position_embeddings": 256, "mlm_activation": "gelu", "model_type": "balm_moe", "num_attention_heads": 20, "num_experts": 8, "num_experts_per_tok": 2, "num_hidden_layers": 30, "num_initial_dense_layers": 1, "num_shared_experts": 0, "output_classifier_attentions": false, "output_expert_indexes": false, "output_router_logits": false, "pad_token_id": 1, "position_embedding_type": "rotary", "router_aux_loss_coef": 0.01, "router_bias": false, "router_dtype": "float32", "router_dynamic_loss_coef": 0.0001, "router_jitter": 0.0, "router_mask_aux_loss": true, "router_mask_pad_logits": false, "router_mask_pad_probs": true, "router_penalty_loss_coef": 0.1, "router_type": "top-k", "router_use_penalty_loss": false, "router_z_loss_coef": 0.001, "shared_expert_intermediate_size": 2560, "top_p_threshold": 0.7, "torch_dtype": "float32", "transformers_version": "4.51.0", "use_cache": true, "vocab_size": 32 }