| from transformers import PretrainedConfig | |
| class MoEGPTConfig(PretrainedConfig): | |
| model_type = "moegpt" | |
| def __init__( | |
| self, | |
| vocab_size=50304, | |
| n_embd=768, | |
| n_layer=12, | |
| n_head=12, | |
| sequence_length=1024, | |
| moe=False, | |
| moe_routing="standard_gating", | |
| moe_num_experts=4, | |
| moe_num_experts_per_tok=2, | |
| moe_softmax_order="softmax_topk", | |
| moe_router_loss="load_balancing_z_loss", | |
| moe_aux_loss_factor=0.01, | |
| moe_z_loss_factor=1.0, | |
| mlp_dim_exp_factor=1.0, | |
| dropout=0.0, | |
| bias=False, | |
| architectures=["MoEGPTForCausalLM"], | |
| auto_map={ | |
| "AutoConfig": "configuration.MoEGPTConfig", | |
| "AutoModelForCausalLM": "modeling.MoEGPTForCausalLM", | |
| "AutoTokenizer": "GPT2TokenizerFast" | |
| }, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.vocab_size = vocab_size | |
| self.n_embd = n_embd | |
| self.n_layer = n_layer | |
| self.n_head = n_head | |
| self.sequence_length = sequence_length | |
| self.moe = moe | |
| self.moe_routing = moe_routing | |
| self.moe_num_experts = moe_num_experts | |
| self.moe_num_experts_per_tok = moe_num_experts_per_tok | |
| self.moe_softmax_order = moe_softmax_order | |
| self.moe_router_loss = moe_router_loss | |
| self.moe_aux_loss_factor = moe_aux_loss_factor | |
| self.moe_z_loss_factor = moe_z_loss_factor | |
| self.mlp_dim_exp_factor = mlp_dim_exp_factor | |
| self.dropout = dropout | |
| self.bias = bias | |
| self.architectures = architectures | |
| self.auto_map = auto_map | |