| from transformers import PretrainedConfig |
|
|
| class MoEGPTConfig(PretrainedConfig): |
| model_type = "moegpt" |
|
|
| def __init__( |
| self, |
| vocab_size=50304, |
| n_embd=1152, |
| n_layer=24, |
| n_head=16, |
| sequence_length=1024, |
| moe=False, |
| moe_routing="standard_gating", |
| moe_num_experts=4, |
| moe_num_experts_per_tok=2, |
| moe_softmax_order="softmax_topk", |
| moe_router_loss="load_balancing_z_loss", |
| moe_aux_loss_factor=0.01, |
| moe_z_loss_factor=1.0, |
| mlp_dim_exp_factor=1.0, |
| dropout=0.0, |
| bias=False, |
| architectures=["MoEGPTForCausalLM"], |
| auto_map={ |
| "AutoConfig": "configuration.MoEGPTConfig", |
| "AutoModelForCausalLM": "modeling.MoEGPTForCausalLM", |
| "AutoTokenizer": "GPT2TokenizerFast" |
| }, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| self.vocab_size = vocab_size |
| self.n_embd = n_embd |
| self.n_layer = n_layer |
| self.n_head = n_head |
| self.sequence_length = sequence_length |
| self.moe = moe |
| self.moe_routing = moe_routing |
| self.moe_num_experts = moe_num_experts |
| self.moe_num_experts_per_tok = moe_num_experts_per_tok |
| self.moe_softmax_order = moe_softmax_order |
| self.moe_router_loss = moe_router_loss |
| self.moe_aux_loss_factor = moe_aux_loss_factor |
| self.moe_z_loss_factor = moe_z_loss_factor |
| self.mlp_dim_exp_factor = mlp_dim_exp_factor |
| self.dropout = dropout |
| self.bias = bias |
| self.architectures = architectures |
| self.auto_map = auto_map |
| |