| | from transformers import PretrainedConfig |
| |
|
| | class MoEGPTConfig(PretrainedConfig): |
| | model_type = "moegpt" |
| |
|
| | def __init__( |
| | self, |
| | vocab_size=50304, |
| | n_embd=1152, |
| | n_layer=24, |
| | n_head=16, |
| | sequence_length=1024, |
| | moe=False, |
| | moe_routing="standard_gating", |
| | moe_num_experts=4, |
| | moe_num_experts_per_tok=2, |
| | moe_softmax_order="softmax_topk", |
| | moe_router_loss="load_balancing_z_loss", |
| | moe_aux_loss_factor=0.01, |
| | moe_z_loss_factor=1.0, |
| | mlp_dim_exp_factor=1.0, |
| | dropout=0.0, |
| | bias=False, |
| | architectures=["MoEGPTForCausalLM"], |
| | auto_map={ |
| | "AutoConfig": "configuration.MoEGPTConfig", |
| | "AutoModelForCausalLM": "modeling.MoEGPTForCausalLM", |
| | "AutoTokenizer": "GPT2TokenizerFast" |
| | }, |
| | **kwargs, |
| | ): |
| | super().__init__(**kwargs) |
| | self.vocab_size = vocab_size |
| | self.n_embd = n_embd |
| | self.n_layer = n_layer |
| | self.n_head = n_head |
| | self.sequence_length = sequence_length |
| | self.moe = moe |
| | self.moe_routing = moe_routing |
| | self.moe_num_experts = moe_num_experts |
| | self.moe_num_experts_per_tok = moe_num_experts_per_tok |
| | self.moe_softmax_order = moe_softmax_order |
| | self.moe_router_loss = moe_router_loss |
| | self.moe_aux_loss_factor = moe_aux_loss_factor |
| | self.moe_z_loss_factor = moe_z_loss_factor |
| | self.mlp_dim_exp_factor = mlp_dim_exp_factor |
| | self.dropout = dropout |
| | self.bias = bias |
| | self.architectures = architectures |
| | self.auto_map = auto_map |
| | |