File size: 1,649 Bytes
9cba846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd8b01d
a1358cd
9cba846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from transformers import PretrainedConfig

class MoEGPTConfig(PretrainedConfig):
    model_type = "moegpt"

    def __init__(
        self,
        vocab_size=50304,
        n_embd=768,
        n_layer=12,
        n_head=12,
        sequence_length=1024,
        moe=False,
        moe_routing="standard_gating",
        moe_num_experts=4,
        moe_num_experts_per_tok=2,
        moe_softmax_order="softmax_topk",
        moe_router_loss="load_balancing_z_loss",
        moe_aux_loss_factor=0.01,
        moe_z_loss_factor=1.0,
        mlp_dim_exp_factor=1.0,
        dropout=0.0,
        bias=False,
        architectures=["MoEGPTForCausalLM"],
        auto_map={
            "AutoConfig": "configuration.MoEGPTConfig",
            "AutoModelForCausalLM": "modeling.MoEGPTForCausalLM",
            "AutoTokenizer": "GPT2TokenizerFast"
        },
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.sequence_length = sequence_length
        self.moe = moe
        self.moe_routing = moe_routing
        self.moe_num_experts = moe_num_experts
        self.moe_num_experts_per_tok = moe_num_experts_per_tok
        self.moe_softmax_order = moe_softmax_order
        self.moe_router_loss = moe_router_loss
        self.moe_aux_loss_factor = moe_aux_loss_factor
        self.moe_z_loss_factor = moe_z_loss_factor
        self.mlp_dim_exp_factor = mlp_dim_exp_factor
        self.dropout = dropout
        self.bias = bias
        self.architectures = architectures
        self.auto_map = auto_map