cr0sh commited on
Commit
991ee72
·
verified ·
1 Parent(s): 87345f9

Upload configuration_afmoe.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_afmoe.py +133 -0
configuration_afmoe.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from transformers.configuration_utils import PretrainedConfig
16
+ from transformers.modeling_rope_utils import rope_config_validation
17
+ from transformers.configuration_utils import layer_type_validation
18
+ from transformers.utils import logging
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+ class AfmoeConfig(PretrainedConfig):
23
+ """
24
+ n_group (`int`, *optional*, defaults to 1):
25
+ Number of groups for routed experts.
26
+ topk_group (`int`, *optional*, defaults to 1):
27
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
28
+ """
29
+ model_type = "afmoe"
30
+ base_model_pp_plan = {
31
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
32
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
33
+ "norm": (["hidden_states"], ["hidden_states"]),
34
+ }
35
+
36
+ def __init__(
37
+ self,
38
+ num_hidden_layers: int = 32,
39
+ vocab_size: int = 200192,
40
+ hidden_size: int = 2048,
41
+ intermediate_size: int = 6144,
42
+ moe_intermediate_size=1408,
43
+ num_dense_layers=1,
44
+ num_attention_heads=16,
45
+ num_key_value_heads=None,
46
+ head_dim=128,
47
+ hidden_act="silu",
48
+ max_position_embeddings=16384,
49
+ initializer_range=0.02,
50
+ rms_norm_eps=1e-5,
51
+ use_cache=True,
52
+ tie_word_embeddings=False,
53
+ rope_theta=10000.0,
54
+ rope_scaling=None,
55
+ num_experts=64,
56
+ num_experts_per_tok=6,
57
+ num_shared_experts=2,
58
+ num_expert_groups=1,
59
+ num_limited_groups=1,
60
+ score_func="sigmoid",
61
+ route_norm=True,
62
+ route_scale=1.0,
63
+ global_attn_every_n_layers=4,
64
+ sliding_window=1024,
65
+ mup_enabled=False,
66
+ layer_types=None,
67
+ attention_dropout: float = 0.0,
68
+ n_group: int = 1,
69
+ topk_group: int = 1,
70
+ **kwargs,
71
+ ):
72
+ self.vocab_size = vocab_size
73
+ self.max_position_embeddings = max_position_embeddings
74
+ self.hidden_size = hidden_size
75
+ self.intermediate_size = intermediate_size
76
+ self.num_hidden_layers = num_hidden_layers
77
+ self.num_dense_layers = num_dense_layers
78
+ self.num_attention_heads = num_attention_heads
79
+ self.head_dim = head_dim
80
+ self.hidden_act = hidden_act
81
+ self.initializer_range = initializer_range
82
+ self.rms_norm_eps = rms_norm_eps
83
+ self.use_cache = use_cache
84
+ self.rope_theta = rope_theta
85
+ self.rope_scaling = rope_scaling
86
+
87
+
88
+ # MoE specific
89
+ self.moe_intermediate_size = moe_intermediate_size
90
+ self.num_experts_per_tok = num_experts_per_tok
91
+ self.n_group = n_group
92
+ self.topk_group = topk_group
93
+ self.num_experts = num_experts
94
+ self.num_shared_experts = num_shared_experts
95
+ self.num_expert_groups = num_expert_groups
96
+ self.num_limited_groups = num_limited_groups
97
+ self.score_func = score_func
98
+ self.route_norm = route_norm
99
+ self.route_scale = route_scale
100
+
101
+
102
+ # Attention specific
103
+ self.attention_dropout = attention_dropout
104
+ self.global_attn_every_n_layers = global_attn_every_n_layers
105
+ self.sliding_window = sliding_window
106
+ self.layer_types = layer_types
107
+ if self.layer_types is None:
108
+ self.layer_types = [
109
+ "sliding_attention" if bool((i + 1) % global_attn_every_n_layers) else "full_attention" for i in range(self.num_hidden_layers)
110
+ ]
111
+ layer_type_validation(self.layer_types)
112
+
113
+ # muP specific
114
+ self.mup_enabled = mup_enabled
115
+
116
+ if num_key_value_heads is None:
117
+ num_key_value_heads = num_attention_heads
118
+
119
+ self.num_key_value_heads = num_key_value_heads
120
+
121
+
122
+ # Validate rope configs
123
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
124
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
125
+ rope_config_validation(self)
126
+
127
+ super().__init__(
128
+ tie_word_embeddings=tie_word_embeddings,
129
+ **kwargs,
130
+ )
131
+
132
+
133
+ __all__ = ["AfmoeConfig"]