jeffkang-lunit commited on
Commit
97cd573
·
verified ·
1 Parent(s): ab02c4c

Upload configuration_gravity_moe.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_gravity_moe.py +40 -107
configuration_gravity_moe.py CHANGED
@@ -11,95 +11,32 @@
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
- """GravityMoE model configuration"""
15
 
16
- try:
17
- from transformers.configuration_utils import PreTrainedConfig
18
- except ImportError:
19
- from transformers.configuration_utils import PretrainedConfig as PreTrainedConfig
20
 
21
 
22
- class GravityMoEConfig(PreTrainedConfig):
23
  r"""
24
- Configuration class for the GravityMoE model.
 
 
25
 
26
- Args:
27
- vocab_size (`int`, *optional*, defaults to 151552):
28
- Vocabulary size of the model.
29
- hidden_size (`int`, *optional*, defaults to 2048):
30
- Dimensionality of the hidden representations.
31
- intermediate_size (`int`, *optional*, defaults to 8192):
32
- Dimensionality of the dense MLP intermediate representations.
33
- moe_intermediate_size (`int`, *optional*, defaults to 1408):
34
- Dimensionality of the MoE expert intermediate representations.
35
- num_hidden_layers (`int`, *optional*, defaults to 28):
36
- Number of hidden layers in the model.
37
- num_attention_heads (`int`, *optional*, defaults to 16):
38
- Number of attention heads.
39
- num_key_value_heads (`int`, *optional*, defaults to 16):
40
- Number of key-value heads.
41
- n_shared_experts (`int`, *optional*, defaults to 1):
42
- Number of shared experts.
43
- n_routed_experts (`int`, *optional*, defaults to 64):
44
- Number of routed experts.
45
- routed_scaling_factor (`float`, *optional*, defaults to 2.446):
46
- Scaling factor for routed expert weights.
47
- kv_lora_rank (`int`, *optional*, defaults to 512):
48
- Rank of the low-rank KV projection.
49
- q_lora_rank (`int`, *optional*, defaults to `None`):
50
- Rank of the low-rank Q projection. If `None`, uses standard Q projection.
51
- qk_rope_head_dim (`int`, *optional*, defaults to 64):
52
- Dimensionality of the RoPE component of QK heads.
53
- v_head_dim (`int`, *optional*, defaults to 128):
54
- Dimensionality of value heads.
55
- qk_nope_head_dim (`int`, *optional*, defaults to 128):
56
- Dimensionality of the non-RoPE component of QK heads.
57
- n_group (`int`, *optional*, defaults to 1):
58
- Number of groups for routed experts.
59
- topk_group (`int`, *optional*, defaults to 1):
60
- Number of top-k groups.
61
- num_experts_per_tok (`int`, *optional*, defaults to 8):
62
- Number of experts activated per token.
63
- first_k_dense_replace (`int`, *optional*, defaults to 1):
64
- Number of initial dense layers before MoE layers.
65
- norm_topk_prob (`bool`, *optional*, defaults to `True`):
66
- Whether to normalize top-k probabilities.
67
- hidden_act (`str`, *optional*, defaults to `"silu"`):
68
- Activation function.
69
- max_position_embeddings (`int`, *optional*, defaults to 65536):
70
- Maximum sequence length.
71
- initializer_range (`float`, *optional*, defaults to 0.02):
72
- Standard deviation for weight initialization.
73
- rms_norm_eps (`float`, *optional*, defaults to 1e-6):
74
- Epsilon for RMS normalization.
75
- use_cache (`bool`, *optional*, defaults to `True`):
76
- Whether to use KV cache.
77
- rope_theta (`float`, *optional*, defaults to 1000000.0):
78
- Base frequency for RoPE.
79
- rope_scaling (`dict`, *optional*):
80
- RoPE scaling configuration.
81
- rope_interleave (`bool`, *optional*, defaults to `True`):
82
- Whether to interleave the rotary position embeddings.
83
- attention_bias (`bool`, *optional*, defaults to `False`):
84
- Whether to use bias in attention projections.
85
- attention_dropout (`float`, *optional*, defaults to 0.0):
86
- Dropout rate for attention weights.
87
 
88
  Example:
89
 
90
  ```python
91
  >>> from configuration_gravity_moe import GravityMoEConfig
92
 
93
- >>> # Initializing a GravityMoE style configuration
94
  >>> configuration = GravityMoEConfig()
95
-
96
- >>> # Accessing the model configuration
97
- >>> configuration = model.config
98
  ```
99
  """
100
 
101
  model_type = "gravity_moe"
102
- keys_to_ignore_at_inference = ["past_key_values"]
103
 
104
  def __init__(
105
  self,
@@ -139,45 +76,41 @@ class GravityMoEConfig(PreTrainedConfig):
139
  attention_dropout=0.0,
140
  **kwargs,
141
  ):
142
- self.vocab_size = vocab_size
143
- self.hidden_size = hidden_size
144
- self.intermediate_size = intermediate_size
145
- self.moe_intermediate_size = moe_intermediate_size
146
- self.num_hidden_layers = num_hidden_layers
147
- self.num_attention_heads = num_attention_heads
148
- self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads
149
- self.n_shared_experts = n_shared_experts
150
- self.n_routed_experts = n_routed_experts
151
- self.num_local_experts = n_routed_experts
152
- self.routed_scaling_factor = routed_scaling_factor
153
- self.kv_lora_rank = kv_lora_rank
154
- self.q_lora_rank = q_lora_rank
155
- self.qk_rope_head_dim = qk_rope_head_dim
156
- self.v_head_dim = v_head_dim
157
- self.qk_nope_head_dim = qk_nope_head_dim
158
- self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
159
- self.head_dim = qk_rope_head_dim
160
- self.n_group = n_group
161
- self.topk_group = topk_group
162
- self.num_experts_per_tok = num_experts_per_tok
163
- self.first_k_dense_replace = first_k_dense_replace
164
- self.norm_topk_prob = norm_topk_prob
165
- self.hidden_act = hidden_act
166
- self.max_position_embeddings = max_position_embeddings
167
- self.initializer_range = initializer_range
168
- self.rms_norm_eps = rms_norm_eps
169
- self.use_cache = use_cache
170
- self.rope_theta = rope_theta
171
- self.rope_scaling = rope_scaling
172
- self.rope_interleave = rope_interleave
173
- self.attention_bias = attention_bias
174
- self.attention_dropout = attention_dropout
175
-
176
  super().__init__(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  pad_token_id=pad_token_id,
178
  bos_token_id=bos_token_id,
179
  eos_token_id=eos_token_id,
180
  tie_word_embeddings=tie_word_embeddings,
 
 
 
 
 
181
  **kwargs,
182
  )
183
 
 
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
+ """GravityMoE model configuration — inherits from DeepSeek V3."""
15
 
16
+ from transformers import DeepseekV3Config
 
 
 
17
 
18
 
19
+ class GravityMoEConfig(DeepseekV3Config):
20
  r"""
21
+ Configuration class for the GravityMoE model, inheriting from
22
+ [`DeepseekV3Config`]. GravityMoE shares the same architecture as
23
+ DeepSeek V3 (sparse MoE with MLA) but uses different hyperparameters.
24
 
25
+ Only default values that differ from DeepSeek V3 are overridden here.
26
+ See [`DeepseekV3Config`] for full documentation of all parameters.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  Example:
29
 
30
  ```python
31
  >>> from configuration_gravity_moe import GravityMoEConfig
32
 
 
33
  >>> configuration = GravityMoEConfig()
34
+ >>> configuration.model_type
35
+ 'gravity_moe'
 
36
  ```
37
  """
38
 
39
  model_type = "gravity_moe"
 
40
 
41
  def __init__(
42
  self,
 
76
  attention_dropout=0.0,
77
  **kwargs,
78
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  super().__init__(
80
+ vocab_size=vocab_size,
81
+ hidden_size=hidden_size,
82
+ intermediate_size=intermediate_size,
83
+ moe_intermediate_size=moe_intermediate_size,
84
+ num_hidden_layers=num_hidden_layers,
85
+ num_attention_heads=num_attention_heads,
86
+ num_key_value_heads=num_key_value_heads,
87
+ n_shared_experts=n_shared_experts,
88
+ n_routed_experts=n_routed_experts,
89
+ routed_scaling_factor=routed_scaling_factor,
90
+ kv_lora_rank=kv_lora_rank,
91
+ q_lora_rank=q_lora_rank,
92
+ qk_rope_head_dim=qk_rope_head_dim,
93
+ v_head_dim=v_head_dim,
94
+ qk_nope_head_dim=qk_nope_head_dim,
95
+ n_group=n_group,
96
+ topk_group=topk_group,
97
+ num_experts_per_tok=num_experts_per_tok,
98
+ first_k_dense_replace=first_k_dense_replace,
99
+ norm_topk_prob=norm_topk_prob,
100
+ hidden_act=hidden_act,
101
+ max_position_embeddings=max_position_embeddings,
102
+ initializer_range=initializer_range,
103
+ rms_norm_eps=rms_norm_eps,
104
+ use_cache=use_cache,
105
  pad_token_id=pad_token_id,
106
  bos_token_id=bos_token_id,
107
  eos_token_id=eos_token_id,
108
  tie_word_embeddings=tie_word_embeddings,
109
+ rope_theta=rope_theta,
110
+ rope_scaling=rope_scaling,
111
+ rope_interleave=rope_interleave,
112
+ attention_bias=attention_bias,
113
+ attention_dropout=attention_dropout,
114
  **kwargs,
115
  )
116