drizzlezyk commited on
Commit
cd42f96
·
verified ·
1 Parent(s): 7bb83c2

Upload configuration_pangu_moe.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_pangu_moe.py +96 -0
configuration_pangu_moe.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
3
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ PanguProMoE model configuration"""
17
+
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.utils import logging
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+
26
+ class PanguProMoEConfig(PretrainedConfig):
27
+
28
+ model_type = "PanguProMoE"
29
+ _auto_class = "AutoConfig"
30
+
31
+ def __init__(
32
+ self,
33
+ vocab_size=153376,
34
+ hidden_size=4608,
35
+ intermediate_size=10240,
36
+ num_hidden_layers=50,
37
+ num_attention_heads=64,
38
+ num_key_value_heads=4,
39
+ mlp_only_layers=[0,1,2,3],
40
+ hidden_act="silu",
41
+ max_position_embeddings=8192,
42
+ initializer_range=0.02,
43
+ rms_norm_eps=1e-5,
44
+ use_cache=True,
45
+ tie_word_embeddings=False,
46
+ rope_theta=100000,
47
+ moe_intermediate_size=1280,
48
+ shared_expert_intermediate_size=2560,
49
+ num_experts_per_tok=8,
50
+ num_experts=80,
51
+ norm_topk_prob=True,
52
+ router_enable_expert_bias=True,
53
+ output_router_logits=False,
54
+ routed_scaling_factor=2.5,
55
+ qk_nope_dim = 128,
56
+ qk_rope_dim = 64,
57
+ v_channels = 128,
58
+ sandwich_norm=True,
59
+ param_sink_number = 128,
60
+ param_sink_with_value=True,
61
+ **kwargs,
62
+ ):
63
+ self.vocab_size = vocab_size
64
+ self.max_position_embeddings = max_position_embeddings
65
+ self.hidden_size = hidden_size
66
+ self.num_hidden_layers = num_hidden_layers
67
+ self.num_attention_heads = num_attention_heads
68
+ self.num_key_value_heads = num_key_value_heads
69
+ self.hidden_act = hidden_act
70
+ self.initializer_range = initializer_range
71
+ self.rms_norm_eps = rms_norm_eps
72
+ self.use_cache = use_cache
73
+ self.rope_theta = rope_theta
74
+ self.mlp_only_layers = mlp_only_layers
75
+ self.intermediate_size = intermediate_size
76
+
77
+ # MoE arguments
78
+ self.moe_intermediate_size = moe_intermediate_size
79
+ self.shared_expert_intermediate_size = shared_expert_intermediate_size
80
+ self.num_experts_per_tok = num_experts_per_tok
81
+ self.num_experts = num_experts
82
+ self.norm_topk_prob = norm_topk_prob
83
+ self.output_router_logits = output_router_logits
84
+ self.router_enable_expert_bias = router_enable_expert_bias
85
+ self.routed_scaling_factor = routed_scaling_factor
86
+ self.qk_nope_dim = qk_nope_dim
87
+ self.qk_rope_dim = qk_rope_dim
88
+ self.v_channels = v_channels
89
+ self.sandwich_norm = sandwich_norm
90
+ self.param_sink_number = param_sink_number
91
+ self.param_sink_with_value = param_sink_with_value
92
+
93
+ super().__init__(
94
+ tie_word_embeddings=tie_word_embeddings,
95
+ **kwargs,
96
+ )