Monoclebear commited on
Commit
3243d04
·
verified ·
1 Parent(s): 1874f18

Upload configuration_bailing_moe_v2_5.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_bailing_moe_v2_5.py +120 -0
configuration_bailing_moe_v2_5.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Bailing MoE V2.5 model configuration"""
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+
5
+
6
+ class BailingMoeV2_5Config(PretrainedConfig):
7
+
8
+ def __init__(
9
+ self,
10
+ vocab_size=157184,
11
+ hidden_size=2048,
12
+ intermediate_size=5120,
13
+ num_hidden_layers=20,
14
+ num_attention_heads=16,
15
+ num_key_value_heads=4,
16
+ hidden_act="silu",
17
+ use_qkv_bias=False, # bailing only
18
+ use_bias=False, # bailing only
19
+ rms_norm_eps=1e-06,
20
+ tie_word_embeddings=False, # PretrainedConfig key, here change default value.
21
+ embedding_dropout=0.0,
22
+ attention_dropout=0.0,
23
+ output_dropout=0.0,
24
+ initializer_range=0.02,
25
+ max_position_embeddings=32768,
26
+ rope_theta=600000.0,
27
+ use_cache=True,
28
+ max_window_layers=20,
29
+ rope_scaling=None,
30
+ pad_token_id=156892,
31
+ eos_token_id=156892,
32
+ num_experts=256,
33
+ num_shared_experts=1,
34
+ num_experts_per_tok=8,
35
+ n_group=8,
36
+ topk_group=4,
37
+ moe_intermediate_size=512,
38
+ first_k_dense_replace=1,
39
+ head_dim=128,
40
+ output_router_logits=False,
41
+ use_qk_norm=True,
42
+ num_nextn_predict_layers=0,
43
+ mtp_loss_scaling_factor=0,
44
+ moe_router_enable_expert_bias=True,
45
+ routed_scaling_factor=1.0,
46
+ layer_group_size=5,
47
+ group_norm_size=4,
48
+ linear_silu=False,
49
+ kv_lora_rank=512,
50
+ q_lora_rank=None,
51
+ qk_rope_head_dim=64,
52
+ v_head_dim=128,
53
+ qk_nope_head_dim=128,
54
+ rope_interleave=True,
55
+ partial_rotary_factor=0.5,
56
+ score_function="sigmoid",
57
+ scoring_func="sigmoid",
58
+ seq_aux=True,
59
+ topk_method="noaux_tc",
60
+ router_dtype="fp32",
61
+ **kwargs,
62
+ ):
63
+ self.num_hidden_layers = num_hidden_layers
64
+ self.vocab_size = vocab_size
65
+ self.hidden_size = hidden_size
66
+ self.intermediate_size = intermediate_size
67
+ self.num_attention_heads = num_attention_heads
68
+ self.num_key_value_heads = num_key_value_heads
69
+ self.hidden_act = hidden_act
70
+ self.use_qkv_bias = use_qkv_bias
71
+ self.use_bias = use_bias
72
+ self.rms_norm_eps = rms_norm_eps
73
+ self.embedding_dropout = embedding_dropout
74
+ self.attention_dropout = attention_dropout
75
+ self.output_dropout = output_dropout
76
+ self.num_nextn_predict_layers = num_nextn_predict_layers
77
+ self.mtp_loss_scaling_factor = mtp_loss_scaling_factor
78
+ self.initializer_range = initializer_range
79
+ self.max_position_embeddings = max_position_embeddings
80
+ self.rope_theta = rope_theta
81
+ self.use_cache = use_cache
82
+ self.max_window_layers = max_window_layers
83
+ self.head_dim = head_dim or self.hidden_size // self.num_attention_heads
84
+ self.rope_scaling = rope_scaling
85
+ self.use_qk_norm = use_qk_norm
86
+ self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
87
+ self.routed_scaling_factor = routed_scaling_factor
88
+
89
+ # MoE configs
90
+ self.num_experts = num_experts
91
+ self.num_shared_experts = num_shared_experts
92
+ self.num_experts_per_tok = num_experts_per_tok
93
+ self.n_group = n_group
94
+ self.topk_group = topk_group
95
+ self.moe_intermediate_size = moe_intermediate_size
96
+ self.first_k_dense_replace = first_k_dense_replace
97
+ self.output_router_logits = output_router_logits
98
+
99
+ # Linear configs
100
+ self.layer_group_size = layer_group_size
101
+ self.group_norm_size = group_norm_size
102
+ self.linear_silu = linear_silu
103
+ # mla
104
+ self.kv_lora_rank = kv_lora_rank
105
+ self.q_lora_rank = q_lora_rank
106
+ self.qk_rope_head_dim = qk_rope_head_dim
107
+
108
+ self.score_function = score_function
109
+ self.scoring_func = scoring_func
110
+ self.seq_aux = seq_aux
111
+ self.topk_method = topk_method
112
+ self.v_head_dim = v_head_dim
113
+ self.qk_nope_head_dim = qk_nope_head_dim
114
+ self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
115
+ self.rope_interleave = rope_interleave
116
+ self.router_dtype = router_dtype
117
+ self.partial_rotary_factor = partial_rotary_factor
118
+ super().__init__(
119
+ pad_token_id=pad_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
120
+ )