yarkcy commited on
Commit
7de7b06
·
verified ·
1 Parent(s): c0f7843

Upload configuration_bailing_moe_v2.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_bailing_moe_v2.py +86 -0
configuration_bailing_moe_v2.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Bailing MoE model configuration"""
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+
5
+
6
+ class BailingMoeV2Config(PretrainedConfig):
7
+ model_type = "bailing_moe_v2"
8
+
9
+ def __init__(
10
+ self,
11
+ vocab_size=30592,
12
+ hidden_size=1024,
13
+ intermediate_size=None,
14
+ num_hidden_layers=24,
15
+ num_attention_heads=16,
16
+ num_key_value_heads=0,
17
+ hidden_act="silu",
18
+ use_qkv_bias=False, # bailing only
19
+ use_qk_norm=False,
20
+ use_bias=True, # bailing only
21
+ rms_norm_eps=1e-05,
22
+ norm_head=False, # bailing only
23
+ tie_word_embeddings=False, # PretrainedConfig key, here change default value.
24
+ embedding_dropout=0.1,
25
+ attention_dropout=0.1,
26
+ output_dropout=0.1,
27
+ initializer_range=0.02,
28
+ max_position_embeddings=16384,
29
+ rope_theta=10000.0,
30
+ use_cache=True,
31
+ use_sliding_window=False,
32
+ sliding_window=4096,
33
+ max_window_layers=28,
34
+ rope_scaling=None,
35
+ pad_token_id=126081,
36
+ num_experts=16,
37
+ num_shared_experts=0,
38
+ num_experts_per_tok=2,
39
+ n_group=8,
40
+ topk_group=4,
41
+ routed_scaling_factor=2.5,
42
+ moe_intermediate_size=None,
43
+ first_k_dense_replace=0,
44
+ head_dim=None,
45
+ output_router_logits=False,
46
+ partial_rotary_factor=0.5,
47
+ **kwargs,
48
+ ):
49
+ self.num_hidden_layers = num_hidden_layers
50
+ self.vocab_size = vocab_size
51
+ self.hidden_size = hidden_size
52
+ self.intermediate_size = intermediate_size
53
+ self.num_attention_heads = num_attention_heads
54
+ self.num_key_value_heads = num_key_value_heads
55
+ self.hidden_act = hidden_act
56
+ self.use_qkv_bias = use_qkv_bias
57
+ self.use_qk_norm = use_qk_norm
58
+ self.use_bias = use_bias
59
+ self.norm_head = norm_head
60
+ self.rms_norm_eps = rms_norm_eps
61
+ self.embedding_dropout = embedding_dropout
62
+ self.attention_dropout = attention_dropout
63
+ self.output_dropout = output_dropout
64
+ self.initializer_range = initializer_range
65
+ self.max_position_embeddings = max_position_embeddings
66
+ self.rope_theta = rope_theta
67
+ self.use_cache = use_cache
68
+ self.use_sliding_window = use_sliding_window
69
+ self.sliding_window = sliding_window
70
+ self.max_window_layers = max_window_layers
71
+ self.head_dim = head_dim or self.hidden_size // self.num_attention_heads
72
+ self.rope_scaling = rope_scaling
73
+
74
+ # MoE configs
75
+ self.num_experts = num_experts
76
+ self.num_shared_experts = num_shared_experts
77
+ self.num_experts_per_tok = num_experts_per_tok
78
+ self.n_group = n_group
79
+ self.topk_group = topk_group
80
+ self.moe_intermediate_size = moe_intermediate_size
81
+ self.first_k_dense_replace = first_k_dense_replace
82
+ self.output_router_logits = output_router_logits
83
+ self.routed_scaling_factor = routed_scaling_factor
84
+ self.partial_rotary_factor = partial_rotary_factor
85
+
86
+ super().__init__(pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)