drizzlezyk commited on
Commit
d21a5ac
·
verified ·
1 Parent(s): 04406f4

Upload configuration_openpangu_dense.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_openpangu_dense.py +92 -0
configuration_openpangu_dense.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3
+
4
+ from transformers.utils import logging
5
+ from transformers.configuration_utils import PretrainedConfig
6
+
7
+
8
+ logger = logging.get_logger(__name__)
9
+
10
+
11
+ class PanguEmbeddedConfig(PretrainedConfig):
12
+ model_type = "pangu_embedded"
13
+ keys_to_ignore_at_inference = ["past_key_values"]
14
+
15
+ def __init__(
16
+ self,
17
+ vocab_size=153376,
18
+ hidden_size=4096,
19
+ intermediate_size=16384,
20
+ num_hidden_layers=28,
21
+ num_attention_heads=32,
22
+ num_key_value_heads=4,
23
+ head_dim=128,
24
+ hidden_act="silu",
25
+ max_position_embeddings=32768,
26
+ initializer_range=0.02,
27
+ rms_norm_eps=1e-5,
28
+ use_cache=True,
29
+ pad_token_id=0,
30
+ bos_token_id=1,
31
+ eos_token_id=2,
32
+ tie_word_embeddings=False,
33
+ rope_theta=16000000.0,
34
+ sliding_window=127,
35
+ attention_dropout=0.0,
36
+ bias=True,
37
+ layer_types=None,
38
+ param_sink_number=128,
39
+ attn_groupnorm=True,
40
+ attn_elementwise_gate=True,
41
+ router_sliding_window=3,
42
+ router_win_decay=0.5,
43
+ **kwargs,
44
+ ):
45
+ self.vocab_size = vocab_size
46
+ self.max_position_embeddings = max_position_embeddings
47
+ self.hidden_size = hidden_size
48
+ self.intermediate_size = intermediate_size
49
+ self.num_hidden_layers = num_hidden_layers
50
+ self.num_attention_heads = num_attention_heads
51
+ self.head_dim = head_dim
52
+ self.num_key_value_heads = num_key_value_heads
53
+ self.hidden_act = hidden_act
54
+ self.initializer_range = initializer_range
55
+ self.rms_norm_eps = rms_norm_eps
56
+ self.use_cache = use_cache
57
+ self.rope_theta = rope_theta
58
+ self.sliding_window = sliding_window
59
+ self.attention_dropout = attention_dropout
60
+ self.bias = bias
61
+
62
+ # Custom arguments not standard in most HF models
63
+ self.param_sink_number = param_sink_number
64
+ self.attn_groupnorm = attn_groupnorm
65
+ self.attn_elementwise_gate = attn_elementwise_gate
66
+ self.router_sliding_window = router_sliding_window
67
+ self.router_win_decay = router_win_decay
68
+
69
+ if layer_types is None:
70
+ # Default layer types based on Megatron's swa_layers: 1,3,5,...,27
71
+ # In 0-based indexing, this corresponds to layers 0, 2, 4, ..., 26
72
+ swa_hf_layers = {i for i in range(0, num_hidden_layers, 2)}
73
+ self.layer_types = [
74
+ "sliding_attention" if i in swa_hf_layers else "full_attention"
75
+ for i in range(num_hidden_layers)
76
+ ]
77
+ else:
78
+ self.layer_types = layer_types
79
+
80
+ if len(self.layer_types) != self.num_hidden_layers:
81
+ raise ValueError(
82
+ f"`layer_types` must have a length equal to `num_hidden_layers` ({self.num_hidden_layers}), "
83
+ f"but has length {len(self.layer_types)}."
84
+ )
85
+
86
+ super().__init__(
87
+ pad_token_id=pad_token_id,
88
+ bos_token_id=bos_token_id,
89
+ eos_token_id=eos_token_id,
90
+ tie_word_embeddings=tie_word_embeddings,
91
+ **kwargs,
92
+ )