fhai50032 commited on
Commit
94c09b9
·
verified ·
1 Parent(s): e71f236

Create configuration_bibo.py

Browse files
Files changed (1) hide show
  1. configuration_bibo.py +164 -0
configuration_bibo.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The BiBo Authors and The HuggingFace Inc. team. All rights reserved.
3
+
4
+
5
+ from transformers.configuration_utils import PretrainedConfig
6
+ from transformers.utils import logging
7
+
8
+ logger = logging.get_logger(__name__)
9
+
10
+ BIBO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
11
+ # not now
12
+ }
13
+
14
+ class BiBoConfig(PretrainedConfig):
15
+ r"""
16
+ This is the configuration class to store the configuration of a [`BiBoModel`]. It is used to
17
+ instantiate a BiBo model according to the specified arguments, defining the model architecture.
18
+
19
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read
20
+ the documentation from [`PretrainedConfig`] for more information.
21
+
22
+ Args:
23
+ vocab_size (`int`, *optional*, defaults to 128000):
24
+ Vocabulary size of the BiBo model.
25
+ hidden_size (`int`, *optional*, defaults to 1536):
26
+ Dimension of the hidden states.
27
+ intermediate_size (`int`, *optional*, defaults to 8960):
28
+ Dimension of the MLP representations in Dense layers.
29
+ num_hidden_layers (`int`, *optional*, defaults to 28):
30
+ Number of hidden layers in the Transformer encoder.
31
+ num_attention_heads (`int`, *optional*, defaults to 12):
32
+ Number of attention heads for each attention layer in the Transformer encoder.
33
+ num_key_value_heads (`int`, *optional*, defaults to 2):
34
+ Number of key and value heads for Grouped Query Attention.
35
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
36
+ The non-linear activation function (function or string) in the encoder.
37
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
38
+ The maximum sequence length that this model might ever be used with.
39
+ initializer_range (`float`, *optional*, defaults to 0.02):
40
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
41
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
42
+ The epsilon used by the RMS normalization layers.
43
+ use_cache (`bool`, *optional*, defaults to `True`):
44
+ Whether or not the model should return the last key/values attentions.
45
+ pad_token_id (`int`, *optional*):
46
+ The index of the padding token in the vocabulary. Defaults to None.
47
+ bos_token_id (`int`, *optional*, defaults to 0):
48
+ The id of the beginning of sequence token in the vocabulary.
49
+ eos_token_id (`int`, *optional*, defaults to 0):
50
+ The id of the end of sequence token in the vocabulary.
51
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
52
+ Whether to tie the weights of the input embeddings and the output embeddings.
53
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
54
+ The base period of the RoPE embeddings.
55
+ rope_scaling (`Dict`, *optional*):
56
+ Dictionary containing the scaling configuration for RoPE embeddings.
57
+ attention_dropout (`float`, *optional*, defaults to 0.0):
58
+ The dropout ratio for the attention probabilities.
59
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
60
+ Whether to use sliding window attention.
61
+ sliding_window (`int`, *optional*, defaults to 32768):
62
+ Sliding window attention window size. If not specified, will default to `config.max_position_embeddings`.
63
+ max_window_layers (`int`, *optional*, defaults to 21):
64
+ The number of layers that use sliding window attention.
65
+ # --- MoE Specific Parameters ---
66
+ moe_intermediate_size (`int`, *optional*, defaults to 1024):
67
+ Dimension of the MLP representations in MoE layers.
68
+ num_routed_experts (`int`, *optional*, defaults to 11):
69
+ Total number of routed experts (MLP + Identity) in MoE layers.
70
+ num_shared_experts (`int`, *optional*, defaults to 1):
71
+ Total number of shared experts (Convolutional) in MoE layers.
72
+ num_experts_per_tok (`int`, *optional*, defaults to 2):
73
+ Number of routed experts to select per token (Top-K).
74
+
75
+ # --- Hybrid Layer Control ---
76
+ # Implicitly defined: First (idx=0) and Last (idx=N-1) layers are Dense, others are MoE.
77
+ """
78
+ model_type = "bibo"
79
+ keys_to_ignore_at_inference = ["past_key_values"]
80
+
81
+ def __init__(
82
+ self,
83
+ vocab_size=128000,
84
+ hidden_size=1536,
85
+ intermediate_size=8960,
86
+ num_hidden_layers=28,
87
+ num_attention_heads=12,
88
+ num_key_value_heads=2,
89
+ hidden_act="silu",
90
+ max_position_embeddings=32768,
91
+ initializer_range=0.02,
92
+ rms_norm_eps=1e-06,
93
+ use_cache=True,
94
+ pad_token_id=None,
95
+ bos_token_id=0,
96
+ eos_token_id=0,
97
+ tie_word_embeddings=True,
98
+ rope_theta=1000000.0,
99
+ rope_scaling=None,
100
+ attention_dropout=0.0,
101
+ use_sliding_window=False,
102
+ sliding_window=32768,
103
+ max_window_layers=21,
104
+
105
+ # MoE defaults
106
+ moe_intermediate_size=1024,
107
+ num_routed_experts=11,
108
+ num_shared_experts=1,
109
+ num_experts_per_tok=2,
110
+ router_temperature=1.3,
111
+ bias_update_factor=1e-4,
112
+ router_noise=0.5,
113
+ kernel_size=3
114
+
115
+
116
+ **kwargs,
117
+ ):
118
+ self.vocab_size = vocab_size
119
+ self.max_position_embeddings = max_position_embeddings
120
+ self.hidden_size = hidden_size
121
+ self.intermediate_size = intermediate_size
122
+ self.num_hidden_layers = num_hidden_layers
123
+ self.num_attention_heads = num_attention_heads
124
+
125
+
126
+ if num_key_value_heads is None:
127
+ num_key_value_heads = num_attention_heads
128
+ self.num_key_value_heads = num_key_value_heads
129
+
130
+ self.hidden_act = hidden_act
131
+ self.initializer_range = initializer_range
132
+ self.rms_norm_eps = rms_norm_eps
133
+ self.use_cache = use_cache
134
+ self.rope_theta = rope_theta
135
+ self.rope_scaling = rope_scaling
136
+ self.attention_dropout = attention_dropout
137
+
138
+ # MoE parameters
139
+ self.moe_intermediate_size = moe_intermediate_size
140
+ self.num_routed_experts = num_routed_experts
141
+ self.num_shared_experts = num_shared_experts
142
+ self.num_experts_per_tok = num_experts_per_tok
143
+ self.router_temperature = router_temperature
144
+ self.router_noise = router_noise
145
+ self.bias_update_factor = bias_update_factor
146
+ self.kernel_size = kernel_size
147
+
148
+
149
+ self.use_sliding_window = use_sliding_window
150
+ self.sliding_window = sliding_window if use_sliding_window else self.max_position_embeddings
151
+ self.max_window_layers = max_window_layers
152
+
153
+
154
+ super().__init__(
155
+ pad_token_id=pad_token_id,
156
+ bos_token_id=bos_token_id,
157
+ eos_token_id=eos_token_id,
158
+ tie_word_embeddings=tie_word_embeddings,
159
+ **kwargs,
160
+ )
161
+
162
+
163
+ # from transformers import AutoConfig
164
+ # AutoConfig.register("bibo", BiBoConfig)