Kilinskiy commited on
Commit
acad00d
·
verified ·
1 Parent(s): d18839b

Upload config.json

Browse files
Files changed (1) hide show
  1. config.json +264 -0
config.json ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Step3p5ForCausalLM"
4
+ ],
5
+ "att_impl_type": "GQA",
6
+ "attention_other_setting": {
7
+ "attention_type": "sliding_attention",
8
+ "head_dim": 128,
9
+ "num_attention_groups": 8,
10
+ "num_attention_heads": 96,
11
+ "true_head_dim": 128
12
+ },
13
+ "auto_map": {
14
+ "AutoConfig": "configuration_step3p5.Step3p5Config",
15
+ "AutoModelForCausalLM": "modeling_step3p5.Step3p5ForCausalLM"
16
+ },
17
+ "bos_token_id": 0,
18
+ "dtype": "bfloat16",
19
+ "eos_token_id": 1,
20
+ "head_dim": 128,
21
+ "hidden_size": 4096,
22
+ "intermediate_size": 11264,
23
+ "layer_types": [
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "full_attention",
29
+ "sliding_attention",
30
+ "sliding_attention",
31
+ "sliding_attention",
32
+ "full_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "full_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "sliding_attention",
44
+ "full_attention",
45
+ "sliding_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "full_attention",
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "full_attention",
53
+ "sliding_attention",
54
+ "sliding_attention",
55
+ "sliding_attention",
56
+ "full_attention",
57
+ "sliding_attention",
58
+ "sliding_attention",
59
+ "sliding_attention",
60
+ "full_attention",
61
+ "sliding_attention",
62
+ "sliding_attention",
63
+ "sliding_attention",
64
+ "full_attention",
65
+ "sliding_attention",
66
+ "sliding_attention",
67
+ "sliding_attention",
68
+ "full_attention",
69
+ "sliding_attention",
70
+ "sliding_attention",
71
+ "sliding_attention"
72
+ ],
73
+ "max_position_embeddings": 262144,
74
+ "max_seq_len": 262144,
75
+ "model_type": "step3p5",
76
+ "moe_every_n_layer": 1,
77
+ "moe_intermediate_size": 1280,
78
+ "moe_layer_offset": 0,
79
+ "moe_layers_enum": "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44",
80
+ "moe_num_experts": 288,
81
+ "moe_router_activation": "sigmoid",
82
+ "moe_router_scaling_factor": 3.0,
83
+ "moe_top_k": 8,
84
+ "need_fp32_gate": true,
85
+ "norm_expert_weight": true,
86
+ "num_attention_groups": 8,
87
+ "num_attention_heads": 64,
88
+ "num_hidden_layers": 45,
89
+ "num_nextn_predict_layers": 3,
90
+ "output_hidden_states": true,
91
+ "pad_token_id": 1,
92
+ "partial_rotary_factor": 0.5,
93
+ "partial_rotary_factors": [
94
+ 0.5,
95
+ 1.0,
96
+ 1.0,
97
+ 1.0,
98
+ 0.5,
99
+ 1.0,
100
+ 1.0,
101
+ 1.0,
102
+ 0.5,
103
+ 1.0,
104
+ 1.0,
105
+ 1.0,
106
+ 0.5,
107
+ 1.0,
108
+ 1.0,
109
+ 1.0,
110
+ 0.5,
111
+ 1.0,
112
+ 1.0,
113
+ 1.0,
114
+ 0.5,
115
+ 1.0,
116
+ 1.0,
117
+ 1.0,
118
+ 0.5,
119
+ 1.0,
120
+ 1.0,
121
+ 1.0,
122
+ 0.5,
123
+ 1.0,
124
+ 1.0,
125
+ 1.0,
126
+ 0.5,
127
+ 1.0,
128
+ 1.0,
129
+ 1.0,
130
+ 0.5,
131
+ 1.0,
132
+ 1.0,
133
+ 1.0,
134
+ 0.5,
135
+ 1.0,
136
+ 1.0,
137
+ 1.0,
138
+ 0.5,
139
+ 1.0,
140
+ 1.0,
141
+ 1.0
142
+ ],
143
+ "rms_norm_eps": 1e-05,
144
+ "rope_parameters": null,
145
+ "rope_theta": 5000000.0,
146
+ "share_expert_dim": 1280,
147
+ "sink": false,
148
+ "sliding_window": 512,
149
+ "swiglu_limits": [
150
+ 0.0,
151
+ 0.0,
152
+ 0.0,
153
+ 0.0,
154
+ 0.0,
155
+ 0.0,
156
+ 0.0,
157
+ 0.0,
158
+ 0.0,
159
+ 0.0,
160
+ 0.0,
161
+ 0.0,
162
+ 0.0,
163
+ 0.0,
164
+ 0.0,
165
+ 0.0,
166
+ 0.0,
167
+ 0.0,
168
+ 0.0,
169
+ 0.0,
170
+ 0.0,
171
+ 0.0,
172
+ 0.0,
173
+ 0.0,
174
+ 0.0,
175
+ 0.0,
176
+ 0.0,
177
+ 0.0,
178
+ 0.0,
179
+ 0.0,
180
+ 0.0,
181
+ 0.0,
182
+ 0.0,
183
+ 0.0,
184
+ 0.0,
185
+ 0.0,
186
+ 0.0,
187
+ 0.0,
188
+ 0.0,
189
+ 0.0,
190
+ 0.0,
191
+ 0.0,
192
+ 0.0,
193
+ 7,
194
+ 7,
195
+ 0.0,
196
+ 0.0,
197
+ 0.0
198
+ ],
199
+ "swiglu_limits_shared": [
200
+ 0.0,
201
+ 0.0,
202
+ 0.0,
203
+ 0.0,
204
+ 0.0,
205
+ 0.0,
206
+ 0.0,
207
+ 0.0,
208
+ 0.0,
209
+ 0.0,
210
+ 0.0,
211
+ 0.0,
212
+ 0.0,
213
+ 0.0,
214
+ 0.0,
215
+ 0.0,
216
+ 0.0,
217
+ 0.0,
218
+ 0.0,
219
+ 0.0,
220
+ 0.0,
221
+ 0.0,
222
+ 0.0,
223
+ 0.0,
224
+ 0.0,
225
+ 0.0,
226
+ 0.0,
227
+ 0.0,
228
+ 0.0,
229
+ 0.0,
230
+ 0.0,
231
+ 0.0,
232
+ 0.0,
233
+ 0.0,
234
+ 0.0,
235
+ 0.0,
236
+ 0.0,
237
+ 0.0,
238
+ 0.0,
239
+ 0.0,
240
+ 0.0,
241
+ 0.0,
242
+ 0.0,
243
+ 0.0,
244
+ 16,
245
+ 0.0,
246
+ 0.0,
247
+ 0.0
248
+ ],
249
+ "transformers_version": "5.1.0",
250
+ "use_cache": false,
251
+ "use_head_wise_attn_gate": true,
252
+ "use_moe": true,
253
+ "use_moe_router_bias": true,
254
+ "use_qk_norm": true,
255
+ "use_return_dict": true,
256
+ "use_rope_layers": [],
257
+ "vocab_size": 128896,
258
+ "yarn_only_types": [
259
+ "full_attention"
260
+ ],
261
+ "zero_centered": true,
262
+ "num_key_value_heads": 8,
263
+ "expert_intermediate_size": 1280
264
+ }