WinstonDeng commited on
Commit
457483d
·
verified ·
1 Parent(s): 6578499

add step-3.7-flash bf16 model config

Browse files
Files changed (1) hide show
  1. config.json +338 -336
config.json CHANGED
@@ -1,343 +1,345 @@
1
  {
2
- "architectures": [
3
- "MMGPTStepRoboticsForCausalLM"
4
- ],
5
- "auto_map": {
6
- "AutoConfig": "configuration_step_robotics.StepRoboticsConfig"
7
- },
8
- "model_type": "step3p5v",
9
- "im_end_token": "<im_end>",
10
- "im_patch_token": "<im_patch>",
11
- "im_start_token": "<im_start>",
12
- "image_token_len": 169,
13
- "patch_token_len": 81,
14
- "image_token_id": 128001,
15
- "understand_projector_stride": 2,
16
- "use_im_start_end": "true",
17
- "vision_select_layer": -1,
18
- "projector_bias": false,
19
- "vision_config": {
20
- "model_type": "perception_encoder",
21
- "image_size": 728,
22
- "patch_size": 14,
23
- "width": 1536,
24
- "layers": 47,
25
- "heads": 16,
26
- "pool_type": "none",
27
- "output_dim": null,
28
- "use_cls_token": false,
29
- "ls_init_value": 0.1,
30
- "use_ln_post": false,
31
- "hidden_act": "quick_gelu"
32
- },
33
- "text_config": {
34
  "architectures": [
35
- "Step3p5ForCausalLM"
36
  ],
37
- "rope_scaling": {
38
- "rope_type": "llama3",
39
- "factor": 2.0,
40
- "original_max_position_embeddings": 131072,
41
- "low_freq_factor": 1.0,
42
- "high_freq_factor": 32.0
43
  },
44
- "yarn_only_types": [
45
- "full_attention"
46
- ],
47
- "model_type": "step3p5",
48
- "hidden_size": 4096,
49
- "intermediate_size": 11264,
50
- "num_hidden_layers": 45,
51
- "max_seq_len": 262144,
52
- "max_position_embeddings": 262144,
53
- "vocab_size": 128896,
54
- "torch_dtype": "bfloat16",
55
- "use_qk_norm": false,
56
- "moe_layers_enum": "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44",
57
- "use_mfa": false,
58
- "num_attention_heads": 64,
59
- "num_attention_groups": 8,
60
- "head_dim": 128,
61
- "use_moe": true,
62
- "moe_num_experts": 288,
63
- "moe_top_k": 8,
64
- "moe_intermediate_size": 1280,
65
- "share_expert_dim": 1280,
66
- "moe_layer_offset": 0,
67
- "moe_every_n_layer": 1,
68
- "norm_expert_weight": true,
69
- "moe_router_activation": "sigmoid",
70
- "moe_router_scaling_factor": 3.0,
71
- "att_impl_type": "GQA",
72
- "num_nextn_predict_layers": 3,
73
- "rope_theta": [
74
- 5000000.0,
75
- 10000.0,
76
- 10000.0,
77
- 10000.0,
78
- 5000000.0,
79
- 10000.0,
80
- 10000.0,
81
- 10000.0,
82
- 5000000.0,
83
- 10000.0,
84
- 10000.0,
85
- 10000.0,
86
- 5000000.0,
87
- 10000.0,
88
- 10000.0,
89
- 10000.0,
90
- 5000000.0,
91
- 10000.0,
92
- 10000.0,
93
- 10000.0,
94
- 5000000.0,
95
- 10000.0,
96
- 10000.0,
97
- 10000.0,
98
- 5000000.0,
99
- 10000.0,
100
- 10000.0,
101
- 10000.0,
102
- 5000000.0,
103
- 10000.0,
104
- 10000.0,
105
- 10000.0,
106
- 5000000.0,
107
- 10000.0,
108
- 10000.0,
109
- 10000.0,
110
- 5000000.0,
111
- 10000.0,
112
- 10000.0,
113
- 10000.0,
114
- 5000000.0,
115
- 10000.0,
116
- 10000.0,
117
- 10000.0,
118
- 5000000.0,
119
- 10000.0,
120
- 10000.0,
121
- 10000.0
122
- ],
123
- "use_head_wise_attn_gate": true,
124
- "sliding_window": 512,
125
- "use_moe_router_bias": true,
126
- "need_fp32_gate": true,
127
- "sink": false,
128
- "layer_types": [
129
- "full_attention",
130
- "sliding_attention",
131
- "sliding_attention",
132
- "sliding_attention",
133
- "full_attention",
134
- "sliding_attention",
135
- "sliding_attention",
136
- "sliding_attention",
137
- "full_attention",
138
- "sliding_attention",
139
- "sliding_attention",
140
- "sliding_attention",
141
- "full_attention",
142
- "sliding_attention",
143
- "sliding_attention",
144
- "sliding_attention",
145
- "full_attention",
146
- "sliding_attention",
147
- "sliding_attention",
148
- "sliding_attention",
149
- "full_attention",
150
- "sliding_attention",
151
- "sliding_attention",
152
- "sliding_attention",
153
- "full_attention",
154
- "sliding_attention",
155
- "sliding_attention",
156
- "sliding_attention",
157
- "full_attention",
158
- "sliding_attention",
159
- "sliding_attention",
160
- "sliding_attention",
161
- "full_attention",
162
- "sliding_attention",
163
- "sliding_attention",
164
- "sliding_attention",
165
- "full_attention",
166
- "sliding_attention",
167
- "sliding_attention",
168
- "sliding_attention",
169
- "full_attention",
170
- "sliding_attention",
171
- "sliding_attention",
172
- "sliding_attention",
173
- "full_attention",
174
- "sliding_attention",
175
- "sliding_attention",
176
- "sliding_attention"
177
- ],
178
- "use_rope_layers": [],
179
- "partial_rotary_factors": [
180
- 0.5,
181
- 1.0,
182
- 1.0,
183
- 1.0,
184
- 0.5,
185
- 1.0,
186
- 1.0,
187
- 1.0,
188
- 0.5,
189
- 1.0,
190
- 1.0,
191
- 1.0,
192
- 0.5,
193
- 1.0,
194
- 1.0,
195
- 1.0,
196
- 0.5,
197
- 1.0,
198
- 1.0,
199
- 1.0,
200
- 0.5,
201
- 1.0,
202
- 1.0,
203
- 1.0,
204
- 0.5,
205
- 1.0,
206
- 1.0,
207
- 1.0,
208
- 0.5,
209
- 1.0,
210
- 1.0,
211
- 1.0,
212
- 0.5,
213
- 1.0,
214
- 1.0,
215
- 1.0,
216
- 0.5,
217
- 1.0,
218
- 1.0,
219
- 1.0,
220
- 0.5,
221
- 1.0,
222
- 1.0,
223
- 1.0,
224
- 0.5,
225
- 1.0,
226
- 1.0,
227
- 1.0
228
- ],
229
- "eos_token_id": [
230
- 1,
231
- 2,
232
- 128007
233
- ],
234
- "bos_token_id": 0,
235
- "attention_other_setting": {
236
- "attention_type": "sliding_attention",
237
- "num_attention_heads": 96,
238
  "num_attention_groups": 8,
239
  "head_dim": 128,
240
- "true_head_dim": 128
241
- },
242
- "swiglu_limits": [
243
- 0.0,
244
- 0.0,
245
- 0.0,
246
- 0.0,
247
- 0.0,
248
- 0.0,
249
- 0.0,
250
- 0.0,
251
- 0.0,
252
- 0.0,
253
- 0.0,
254
- 0.0,
255
- 0.0,
256
- 0.0,
257
- 0.0,
258
- 0.0,
259
- 0.0,
260
- 0.0,
261
- 0.0,
262
- 0.0,
263
- 0.0,
264
- 0.0,
265
- 0.0,
266
- 0.0,
267
- 0.0,
268
- 0.0,
269
- 0.0,
270
- 0.0,
271
- 0.0,
272
- 0.0,
273
- 0.0,
274
- 0.0,
275
- 0.0,
276
- 0.0,
277
- 0.0,
278
- 0.0,
279
- 0.0,
280
- 0.0,
281
- 0.0,
282
- 0.0,
283
- 0.0,
284
- 0.0,
285
- 0.0,
286
- 7,
287
- 7,
288
- 0.0,
289
- 0.0,
290
- 0.0
291
- ],
292
- "swiglu_limits_shared": [
293
- 0.0,
294
- 0.0,
295
- 0.0,
296
- 0.0,
297
- 0.0,
298
- 0.0,
299
- 0.0,
300
- 0.0,
301
- 0.0,
302
- 0.0,
303
- 0.0,
304
- 0.0,
305
- 0.0,
306
- 0.0,
307
- 0.0,
308
- 0.0,
309
- 0.0,
310
- 0.0,
311
- 0.0,
312
- 0.0,
313
- 0.0,
314
- 0.0,
315
- 0.0,
316
- 0.0,
317
- 0.0,
318
- 0.0,
319
- 0.0,
320
- 0.0,
321
- 0.0,
322
- 0.0,
323
- 0.0,
324
- 0.0,
325
- 0.0,
326
- 0.0,
327
- 0.0,
328
- 0.0,
329
- 0.0,
330
- 0.0,
331
- 0.0,
332
- 0.0,
333
- 0.0,
334
- 0.0,
335
- 0.0,
336
- 16,
337
- 16,
338
- 0.0,
339
- 0.0,
340
- 0.0
341
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  }
343
- }
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "architectures": [
3
+ "Step3p7ForConditionalGeneration"
4
  ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_step3p7.Step3p7Config",
7
+ "AutoModelForCausalLM": "modeling_step3p7.Step3p7ForConditionalGeneration"
 
 
 
8
  },
9
+ "model_type": "step3p7",
10
+ "im_end_token": "<im_end>",
11
+ "im_patch_token": "<im_patch>",
12
+ "im_start_token": "<im_start>",
13
+ "image_token_len": 169,
14
+ "patch_token_len": 81,
15
+ "image_token_id": 128001,
16
+ "understand_projector_stride": 2,
17
+ "use_im_start_end": "true",
18
+ "vision_select_layer": -1,
19
+ "projector_bias": false,
20
+ "vision_config": {
21
+ "model_type": "perception_encoder",
22
+ "image_size": 728,
23
+ "patch_size": 14,
24
+ "width": 1536,
25
+ "layers": 47,
26
+ "heads": 16,
27
+ "pool_type": "none",
28
+ "output_dim": null,
29
+ "use_cls_token": false,
30
+ "ls_init_value": 0.1,
31
+ "use_ln_post": false,
32
+ "hidden_act": "quick_gelu"
33
+ },
34
+ "text_config": {
35
+ "architectures": [
36
+ "Step3p5ForCausalLM"
37
+ ],
38
+ "rope_scaling": {
39
+ "rope_type": "llama3",
40
+ "factor": 2.0,
41
+ "original_max_position_embeddings": 131072,
42
+ "low_freq_factor": 1.0,
43
+ "high_freq_factor": 32.0
44
+ },
45
+ "yarn_only_types": [
46
+ "full_attention"
47
+ ],
48
+ "model_type": "step3p5",
49
+ "hidden_size": 4096,
50
+ "intermediate_size": 11264,
51
+ "num_hidden_layers": 45,
52
+ "max_seq_len": 262144,
53
+ "max_position_embeddings": 262144,
54
+ "vocab_size": 128896,
55
+ "torch_dtype": "bfloat16",
56
+ "use_qk_norm": false,
57
+ "moe_layers_enum": "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44",
58
+ "use_mfa": false,
59
+ "num_attention_heads": 64,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  "num_attention_groups": 8,
61
  "head_dim": 128,
62
+ "use_moe": true,
63
+ "moe_num_experts": 288,
64
+ "moe_top_k": 8,
65
+ "moe_intermediate_size": 1280,
66
+ "share_expert_dim": 1280,
67
+ "moe_layer_offset": 0,
68
+ "moe_every_n_layer": 1,
69
+ "norm_expert_weight": true,
70
+ "moe_router_activation": "sigmoid",
71
+ "moe_router_scaling_factor": 3.0,
72
+ "att_impl_type": "GQA",
73
+ "num_nextn_predict_layers": 3,
74
+ "rope_theta": [
75
+ 5000000.0,
76
+ 10000.0,
77
+ 10000.0,
78
+ 10000.0,
79
+ 5000000.0,
80
+ 10000.0,
81
+ 10000.0,
82
+ 10000.0,
83
+ 5000000.0,
84
+ 10000.0,
85
+ 10000.0,
86
+ 10000.0,
87
+ 5000000.0,
88
+ 10000.0,
89
+ 10000.0,
90
+ 10000.0,
91
+ 5000000.0,
92
+ 10000.0,
93
+ 10000.0,
94
+ 10000.0,
95
+ 5000000.0,
96
+ 10000.0,
97
+ 10000.0,
98
+ 10000.0,
99
+ 5000000.0,
100
+ 10000.0,
101
+ 10000.0,
102
+ 10000.0,
103
+ 5000000.0,
104
+ 10000.0,
105
+ 10000.0,
106
+ 10000.0,
107
+ 5000000.0,
108
+ 10000.0,
109
+ 10000.0,
110
+ 10000.0,
111
+ 5000000.0,
112
+ 10000.0,
113
+ 10000.0,
114
+ 10000.0,
115
+ 5000000.0,
116
+ 10000.0,
117
+ 10000.0,
118
+ 10000.0,
119
+ 5000000.0,
120
+ 10000.0,
121
+ 10000.0,
122
+ 10000.0
123
+ ],
124
+ "use_head_wise_attn_gate": true,
125
+ "sliding_window": 512,
126
+ "use_moe_router_bias": true,
127
+ "need_fp32_gate": true,
128
+ "sink": false,
129
+ "layer_types": [
130
+ "full_attention",
131
+ "sliding_attention",
132
+ "sliding_attention",
133
+ "sliding_attention",
134
+ "full_attention",
135
+ "sliding_attention",
136
+ "sliding_attention",
137
+ "sliding_attention",
138
+ "full_attention",
139
+ "sliding_attention",
140
+ "sliding_attention",
141
+ "sliding_attention",
142
+ "full_attention",
143
+ "sliding_attention",
144
+ "sliding_attention",
145
+ "sliding_attention",
146
+ "full_attention",
147
+ "sliding_attention",
148
+ "sliding_attention",
149
+ "sliding_attention",
150
+ "full_attention",
151
+ "sliding_attention",
152
+ "sliding_attention",
153
+ "sliding_attention",
154
+ "full_attention",
155
+ "sliding_attention",
156
+ "sliding_attention",
157
+ "sliding_attention",
158
+ "full_attention",
159
+ "sliding_attention",
160
+ "sliding_attention",
161
+ "sliding_attention",
162
+ "full_attention",
163
+ "sliding_attention",
164
+ "sliding_attention",
165
+ "sliding_attention",
166
+ "full_attention",
167
+ "sliding_attention",
168
+ "sliding_attention",
169
+ "sliding_attention",
170
+ "full_attention",
171
+ "sliding_attention",
172
+ "sliding_attention",
173
+ "sliding_attention",
174
+ "full_attention",
175
+ "sliding_attention",
176
+ "sliding_attention",
177
+ "sliding_attention"
178
+ ],
179
+ "use_rope_layers": [],
180
+ "partial_rotary_factors": [
181
+ 0.5,
182
+ 1.0,
183
+ 1.0,
184
+ 1.0,
185
+ 0.5,
186
+ 1.0,
187
+ 1.0,
188
+ 1.0,
189
+ 0.5,
190
+ 1.0,
191
+ 1.0,
192
+ 1.0,
193
+ 0.5,
194
+ 1.0,
195
+ 1.0,
196
+ 1.0,
197
+ 0.5,
198
+ 1.0,
199
+ 1.0,
200
+ 1.0,
201
+ 0.5,
202
+ 1.0,
203
+ 1.0,
204
+ 1.0,
205
+ 0.5,
206
+ 1.0,
207
+ 1.0,
208
+ 1.0,
209
+ 0.5,
210
+ 1.0,
211
+ 1.0,
212
+ 1.0,
213
+ 0.5,
214
+ 1.0,
215
+ 1.0,
216
+ 1.0,
217
+ 0.5,
218
+ 1.0,
219
+ 1.0,
220
+ 1.0,
221
+ 0.5,
222
+ 1.0,
223
+ 1.0,
224
+ 1.0,
225
+ 0.5,
226
+ 1.0,
227
+ 1.0,
228
+ 1.0
229
+ ],
230
+ "eos_token_id": [
231
+ 1,
232
+ 2,
233
+ 128007
234
+ ],
235
+ "bos_token_id": 0,
236
+ "attention_other_setting": {
237
+ "attention_type": "sliding_attention",
238
+ "num_attention_heads": 96,
239
+ "num_attention_groups": 8,
240
+ "head_dim": 128,
241
+ "true_head_dim": 128
242
+ },
243
+ "swiglu_limits": [
244
+ 0.0,
245
+ 0.0,
246
+ 0.0,
247
+ 0.0,
248
+ 0.0,
249
+ 0.0,
250
+ 0.0,
251
+ 0.0,
252
+ 0.0,
253
+ 0.0,
254
+ 0.0,
255
+ 0.0,
256
+ 0.0,
257
+ 0.0,
258
+ 0.0,
259
+ 0.0,
260
+ 0.0,
261
+ 0.0,
262
+ 0.0,
263
+ 0.0,
264
+ 0.0,
265
+ 0.0,
266
+ 0.0,
267
+ 0.0,
268
+ 0.0,
269
+ 0.0,
270
+ 0.0,
271
+ 0.0,
272
+ 0.0,
273
+ 0.0,
274
+ 0.0,
275
+ 0.0,
276
+ 0.0,
277
+ 0.0,
278
+ 0.0,
279
+ 0.0,
280
+ 0.0,
281
+ 0.0,
282
+ 0.0,
283
+ 0.0,
284
+ 0.0,
285
+ 0.0,
286
+ 0.0,
287
+ 7,
288
+ 7,
289
+ 0.0,
290
+ 0.0,
291
+ 0.0
292
+ ],
293
+ "swiglu_limits_shared": [
294
+ 0.0,
295
+ 0.0,
296
+ 0.0,
297
+ 0.0,
298
+ 0.0,
299
+ 0.0,
300
+ 0.0,
301
+ 0.0,
302
+ 0.0,
303
+ 0.0,
304
+ 0.0,
305
+ 0.0,
306
+ 0.0,
307
+ 0.0,
308
+ 0.0,
309
+ 0.0,
310
+ 0.0,
311
+ 0.0,
312
+ 0.0,
313
+ 0.0,
314
+ 0.0,
315
+ 0.0,
316
+ 0.0,
317
+ 0.0,
318
+ 0.0,
319
+ 0.0,
320
+ 0.0,
321
+ 0.0,
322
+ 0.0,
323
+ 0.0,
324
+ 0.0,
325
+ 0.0,
326
+ 0.0,
327
+ 0.0,
328
+ 0.0,
329
+ 0.0,
330
+ 0.0,
331
+ 0.0,
332
+ 0.0,
333
+ 0.0,
334
+ 0.0,
335
+ 0.0,
336
+ 0.0,
337
+ 16,
338
+ 16,
339
+ 0.0,
340
+ 0.0,
341
+ 0.0
342
+ ]
343
+ }
344
  }
345
+