luotingdan commited on
Commit
a9c0171
·
1 Parent(s): 7805a18

update processor config

Browse files
Files changed (4) hide show
  1. config.json +338 -338
  2. configuration_step3p7.py +3 -15
  3. modeling_step3p7.py +37 -27
  4. processing_step3.py +11 -0
config.json CHANGED
@@ -1,345 +1,345 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "architectures": [
3
- "Step3p7ForConditionalGeneration"
4
  ],
5
- "auto_map": {
6
- "AutoConfig": "configuration_step3p7.Step3p7Config",
7
- "AutoModelForCausalLM": "modeling_step3p7.Step3p7ForConditionalGeneration"
 
 
 
8
  },
9
- "model_type": "step3p7",
10
- "im_end_token": "<im_end>",
11
- "im_patch_token": "<im_patch>",
12
- "im_start_token": "<im_start>",
13
- "image_token_len": 169,
14
- "patch_token_len": 81,
15
- "image_token_id": 128001,
16
- "understand_projector_stride": 2,
17
- "use_im_start_end": "true",
18
- "vision_select_layer": -1,
19
- "projector_bias": false,
20
- "vision_config": {
21
- "model_type": "perception_encoder",
22
- "image_size": 728,
23
- "patch_size": 14,
24
- "width": 1536,
25
- "layers": 47,
26
- "heads": 16,
27
- "pool_type": "none",
28
- "output_dim": null,
29
- "use_cls_token": false,
30
- "ls_init_value": 0.1,
31
- "use_ln_post": false,
32
- "hidden_act": "quick_gelu"
33
- },
34
- "text_config": {
35
- "architectures": [
36
- "Step3p5ForCausalLM"
37
- ],
38
- "rope_scaling": {
39
- "rope_type": "llama3",
40
- "factor": 2.0,
41
- "original_max_position_embeddings": 131072,
42
- "low_freq_factor": 1.0,
43
- "high_freq_factor": 32.0
44
- },
45
- "yarn_only_types": [
46
- "full_attention"
47
- ],
48
- "model_type": "step3p5",
49
- "hidden_size": 4096,
50
- "intermediate_size": 11264,
51
- "num_hidden_layers": 45,
52
- "max_seq_len": 262144,
53
- "max_position_embeddings": 262144,
54
- "vocab_size": 128896,
55
- "torch_dtype": "bfloat16",
56
- "use_qk_norm": false,
57
- "moe_layers_enum": "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44",
58
- "use_mfa": false,
59
- "num_attention_heads": 64,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  "num_attention_groups": 8,
61
  "head_dim": 128,
62
- "use_moe": true,
63
- "moe_num_experts": 288,
64
- "moe_top_k": 8,
65
- "moe_intermediate_size": 1280,
66
- "share_expert_dim": 1280,
67
- "moe_layer_offset": 0,
68
- "moe_every_n_layer": 1,
69
- "norm_expert_weight": true,
70
- "moe_router_activation": "sigmoid",
71
- "moe_router_scaling_factor": 3.0,
72
- "att_impl_type": "GQA",
73
- "num_nextn_predict_layers": 3,
74
- "rope_theta": [
75
- 5000000.0,
76
- 10000.0,
77
- 10000.0,
78
- 10000.0,
79
- 5000000.0,
80
- 10000.0,
81
- 10000.0,
82
- 10000.0,
83
- 5000000.0,
84
- 10000.0,
85
- 10000.0,
86
- 10000.0,
87
- 5000000.0,
88
- 10000.0,
89
- 10000.0,
90
- 10000.0,
91
- 5000000.0,
92
- 10000.0,
93
- 10000.0,
94
- 10000.0,
95
- 5000000.0,
96
- 10000.0,
97
- 10000.0,
98
- 10000.0,
99
- 5000000.0,
100
- 10000.0,
101
- 10000.0,
102
- 10000.0,
103
- 5000000.0,
104
- 10000.0,
105
- 10000.0,
106
- 10000.0,
107
- 5000000.0,
108
- 10000.0,
109
- 10000.0,
110
- 10000.0,
111
- 5000000.0,
112
- 10000.0,
113
- 10000.0,
114
- 10000.0,
115
- 5000000.0,
116
- 10000.0,
117
- 10000.0,
118
- 10000.0,
119
- 5000000.0,
120
- 10000.0,
121
- 10000.0,
122
- 10000.0
123
- ],
124
- "use_head_wise_attn_gate": true,
125
- "sliding_window": 512,
126
- "use_moe_router_bias": true,
127
- "need_fp32_gate": true,
128
- "sink": false,
129
- "layer_types": [
130
- "full_attention",
131
- "sliding_attention",
132
- "sliding_attention",
133
- "sliding_attention",
134
- "full_attention",
135
- "sliding_attention",
136
- "sliding_attention",
137
- "sliding_attention",
138
- "full_attention",
139
- "sliding_attention",
140
- "sliding_attention",
141
- "sliding_attention",
142
- "full_attention",
143
- "sliding_attention",
144
- "sliding_attention",
145
- "sliding_attention",
146
- "full_attention",
147
- "sliding_attention",
148
- "sliding_attention",
149
- "sliding_attention",
150
- "full_attention",
151
- "sliding_attention",
152
- "sliding_attention",
153
- "sliding_attention",
154
- "full_attention",
155
- "sliding_attention",
156
- "sliding_attention",
157
- "sliding_attention",
158
- "full_attention",
159
- "sliding_attention",
160
- "sliding_attention",
161
- "sliding_attention",
162
- "full_attention",
163
- "sliding_attention",
164
- "sliding_attention",
165
- "sliding_attention",
166
- "full_attention",
167
- "sliding_attention",
168
- "sliding_attention",
169
- "sliding_attention",
170
- "full_attention",
171
- "sliding_attention",
172
- "sliding_attention",
173
- "sliding_attention",
174
- "full_attention",
175
- "sliding_attention",
176
- "sliding_attention",
177
- "sliding_attention"
178
- ],
179
- "use_rope_layers": [],
180
- "partial_rotary_factors": [
181
- 0.5,
182
- 1.0,
183
- 1.0,
184
- 1.0,
185
- 0.5,
186
- 1.0,
187
- 1.0,
188
- 1.0,
189
- 0.5,
190
- 1.0,
191
- 1.0,
192
- 1.0,
193
- 0.5,
194
- 1.0,
195
- 1.0,
196
- 1.0,
197
- 0.5,
198
- 1.0,
199
- 1.0,
200
- 1.0,
201
- 0.5,
202
- 1.0,
203
- 1.0,
204
- 1.0,
205
- 0.5,
206
- 1.0,
207
- 1.0,
208
- 1.0,
209
- 0.5,
210
- 1.0,
211
- 1.0,
212
- 1.0,
213
- 0.5,
214
- 1.0,
215
- 1.0,
216
- 1.0,
217
- 0.5,
218
- 1.0,
219
- 1.0,
220
- 1.0,
221
- 0.5,
222
- 1.0,
223
- 1.0,
224
- 1.0,
225
- 0.5,
226
- 1.0,
227
- 1.0,
228
- 1.0
229
- ],
230
- "eos_token_id": [
231
- 1,
232
- 2,
233
- 128007
234
- ],
235
- "bos_token_id": 0,
236
- "attention_other_setting": {
237
- "attention_type": "sliding_attention",
238
- "num_attention_heads": 96,
239
- "num_attention_groups": 8,
240
- "head_dim": 128,
241
- "true_head_dim": 128
242
- },
243
- "swiglu_limits": [
244
- 0.0,
245
- 0.0,
246
- 0.0,
247
- 0.0,
248
- 0.0,
249
- 0.0,
250
- 0.0,
251
- 0.0,
252
- 0.0,
253
- 0.0,
254
- 0.0,
255
- 0.0,
256
- 0.0,
257
- 0.0,
258
- 0.0,
259
- 0.0,
260
- 0.0,
261
- 0.0,
262
- 0.0,
263
- 0.0,
264
- 0.0,
265
- 0.0,
266
- 0.0,
267
- 0.0,
268
- 0.0,
269
- 0.0,
270
- 0.0,
271
- 0.0,
272
- 0.0,
273
- 0.0,
274
- 0.0,
275
- 0.0,
276
- 0.0,
277
- 0.0,
278
- 0.0,
279
- 0.0,
280
- 0.0,
281
- 0.0,
282
- 0.0,
283
- 0.0,
284
- 0.0,
285
- 0.0,
286
- 0.0,
287
- 7,
288
- 7,
289
- 0.0,
290
- 0.0,
291
- 0.0
292
- ],
293
- "swiglu_limits_shared": [
294
- 0.0,
295
- 0.0,
296
- 0.0,
297
- 0.0,
298
- 0.0,
299
- 0.0,
300
- 0.0,
301
- 0.0,
302
- 0.0,
303
- 0.0,
304
- 0.0,
305
- 0.0,
306
- 0.0,
307
- 0.0,
308
- 0.0,
309
- 0.0,
310
- 0.0,
311
- 0.0,
312
- 0.0,
313
- 0.0,
314
- 0.0,
315
- 0.0,
316
- 0.0,
317
- 0.0,
318
- 0.0,
319
- 0.0,
320
- 0.0,
321
- 0.0,
322
- 0.0,
323
- 0.0,
324
- 0.0,
325
- 0.0,
326
- 0.0,
327
- 0.0,
328
- 0.0,
329
- 0.0,
330
- 0.0,
331
- 0.0,
332
- 0.0,
333
- 0.0,
334
- 0.0,
335
- 0.0,
336
- 0.0,
337
- 16,
338
- 16,
339
- 0.0,
340
- 0.0,
341
- 0.0
342
- ]
343
- }
344
  }
345
-
 
1
  {
2
+ "architectures": [
3
+ "Step3p7ForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_step3p7.Step3p7Config",
7
+ "AutoProcessor": "processing_step3.Step3VLProcessor",
8
+ "AutoModelForCausalLM": "modeling_step3p7.Step3p7ForConditionalGeneration"
9
+ },
10
+ "model_type": "step3p7",
11
+ "im_end_token": "<im_end>",
12
+ "im_patch_token": "<im_patch>",
13
+ "im_start_token": "<im_start>",
14
+ "image_token_len": 169,
15
+ "patch_token_len": 81,
16
+ "image_token_id": 128001,
17
+ "understand_projector_stride": 2,
18
+ "use_im_start_end": "true",
19
+ "vision_select_layer": -1,
20
+ "projector_bias": false,
21
+ "vision_config": {
22
+ "model_type": "perception_encoder",
23
+ "image_size": 728,
24
+ "patch_size": 14,
25
+ "width": 1536,
26
+ "layers": 47,
27
+ "heads": 16,
28
+ "pool_type": "none",
29
+ "output_dim": null,
30
+ "use_cls_token": false,
31
+ "ls_init_value": 0.1,
32
+ "use_ln_post": false,
33
+ "hidden_act": "quick_gelu"
34
+ },
35
+ "text_config": {
36
  "architectures": [
37
+ "Step3p5ForCausalLM"
38
  ],
39
+ "rope_scaling": {
40
+ "rope_type": "llama3",
41
+ "factor": 2.0,
42
+ "original_max_position_embeddings": 131072,
43
+ "low_freq_factor": 1.0,
44
+ "high_freq_factor": 32.0
45
  },
46
+ "yarn_only_types": [
47
+ "full_attention"
48
+ ],
49
+ "model_type": "step3p5",
50
+ "hidden_size": 4096,
51
+ "intermediate_size": 11264,
52
+ "num_hidden_layers": 45,
53
+ "max_seq_len": 262144,
54
+ "max_position_embeddings": 262144,
55
+ "vocab_size": 128896,
56
+ "torch_dtype": "bfloat16",
57
+ "use_qk_norm": false,
58
+ "moe_layers_enum": "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44",
59
+ "use_mfa": false,
60
+ "num_attention_heads": 64,
61
+ "num_attention_groups": 8,
62
+ "head_dim": 128,
63
+ "use_moe": true,
64
+ "moe_num_experts": 288,
65
+ "moe_top_k": 8,
66
+ "moe_intermediate_size": 1280,
67
+ "share_expert_dim": 1280,
68
+ "moe_layer_offset": 0,
69
+ "moe_every_n_layer": 1,
70
+ "norm_expert_weight": true,
71
+ "moe_router_activation": "sigmoid",
72
+ "moe_router_scaling_factor": 3.0,
73
+ "att_impl_type": "GQA",
74
+ "num_nextn_predict_layers": 3,
75
+ "rope_theta": [
76
+ 5000000.0,
77
+ 10000.0,
78
+ 10000.0,
79
+ 10000.0,
80
+ 5000000.0,
81
+ 10000.0,
82
+ 10000.0,
83
+ 10000.0,
84
+ 5000000.0,
85
+ 10000.0,
86
+ 10000.0,
87
+ 10000.0,
88
+ 5000000.0,
89
+ 10000.0,
90
+ 10000.0,
91
+ 10000.0,
92
+ 5000000.0,
93
+ 10000.0,
94
+ 10000.0,
95
+ 10000.0,
96
+ 5000000.0,
97
+ 10000.0,
98
+ 10000.0,
99
+ 10000.0,
100
+ 5000000.0,
101
+ 10000.0,
102
+ 10000.0,
103
+ 10000.0,
104
+ 5000000.0,
105
+ 10000.0,
106
+ 10000.0,
107
+ 10000.0,
108
+ 5000000.0,
109
+ 10000.0,
110
+ 10000.0,
111
+ 10000.0,
112
+ 5000000.0,
113
+ 10000.0,
114
+ 10000.0,
115
+ 10000.0,
116
+ 5000000.0,
117
+ 10000.0,
118
+ 10000.0,
119
+ 10000.0,
120
+ 5000000.0,
121
+ 10000.0,
122
+ 10000.0,
123
+ 10000.0
124
+ ],
125
+ "use_head_wise_attn_gate": true,
126
+ "sliding_window": 512,
127
+ "use_moe_router_bias": true,
128
+ "need_fp32_gate": true,
129
+ "sink": false,
130
+ "layer_types": [
131
+ "full_attention",
132
+ "sliding_attention",
133
+ "sliding_attention",
134
+ "sliding_attention",
135
+ "full_attention",
136
+ "sliding_attention",
137
+ "sliding_attention",
138
+ "sliding_attention",
139
+ "full_attention",
140
+ "sliding_attention",
141
+ "sliding_attention",
142
+ "sliding_attention",
143
+ "full_attention",
144
+ "sliding_attention",
145
+ "sliding_attention",
146
+ "sliding_attention",
147
+ "full_attention",
148
+ "sliding_attention",
149
+ "sliding_attention",
150
+ "sliding_attention",
151
+ "full_attention",
152
+ "sliding_attention",
153
+ "sliding_attention",
154
+ "sliding_attention",
155
+ "full_attention",
156
+ "sliding_attention",
157
+ "sliding_attention",
158
+ "sliding_attention",
159
+ "full_attention",
160
+ "sliding_attention",
161
+ "sliding_attention",
162
+ "sliding_attention",
163
+ "full_attention",
164
+ "sliding_attention",
165
+ "sliding_attention",
166
+ "sliding_attention",
167
+ "full_attention",
168
+ "sliding_attention",
169
+ "sliding_attention",
170
+ "sliding_attention",
171
+ "full_attention",
172
+ "sliding_attention",
173
+ "sliding_attention",
174
+ "sliding_attention",
175
+ "full_attention",
176
+ "sliding_attention",
177
+ "sliding_attention",
178
+ "sliding_attention"
179
+ ],
180
+ "use_rope_layers": [],
181
+ "partial_rotary_factors": [
182
+ 0.5,
183
+ 1.0,
184
+ 1.0,
185
+ 1.0,
186
+ 0.5,
187
+ 1.0,
188
+ 1.0,
189
+ 1.0,
190
+ 0.5,
191
+ 1.0,
192
+ 1.0,
193
+ 1.0,
194
+ 0.5,
195
+ 1.0,
196
+ 1.0,
197
+ 1.0,
198
+ 0.5,
199
+ 1.0,
200
+ 1.0,
201
+ 1.0,
202
+ 0.5,
203
+ 1.0,
204
+ 1.0,
205
+ 1.0,
206
+ 0.5,
207
+ 1.0,
208
+ 1.0,
209
+ 1.0,
210
+ 0.5,
211
+ 1.0,
212
+ 1.0,
213
+ 1.0,
214
+ 0.5,
215
+ 1.0,
216
+ 1.0,
217
+ 1.0,
218
+ 0.5,
219
+ 1.0,
220
+ 1.0,
221
+ 1.0,
222
+ 0.5,
223
+ 1.0,
224
+ 1.0,
225
+ 1.0,
226
+ 0.5,
227
+ 1.0,
228
+ 1.0,
229
+ 1.0
230
+ ],
231
+ "eos_token_id": [
232
+ 1,
233
+ 2,
234
+ 128007
235
+ ],
236
+ "bos_token_id": 0,
237
+ "attention_other_setting": {
238
+ "attention_type": "sliding_attention",
239
+ "num_attention_heads": 96,
240
  "num_attention_groups": 8,
241
  "head_dim": 128,
242
+ "true_head_dim": 128
243
+ },
244
+ "swiglu_limits": [
245
+ 0.0,
246
+ 0.0,
247
+ 0.0,
248
+ 0.0,
249
+ 0.0,
250
+ 0.0,
251
+ 0.0,
252
+ 0.0,
253
+ 0.0,
254
+ 0.0,
255
+ 0.0,
256
+ 0.0,
257
+ 0.0,
258
+ 0.0,
259
+ 0.0,
260
+ 0.0,
261
+ 0.0,
262
+ 0.0,
263
+ 0.0,
264
+ 0.0,
265
+ 0.0,
266
+ 0.0,
267
+ 0.0,
268
+ 0.0,
269
+ 0.0,
270
+ 0.0,
271
+ 0.0,
272
+ 0.0,
273
+ 0.0,
274
+ 0.0,
275
+ 0.0,
276
+ 0.0,
277
+ 0.0,
278
+ 0.0,
279
+ 0.0,
280
+ 0.0,
281
+ 0.0,
282
+ 0.0,
283
+ 0.0,
284
+ 0.0,
285
+ 0.0,
286
+ 0.0,
287
+ 0.0,
288
+ 7,
289
+ 7,
290
+ 0.0,
291
+ 0.0,
292
+ 0.0
293
+ ],
294
+ "swiglu_limits_shared": [
295
+ 0.0,
296
+ 0.0,
297
+ 0.0,
298
+ 0.0,
299
+ 0.0,
300
+ 0.0,
301
+ 0.0,
302
+ 0.0,
303
+ 0.0,
304
+ 0.0,
305
+ 0.0,
306
+ 0.0,
307
+ 0.0,
308
+ 0.0,
309
+ 0.0,
310
+ 0.0,
311
+ 0.0,
312
+ 0.0,
313
+ 0.0,
314
+ 0.0,
315
+ 0.0,
316
+ 0.0,
317
+ 0.0,
318
+ 0.0,
319
+ 0.0,
320
+ 0.0,
321
+ 0.0,
322
+ 0.0,
323
+ 0.0,
324
+ 0.0,
325
+ 0.0,
326
+ 0.0,
327
+ 0.0,
328
+ 0.0,
329
+ 0.0,
330
+ 0.0,
331
+ 0.0,
332
+ 0.0,
333
+ 0.0,
334
+ 0.0,
335
+ 0.0,
336
+ 0.0,
337
+ 0.0,
338
+ 16,
339
+ 16,
340
+ 0.0,
341
+ 0.0,
342
+ 0.0
343
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  }
345
+ }
configuration_step3p7.py CHANGED
@@ -91,23 +91,10 @@ class Step3p7TextConfig(PretrainedConfig):
91
  **kwargs,
92
  ) -> None:
93
  torch_dtype = kwargs.get("torch_dtype")
94
- layer_types = _normalize_per_layer_values(layer_types,
95
  num_hidden_layers)
96
- swiglu_limits = _normalize_per_layer_values(swiglu_limits,
97
- num_hidden_layers)
98
- swiglu_limits_shared = _normalize_per_layer_values(
99
- swiglu_limits_shared, num_hidden_layers)
100
- partial_rotary_factors = kwargs.get("partial_rotary_factors")
101
- kwargs["partial_rotary_factors"] = _normalize_per_layer_values(
102
- partial_rotary_factors, num_hidden_layers)
103
- if isinstance(rope_theta, list):
104
- rope_theta = _normalize_per_layer_values(rope_theta,
105
- num_hidden_layers)
106
  if isinstance(rope_scaling, dict):
107
  rope_scaling = dict(rope_scaling)
108
- if use_rope_layers:
109
- use_rope_layers = _normalize_per_layer_values(
110
- use_rope_layers, num_hidden_layers)
111
  if share_expert_dim is None:
112
  share_expert_dim = share_expert_dims
113
  self.hidden_size = hidden_size
@@ -128,7 +115,7 @@ class Step3p7TextConfig(PretrainedConfig):
128
  self.head_dim = head_dim
129
  self.norm_expert_weight = norm_expert_weight
130
  self.moe_layers_enum = moe_layers_enum
131
- self.layer_types = layer_types
132
  self.sliding_window = sliding_window
133
  self.pad_token_id = pad_token_id
134
  self.attention_dropout = attention_dropout
@@ -145,6 +132,7 @@ class Step3p7TextConfig(PretrainedConfig):
145
  super().__init__(**kwargs)
146
  if torch_dtype is not None:
147
  self.torch_dtype = torch_dtype
 
148
 
149
  def to_dict(self):
150
  output = super().to_dict()
 
91
  **kwargs,
92
  ) -> None:
93
  torch_dtype = kwargs.get("torch_dtype")
94
+ trim_layer_types = _normalize_per_layer_values(layer_types,
95
  num_hidden_layers)
 
 
 
 
 
 
 
 
 
 
96
  if isinstance(rope_scaling, dict):
97
  rope_scaling = dict(rope_scaling)
 
 
 
98
  if share_expert_dim is None:
99
  share_expert_dim = share_expert_dims
100
  self.hidden_size = hidden_size
 
115
  self.head_dim = head_dim
116
  self.norm_expert_weight = norm_expert_weight
117
  self.moe_layers_enum = moe_layers_enum
118
+ self.layer_types = trim_layer_types
119
  self.sliding_window = sliding_window
120
  self.pad_token_id = pad_token_id
121
  self.attention_dropout = attention_dropout
 
132
  super().__init__(**kwargs)
133
  if torch_dtype is not None:
134
  self.torch_dtype = torch_dtype
135
+ self.layer_types = layer_types
136
 
137
  def to_dict(self):
138
  output = super().to_dict()
modeling_step3p7.py CHANGED
@@ -199,36 +199,40 @@ class Step3p7PreTrainedModel(PreTrainedModel):
199
  class Step3p7RotaryEmbedding(nn.Module):
200
  def __init__(self, config: Step3p7TextConfig, device=None, layer_idx=None):
201
  super().__init__()
202
- # BC: "rope_type" was originally "type"
203
  self.layer_idx = layer_idx
204
- self.original_rope_parameters = None
205
- if config.rope_parameters is not None:
206
- self.original_rope_parameters = config.rope_parameters
207
- config.rope_parameters = dict(config.rope_parameters)
208
- self.rope_type = config.rope_parameters.get(
209
- "rope_type", config.rope_parameters.get("type")
210
- )
211
- else:
212
- self.rope_type = "default"
213
  self.max_seq_len_cached = config.max_position_embeddings
214
  self.original_max_seq_len = config.max_position_embeddings
215
 
216
- partial_rotary_factors = getattr(
217
- config, "partial_rotary_factors", None
218
- )
 
 
 
219
  if partial_rotary_factors is not None:
220
- config.partial_rotary_factor = partial_rotary_factors[self.layer_idx]
221
- else:
222
- config.partial_rotary_factor = 1.0
223
 
224
- self.rope_theta = config.rope_theta
225
- if isinstance(config.rope_theta, list):
226
- self.rope_theta = config.rope_theta.copy()
227
- config.rope_theta = self.rope_theta[self.layer_idx]
228
 
229
  self.config = copy.copy(config)
 
 
 
230
  if config.rope_parameters is not None:
231
- self.config.rope_parameters = dict(config.rope_parameters)
 
 
 
 
 
 
 
 
 
 
232
  self.rope_init_fn = self.compute_default_rope_parameters
233
  if self.rope_type != "default":
234
  self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
@@ -238,8 +242,6 @@ class Step3p7RotaryEmbedding(nn.Module):
238
 
239
  self.register_buffer("inv_freq", inv_freq, persistent=False)
240
  self.original_inv_freq = self.inv_freq
241
- config.rope_theta = self.rope_theta
242
- config.rope_parameters = self.original_rope_parameters
243
 
244
  @torch.no_grad()
245
  @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
@@ -288,10 +290,14 @@ class Step3p7RotaryEmbedding(nn.Module):
288
  post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
289
  """
290
  base = config.rope_theta
291
- dim = (
 
 
 
292
  getattr(config, "head_dim", None)
293
  or config.hidden_size // config.num_attention_heads
294
  )
 
295
 
296
  attention_factor = 1.0 # Unused in this type of RoPE
297
 
@@ -968,7 +974,6 @@ class Step3p7TextModel(Step3p7TextPreTrainedModel, GenerationMixin):
968
  mask_kwargs = {
969
  "config": self.config,
970
  "attention_mask": attention_mask,
971
- "cache_position": cache_position,
972
  "past_key_values": past_key_values,
973
  "position_ids": position_ids,
974
  }
@@ -1381,7 +1386,12 @@ class Step3p7ForConditionalGeneration(Step3p7PreTrainedModel, GenerationMixin):
1381
  **kwargs,
1382
  )
1383
 
1384
- if cache_position[0] == 0:
 
 
 
 
 
1385
  # During cached decoding, input ids no longer contain image tokens,
1386
  # so pixel values should only be passed at the first step.
1387
  model_inputs["pixel_values"] = pixel_values
@@ -1392,4 +1402,4 @@ class Step3p7ForConditionalGeneration(Step3p7PreTrainedModel, GenerationMixin):
1392
  if key.startswith("language_model."):
1393
  return key[len("language_model.") :], True
1394
 
1395
- return key, False
 
199
  class Step3p7RotaryEmbedding(nn.Module):
200
  def __init__(self, config: Step3p7TextConfig, device=None, layer_idx=None):
201
  super().__init__()
 
202
  self.layer_idx = layer_idx
 
 
 
 
 
 
 
 
 
203
  self.max_seq_len_cached = config.max_position_embeddings
204
  self.original_max_seq_len = config.max_position_embeddings
205
 
206
+ rope_theta = config.rope_theta
207
+ if isinstance(rope_theta, list):
208
+ rope_theta = rope_theta[0 if layer_idx is None else layer_idx]
209
+
210
+ partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
211
+ partial_rotary_factors = getattr(config, "partial_rotary_factors", None)
212
  if partial_rotary_factors is not None:
213
+ partial_rotary_factor = partial_rotary_factors[
214
+ 0 if layer_idx is None else layer_idx
215
+ ]
216
 
217
+ self.rope_theta = rope_theta
218
+ self.partial_rotary_factor = partial_rotary_factor
 
 
219
 
220
  self.config = copy.copy(config)
221
+ self.config.rope_theta = rope_theta
222
+ self.config.partial_rotary_factor = partial_rotary_factor
223
+
224
  if config.rope_parameters is not None:
225
+ self.config.rope_parameters = copy.deepcopy(config.rope_parameters)
226
+ self.config.rope_parameters["rope_theta"] = rope_theta
227
+ self.config.rope_parameters["partial_rotary_factor"] = (
228
+ partial_rotary_factor
229
+ )
230
+ self.rope_type = self.config.rope_parameters.get(
231
+ "rope_type", self.config.rope_parameters.get("type")
232
+ )
233
+ else:
234
+ self.rope_type = "default"
235
+
236
  self.rope_init_fn = self.compute_default_rope_parameters
237
  if self.rope_type != "default":
238
  self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
242
 
243
  self.register_buffer("inv_freq", inv_freq, persistent=False)
244
  self.original_inv_freq = self.inv_freq
 
 
245
 
246
  @torch.no_grad()
247
  @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
 
290
  post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
291
  """
292
  base = config.rope_theta
293
+ partial_rotary_factor = getattr(
294
+ config, "partial_rotary_factor", 1.0
295
+ )
296
+ head_dim = (
297
  getattr(config, "head_dim", None)
298
  or config.hidden_size // config.num_attention_heads
299
  )
300
+ dim = int(head_dim * partial_rotary_factor)
301
 
302
  attention_factor = 1.0 # Unused in this type of RoPE
303
 
 
974
  mask_kwargs = {
975
  "config": self.config,
976
  "attention_mask": attention_mask,
 
977
  "past_key_values": past_key_values,
978
  "position_ids": position_ids,
979
  }
 
1386
  **kwargs,
1387
  )
1388
 
1389
+ generation_cache_position = model_inputs.get("cache_position", cache_position)
1390
+ is_prefill = past_key_values is None
1391
+ if generation_cache_position is not None and generation_cache_position.numel() > 0:
1392
+ is_prefill = generation_cache_position[0].item() == 0
1393
+
1394
+ if is_prefill:
1395
  # During cached decoding, input ids no longer contain image tokens,
1396
  # so pixel values should only be passed at the first step.
1397
  model_inputs["pixel_values"] = pixel_values
 
1402
  if key.startswith("language_model."):
1403
  return key[len("language_model.") :], True
1404
 
1405
+ return key, False
processing_step3.py CHANGED
@@ -16,6 +16,7 @@ from torchvision.transforms.functional import InterpolationMode
16
  from transformers.feature_extraction_utils import BatchFeature, TensorType
17
  from transformers.image_utils import ImageInput
18
  from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 
19
  from math import ceil
20
  from itertools import product
21
 
@@ -255,6 +256,16 @@ class Step3VLProcessor(ProcessorMixin):
255
  attributes = ["tokenizer"]
256
  tokenizer_class = "AutoTokenizer"
257
 
 
 
 
 
 
 
 
 
 
 
258
  def __init__(
259
  self,
260
  tokenizer=None,
 
16
  from transformers.feature_extraction_utils import BatchFeature, TensorType
17
  from transformers.image_utils import ImageInput
18
  from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
19
+ from transformers.tokenization_utils_tokenizers import TokenizersBackend
20
  from math import ceil
21
  from itertools import product
22
 
 
256
  attributes = ["tokenizer"]
257
  tokenizer_class = "AutoTokenizer"
258
 
259
+ @classmethod
260
+ def _load_tokenizer_from_pretrained(
261
+ cls, sub_processor_type, pretrained_model_name_or_path, subfolder="", **kwargs
262
+ ):
263
+ return TokenizersBackend.from_pretrained(
264
+ pretrained_model_name_or_path,
265
+ subfolder=subfolder,
266
+ **kwargs,
267
+ )
268
+
269
  def __init__(
270
  self,
271
  tokenizer=None,