shilinxu commited on
Commit
9d795b3
·
verified ·
1 Parent(s): 8d12c0b

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2VisionTransformerPretrainedModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_qwen2_vl.Qwen2VLVisionConfig",
7
+ "AutoModel": "modeling_qwen2_vl.Qwen2VisionTransformerPretrainedModel"
8
+ },
9
+ "depth": 32,
10
+ "embed_dim": 1280,
11
+ "hidden_act": "quick_gelu",
12
+ "hidden_size": 1536,
13
+ "in_channels": 3,
14
+ "in_chans": 3,
15
+ "initializer_range": 0.02,
16
+ "mlp_ratio": 4,
17
+ "model_type": "qwen2_vl",
18
+ "num_heads": 16,
19
+ "patch_size": 14,
20
+ "spatial_merge_size": 2,
21
+ "spatial_patch_size": 14,
22
+ "temporal_patch_size": 2,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.52.1"
25
+ }
configuration_qwen2_vl.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Qwen2VL model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.utils import logging
19
+
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+
24
+ class Qwen2VLVisionConfig(PretrainedConfig):
25
+ model_type = "qwen2_vl"
26
+ base_config_key = "vision_config"
27
+
28
+ def __init__(
29
+ self,
30
+ depth=32,
31
+ embed_dim=1280,
32
+ hidden_size=3584,
33
+ hidden_act="quick_gelu",
34
+ mlp_ratio=4,
35
+ num_heads=16,
36
+ in_channels=3,
37
+ patch_size=14,
38
+ spatial_merge_size=2,
39
+ temporal_patch_size=2,
40
+ initializer_range=0.02,
41
+ **kwargs,
42
+ ):
43
+ super().__init__(**kwargs)
44
+
45
+ self.depth = depth
46
+ self.embed_dim = embed_dim
47
+ self.hidden_size = hidden_size
48
+ self.hidden_act = hidden_act
49
+ self.mlp_ratio = mlp_ratio
50
+ self.num_heads = num_heads
51
+ self.in_channels = in_channels
52
+ self.patch_size = patch_size
53
+ self.spatial_merge_size = spatial_merge_size
54
+ self.temporal_patch_size = temporal_patch_size
55
+ self.initializer_range = initializer_range
56
+
57
+
image_processing_qwen2_vl.py ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """Image processor class for Qwen2-VL."""
21
+
22
+ import math
23
+ from typing import Dict, List, Optional, Union
24
+
25
+ import numpy as np
26
+
27
+ from ...image_processing_utils import BaseImageProcessor, BatchFeature
28
+ from ...image_transforms import (
29
+ convert_to_rgb,
30
+ resize,
31
+ to_channel_dimension_format,
32
+ )
33
+ from ...image_utils import (
34
+ OPENAI_CLIP_MEAN,
35
+ OPENAI_CLIP_STD,
36
+ ChannelDimension,
37
+ ImageInput,
38
+ PILImageResampling,
39
+ get_image_size,
40
+ infer_channel_dimension_format,
41
+ is_scaled_image,
42
+ make_flat_list_of_images,
43
+ make_list_of_images,
44
+ to_numpy_array,
45
+ valid_images,
46
+ validate_preprocess_arguments,
47
+ )
48
+ from ...utils import TensorType, logging
49
+ from ...video_utils import VideoInput, make_batched_videos
50
+
51
+
52
+ logger = logging.get_logger(__name__)
53
+
54
+
55
+ def smart_resize(
56
+ height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
57
+ ):
58
+ """Rescales the image so that the following conditions are met:
59
+
60
+ 1. Both dimensions (height and width) are divisible by 'factor'.
61
+
62
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
63
+
64
+ 3. The aspect ratio of the image is maintained as closely as possible.
65
+
66
+ """
67
+ if height < factor or width < factor:
68
+ raise ValueError(f"height:{height} and width:{width} must be larger than factor:{factor}")
69
+ elif max(height, width) / min(height, width) > 200:
70
+ raise ValueError(
71
+ f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
72
+ )
73
+ h_bar = round(height / factor) * factor
74
+ w_bar = round(width / factor) * factor
75
+ if h_bar * w_bar > max_pixels:
76
+ beta = math.sqrt((height * width) / max_pixels)
77
+ h_bar = math.floor(height / beta / factor) * factor
78
+ w_bar = math.floor(width / beta / factor) * factor
79
+ elif h_bar * w_bar < min_pixels:
80
+ beta = math.sqrt(min_pixels / (height * width))
81
+ h_bar = math.ceil(height * beta / factor) * factor
82
+ w_bar = math.ceil(width * beta / factor) * factor
83
+ return h_bar, w_bar
84
+
85
+
86
+ class Qwen2VLImageProcessor(BaseImageProcessor):
87
+ r"""
88
+ Constructs a Qwen2-VL image processor that dynamically resizes images based on the original images.
89
+
90
+ Args:
91
+ do_resize (`bool`, *optional*, defaults to `True`):
92
+ Whether to resize the image's (height, width) dimensions.
93
+ size (`Dict[str, int]`, *optional*, defaults to `{"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 1280}`):
94
+ Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
95
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
96
+ Resampling filter to use when resizing the image.
97
+ do_rescale (`bool`, *optional*, defaults to `True`):
98
+ Whether to rescale the image by the specified scale `rescale_factor`.
99
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
100
+ Scale factor to use if rescaling the image.
101
+ do_normalize (`bool`, *optional*, defaults to `True`):
102
+ Whether to normalize the image.
103
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
104
+ Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
105
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
106
+ Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
107
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
108
+ Whether to convert the image to RGB.
109
+ min_pixels (`int`, *optional*, defaults to `56 * 56`):
110
+ The min pixels of the image to resize the image.
111
+ max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
112
+ The max pixels of the image to resize the image.
113
+ patch_size (`int`, *optional*, defaults to 14):
114
+ The spatial patch size of the vision encoder.
115
+ temporal_patch_size (`int`, *optional*, defaults to 2):
116
+ The temporal patch size of the vision encoder.
117
+ merge_size (`int`, *optional*, defaults to 2):
118
+ The merge size of the vision encoder to llm encoder.
119
+ """
120
+
121
+ model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
122
+
123
+ def __init__(
124
+ self,
125
+ do_resize: bool = True,
126
+ size: Optional[Dict[str, int]] = None,
127
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
128
+ do_rescale: bool = True,
129
+ rescale_factor: Union[int, float] = 1 / 255,
130
+ do_normalize: bool = True,
131
+ image_mean: Optional[Union[float, List[float]]] = None,
132
+ image_std: Optional[Union[float, List[float]]] = None,
133
+ do_convert_rgb: bool = True,
134
+ min_pixels: Optional[int] = None,
135
+ max_pixels: Optional[int] = None,
136
+ patch_size: int = 14,
137
+ temporal_patch_size: int = 2,
138
+ merge_size: int = 2,
139
+ **kwargs,
140
+ ) -> None:
141
+ super().__init__(**kwargs)
142
+ if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
143
+ raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
144
+ else:
145
+ size = {"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 1280}
146
+ # backward compatibility: override size with min_pixels and max_pixels if they are provided
147
+ if min_pixels is not None:
148
+ size["shortest_edge"] = min_pixels
149
+ if max_pixels is not None:
150
+ size["longest_edge"] = max_pixels
151
+ self.min_pixels = size["shortest_edge"]
152
+ self.max_pixels = size["longest_edge"]
153
+ self.size = size
154
+
155
+ self.do_resize = do_resize
156
+ self.resample = resample
157
+ self.do_rescale = do_rescale
158
+ self.rescale_factor = rescale_factor
159
+ self.do_normalize = do_normalize
160
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
161
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
162
+
163
+ self.patch_size = patch_size
164
+ self.temporal_patch_size = temporal_patch_size
165
+ self.merge_size = merge_size
166
+ self.do_convert_rgb = do_convert_rgb
167
+
168
+ def _preprocess(
169
+ self,
170
+ images: Union[ImageInput, VideoInput],
171
+ do_resize: Optional[bool] = None,
172
+ size: Optional[Dict[str, int]] = None,
173
+ resample: PILImageResampling = None,
174
+ do_rescale: Optional[bool] = None,
175
+ rescale_factor: Optional[float] = None,
176
+ do_normalize: Optional[bool] = None,
177
+ image_mean: Optional[Union[float, List[float]]] = None,
178
+ image_std: Optional[Union[float, List[float]]] = None,
179
+ patch_size: Optional[int] = None,
180
+ temporal_patch_size: Optional[int] = None,
181
+ merge_size: Optional[int] = None,
182
+ do_convert_rgb: Optional[bool] = None,
183
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
184
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
185
+ ):
186
+ """
187
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
188
+
189
+ Args:
190
+ images (`ImageInput`):
191
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
192
+ vision_info (`List[Dict]`, *optional*):
193
+ Optional list of dictionaries containing additional information about vision inputs.
194
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
195
+ Whether to resize the image.
196
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
197
+ Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
198
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
199
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
200
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
201
+ Whether to rescale the image.
202
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
203
+ Scale factor to use if rescaling the image.
204
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
205
+ Whether to normalize the image.
206
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
207
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
208
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
209
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
210
+ patch_size (`int`, *optional*, defaults to `self.patch_size`):
211
+ The spatial patch size of the vision encoder.
212
+ temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
213
+ The temporal patch size of the vision encoder.
214
+ merge_size (`int`, *optional*, defaults to `self.merge_size`):
215
+ The merge size of the vision encoder to llm encoder.
216
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
217
+ Whether to convert the image to RGB.
218
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
219
+ The channel dimension format for the output image. Can be one of:
220
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
221
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
222
+ - Unset: Use the channel dimension format of the input image.
223
+ input_data_format (`ChannelDimension` or `str`, *optional*):
224
+ The channel dimension format for the input image. Can be one of:
225
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
226
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
227
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
228
+ """
229
+ images = make_list_of_images(images)
230
+
231
+ if do_convert_rgb:
232
+ images = [convert_to_rgb(image) for image in images]
233
+
234
+ # All transformations expect numpy arrays.
235
+ images = [to_numpy_array(image) for image in images]
236
+
237
+ if do_rescale and is_scaled_image(images[0]):
238
+ logger.warning_once(
239
+ "It looks like you are trying to rescale already rescaled images. If the input"
240
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
241
+ )
242
+ if input_data_format is None:
243
+ # We assume that all images have the same channel dimension format.
244
+ input_data_format = infer_channel_dimension_format(images[0])
245
+
246
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
247
+ resized_height, resized_width = height, width
248
+ processed_images = []
249
+ for image in images:
250
+ if do_resize:
251
+ resized_height, resized_width = smart_resize(
252
+ height,
253
+ width,
254
+ factor=patch_size * merge_size,
255
+ min_pixels=size["shortest_edge"],
256
+ max_pixels=size["longest_edge"],
257
+ )
258
+ image = resize(
259
+ image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
260
+ )
261
+
262
+ if do_rescale:
263
+ image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
264
+
265
+ if do_normalize:
266
+ image = self.normalize(
267
+ image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
268
+ )
269
+
270
+ image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
271
+ processed_images.append(image)
272
+
273
+ patches = np.array(processed_images)
274
+ if data_format == ChannelDimension.LAST:
275
+ patches = patches.transpose(0, 3, 1, 2)
276
+ if patches.shape[0] % temporal_patch_size != 0:
277
+ repeats = np.repeat(
278
+ patches[-1][np.newaxis], temporal_patch_size - (patches.shape[0] % temporal_patch_size), axis=0
279
+ )
280
+ patches = np.concatenate([patches, repeats], axis=0)
281
+ channel = patches.shape[1]
282
+ grid_t = patches.shape[0] // temporal_patch_size
283
+ grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
284
+ patches = patches.reshape(
285
+ grid_t,
286
+ temporal_patch_size,
287
+ channel,
288
+ grid_h // merge_size,
289
+ merge_size,
290
+ patch_size,
291
+ grid_w // merge_size,
292
+ merge_size,
293
+ patch_size,
294
+ )
295
+ patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
296
+ flatten_patches = patches.reshape(
297
+ grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
298
+ )
299
+
300
+ return flatten_patches, (grid_t, grid_h, grid_w)
301
+
302
+ def preprocess(
303
+ self,
304
+ images: ImageInput,
305
+ videos: VideoInput = None,
306
+ do_resize: Optional[bool] = None,
307
+ size: Optional[Dict[str, int]] = None,
308
+ min_pixels: Optional[int] = None,
309
+ max_pixels: Optional[int] = None,
310
+ resample: PILImageResampling = None,
311
+ do_rescale: Optional[bool] = None,
312
+ rescale_factor: Optional[float] = None,
313
+ do_normalize: Optional[bool] = None,
314
+ image_mean: Optional[Union[float, List[float]]] = None,
315
+ image_std: Optional[Union[float, List[float]]] = None,
316
+ patch_size: Optional[int] = None,
317
+ temporal_patch_size: Optional[int] = None,
318
+ merge_size: Optional[int] = None,
319
+ do_convert_rgb: Optional[bool] = None,
320
+ return_tensors: Optional[Union[str, TensorType]] = None,
321
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
322
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
323
+ ):
324
+ """
325
+ Args:
326
+ images (`ImageInput`):
327
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
328
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
329
+ videos (`VideoInput`):
330
+ Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
331
+ passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
332
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
333
+ Whether to resize the image.
334
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
335
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
336
+ the longest edge resized to keep the input aspect ratio.
337
+ resample (`int`, *optional*, defaults to `self.resample`):
338
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
339
+ has an effect if `do_resize` is set to `True`.
340
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
341
+ Whether to rescale the image.
342
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
343
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
344
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
345
+ Whether to normalize the image.
346
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
347
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
348
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
349
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
350
+ `True`.
351
+ min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
352
+ The min pixels of the image to resize the image.
353
+ max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
354
+ The max pixels of the image to resize the image.
355
+ patch_size (`int`, *optional*, defaults to `self.patch_size`):
356
+ The spatial patch size of the vision encoder.
357
+ temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
358
+ The temporal patch size of the vision encoder.
359
+ merge_size (`int`, *optional*, defaults to `self.merge_size`):
360
+ The merge size of the vision encoder to llm encoder.
361
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
362
+ Whether to convert the image to RGB.
363
+ return_tensors (`str` or `TensorType`, *optional*):
364
+ The type of tensors to return. Can be one of:
365
+ - Unset: Return a list of `np.ndarray`.
366
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
367
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
368
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
369
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
370
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
371
+ The channel dimension format for the output image. Can be one of:
372
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
373
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
374
+ - Unset: Use the channel dimension format of the input image.
375
+ input_data_format (`ChannelDimension` or `str`, *optional*):
376
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
377
+ from the input image. Can be one of:
378
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
379
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
380
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
381
+
382
+ """
383
+ min_pixels = min_pixels if min_pixels is not None else self.min_pixels
384
+ max_pixels = max_pixels if max_pixels is not None else self.max_pixels
385
+
386
+ if size is not None:
387
+ if "shortest_edge" not in size or "longest_edge" not in size:
388
+ raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
389
+ min_pixels = size["shortest_edge"]
390
+ elif min_pixels is not None and max_pixels is not None:
391
+ # backward compatibility: override size with min_pixels and max_pixels if they are provided
392
+ size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
393
+ else:
394
+ size = {**self.size}
395
+
396
+ do_resize = do_resize if do_resize is not None else self.do_resize
397
+
398
+ resample = resample if resample is not None else self.resample
399
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
400
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
401
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
402
+ image_mean = image_mean if image_mean is not None else self.image_mean
403
+ image_std = image_std if image_std is not None else self.image_std
404
+ patch_size = patch_size if patch_size is not None else self.patch_size
405
+ temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
406
+ merge_size = merge_size if merge_size is not None else self.merge_size
407
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
408
+
409
+ if images is not None:
410
+ images = make_flat_list_of_images(images)
411
+
412
+ if images is not None and not valid_images(images):
413
+ raise ValueError(
414
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
415
+ "torch.Tensor, tf.Tensor or jax.ndarray."
416
+ )
417
+
418
+ validate_preprocess_arguments(
419
+ rescale_factor=rescale_factor,
420
+ do_normalize=do_normalize,
421
+ image_mean=image_mean,
422
+ image_std=image_std,
423
+ do_resize=do_resize,
424
+ size=size,
425
+ resample=resample,
426
+ )
427
+
428
+ data = {}
429
+ if images is not None:
430
+ pixel_values, vision_grid_thws = [], []
431
+ for image in images:
432
+ patches, image_grid_thw = self._preprocess(
433
+ image,
434
+ do_resize=do_resize,
435
+ size=size,
436
+ resample=resample,
437
+ do_rescale=do_rescale,
438
+ rescale_factor=rescale_factor,
439
+ do_normalize=do_normalize,
440
+ image_mean=image_mean,
441
+ image_std=image_std,
442
+ patch_size=patch_size,
443
+ temporal_patch_size=temporal_patch_size,
444
+ merge_size=merge_size,
445
+ data_format=data_format,
446
+ do_convert_rgb=do_convert_rgb,
447
+ input_data_format=input_data_format,
448
+ )
449
+ pixel_values.extend(patches)
450
+ vision_grid_thws.append(image_grid_thw)
451
+ pixel_values = np.array(pixel_values)
452
+ vision_grid_thws = np.array(vision_grid_thws)
453
+ data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws})
454
+
455
+ # kept for BC only and should be removed after v5.0
456
+ if videos is not None:
457
+ logger.warning(
458
+ "`Qwen2VLImageProcessor` works only with image inputs and doesn't process videos anymore. "
459
+ "This is a deprecated behavior and will be removed in v5.0. "
460
+ "Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
461
+ )
462
+ videos = make_batched_videos(videos)
463
+ pixel_values_videos, vision_grid_thws_videos = [], []
464
+ for images in videos:
465
+ patches, video_grid_thw = self._preprocess(
466
+ images,
467
+ do_resize=do_resize,
468
+ size=size,
469
+ resample=resample,
470
+ do_rescale=do_rescale,
471
+ rescale_factor=rescale_factor,
472
+ do_normalize=do_normalize,
473
+ image_mean=image_mean,
474
+ image_std=image_std,
475
+ patch_size=patch_size,
476
+ temporal_patch_size=temporal_patch_size,
477
+ merge_size=merge_size,
478
+ data_format=data_format,
479
+ do_convert_rgb=do_convert_rgb,
480
+ input_data_format=input_data_format,
481
+ )
482
+ pixel_values_videos.extend(patches)
483
+ vision_grid_thws_videos.append(video_grid_thw)
484
+ data.update(
485
+ {
486
+ "pixel_values_videos": np.array(pixel_values_videos),
487
+ "video_grid_thw": np.array(vision_grid_thws_videos),
488
+ }
489
+ )
490
+
491
+ return BatchFeature(data=data, tensor_type=return_tensors)
492
+
493
+
494
+ __all__ = ["Qwen2VLImageProcessor"]
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2e3f1f331eb3aa68636be825a1b058b147bbf0d6d87af2333d3c317fcb8bdff
3
+ size 1330580320
modeling_qwen2_vl.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """PyTorch Qwen2-VL model."""
21
+
22
+ import math
23
+ from dataclasses import dataclass
24
+ from typing import Any, Dict, List, Optional, Tuple, Union
25
+
26
+ import torch
27
+ import torch.nn as nn
28
+ import torch.nn.functional as F
29
+ from torch.nn import LayerNorm
30
+
31
+ from transformers.activations import ACT2FN
32
+ from transformers.modeling_flash_attention_utils import is_flash_attn_available
33
+ from transformers.modeling_utils import PreTrainedModel
34
+ from transformers.utils import auto_docstring, logging
35
+ from .configuration_qwen2_vl import Qwen2VLVisionConfig
36
+
37
+
38
+ if is_flash_attn_available():
39
+ from transformers.modeling_flash_attention_utils import flash_attn_varlen_func
40
+
41
+
42
+
43
+ logger = logging.get_logger(__name__)
44
+
45
+
46
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
47
+ def rotate_half(x):
48
+ """Rotates half the hidden dims of the input."""
49
+ x1 = x[..., : x.shape[-1] // 2]
50
+ x2 = x[..., x.shape[-1] // 2 :]
51
+ return torch.cat((-x2, x1), dim=-1)
52
+
53
+ def apply_rotary_pos_emb_vision(
54
+ q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
55
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
56
+ orig_q_dtype = q.dtype
57
+ orig_k_dtype = k.dtype
58
+ q, k = q.float(), k.float()
59
+ cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
60
+ q_embed = (q * cos) + (rotate_half(q) * sin)
61
+ k_embed = (k * cos) + (rotate_half(k) * sin)
62
+ q_embed = q_embed.to(orig_q_dtype)
63
+ k_embed = k_embed.to(orig_k_dtype)
64
+ return q_embed, k_embed
65
+
66
+
67
+ class VisionRotaryEmbedding(nn.Module):
68
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
69
+ super().__init__()
70
+ inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
71
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
72
+
73
+ def forward(self, seqlen: int) -> torch.Tensor:
74
+ seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
75
+ freqs = torch.outer(seq, self.inv_freq)
76
+ return freqs
77
+
78
+
79
+ class PatchEmbed(nn.Module):
80
+ def __init__(
81
+ self,
82
+ patch_size: int = 14,
83
+ temporal_patch_size: int = 2,
84
+ in_channels: int = 3,
85
+ embed_dim: int = 1152,
86
+ ) -> None:
87
+ super().__init__()
88
+ self.patch_size = patch_size
89
+ self.temporal_patch_size = temporal_patch_size
90
+ self.in_channels = in_channels
91
+ self.embed_dim = embed_dim
92
+
93
+ kernel_size = [temporal_patch_size, patch_size, patch_size]
94
+ self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
95
+
96
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
97
+ target_dtype = self.proj.weight.dtype
98
+ hidden_states = hidden_states.view(
99
+ -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
100
+ )
101
+ hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
102
+ return hidden_states
103
+
104
+
105
+ class PatchMerger(nn.Module):
106
+ def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
107
+ super().__init__()
108
+ self.hidden_size = context_dim * (spatial_merge_size**2)
109
+ self.ln_q = LayerNorm(context_dim, eps=1e-6)
110
+ self.mlp = nn.Sequential(
111
+ nn.Linear(self.hidden_size, self.hidden_size),
112
+ nn.GELU(),
113
+ nn.Linear(self.hidden_size, dim),
114
+ )
115
+
116
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
117
+ x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
118
+ return x
119
+
120
+
121
+ class VisionMlp(nn.Module):
122
+ def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None:
123
+ super().__init__()
124
+ self.fc1 = nn.Linear(dim, hidden_dim)
125
+ self.act = ACT2FN[hidden_act]
126
+ self.fc2 = nn.Linear(hidden_dim, dim)
127
+
128
+ def forward(self, x) -> torch.Tensor:
129
+ return self.fc2(self.act(self.fc1(x)))
130
+
131
+
132
+ class VisionAttention(nn.Module):
133
+ def __init__(self, dim: int, num_heads: int = 16) -> None:
134
+ super().__init__()
135
+ self.num_heads = num_heads
136
+ self.head_dim = dim // num_heads
137
+ self.qkv = nn.Linear(dim, dim * 3, bias=True)
138
+ self.proj = nn.Linear(dim, dim)
139
+
140
+ def forward(
141
+ self,
142
+ hidden_states: torch.Tensor,
143
+ cu_seqlens: torch.Tensor,
144
+ rotary_pos_emb: Optional[torch.Tensor] = None,
145
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
146
+ ) -> torch.Tensor:
147
+ seq_length = hidden_states.shape[0]
148
+ q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
149
+ if position_embeddings is None:
150
+ logger.warning_once(
151
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
152
+ "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
153
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
154
+ "removed and `position_embeddings` will be mandatory."
155
+ )
156
+ emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
157
+ cos = emb.cos()
158
+ sin = emb.sin()
159
+ else:
160
+ cos, sin = position_embeddings
161
+ q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
162
+
163
+ attention_mask = torch.full(
164
+ [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
165
+ )
166
+ for i in range(1, len(cu_seqlens)):
167
+ attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
168
+
169
+ q = q.transpose(0, 1)
170
+ k = k.transpose(0, 1)
171
+ v = v.transpose(0, 1)
172
+ attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
173
+ attn_weights = attn_weights + attention_mask
174
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
175
+ attn_output = torch.matmul(attn_weights, v)
176
+ attn_output = attn_output.transpose(0, 1)
177
+ attn_output = attn_output.reshape(seq_length, -1)
178
+ attn_output = self.proj(attn_output)
179
+ return attn_output
180
+
181
+
182
+ class VisionFlashAttention2(nn.Module):
183
+ def __init__(self, dim: int, num_heads: int = 16) -> None:
184
+ super().__init__()
185
+ self.num_heads = num_heads
186
+ self.qkv = nn.Linear(dim, dim * 3, bias=True)
187
+ self.proj = nn.Linear(dim, dim)
188
+
189
+ def forward(
190
+ self,
191
+ hidden_states: torch.Tensor,
192
+ cu_seqlens: torch.Tensor,
193
+ rotary_pos_emb: Optional[torch.Tensor] = None,
194
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
195
+ ) -> torch.Tensor:
196
+ seq_length = hidden_states.shape[0]
197
+ q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
198
+ if position_embeddings is None:
199
+ logger.warning_once(
200
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
201
+ "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
202
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
203
+ "removed and `position_embeddings` will be mandatory."
204
+ )
205
+ emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
206
+ cos = emb.cos()
207
+ sin = emb.sin()
208
+ else:
209
+ cos, sin = position_embeddings
210
+ q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
211
+
212
+ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
213
+ attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
214
+ seq_length, -1
215
+ )
216
+ attn_output = self.proj(attn_output)
217
+ return attn_output
218
+
219
+
220
+ class VisionSdpaAttention(nn.Module):
221
+ def __init__(self, dim: int, num_heads: int = 16) -> None:
222
+ super().__init__()
223
+ self.num_heads = num_heads
224
+ self.qkv = nn.Linear(dim, dim * 3, bias=True)
225
+ self.proj = nn.Linear(dim, dim)
226
+
227
+ def forward(
228
+ self,
229
+ hidden_states: torch.Tensor,
230
+ cu_seqlens: torch.Tensor,
231
+ rotary_pos_emb: Optional[torch.Tensor] = None,
232
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
233
+ ) -> torch.Tensor:
234
+ seq_length = hidden_states.shape[0]
235
+ q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
236
+ if position_embeddings is None:
237
+ logger.warning_once(
238
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
239
+ "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
240
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
241
+ "removed and `position_embeddings` will be mandatory."
242
+ )
243
+ emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
244
+ cos = emb.cos()
245
+ sin = emb.sin()
246
+ else:
247
+ cos, sin = position_embeddings
248
+ q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
249
+
250
+ attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
251
+ for i in range(1, len(cu_seqlens)):
252
+ attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
253
+ q = q.transpose(0, 1)
254
+ k = k.transpose(0, 1)
255
+ v = v.transpose(0, 1)
256
+ attn_output = F.scaled_dot_product_attention(
257
+ q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0), attention_mask, dropout_p=0.0
258
+ )
259
+ attn_output = attn_output.squeeze(0).transpose(0, 1)
260
+ attn_output = attn_output.reshape(seq_length, -1)
261
+ attn_output = self.proj(attn_output)
262
+ return attn_output
263
+
264
+
265
+ QWEN2_VL_VISION_ATTENTION_CLASSES = {
266
+ "eager": VisionAttention,
267
+ "flash_attention_2": VisionFlashAttention2,
268
+ "sdpa": VisionSdpaAttention,
269
+ }
270
+
271
+
272
+ class Qwen2VLVisionBlock(nn.Module):
273
+ def __init__(self, config, attn_implementation: str = "sdpa") -> None:
274
+ super().__init__()
275
+ self.norm1 = LayerNorm(config.embed_dim, eps=1e-6)
276
+ self.norm2 = LayerNorm(config.embed_dim, eps=1e-6)
277
+ mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
278
+
279
+ self.attn = QWEN2_VL_VISION_ATTENTION_CLASSES[attn_implementation](
280
+ config.embed_dim, num_heads=config.num_heads
281
+ )
282
+ self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)
283
+
284
+ def forward(
285
+ self,
286
+ hidden_states: torch.Tensor,
287
+ cu_seqlens: torch.Tensor,
288
+ rotary_pos_emb: Optional[torch.Tensor] = None,
289
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
290
+ ) -> torch.Tensor:
291
+ hidden_states = hidden_states + self.attn(
292
+ self.norm1(hidden_states),
293
+ cu_seqlens=cu_seqlens,
294
+ rotary_pos_emb=rotary_pos_emb,
295
+ position_embeddings=position_embeddings,
296
+ )
297
+ hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
298
+ return hidden_states
299
+
300
+ @auto_docstring
301
+ class Qwen2VisionTransformerPretrainedModel(PreTrainedModel):
302
+ config_class = Qwen2VLVisionConfig
303
+ base_model_prefix = "model"
304
+ supports_gradient_checkpointing = True
305
+ _no_split_modules = ["Qwen2VLVisionBlock"]
306
+ _skip_keys_device_placement = "past_key_values"
307
+ _supports_flash_attn_2 = True
308
+ _supports_sdpa = True
309
+ _supports_cache_class = True
310
+ _supports_static_cache = False # TODO (joao): fix. torch.compile failing probably due to `cache_positions`
311
+
312
+ def __init__(self, config) -> None:
313
+ super().__init__(config)
314
+ self.spatial_merge_size = config.spatial_merge_size
315
+
316
+ self.patch_embed = PatchEmbed(
317
+ patch_size=config.patch_size,
318
+ temporal_patch_size=config.temporal_patch_size,
319
+ in_channels=config.in_channels,
320
+ embed_dim=config.embed_dim,
321
+ )
322
+
323
+ head_dim = config.embed_dim // config.num_heads
324
+ self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
325
+
326
+ self.blocks = nn.ModuleList(
327
+ [Qwen2VLVisionBlock(config, config._attn_implementation) for _ in range(config.depth)]
328
+ )
329
+ self.merger = PatchMerger(
330
+ dim=config.hidden_size, context_dim=config.embed_dim, spatial_merge_size=config.spatial_merge_size
331
+ )
332
+ self.gradient_checkpointing = False
333
+
334
+ def _init_weights(self, module):
335
+ std = self.config.initializer_range
336
+ if isinstance(module, (nn.Linear, nn.Conv3d)):
337
+ module.weight.data.normal_(mean=0.0, std=std)
338
+ if module.bias is not None:
339
+ module.bias.data.zero_()
340
+ elif isinstance(module, nn.Embedding):
341
+ module.weight.data.normal_(mean=0.0, std=std)
342
+ if module.padding_idx is not None:
343
+ module.weight.data[module.padding_idx].zero_()
344
+
345
+ def get_dtype(self) -> torch.dtype:
346
+ return self.blocks[0].mlp.fc2.weight.dtype
347
+
348
+ def get_device(self) -> torch.device:
349
+ return self.blocks[0].mlp.fc2.weight.device
350
+
351
+ def rot_pos_emb(self, grid_thw):
352
+ pos_ids = []
353
+ for t, h, w in grid_thw:
354
+ hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
355
+ hpos_ids = hpos_ids.reshape(
356
+ h // self.spatial_merge_size,
357
+ self.spatial_merge_size,
358
+ w // self.spatial_merge_size,
359
+ self.spatial_merge_size,
360
+ )
361
+ hpos_ids = hpos_ids.permute(0, 2, 1, 3)
362
+ hpos_ids = hpos_ids.flatten()
363
+
364
+ wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
365
+ wpos_ids = wpos_ids.reshape(
366
+ h // self.spatial_merge_size,
367
+ self.spatial_merge_size,
368
+ w // self.spatial_merge_size,
369
+ self.spatial_merge_size,
370
+ )
371
+ wpos_ids = wpos_ids.permute(0, 2, 1, 3)
372
+ wpos_ids = wpos_ids.flatten()
373
+ pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
374
+ pos_ids = torch.cat(pos_ids, dim=0)
375
+ max_grid_size = grid_thw[:, 1:].max()
376
+ rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
377
+ rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
378
+ return rotary_pos_emb
379
+
380
+ @auto_docstring
381
+ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
382
+ r"""
383
+ grid_thw (`torch.LongTensor` of shape `(num_images, 3)`):
384
+ The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values.
385
+ """
386
+ hidden_states = self.patch_embed(hidden_states)
387
+ rotary_pos_emb = self.rot_pos_emb(grid_thw)
388
+ emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
389
+ position_embeddings = (emb.cos(), emb.sin())
390
+
391
+ cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
392
+ dim=0,
393
+ # Select dtype based on the following factors:
394
+ # - FA2 requires that cu_seqlens_q must have dtype int32
395
+ # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
396
+ # See https://github.com/huggingface/transformers/pull/34852 for more information
397
+ dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
398
+ )
399
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
400
+
401
+ for blk in self.blocks:
402
+ if self.gradient_checkpointing and self.training:
403
+ hidden_states = self._gradient_checkpointing_func(
404
+ blk.__call__, hidden_states, cu_seqlens, None, position_embeddings
405
+ )
406
+ else:
407
+ hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, position_embeddings=position_embeddings)
408
+
409
+ return self.merger(hidden_states)
410
+
preprocessor_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_qwen2_vl.Qwen2VLImageProcessor"
4
+ },
5
+ "do_convert_rgb": true,
6
+ "do_normalize": true,
7
+ "do_rescale": true,
8
+ "do_resize": true,
9
+ "image_mean": [
10
+ 0.48145466,
11
+ 0.4578275,
12
+ 0.40821073
13
+ ],
14
+ "image_std": [
15
+ 0.26862954,
16
+ 0.26130258,
17
+ 0.27577711
18
+ ],
19
+ "max_pixels": 12845056,
20
+ "merge_size": 2,
21
+ "min_pixels": 3136,
22
+ "patch_size": 14,
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "longest_edge": 12845056,
27
+ "shortest_edge": 3136
28
+ },
29
+ "temporal_patch_size": 2
30
+ }