TYTTYTTYT commited on
Commit
9118e49
·
verified ·
1 Parent(s): 8e9cbfb

Fixed bug in resize logic

Browse files
image_processing_qwen2_vl.py CHANGED
@@ -30,7 +30,7 @@ from transformers.video_utils import VideoInput
30
  logger = logging.get_logger(__name__)
31
 
32
 
33
- class Qwen2VLImageProcessorKwargs(ImagesKwargs, total=False):
34
  r"""
35
  min_pixels (`int`, *optional*, defaults to `56 * 56`):
36
  The min pixels of the image to resize the image.
@@ -42,6 +42,8 @@ class Qwen2VLImageProcessorKwargs(ImagesKwargs, total=False):
42
  The temporal patch size of the vision encoder.
43
  merge_size (`int`, *optional*, defaults to 2):
44
  The merge size of the vision encoder to llm encoder.
 
 
45
  """
46
 
47
  min_pixels: int
@@ -49,6 +51,7 @@ class Qwen2VLImageProcessorKwargs(ImagesKwargs, total=False):
49
  patch_size: int
50
  temporal_patch_size: int
51
  merge_size: int
 
52
 
53
 
54
  def smart_resize(
@@ -116,7 +119,7 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
116
  """
117
 
118
  model_input_names = ["pixel_values", "image_grid_thw"]
119
- valid_kwargs = Qwen2VLImageProcessorKwargs
120
 
121
  def __init__(
122
  self,
@@ -471,4 +474,4 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
471
  return grid_h * grid_w
472
 
473
 
474
- __all__ = ["ZFQwen2VLImageProcessor"]
 
30
  logger = logging.get_logger(__name__)
31
 
32
 
33
+ class ZFQwen2VLImageProcessorKwargs(ImagesKwargs, total=False):
34
  r"""
35
  min_pixels (`int`, *optional*, defaults to `56 * 56`):
36
  The min pixels of the image to resize the image.
 
42
  The temporal patch size of the vision encoder.
43
  merge_size (`int`, *optional*, defaults to 2):
44
  The merge size of the vision encoder to llm encoder.
45
+ focus_size (`int`, *optional*, defaults to 1280):
46
+ The focus size of the VLLM model.
47
  """
48
 
49
  min_pixels: int
 
51
  patch_size: int
52
  temporal_patch_size: int
53
  merge_size: int
54
+ focus_size: int
55
 
56
 
57
  def smart_resize(
 
119
  """
120
 
121
  model_input_names = ["pixel_values", "image_grid_thw"]
122
+ valid_kwargs = ZFQwen2VLImageProcessorKwargs
123
 
124
  def __init__(
125
  self,
 
474
  return grid_h * grid_w
475
 
476
 
477
+ __all__ = ["ZFQwen2VLImageProcessor", "ZFQwen2VLImageProcessorKwargs"]
image_processing_qwen2_vl_fast.py CHANGED
@@ -3,28 +3,21 @@ from typing import Optional, Union
3
  import torch
4
  import torchvision.transforms.v2.functional as tvF
5
 
6
- from transformers.image_processing_utils import BatchFeature
7
- from transformers.image_processing_utils_fast import (
8
- BaseImageProcessorFast,
9
- group_images_by_shape,
10
- reorder_images,
11
- )
12
  from transformers.image_utils import (
13
- OPENAI_CLIP_MEAN,
14
- OPENAI_CLIP_STD,
15
  ChannelDimension,
16
  ImageInput,
17
  PILImageResampling,
18
  SizeDict,
19
  )
20
  from transformers.processing_utils import Unpack
21
- from transformers.utils import (
22
- TensorType,
23
- auto_docstring,
24
- logging,
25
- )
26
- from .image_processing_qwen2_vl import Qwen2VLImageProcessorKwargs, smart_resize
27
-
28
 
29
  logger = logging.get_logger(__name__)
30
 
@@ -42,27 +35,28 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
42
  patch_size = 14
43
  temporal_patch_size = 2
44
  merge_size = 2
45
- valid_kwargs = Qwen2VLImageProcessorKwargs
 
46
  model_input_names = ["pixel_values", "image_grid_thw"]
47
 
48
- def __init__(self, **kwargs: Unpack[Qwen2VLImageProcessorKwargs]):
49
  size = kwargs.pop("size", None)
50
  min_pixels = kwargs.pop("min_pixels", None)
51
  max_pixels = kwargs.pop("max_pixels", None)
52
  # backward compatibility: override size with min_pixels and max_pixels if they are provided
53
  size = self.size if size is None else size
54
  if min_pixels is not None:
55
- size["shortest_edge"] = min_pixels
56
- size.pop("min_pixels", None)
57
  if max_pixels is not None:
58
- size["longest_edge"] = max_pixels
59
- size.pop("max_pixels", None)
60
- if "shortest_edge" not in size or "longest_edge" not in size:
61
  raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
62
 
63
- super().__init__(size=size, **kwargs)
64
 
65
- def _further_process_kwargs(
66
  self,
67
  size: SizeDict | None = None,
68
  min_pixels: int | None = None,
@@ -74,32 +68,32 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
74
  Can be overridden by subclasses to customize the processing of kwargs.
75
  """
76
  if min_pixels is not None and max_pixels is not None:
77
- size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
78
  elif size is not None:
79
  if "shortest_edge" not in size or "longest_edge" not in size:
80
  raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
81
  min_pixels = size["shortest_edge"]
82
  max_pixels = size["longest_edge"]
83
  else:
84
- size = {**self.size}
85
 
86
  return super()._further_process_kwargs(size=size, **kwargs)
87
 
88
  @auto_docstring
89
- def preprocess(
90
  self,
91
  images: ImageInput,
92
- **kwargs: Unpack[Qwen2VLImageProcessorKwargs],
93
  ) -> BatchFeature:
94
  return super().preprocess(images, **kwargs)
95
 
96
- def _preprocess_image_like_inputs(
97
  self,
98
  images: ImageInput,
99
  do_convert_rgb: bool,
100
  input_data_format: ChannelDimension,
101
  device: Union[str, "torch.device"] | None = None,
102
- **kwargs: Unpack[Qwen2VLImageProcessorKwargs],
103
  ) -> BatchFeature:
104
  """
105
  Preprocess image-like inputs.
@@ -109,12 +103,12 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
109
  # Prepare input images
110
  batch_feature = BatchFeature()
111
  images = self._prepare_image_like_inputs(
112
- images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
113
  )
114
- batch_feature = self._preprocess(images, **kwargs)
115
  return batch_feature
116
 
117
- def _preprocess(
118
  self,
119
  images: list["torch.Tensor"],
120
  do_resize: bool,
@@ -128,6 +122,7 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
128
  patch_size: int,
129
  temporal_patch_size: int,
130
  merge_size: int,
 
131
  disable_grouping: bool | None,
132
  return_tensors: str | TensorType | None,
133
  **kwargs,
@@ -141,7 +136,7 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
141
  resized_height, resized_width = smart_resize(
142
  height,
143
  width,
144
- factor=patch_size * merge_size,
145
  min_pixels=size["shortest_edge"],
146
  max_pixels=size["longest_edge"],
147
  )
@@ -162,7 +157,7 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
162
  resized_height, resized_width = stacked_images.shape[-2:]
163
  # Fused rescale and normalize
164
  patches = self.rescale_and_normalize(
165
- stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
166
  )
167
  if patches.ndim == 4:
168
  # add a temporal dimension if we have images
@@ -200,7 +195,7 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
200
 
201
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
202
  processed_grids = reorder_images(processed_grids, grouped_images_index)
203
- pixel_values = torch.cat(processed_images, dim=0)
204
  image_grid_thw = torch.tensor(processed_grids)
205
 
206
  return BatchFeature(
@@ -224,12 +219,13 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
224
  Returns:
225
  `int`: Number of image patches per image.
226
  """
227
- min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"]
228
- max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"]
229
- patch_size = images_kwargs.get("patch_size", self.patch_size)
230
- merge_size = images_kwargs.get("merge_size", self.merge_size)
 
231
 
232
- factor = patch_size * merge_size
233
  resized_height, resized_width = smart_resize(
234
  height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
235
  )
 
3
  import torch
4
  import torchvision.transforms.v2.functional as tvF
5
 
6
+ from transformers.image_processing_base import BatchFeature
7
+ from transformers.image_processing_utils_fast import BaseImageProcessorFast
8
+ from transformers.image_transforms import group_images_by_shape, reorder_images
9
+ from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 
 
10
  from transformers.image_utils import (
 
 
11
  ChannelDimension,
12
  ImageInput,
13
  PILImageResampling,
14
  SizeDict,
15
  )
16
  from transformers.processing_utils import Unpack
17
+ from transformers.utils.generic import TensorType
18
+ from transformers.utils.auto_docstring import auto_docstring
19
+ from transformers.utils import logging
20
+ from .image_processing_qwen2_vl import ZFQwen2VLImageProcessorKwargs, smart_resize
 
 
 
21
 
22
  logger = logging.get_logger(__name__)
23
 
 
35
  patch_size = 14
36
  temporal_patch_size = 2
37
  merge_size = 2
38
+ focus_size = 2
39
+ valid_kwargs = ZFQwen2VLImageProcessorKwargs
40
  model_input_names = ["pixel_values", "image_grid_thw"]
41
 
42
+ def __init__(self, **kwargs: Unpack[ZFQwen2VLImageProcessorKwargs]):
43
  size = kwargs.pop("size", None)
44
  min_pixels = kwargs.pop("min_pixels", None)
45
  max_pixels = kwargs.pop("max_pixels", None)
46
  # backward compatibility: override size with min_pixels and max_pixels if they are provided
47
  size = self.size if size is None else size
48
  if min_pixels is not None:
49
+ size["shortest_edge"] = min_pixels # type: ignore
50
+ size.pop("min_pixels", None) # type: ignore
51
  if max_pixels is not None:
52
+ size["longest_edge"] = max_pixels # type: ignore
53
+ size.pop("max_pixels", None) # type: ignore
54
+ if "shortest_edge" not in size or "longest_edge" not in size: # type: ignore
55
  raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
56
 
57
+ super().__init__(size=size, **kwargs) # type: ignore
58
 
59
+ def _further_process_kwargs( # type: ignore
60
  self,
61
  size: SizeDict | None = None,
62
  min_pixels: int | None = None,
 
68
  Can be overridden by subclasses to customize the processing of kwargs.
69
  """
70
  if min_pixels is not None and max_pixels is not None:
71
+ size = {"shortest_edge": min_pixels, "longest_edge": max_pixels} # type: ignore
72
  elif size is not None:
73
  if "shortest_edge" not in size or "longest_edge" not in size:
74
  raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
75
  min_pixels = size["shortest_edge"]
76
  max_pixels = size["longest_edge"]
77
  else:
78
+ size = {**self.size} # type: ignore
79
 
80
  return super()._further_process_kwargs(size=size, **kwargs)
81
 
82
  @auto_docstring
83
+ def preprocess( # type: ignore
84
  self,
85
  images: ImageInput,
86
+ **kwargs: Unpack[ZFQwen2VLImageProcessorKwargs],
87
  ) -> BatchFeature:
88
  return super().preprocess(images, **kwargs)
89
 
90
+ def _preprocess_image_like_inputs( # type: ignore
91
  self,
92
  images: ImageInput,
93
  do_convert_rgb: bool,
94
  input_data_format: ChannelDimension,
95
  device: Union[str, "torch.device"] | None = None,
96
+ **kwargs: Unpack[ZFQwen2VLImageProcessorKwargs], # type: ignore
97
  ) -> BatchFeature:
98
  """
99
  Preprocess image-like inputs.
 
103
  # Prepare input images
104
  batch_feature = BatchFeature()
105
  images = self._prepare_image_like_inputs(
106
+ images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device # type: ignore
107
  )
108
+ batch_feature = self._preprocess(images, **kwargs) # type: ignore
109
  return batch_feature
110
 
111
+ def _preprocess( # type: ignore
112
  self,
113
  images: list["torch.Tensor"],
114
  do_resize: bool,
 
122
  patch_size: int,
123
  temporal_patch_size: int,
124
  merge_size: int,
125
+ focus_size: int,
126
  disable_grouping: bool | None,
127
  return_tensors: str | TensorType | None,
128
  **kwargs,
 
136
  resized_height, resized_width = smart_resize(
137
  height,
138
  width,
139
+ factor=patch_size * merge_size * focus_size,
140
  min_pixels=size["shortest_edge"],
141
  max_pixels=size["longest_edge"],
142
  )
 
157
  resized_height, resized_width = stacked_images.shape[-2:]
158
  # Fused rescale and normalize
159
  patches = self.rescale_and_normalize(
160
+ stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std # type: ignore
161
  )
162
  if patches.ndim == 4:
163
  # add a temporal dimension if we have images
 
195
 
196
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
197
  processed_grids = reorder_images(processed_grids, grouped_images_index)
198
+ pixel_values = torch.cat(processed_images, dim=0) # type: ignore
199
  image_grid_thw = torch.tensor(processed_grids)
200
 
201
  return BatchFeature(
 
219
  Returns:
220
  `int`: Number of image patches per image.
221
  """
222
+ min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"] # type: ignore
223
+ max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"] # type: ignore
224
+ patch_size = images_kwargs.get("patch_size", self.patch_size) # type: ignore
225
+ merge_size = images_kwargs.get("merge_size", self.merge_size) # type: ignore
226
+ focus_size = images_kwargs.get("focus_size", self.focus_size) # type: ignore
227
 
228
+ factor = patch_size * merge_size * focus_size
229
  resized_height, resized_width = smart_resize(
230
  height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
231
  )
processor_config.json CHANGED
@@ -12,6 +12,7 @@
12
  "do_normalize": true,
13
  "do_rescale": true,
14
  "do_resize": true,
 
15
  "image_mean": [
16
  0.5,
17
  0.5,
@@ -24,7 +25,7 @@
24
  0.5
25
  ],
26
  "merge_size": 2,
27
- "patch_size": 16,
28
  "resample": 3,
29
  "rescale_factor": 0.00392156862745098,
30
  "size": {
@@ -46,6 +47,7 @@
46
  "do_rescale": true,
47
  "do_resize": true,
48
  "do_sample_frames": true,
 
49
  "fps": 2,
50
  "image_mean": [
51
  0.5,
 
12
  "do_normalize": true,
13
  "do_rescale": true,
14
  "do_resize": true,
15
+ "focus_size": 2,
16
  "image_mean": [
17
  0.5,
18
  0.5,
 
25
  0.5
26
  ],
27
  "merge_size": 2,
28
+ "patch_size": 14,
29
  "resample": 3,
30
  "rescale_factor": 0.00392156862745098,
31
  "size": {
 
47
  "do_rescale": true,
48
  "do_resize": true,
49
  "do_sample_frames": true,
50
+ "focus_size": 2,
51
  "fps": 2,
52
  "image_mean": [
53
  0.5,