Fixed bug in resize logic

Browse files

Files changed (3) hide show

image_processing_qwen2_vl.py +6 -3
image_processing_qwen2_vl_fast.py +37 -41
processor_config.json +3 -1

image_processing_qwen2_vl.py CHANGED Viewed

@@ -30,7 +30,7 @@ from transformers.video_utils import VideoInput
 logger = logging.get_logger(__name__)
-class Qwen2VLImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     min_pixels (`int`, *optional*, defaults to `56 * 56`):
         The min pixels of the image to resize the image.
@@ -42,6 +42,8 @@ class Qwen2VLImageProcessorKwargs(ImagesKwargs, total=False):
         The temporal patch size of the vision encoder.
     merge_size (`int`, *optional*, defaults to 2):
         The merge size of the vision encoder to llm encoder.
     """
     min_pixels: int
@@ -49,6 +51,7 @@ class Qwen2VLImageProcessorKwargs(ImagesKwargs, total=False):
     patch_size: int
     temporal_patch_size: int
     merge_size: int
 def smart_resize(
@@ -116,7 +119,7 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
     """
     model_input_names = ["pixel_values", "image_grid_thw"]
-    valid_kwargs = Qwen2VLImageProcessorKwargs
     def __init__(
         self,
@@ -471,4 +474,4 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
         return grid_h * grid_w
-__all__ = ["ZFQwen2VLImageProcessor"]

 logger = logging.get_logger(__name__)
+class ZFQwen2VLImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     min_pixels (`int`, *optional*, defaults to `56 * 56`):
         The min pixels of the image to resize the image.
         The temporal patch size of the vision encoder.
     merge_size (`int`, *optional*, defaults to 2):
         The merge size of the vision encoder to llm encoder.
+    focus_size (`int`, *optional*, defaults to 1280):
+        The focus size of the VLLM model.
     """
     min_pixels: int
     patch_size: int
     temporal_patch_size: int
     merge_size: int
+    focus_size: int
 def smart_resize(
     """
     model_input_names = ["pixel_values", "image_grid_thw"]
+    valid_kwargs = ZFQwen2VLImageProcessorKwargs
     def __init__(
         self,
         return grid_h * grid_w
+__all__ = ["ZFQwen2VLImageProcessor", "ZFQwen2VLImageProcessorKwargs"]

image_processing_qwen2_vl_fast.py CHANGED Viewed

@@ -3,28 +3,21 @@ from typing import Optional, Union
 import torch
 import torchvision.transforms.v2.functional as tvF
-from transformers.image_processing_utils import BatchFeature
-from transformers.image_processing_utils_fast import (
-    BaseImageProcessorFast,
-    group_images_by_shape,
-    reorder_images,
-)
 from transformers.image_utils import (
-    OPENAI_CLIP_MEAN,
-    OPENAI_CLIP_STD,
     ChannelDimension,
     ImageInput,
     PILImageResampling,
     SizeDict,
 )
 from transformers.processing_utils import Unpack
-from transformers.utils import (
-    TensorType,
-    auto_docstring,
-    logging,
-)
-from .image_processing_qwen2_vl import Qwen2VLImageProcessorKwargs, smart_resize
 logger = logging.get_logger(__name__)
@@ -42,27 +35,28 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
     patch_size = 14
     temporal_patch_size = 2
     merge_size = 2
-    valid_kwargs = Qwen2VLImageProcessorKwargs
     model_input_names = ["pixel_values", "image_grid_thw"]
-    def __init__(self, **kwargs: Unpack[Qwen2VLImageProcessorKwargs]):
         size = kwargs.pop("size", None)
         min_pixels = kwargs.pop("min_pixels", None)
         max_pixels = kwargs.pop("max_pixels", None)
         # backward compatibility: override size with min_pixels and max_pixels if they are provided
         size = self.size if size is None else size
         if min_pixels is not None:
-            size["shortest_edge"] = min_pixels
-            size.pop("min_pixels", None)
         if max_pixels is not None:
-            size["longest_edge"] = max_pixels
-            size.pop("max_pixels", None)
-        if "shortest_edge" not in size or "longest_edge" not in size:
             raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
-        super().__init__(size=size, **kwargs)
-    def _further_process_kwargs(
         self,
         size: SizeDict | None = None,
         min_pixels: int | None = None,
@@ -74,32 +68,32 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
         Can be overridden by subclasses to customize the processing of kwargs.
         """
         if min_pixels is not None and max_pixels is not None:
-            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
         elif size is not None:
             if "shortest_edge" not in size or "longest_edge" not in size:
                 raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
             min_pixels = size["shortest_edge"]
             max_pixels = size["longest_edge"]
         else:
-            size = {**self.size}
         return super()._further_process_kwargs(size=size, **kwargs)
     @auto_docstring
-    def preprocess(
         self,
         images: ImageInput,
-        **kwargs: Unpack[Qwen2VLImageProcessorKwargs],
     ) -> BatchFeature:
         return super().preprocess(images, **kwargs)
-    def _preprocess_image_like_inputs(
         self,
         images: ImageInput,
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Union[str, "torch.device"] | None = None,
-        **kwargs: Unpack[Qwen2VLImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
@@ -109,12 +103,12 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
         # Prepare input images
         batch_feature = BatchFeature()
         images = self._prepare_image_like_inputs(
-            images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
         )
-        batch_feature = self._preprocess(images, **kwargs)
         return batch_feature
-    def _preprocess(
         self,
         images: list["torch.Tensor"],
         do_resize: bool,
@@ -128,6 +122,7 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
         patch_size: int,
         temporal_patch_size: int,
         merge_size: int,
         disable_grouping: bool | None,
         return_tensors: str | TensorType | None,
         **kwargs,
@@ -141,7 +136,7 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
                 resized_height, resized_width = smart_resize(
                     height,
                     width,
-                    factor=patch_size * merge_size,
                     min_pixels=size["shortest_edge"],
                     max_pixels=size["longest_edge"],
                 )
@@ -162,7 +157,7 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
             resized_height, resized_width = stacked_images.shape[-2:]
             # Fused rescale and normalize
             patches = self.rescale_and_normalize(
-                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
             )
             if patches.ndim == 4:
                 # add a temporal dimension if we have images
@@ -200,7 +195,7 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
         processed_grids = reorder_images(processed_grids, grouped_images_index)
-        pixel_values = torch.cat(processed_images, dim=0)
         image_grid_thw = torch.tensor(processed_grids)
         return BatchFeature(
@@ -224,12 +219,13 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
         Returns:
             `int`: Number of image patches per image.
         """
-        min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"]
-        max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"]
-        patch_size = images_kwargs.get("patch_size", self.patch_size)
-        merge_size = images_kwargs.get("merge_size", self.merge_size)
-        factor = patch_size * merge_size
         resized_height, resized_width = smart_resize(
             height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
         )

 import torch
 import torchvision.transforms.v2.functional as tvF
+from transformers.image_processing_base import BatchFeature
+from transformers.image_processing_utils_fast import BaseImageProcessorFast
+from transformers.image_transforms import group_images_by_shape, reorder_images
+from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 from transformers.image_utils import (
     ChannelDimension,
     ImageInput,
     PILImageResampling,
     SizeDict,
 )
 from transformers.processing_utils import Unpack
+from transformers.utils.generic import TensorType
+from transformers.utils.auto_docstring import auto_docstring
+from transformers.utils import logging
+from .image_processing_qwen2_vl import ZFQwen2VLImageProcessorKwargs, smart_resize
 logger = logging.get_logger(__name__)
     patch_size = 14
     temporal_patch_size = 2
     merge_size = 2
+    focus_size = 2
+    valid_kwargs = ZFQwen2VLImageProcessorKwargs
     model_input_names = ["pixel_values", "image_grid_thw"]
+    def __init__(self, **kwargs: Unpack[ZFQwen2VLImageProcessorKwargs]):
         size = kwargs.pop("size", None)
         min_pixels = kwargs.pop("min_pixels", None)
         max_pixels = kwargs.pop("max_pixels", None)
         # backward compatibility: override size with min_pixels and max_pixels if they are provided
         size = self.size if size is None else size
         if min_pixels is not None:
+            size["shortest_edge"] = min_pixels # type: ignore
+            size.pop("min_pixels", None) # type: ignore
         if max_pixels is not None:
+            size["longest_edge"] = max_pixels # type: ignore
+            size.pop("max_pixels", None) # type: ignore
+        if "shortest_edge" not in size or "longest_edge" not in size: # type: ignore
             raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+        super().__init__(size=size, **kwargs) # type: ignore
+    def _further_process_kwargs( # type: ignore
         self,
         size: SizeDict | None = None,
         min_pixels: int | None = None,
         Can be overridden by subclasses to customize the processing of kwargs.
         """
         if min_pixels is not None and max_pixels is not None:
+            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels} # type: ignore
         elif size is not None:
             if "shortest_edge" not in size or "longest_edge" not in size:
                 raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
             min_pixels = size["shortest_edge"]
             max_pixels = size["longest_edge"]
         else:
+            size = {**self.size} # type: ignore
         return super()._further_process_kwargs(size=size, **kwargs)
     @auto_docstring
+    def preprocess( # type: ignore
         self,
         images: ImageInput,
+        **kwargs: Unpack[ZFQwen2VLImageProcessorKwargs],
     ) -> BatchFeature:
         return super().preprocess(images, **kwargs)
+    def _preprocess_image_like_inputs( # type: ignore
         self,
         images: ImageInput,
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Union[str, "torch.device"] | None = None,
+        **kwargs: Unpack[ZFQwen2VLImageProcessorKwargs], # type: ignore
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
         # Prepare input images
         batch_feature = BatchFeature()
         images = self._prepare_image_like_inputs(
+            images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device # type: ignore
         )
+        batch_feature = self._preprocess(images, **kwargs) # type: ignore
         return batch_feature
+    def _preprocess( # type: ignore
         self,
         images: list["torch.Tensor"],
         do_resize: bool,
         patch_size: int,
         temporal_patch_size: int,
         merge_size: int,
+        focus_size: int,
         disable_grouping: bool | None,
         return_tensors: str | TensorType | None,
         **kwargs,
                 resized_height, resized_width = smart_resize(
                     height,
                     width,
+                    factor=patch_size * merge_size * focus_size,
                     min_pixels=size["shortest_edge"],
                     max_pixels=size["longest_edge"],
                 )
             resized_height, resized_width = stacked_images.shape[-2:]
             # Fused rescale and normalize
             patches = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std # type: ignore
             )
             if patches.ndim == 4:
                 # add a temporal dimension if we have images
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
         processed_grids = reorder_images(processed_grids, grouped_images_index)
+        pixel_values = torch.cat(processed_images, dim=0) # type: ignore
         image_grid_thw = torch.tensor(processed_grids)
         return BatchFeature(
         Returns:
             `int`: Number of image patches per image.
         """
+        min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"] # type: ignore
+        max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"] # type: ignore
+        patch_size = images_kwargs.get("patch_size", self.patch_size) # type: ignore
+        merge_size = images_kwargs.get("merge_size", self.merge_size) # type: ignore
+        focus_size = images_kwargs.get("focus_size", self.focus_size) # type: ignore
+        factor = patch_size * merge_size * focus_size
         resized_height, resized_width = smart_resize(
             height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
         )

processor_config.json CHANGED Viewed

@@ -12,6 +12,7 @@
     "do_normalize": true,
     "do_rescale": true,
     "do_resize": true,
     "image_mean": [
       0.5,
       0.5,
@@ -24,7 +25,7 @@
       0.5
     ],
     "merge_size": 2,
-    "patch_size": 16,
     "resample": 3,
     "rescale_factor": 0.00392156862745098,
     "size": {
@@ -46,6 +47,7 @@
     "do_rescale": true,
     "do_resize": true,
     "do_sample_frames": true,
     "fps": 2,
     "image_mean": [
       0.5,

     "do_normalize": true,
     "do_rescale": true,
     "do_resize": true,
+    "focus_size": 2,
     "image_mean": [
       0.5,
       0.5,
       0.5
     ],
     "merge_size": 2,
+    "patch_size": 14,
     "resample": 3,
     "rescale_factor": 0.00392156862745098,
     "size": {
     "do_rescale": true,
     "do_resize": true,
     "do_sample_frames": true,
+    "focus_size": 2,
     "fps": 2,
     "image_mean": [
       0.5,