ignore type errors in the processing codes

Browse files

Files changed (4) hide show

image_processing_qwen2_vl.py +38 -37
image_processing_qwen2_vl_fast.py +33 -37
processing_qwen3_vl.py +42 -42
video_processing_qwen3_vl.py +25 -23

image_processing_qwen2_vl.py CHANGED Viewed

@@ -3,15 +3,15 @@ from typing import Optional, Union
 import numpy as np
-from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
 from transformers.image_transforms import (
     convert_to_rgb,
     resize,
     to_channel_dimension_format,
 )
 from transformers.image_utils import (
-    OPENAI_CLIP_MEAN,
-    OPENAI_CLIP_STD,
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -23,7 +23,8 @@ from transformers.image_utils import (
     valid_images,
     validate_preprocess_arguments,
 )
-from transformers.utils import TensorType, logging
 from transformers.video_utils import VideoInput, make_batched_videos
@@ -205,16 +206,16 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
-        images = make_flat_list_of_images(images)
         if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
         # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
         if do_rescale and is_scaled_image(images[0]):
-            logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
             )
@@ -222,7 +223,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
-        height, width = get_image_size(images[0], channel_dim=input_data_format)
         resized_height, resized_width = height, width
         processed_images = []
         for image in images:
@@ -230,55 +231,55 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
                 resized_height, resized_width = smart_resize(
                     height,
                     width,
-                    factor=patch_size * merge_size * focus_size,
-                    min_pixels=size["shortest_edge"],
-                    max_pixels=size["longest_edge"],
                 )
                 image = resize(
                     image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
                 )
             if do_rescale:
-                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
             if do_normalize:
                 image = self.normalize(
-                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
                 )
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
             processed_images.append(image)
         patches = np.array(processed_images)
         if data_format == ChannelDimension.LAST:
             patches = patches.transpose(0, 3, 1, 2)
-        if patches.shape[0] % temporal_patch_size != 0:
             repeats = np.repeat(
-                patches[-1][np.newaxis], temporal_patch_size - (patches.shape[0] % temporal_patch_size), axis=0
             )
             patches = np.concatenate([patches, repeats], axis=0)
         channel = patches.shape[1]
-        grid_t = patches.shape[0] // temporal_patch_size
-        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
         patches = patches.reshape(
             grid_t,
-            temporal_patch_size,
             channel,
-            grid_h // merge_size,
-            merge_size,
-            patch_size,
-            grid_w // merge_size,
-            merge_size,
-            patch_size,
         )
         patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
         flatten_patches = patches.reshape(
-            grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
         )
         return flatten_patches, (grid_t, grid_h, grid_w)
-    def preprocess(
         self,
         images: ImageInput,
         videos: Optional[VideoInput] = None,
@@ -386,7 +387,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
         if images is not None:
-            images = self.fetch_images(images)
             images = make_flat_list_of_images(images)
         if images is not None and not valid_images(images):
@@ -408,7 +409,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
         data = {}
         if images is not None:
             pixel_values, vision_grid_thws = [], []
-            for image in images:
                 patches, image_grid_thw = self._preprocess(
                     image,
                     do_resize=do_resize,
@@ -439,9 +440,9 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
                 "This is a deprecated behavior and will be removed in v5.0. "
                 "Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
             )
-            videos = make_batched_videos(videos)
             pixel_values_videos, vision_grid_thws_videos = [], []
-            for images in videos:
                 patches, video_grid_thw = self._preprocess(
                     images,
                     do_resize=do_resize,
@@ -484,11 +485,11 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
         Returns:
             `int`: Number of image patches per image.
         """
-        min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"]
-        max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"]
-        patch_size = images_kwargs.get("patch_size", self.patch_size)
-        merge_size = images_kwargs.get("merge_size", self.merge_size)
-        focus_size = images_kwargs.get("focus_size", self.focus_size)
         factor = patch_size * merge_size * focus_size
         resized_height, resized_width = smart_resize(

 import numpy as np
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_processing_base import BatchFeature
 from transformers.image_transforms import (
     convert_to_rgb,
     resize,
     to_channel_dimension_format,
 )
+from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 from transformers.image_utils import (
     ChannelDimension,
     ImageInput,
     PILImageResampling,
     valid_images,
     validate_preprocess_arguments,
 )
+from transformers.utils.generic import TensorType
+from transformers.utils import logging
 from transformers.video_utils import VideoInput, make_batched_videos
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
+        images = make_flat_list_of_images(images) # type: ignore
         if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images] # type: ignore
         # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images] # type: ignore
         if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once( # type: ignore
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
             )
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format) # type: ignore
         resized_height, resized_width = height, width
         processed_images = []
         for image in images:
                 resized_height, resized_width = smart_resize(
                     height,
                     width,
+                    factor=patch_size * merge_size * focus_size, # type: ignore
+                    min_pixels=size["shortest_edge"], # type: ignore
+                    max_pixels=size["longest_edge"], # type: ignore
                 )
                 image = resize(
                     image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
                 )
             if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format) # type: ignore
             if do_normalize:
                 image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format # type: ignore
                 )
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # type: ignore
             processed_images.append(image)
         patches = np.array(processed_images)
         if data_format == ChannelDimension.LAST:
             patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] % temporal_patch_size != 0: # type: ignore
             repeats = np.repeat(
+                patches[-1][np.newaxis], temporal_patch_size - (patches.shape[0] % temporal_patch_size), axis=0 # type: ignore
             )
             patches = np.concatenate([patches, repeats], axis=0)
         channel = patches.shape[1]
+        grid_t = patches.shape[0] // temporal_patch_size # type: ignore
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size # type: ignore
         patches = patches.reshape(
             grid_t,
+            temporal_patch_size, # type: ignore
             channel,
+            grid_h // merge_size, # type: ignore
+            merge_size, # type: ignore
+            patch_size, # type: ignore
+            grid_w // merge_size, # type: ignore
+            merge_size, # type: ignore
+            patch_size, # type: ignore
         )
         patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
         flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size # type: ignore
         )
         return flatten_patches, (grid_t, grid_h, grid_w)
+    def preprocess( # type: ignore
         self,
         images: ImageInput,
         videos: Optional[VideoInput] = None,
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
         if images is not None:
+            images = self.fetch_images(images) # type: ignore
             images = make_flat_list_of_images(images)
         if images is not None and not valid_images(images):
         data = {}
         if images is not None:
             pixel_values, vision_grid_thws = [], []
+            for image in images: # type: ignore
                 patches, image_grid_thw = self._preprocess(
                     image,
                     do_resize=do_resize,
                 "This is a deprecated behavior and will be removed in v5.0. "
                 "Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
             )
+            videos = make_batched_videos(videos) # type: ignore
             pixel_values_videos, vision_grid_thws_videos = [], []
+            for images in videos: # type: ignore
                 patches, video_grid_thw = self._preprocess(
                     images,
                     do_resize=do_resize,
         Returns:
             `int`: Number of image patches per image.
         """
+        min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"] # type: ignore
+        max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"] # type: ignore
+        patch_size = images_kwargs.get("patch_size", self.patch_size) # type: ignore
+        merge_size = images_kwargs.get("merge_size", self.merge_size) # type: ignore
+        focus_size = images_kwargs.get("focus_size", self.focus_size) # type: ignore
         factor = patch_size * merge_size * focus_size
         resized_height, resized_width = smart_resize(

image_processing_qwen2_vl_fast.py CHANGED Viewed

@@ -3,27 +3,23 @@ from typing import Optional, Union
 import torch
 from torchvision.transforms.v2 import functional as F
-from transformers.image_processing_utils import BatchFeature
 from transformers.image_processing_utils_fast import (
     BaseImageProcessorFast,
     DefaultFastImageProcessorKwargs,
-    group_images_by_shape,
-    reorder_images,
 )
 from transformers.image_utils import (
-    OPENAI_CLIP_MEAN,
-    OPENAI_CLIP_STD,
     ChannelDimension,
     ImageInput,
     PILImageResampling,
     SizeDict,
 )
 from transformers.processing_utils import Unpack
-from transformers.utils import (
-    TensorType,
-    auto_docstring,
-    logging,
-)
 from transformers.video_utils import VideoInput, make_batched_videos
 from .image_processing_qwen2_vl import smart_resize
@@ -81,17 +77,17 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
         # backward compatibility: override size with min_pixels and max_pixels if they are provided
         size = self.size if size is None else size
         if min_pixels is not None:
-            size["shortest_edge"] = min_pixels
             size.pop("min_pixels", None)
         if max_pixels is not None:
-            size["longest_edge"] = max_pixels
             size.pop("max_pixels", None)
         if "shortest_edge" not in size or "longest_edge" not in size:
             raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
-        super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
-    def _further_process_kwargs(
         self,
         size: Optional[SizeDict] = None,
         min_pixels: Optional[int] = None,
@@ -103,19 +99,19 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
         Can be overridden by subclasses to customize the processing of kwargs.
         """
         if min_pixels is not None and max_pixels is not None:
-            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
         elif size is not None:
             if "shortest_edge" not in size or "longest_edge" not in size:
                 raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
             min_pixels = size["shortest_edge"]
             max_pixels = size["longest_edge"]
         else:
-            size = {**self.size}
         return super()._further_process_kwargs(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
     @auto_docstring
-    def preprocess(
         self,
         images: ImageInput,
         videos: Optional[VideoInput] = None,
@@ -123,14 +119,14 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
     ) -> BatchFeature:
         return super().preprocess(images, videos, **kwargs)
-    def _preprocess_image_like_inputs(
         self,
         images: ImageInput,
         videos: VideoInput,
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[DefaultFastImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
@@ -141,9 +137,9 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
         batch_feature = BatchFeature()
         if images is not None:
             images = self._prepare_image_like_inputs(
-                images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
             )
-            batch_feature = self._preprocess(images, **kwargs)
         if videos is not None:
             logger.warning(
                 "`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. "
@@ -151,18 +147,18 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
                 "Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
             )
             # Can't change _prepare_images_structure to work with videos because it also needs to work with images.
-            videos = make_batched_videos(videos)
             videos = [
-                torch.stack(self._prepare_image_like_inputs(video, do_convert_rgb, input_data_format, device))
                 for video in videos
             ]
-            video_outputs = self._preprocess(videos, **kwargs)
             batch_feature.update(
                 {"pixel_values_videos": video_outputs.pixel_values, "video_grid_thw": video_outputs.image_grid_thw}
             )
         return batch_feature
-    def _preprocess(
         self,
         images: list["torch.Tensor"],
         do_resize: bool,
@@ -182,10 +178,10 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
         **kwargs,
     ):
         # Group images by size for batched resizing
-        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
         resized_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
-            height, width = stacked_images.shape[-2:]
             if do_resize:
                 resized_height, resized_width = smart_resize(
                     height,
@@ -195,7 +191,7 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
                     max_pixels=size["longest_edge"],
                 )
                 stacked_images = self.resize(
-                    image=stacked_images,
                     size=SizeDict(height=resized_height, width=resized_width),
                     interpolation=interpolation,
                 )
@@ -204,14 +200,14 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
         # Group images by size for further processing
         # Needed in case do_resize is False, or resize returns images with different sizes
-        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
         processed_images_grouped = {}
         processed_grids = {}
         for shape, stacked_images in grouped_images.items():
-            resized_height, resized_width = stacked_images.shape[-2:]
             # Fused rescale and normalize
             patches = self.rescale_and_normalize(
-                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
             )
             if patches.ndim == 4:
                 # add a temporal dimension if we have images
@@ -249,7 +245,7 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
         processed_grids = reorder_images(processed_grids, grouped_images_index)
-        pixel_values = torch.cat(processed_images, dim=0)
         image_grid_thw = torch.tensor(processed_grids)
         return BatchFeature(
@@ -273,11 +269,11 @@ class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast):
         Returns:
             `int`: Number of image patches per image.
         """
-        min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"]
-        max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"]
-        patch_size = images_kwargs.get("patch_size", self.patch_size)
-        merge_size = images_kwargs.get("merge_size", self.merge_size)
-        focus_size = images_kwargs.get("focus_size", self.focus_size)
         factor = patch_size * merge_size * focus_size
         resized_height, resized_width = smart_resize(

 import torch
 from torchvision.transforms.v2 import functional as F
+from transformers.image_processing_base import BatchFeature
 from transformers.image_processing_utils_fast import (
     BaseImageProcessorFast,
     DefaultFastImageProcessorKwargs,
 )
+from transformers.image_transforms import group_images_by_shape, reorder_images
+from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 from transformers.image_utils import (
     ChannelDimension,
     ImageInput,
     PILImageResampling,
     SizeDict,
 )
 from transformers.processing_utils import Unpack
+from transformers.utils.generic import TensorType
+from transformers.utils.auto_docstring import auto_docstring
+from transformers.utils import logging
 from transformers.video_utils import VideoInput, make_batched_videos
 from .image_processing_qwen2_vl import smart_resize
         # backward compatibility: override size with min_pixels and max_pixels if they are provided
         size = self.size if size is None else size
         if min_pixels is not None:
+            size["shortest_edge"] = min_pixels # type: ignore
             size.pop("min_pixels", None)
         if max_pixels is not None:
+            size["longest_edge"] = max_pixels # type: ignore
             size.pop("max_pixels", None)
         if "shortest_edge" not in size or "longest_edge" not in size:
             raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+        super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs) # type: ignore
+    def _further_process_kwargs( # type: ignore
         self,
         size: Optional[SizeDict] = None,
         min_pixels: Optional[int] = None,
         Can be overridden by subclasses to customize the processing of kwargs.
         """
         if min_pixels is not None and max_pixels is not None:
+            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels} # type: ignore
         elif size is not None:
             if "shortest_edge" not in size or "longest_edge" not in size:
                 raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
             min_pixels = size["shortest_edge"]
             max_pixels = size["longest_edge"]
         else:
+            size = {**self.size} # type: ignore
         return super()._further_process_kwargs(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
     @auto_docstring
+    def preprocess( # type: ignore
         self,
         images: ImageInput,
         videos: Optional[VideoInput] = None,
     ) -> BatchFeature:
         return super().preprocess(images, videos, **kwargs)
+    def _preprocess_image_like_inputs( # type: ignore
         self,
         images: ImageInput,
         videos: VideoInput,
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
+        **kwargs: Unpack[DefaultFastImageProcessorKwargs], # type: ignore
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
         batch_feature = BatchFeature()
         if images is not None:
             images = self._prepare_image_like_inputs(
+                images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device # type: ignore
             )
+            batch_feature = self._preprocess(images, **kwargs) # type: ignore
         if videos is not None:
             logger.warning(
                 "`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. "
                 "Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
             )
             # Can't change _prepare_images_structure to work with videos because it also needs to work with images.
+            videos = make_batched_videos(videos) # type: ignore
             videos = [
+                torch.stack(self._prepare_image_like_inputs(video, do_convert_rgb, input_data_format, device)) # type: ignore
                 for video in videos
             ]
+            video_outputs = self._preprocess(videos, **kwargs) # type: ignore
             batch_feature.update(
                 {"pixel_values_videos": video_outputs.pixel_values, "video_grid_thw": video_outputs.image_grid_thw}
             )
         return batch_feature
+    def _preprocess( # type: ignore
         self,
         images: list["torch.Tensor"],
         do_resize: bool,
         **kwargs,
     ):
         # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) # type: ignore
         resized_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
+            height, width = stacked_images.shape[-2:] # type: ignore
             if do_resize:
                 resized_height, resized_width = smart_resize(
                     height,
                     max_pixels=size["longest_edge"],
                 )
                 stacked_images = self.resize(
+                    image=stacked_images, # type: ignore
                     size=SizeDict(height=resized_height, width=resized_width),
                     interpolation=interpolation,
                 )
         # Group images by size for further processing
         # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) # type: ignore
         processed_images_grouped = {}
         processed_grids = {}
         for shape, stacked_images in grouped_images.items():
+            resized_height, resized_width = stacked_images.shape[-2:] # type: ignore
             # Fused rescale and normalize
             patches = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std # type: ignore
             )
             if patches.ndim == 4:
                 # add a temporal dimension if we have images
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
         processed_grids = reorder_images(processed_grids, grouped_images_index)
+        pixel_values = torch.cat(processed_images, dim=0) # type: ignore
         image_grid_thw = torch.tensor(processed_grids)
         return BatchFeature(
         Returns:
             `int`: Number of image patches per image.
         """
+        min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"] # type: ignore
+        max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"] # type: ignore
+        patch_size = images_kwargs.get("patch_size", self.patch_size) # type: ignore
+        merge_size = images_kwargs.get("merge_size", self.merge_size) # type: ignore
+        focus_size = images_kwargs.get("focus_size", self.focus_size) # type: ignore
         factor = patch_size * merge_size * focus_size
         resized_height, resized_width = smart_resize(

processing_qwen3_vl.py CHANGED Viewed

@@ -27,9 +27,9 @@ class Qwen3VLImagesKwargs(ImagesKwargs):
 class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Qwen3VLImagesKwargs
-    videos_kwargs: Qwen3VLVideosProcessorKwargs
-    _defaults = {
         "text_kwargs": {
             "padding": False,
             "return_token_type_ids": False,
@@ -62,40 +62,40 @@ class ZFQwen3VLProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
-        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
-        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
         self.image_token_id = (
-            tokenizer.image_token_id
             if getattr(tokenizer, "image_token_id", None)
-            else tokenizer.convert_tokens_to_ids(self.image_token)
         )
         self.video_token_id = (
-            tokenizer.video_token_id
             if getattr(tokenizer, "video_token_id", None)
-            else tokenizer.convert_tokens_to_ids(self.video_token)
         )
         self.vision_start_token = (
-            "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
         )
         self.vision_end_token = (
-            "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token
         )
         self.vision_start_token_id = (
-            tokenizer.vision_start_token_id
             if getattr(tokenizer, "vision_start_token_id", None)
-            else tokenizer.convert_tokens_to_ids(self.vision_start_token)
         )
         self.vision_end_token_id = (
-            tokenizer.vision_end_token_id
             if getattr(tokenizer, "vision_end_token_id", None)
-            else tokenizer.convert_tokens_to_ids(self.vision_end_token)
         )
-    def __call__(
         self,
-        images: ImageInput = None,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        videos: VideoInput = None,
         **kwargs: Unpack[Qwen3VLProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -135,19 +135,19 @@ class ZFQwen3VLProcessor(ProcessorMixin):
             - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
-            Qwen3VLProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
         if images is not None:
-            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
             image_grid_thw = image_inputs["image_grid_thw"]
         else:
             image_inputs = {}
             image_grid_thw = None
         if videos is not None:
-            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
             video_grid_thw = videos_inputs["video_grid_thw"]
             # If user has not requested video metadata, pop it
             if "return_metadata" not in kwargs:
@@ -164,23 +164,23 @@ class ZFQwen3VLProcessor(ProcessorMixin):
         text = text.copy()  # below lines change text in-place
         if image_grid_thw is not None:
-            merge_length = self.image_processor.merge_size**2
             index = 0
             for i in range(len(text)):
                 while self.image_token in text[i]:
                     num_image_tokens = image_grid_thw[index].prod() // merge_length
-                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
                     index += 1
-                text[i] = text[i].replace("<|placeholder|>", self.image_token)
         if video_grid_thw is not None:
-            merge_length = self.video_processor.merge_size**2
             index = 0
             for i in range(len(text)):
                 while self.video_token in text[i]:
-                    metadata = video_metadata[index]
                     if metadata.fps is None:
-                        logger.warning_once(
                             "Qwen3VL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
                             "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
                             "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
@@ -191,8 +191,8 @@ class ZFQwen3VLProcessor(ProcessorMixin):
                     curr_timestamp = self._calculate_timestamps(
                         metadata.frames_indices,
                         metadata.fps,
-                        self.video_processor.merge_size,
-                        self.video_processor.focus_size,
                     )
                     print(len(curr_timestamp), curr_timestamp)
@@ -206,20 +206,20 @@ class ZFQwen3VLProcessor(ProcessorMixin):
                             self.vision_start_token + "<|placeholder|>" * frame_seqlen + self.vision_end_token
                         )
                     if f"{self.vision_start_token}{self.video_token}{self.vision_end_token}" in text[i]:
-                        text[i] = text[i].replace(
                             f"{self.vision_start_token}{self.video_token}{self.vision_end_token}", video_placeholder, 1
                         )
                     else:
                         # vllm may input video token directly
-                        text[i] = text[i].replace(self.video_token, video_placeholder, 1)
                     index += 1
-                text[i] = text[i].replace("<|placeholder|>", self.video_token)
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
-        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
-        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
         if return_mm_token_type_ids:
             array_ids = np.array(text_inputs["input_ids"])
@@ -246,10 +246,10 @@ class ZFQwen3VLProcessor(ProcessorMixin):
         if image_sizes is not None:
             images_kwargs = Qwen3VLProcessorKwargs._defaults.get("images_kwargs", {})
             images_kwargs.update(kwargs)
-            merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
             num_image_patches = [
-                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
                 for image_size in image_sizes
             ]
             num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
@@ -259,10 +259,10 @@ class ZFQwen3VLProcessor(ProcessorMixin):
             videos_kwargs = Qwen3VLProcessorKwargs._defaults.get("videos_kwargs", {})
             videos_kwargs.update(kwargs)
             num_video_patches = [
-                self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs)
                 for video_size in video_sizes
             ]
-            num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches]
             vision_data["num_video_tokens"] = num_video_tokens
         return MultiModalData(**vision_data)
@@ -287,7 +287,7 @@ class ZFQwen3VLProcessor(ProcessorMixin):
         Returns:
             `list[str]`: The decoded text.
         """
-        return self.tokenizer.batch_decode(
             generated_outputs,
             skip_special_tokens=skip_special_tokens,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
@@ -306,7 +306,7 @@ class ZFQwen3VLProcessor(ProcessorMixin):
         print(len(indices), indices)
         b_size = merge_size * focus_size
         if len(indices) % b_size != 0:
-            indices.extend(indices[-1] for _ in range(b_size - len(indices) % b_size))
         print(len(indices), indices)
         timestamps = [idx / video_fps for idx in indices]
         # @JJJYmmm frames are merged by self.merge_size, \

 class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Qwen3VLImagesKwargs # type: ignore
+    videos_kwargs: Qwen3VLVideosProcessorKwargs # type: ignore
+    _defaults = { # type: ignore
         "text_kwargs": {
             "padding": False,
             "return_token_type_ids": False,
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
+        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token # type: ignore
+        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token # type: ignore
         self.image_token_id = (
+            tokenizer.image_token_id # type: ignore
             if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token) # type: ignore
         )
         self.video_token_id = (
+            tokenizer.video_token_id # type: ignore
             if getattr(tokenizer, "video_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.video_token) # type: ignore
         )
         self.vision_start_token = (
+            "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token # type: ignore
         )
         self.vision_end_token = (
+            "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token # type: ignore
         )
         self.vision_start_token_id = (
+            tokenizer.vision_start_token_id # type: ignore
             if getattr(tokenizer, "vision_start_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.vision_start_token) # type: ignore
         )
         self.vision_end_token_id = (
+            tokenizer.vision_end_token_id # type: ignore
             if getattr(tokenizer, "vision_end_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.vision_end_token) # type: ignore
         )
+    def __call__( # type: ignore
         self,
+        images: ImageInput = None, # type: ignore
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, # type: ignore
+        videos: VideoInput = None, # type: ignore
         **kwargs: Unpack[Qwen3VLProcessorKwargs],
     ) -> BatchFeature:
         """
             - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
+            Qwen3VLProcessorKwargs, # type: ignore
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs, # type: ignore
             **kwargs,
         )
         if images is not None:
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) # type: ignore
             image_grid_thw = image_inputs["image_grid_thw"]
         else:
             image_inputs = {}
             image_grid_thw = None
         if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) # type: ignore
             video_grid_thw = videos_inputs["video_grid_thw"]
             # If user has not requested video metadata, pop it
             if "return_metadata" not in kwargs:
         text = text.copy()  # below lines change text in-place
         if image_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2 # type: ignore
             index = 0
             for i in range(len(text)):
                 while self.image_token in text[i]:
                     num_image_tokens = image_grid_thw[index].prod() // merge_length
+                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1) # type: ignore
                     index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token) # type: ignore
         if video_grid_thw is not None:
+            merge_length = self.video_processor.merge_size**2 # type: ignore
             index = 0
             for i in range(len(text)):
                 while self.video_token in text[i]:
+                    metadata = video_metadata[index] # type: ignore
                     if metadata.fps is None:
+                        logger.warning_once( # type: ignore
                             "Qwen3VL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
                             "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
                             "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
                     curr_timestamp = self._calculate_timestamps(
                         metadata.frames_indices,
                         metadata.fps,
+                        self.video_processor.merge_size, # type: ignore
+                        self.video_processor.focus_size, # type: ignore
                     )
                     print(len(curr_timestamp), curr_timestamp)
                             self.vision_start_token + "<|placeholder|>" * frame_seqlen + self.vision_end_token
                         )
                     if f"{self.vision_start_token}{self.video_token}{self.vision_end_token}" in text[i]:
+                        text[i] = text[i].replace( # type: ignore
                             f"{self.vision_start_token}{self.video_token}{self.vision_end_token}", video_placeholder, 1
                         )
                     else:
                         # vllm may input video token directly
+                        text[i] = text[i].replace(self.video_token, video_placeholder, 1) # type: ignore
                     index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token) # type: ignore
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) # type: ignore
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) # type: ignore
         if return_mm_token_type_ids:
             array_ids = np.array(text_inputs["input_ids"])
         if image_sizes is not None:
             images_kwargs = Qwen3VLProcessorKwargs._defaults.get("images_kwargs", {})
             images_kwargs.update(kwargs)
+            merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size # type: ignore
             num_image_patches = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs) # type: ignore
                 for image_size in image_sizes
             ]
             num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
             videos_kwargs = Qwen3VLProcessorKwargs._defaults.get("videos_kwargs", {})
             videos_kwargs.update(kwargs)
             num_video_patches = [
+                self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs) # type: ignore
                 for video_size in video_sizes
             ]
+            num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches] # type: ignore
             vision_data["num_video_tokens"] = num_video_tokens
         return MultiModalData(**vision_data)
         Returns:
             `list[str]`: The decoded text.
         """
+        return self.tokenizer.batch_decode( # type: ignore
             generated_outputs,
             skip_special_tokens=skip_special_tokens,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
         print(len(indices), indices)
         b_size = merge_size * focus_size
         if len(indices) % b_size != 0:
+            indices.extend(indices[-1] for _ in range(b_size - len(indices) % b_size)) # type: ignore
         print(len(indices), indices)
         timestamps = [idx / video_fps for idx in indices]
         # @JJJYmmm frames are merged by self.merge_size, \

video_processing_qwen3_vl.py CHANGED Viewed

@@ -7,7 +7,9 @@ import torch
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ChannelDimension, PILImageResampling, SizeDict, get_image_size
 from transformers.processing_utils import Unpack, VideosKwargs
-from transformers.utils import TensorType, add_start_docstrings, logging
 from transformers.video_processing_utils import BASE_VIDEO_PROCESSOR_DOCSTRING, BaseVideoProcessor
 from transformers.video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
@@ -96,7 +98,7 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
         ):
             raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
-    def _further_process_kwargs(
         self,
         size: Optional[SizeDict] = None,
         **kwargs,
@@ -110,7 +112,7 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
         return super()._further_process_kwargs(size=size, **kwargs)
-    def sample_frames(
         self,
         metadata: VideoMetadata,
         num_frames: Optional[int] = None,
@@ -145,7 +147,7 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
         if num_frames is None and fps is not None:
             if metadata.fps is None:
                 metadata.fps = 24
-                logger.warning_once(
                     "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
                     "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
                 )
@@ -159,7 +161,7 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
         return indices
-    def _preprocess(
         self,
         videos: list[torch.Tensor],
         do_convert_rgb: bool = True,
@@ -189,16 +191,16 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
                     num_frames=num_frames,
                     height=height,
                     width=width,
-                    temporal_factor=temporal_patch_size,
-                    factor=patch_size * merge_size * focus_size,
-                    min_pixels=size.shortest_edge,
-                    max_pixels=size.longest_edge,
                 )
                 stacked_videos = stacked_videos.view(B * T, C, H, W)
                 stacked_videos = self.resize(
                     stacked_videos,
                     size=SizeDict(height=resized_height, width=resized_width),
-                    interpolation=interpolation,
                 )
                 stacked_videos = stacked_videos.view(B, T, C, resized_height, resized_width)
             resized_videos_grouped[shape] = stacked_videos
@@ -210,40 +212,40 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
         processed_videos_grouped = {}
         processed_grids = {}
         for shape, stacked_videos in grouped_videos.items():
-            resized_height, resized_width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST)
             # Fused rescale and normalize
             stacked_videos = self.rescale_and_normalize(
-                stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
             )
             patches = stacked_videos
-            temporal_focus_size = temporal_patch_size * focus_size
             # Check that videos have `num_frames` divisible by `temporal_patch_size`
             if res := patches.shape[1] % temporal_focus_size:
                 repeats = patches[:, -1:].repeat(1, temporal_focus_size - res, 1, 1, 1)
                 patches = torch.cat([patches, repeats], dim=1)
             batch_size, grid_t, channel = patches.shape[:3]
-            grid_t = grid_t // temporal_patch_size
-            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
             patches = patches.view(
                 batch_size,
                 grid_t,
-                temporal_patch_size,
                 channel,
-                grid_h // merge_size,
-                merge_size,
-                patch_size,
-                grid_w // merge_size,
-                merge_size,
-                patch_size,
             )
             patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
             flatten_patches = patches.reshape(
                 batch_size,
                 grid_t * grid_h * grid_w,
-                channel * temporal_patch_size * patch_size * patch_size,
             )
             processed_videos_grouped[shape] = flatten_patches

 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ChannelDimension, PILImageResampling, SizeDict, get_image_size
 from transformers.processing_utils import Unpack, VideosKwargs
+from transformers.utils.generic import TensorType
+from transformers.utils.doc import add_start_docstrings
+from transformers.utils import logging
 from transformers.video_processing_utils import BASE_VIDEO_PROCESSOR_DOCSTRING, BaseVideoProcessor
 from transformers.video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
         ):
             raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+    def _further_process_kwargs( # type: ignore
         self,
         size: Optional[SizeDict] = None,
         **kwargs,
         return super()._further_process_kwargs(size=size, **kwargs)
+    def sample_frames( # type: ignore
         self,
         metadata: VideoMetadata,
         num_frames: Optional[int] = None,
         if num_frames is None and fps is not None:
             if metadata.fps is None:
                 metadata.fps = 24
+                logger.warning_once( # type: ignore
                     "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
                     "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
                 )
         return indices
+    def _preprocess( # type: ignore
         self,
         videos: list[torch.Tensor],
         do_convert_rgb: bool = True,
                     num_frames=num_frames,
                     height=height,
                     width=width,
+                    temporal_factor=temporal_patch_size, # type: ignore
+                    factor=patch_size * merge_size * focus_size, # type: ignore
+                    min_pixels=size.shortest_edge, # type: ignore
+                    max_pixels=size.longest_edge, # type: ignore
                 )
                 stacked_videos = stacked_videos.view(B * T, C, H, W)
                 stacked_videos = self.resize(
                     stacked_videos,
                     size=SizeDict(height=resized_height, width=resized_width),
+                    interpolation=interpolation, # type: ignore
                 )
                 stacked_videos = stacked_videos.view(B, T, C, resized_height, resized_width)
             resized_videos_grouped[shape] = stacked_videos
         processed_videos_grouped = {}
         processed_grids = {}
         for shape, stacked_videos in grouped_videos.items():
+            resized_height, resized_width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST) # type: ignore
             # Fused rescale and normalize
             stacked_videos = self.rescale_and_normalize(
+                stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std # type: ignore
             )
             patches = stacked_videos
+            temporal_focus_size = temporal_patch_size * focus_size # type: ignore
             # Check that videos have `num_frames` divisible by `temporal_patch_size`
             if res := patches.shape[1] % temporal_focus_size:
                 repeats = patches[:, -1:].repeat(1, temporal_focus_size - res, 1, 1, 1)
                 patches = torch.cat([patches, repeats], dim=1)
             batch_size, grid_t, channel = patches.shape[:3]
+            grid_t = grid_t // temporal_patch_size # type: ignore
+            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size # type: ignore
             patches = patches.view(
                 batch_size,
                 grid_t,
+                temporal_patch_size, # type: ignore
                 channel,
+                grid_h // merge_size, # type: ignore
+                merge_size, # type: ignore
+                patch_size, # type: ignore
+                grid_w // merge_size, # type: ignore
+                merge_size, # type: ignore
+                patch_size, # type: ignore
             )
             patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
             flatten_patches = patches.reshape(
                 batch_size,
                 grid_t * grid_h * grid_w,
+                channel * temporal_patch_size * patch_size * patch_size, # type: ignore
             )
             processed_videos_grouped[shape] = flatten_patches