Fixed bug in resize logic

Browse files

Files changed (4) hide show

image_processing_qwen2_vl.py +36 -31
processing_qwen3_vl.py +49 -41
processor_config.json +2 -2
video_processing_qwen3_vl.py +28 -25

image_processing_qwen2_vl.py CHANGED Viewed

@@ -2,15 +2,15 @@ import math
 import numpy as np
-from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
 from transformers.image_transforms import (
     convert_to_rgb,
     resize,
     to_channel_dimension_format,
 )
 from transformers.image_utils import (
-    OPENAI_CLIP_MEAN,
-    OPENAI_CLIP_STD,
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -23,7 +23,8 @@ from transformers.image_utils import (
     validate_preprocess_arguments,
 )
 from transformers.processing_utils import ImagesKwargs
-from transformers.utils import TensorType, logging
 from transformers.video_utils import VideoInput
@@ -137,6 +138,7 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
         patch_size: int = 14,
         temporal_patch_size: int = 2,
         merge_size: int = 2,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -165,6 +167,7 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
         self.patch_size = patch_size
         self.temporal_patch_size = temporal_patch_size
         self.merge_size = merge_size
         self.do_convert_rgb = do_convert_rgb
     def _preprocess(
@@ -181,6 +184,7 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
         patch_size: int | None = None,
         temporal_patch_size: int | None = None,
         merge_size: int | None = None,
         do_convert_rgb: bool | None = None,
         data_format: ChannelDimension | None = ChannelDimension.FIRST,
         input_data_format: str | ChannelDimension | None = None,
@@ -228,16 +232,16 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
-        images = make_flat_list_of_images(images)
         if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
         # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
         if do_rescale and is_scaled_image(images[0]):
-            logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
             )
@@ -245,7 +249,7 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
-        height, width = get_image_size(images[0], channel_dim=input_data_format)
         resized_height, resized_width = height, width
         processed_images = []
         for image in images:
@@ -253,23 +257,23 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
                 resized_height, resized_width = smart_resize(
                     height,
                     width,
-                    factor=patch_size * merge_size,
-                    min_pixels=size["shortest_edge"],
-                    max_pixels=size["longest_edge"],
                 )
                 image = resize(
                     image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
                 )
             if do_rescale:
-                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
             if do_normalize:
                 image = self.normalize(
-                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
                 )
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
             processed_images.append(image)
         patches = np.array(processed_images)
@@ -282,17 +286,17 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
             patches = np.concatenate([patches, repeats], axis=0)
         channel = patches.shape[1]
         grid_t = patches.shape[0] // temporal_patch_size
-        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
         patches = patches.reshape(
             grid_t,
-            temporal_patch_size,
             channel,
-            grid_h // merge_size,
-            merge_size,
-            patch_size,
-            grid_w // merge_size,
-            merge_size,
-            patch_size,
         )
         patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
         flatten_patches = patches.reshape(
@@ -301,7 +305,7 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
         return flatten_patches, (grid_t, grid_h, grid_w)
-    def preprocess(
         self,
         images: ImageInput,
         do_resize: bool | None = None,
@@ -403,7 +407,7 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
         if images is not None:
-            images = self.fetch_images(images)
             images = make_flat_list_of_images(images)
         if images is not None and not valid_images(images):
@@ -421,7 +425,7 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
         data = {}
         pixel_values, vision_grid_thws = [], []
-        for image in images:
             patches, image_grid_thw = self._preprocess(
                 image,
                 do_resize=do_resize,
@@ -461,12 +465,13 @@ class ZFQwen2VLImageProcessor(BaseImageProcessor):
         Returns:
             `int`: Number of image patches per image.
         """
-        min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"]
-        max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"]
-        patch_size = images_kwargs.get("patch_size", self.patch_size)
-        merge_size = images_kwargs.get("merge_size", self.merge_size)
-        factor = patch_size * merge_size
         resized_height, resized_width = smart_resize(
             height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
         )

 import numpy as np
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_processing_base import BatchFeature
 from transformers.image_transforms import (
     convert_to_rgb,
     resize,
     to_channel_dimension_format,
 )
+from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 from transformers.image_utils import (
     ChannelDimension,
     ImageInput,
     PILImageResampling,
     validate_preprocess_arguments,
 )
 from transformers.processing_utils import ImagesKwargs
+from transformers.utils.generic import TensorType
+from transformers.utils import logging
 from transformers.video_utils import VideoInput
         patch_size: int = 14,
         temporal_patch_size: int = 2,
         merge_size: int = 2,
+        focus_size: int = 2,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self.patch_size = patch_size
         self.temporal_patch_size = temporal_patch_size
         self.merge_size = merge_size
+        self.focus_size = focus_size
         self.do_convert_rgb = do_convert_rgb
     def _preprocess(
         patch_size: int | None = None,
         temporal_patch_size: int | None = None,
         merge_size: int | None = None,
+        focus_size: int | None = None,
         do_convert_rgb: bool | None = None,
         data_format: ChannelDimension | None = ChannelDimension.FIRST,
         input_data_format: str | ChannelDimension | None = None,
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
+        images = make_flat_list_of_images(images) # type: ignore
         if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images] # type: ignore
         # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images] # type: ignore
         if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once( # type: ignore
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
             )
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format) # type: ignore
         resized_height, resized_width = height, width
         processed_images = []
         for image in images:
                 resized_height, resized_width = smart_resize(
                     height,
                     width,
+                    factor=patch_size * merge_size * focus_size, # type: ignore
+                    min_pixels=size["shortest_edge"], # type: ignore
+                    max_pixels=size["longest_edge"], # type: ignore
                 )
                 image = resize(
                     image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
                 )
             if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format) # type: ignore
             if do_normalize:
                 image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format # type: ignore
                 )
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # type: ignore
             processed_images.append(image)
         patches = np.array(processed_images)
             patches = np.concatenate([patches, repeats], axis=0)
         channel = patches.shape[1]
         grid_t = patches.shape[0] // temporal_patch_size
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size # type: ignore
         patches = patches.reshape(
             grid_t,
+            temporal_patch_size, # type: ignore
             channel,
+            grid_h // merge_size, # type: ignore
+            merge_size, # type: ignore
+            patch_size, # type: ignore
+            grid_w // merge_size, # type: ignore
+            merge_size, # type: ignore
+            patch_size, # type: ignore
         )
         patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
         flatten_patches = patches.reshape(
         return flatten_patches, (grid_t, grid_h, grid_w)
+    def preprocess( # type: ignore
         self,
         images: ImageInput,
         do_resize: bool | None = None,
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
         if images is not None:
+            images = self.fetch_images(images) # type: ignore
             images = make_flat_list_of_images(images)
         if images is not None and not valid_images(images):
         data = {}
         pixel_values, vision_grid_thws = [], []
+        for image in images: # type: ignore
             patches, image_grid_thw = self._preprocess(
                 image,
                 do_resize=do_resize,
         Returns:
             `int`: Number of image patches per image.
         """
+        min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"] # type: ignore
+        max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"] # type: ignore
+        patch_size = images_kwargs.get("patch_size", self.patch_size) # type: ignore
+        merge_size = images_kwargs.get("merge_size", self.merge_size) # type: ignore
+        focus_size = images_kwargs.get("focus_size", self.focus_size) # type: ignore
+        factor = patch_size * merge_size * focus_size
         resized_height, resized_width = smart_resize(
             height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
         )

processing_qwen3_vl.py CHANGED Viewed

@@ -25,42 +25,42 @@ class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False):
 @auto_docstring
 class ZFQwen3VLProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
-        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
-        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
         self.image_token_id = (
-            tokenizer.image_token_id
             if getattr(tokenizer, "image_token_id", None)
-            else tokenizer.convert_tokens_to_ids(self.image_token)
         )
         self.video_token_id = (
-            tokenizer.video_token_id
             if getattr(tokenizer, "video_token_id", None)
-            else tokenizer.convert_tokens_to_ids(self.video_token)
         )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
         self.vision_start_token = (
-            "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
         )
         self.vision_end_token = (
-            "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token
         )
         self.vision_start_token_id = (
-            tokenizer.vision_start_token_id
             if getattr(tokenizer, "vision_start_token_id", None)
-            else tokenizer.convert_tokens_to_ids(self.vision_start_token)
         )
         self.vision_end_token_id = (
-            tokenizer.vision_end_token_id
             if getattr(tokenizer, "vision_end_token_id", None)
-            else tokenizer.convert_tokens_to_ids(self.vision_end_token)
         )
     @auto_docstring
-    def __call__(
         self,
-        images: ImageInput = None,
-        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
-        videos: VideoInput = None,
         **kwargs: Unpack[Qwen3VLProcessorKwargs],
     ) -> BatchFeature:
         r"""
@@ -77,19 +77,19 @@ class ZFQwen3VLProcessor(ProcessorMixin):
             - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
-            Qwen3VLProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
         if images is not None:
-            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
             image_grid_thw = image_inputs["image_grid_thw"]
         else:
             image_inputs = {}
             image_grid_thw = None
         if videos is not None:
-            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
             video_grid_thw = videos_inputs["video_grid_thw"]
             # If user has not requested video metadata, pop it
             if not kwargs.get("return_metadata"):
@@ -105,23 +105,23 @@ class ZFQwen3VLProcessor(ProcessorMixin):
         text = text.copy()  # below lines change text in-place
         if image_grid_thw is not None:
-            merge_length = self.image_processor.merge_size**2
             index = 0
             for i in range(len(text)):
                 while self.image_token in text[i]:
                     num_image_tokens = image_grid_thw[index].prod() // merge_length
-                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
                     index += 1
-                text[i] = text[i].replace("<|placeholder|>", self.image_token)
         if video_grid_thw is not None:
-            merge_length = self.video_processor.merge_size**2
             index = 0
             for i in range(len(text)):
                 while self.video_token in text[i]:
-                    metadata = video_metadata[index]
                     if metadata.fps is None:
-                        logger.warning_once(
                             "Qwen3VL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
                             "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
                             "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
@@ -132,9 +132,9 @@ class ZFQwen3VLProcessor(ProcessorMixin):
                     curr_timestamp = self._calculate_timestamps(
                         metadata.frames_indices,
                         metadata.fps,
-                        self.video_processor.merge_size,
                     )
                     video_placeholder = ""
                     frame_seqlen = video_grid_thw[index][1:].prod() // merge_length
                     for frame_idx in range(video_grid_thw[index][0]):
@@ -144,20 +144,20 @@ class ZFQwen3VLProcessor(ProcessorMixin):
                             self.vision_start_token + "<|placeholder|>" * frame_seqlen + self.vision_end_token
                         )
                     if f"{self.vision_start_token}{self.video_token}{self.vision_end_token}" in text[i]:
-                        text[i] = text[i].replace(
                             f"{self.vision_start_token}{self.video_token}{self.vision_end_token}", video_placeholder, 1
                         )
                     else:
                         # vllm may input video token directly
-                        text[i] = text[i].replace(self.video_token, video_placeholder, 1)
                     index += 1
-                text[i] = text[i].replace("<|placeholder|>", self.video_token)
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
-        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
-        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
         if return_mm_token_type_ids:
             array_ids = np.array(text_inputs["input_ids"])
@@ -184,10 +184,10 @@ class ZFQwen3VLProcessor(ProcessorMixin):
         if image_sizes is not None:
             images_kwargs = Qwen3VLProcessorKwargs._defaults.get("images_kwargs", {})
             images_kwargs.update(kwargs)
-            merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
             num_image_patches = [
-                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
                 for image_size in image_sizes
             ]
             num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
@@ -197,10 +197,10 @@ class ZFQwen3VLProcessor(ProcessorMixin):
             videos_kwargs = Qwen3VLProcessorKwargs._defaults.get("videos_kwargs", {})
             videos_kwargs.update(kwargs)
             num_video_patches = [
-                self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs)
                 for video_size in video_sizes
             ]
-            num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches]
             vision_data["num_video_tokens"] = num_video_tokens
         return MultiModalData(**vision_data)
@@ -225,18 +225,26 @@ class ZFQwen3VLProcessor(ProcessorMixin):
         Returns:
             `list[str]`: The decoded text.
         """
-        return self.tokenizer.batch_decode(
             generated_outputs,
             skip_special_tokens=skip_special_tokens,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             **kwargs,
         )
-    def _calculate_timestamps(self, indices: list[int] | np.ndarray, video_fps: float, merge_size: int = 2):
         if not isinstance(indices, list):
             indices = indices.tolist()
-        if len(indices) % merge_size != 0:
-            indices.extend(indices[-1] for _ in range(merge_size - len(indices) % merge_size))
         timestamps = [idx / video_fps for idx in indices]
         # @JJJYmmm frames are merged by self.merge_size, \
         # so we need to average the timestamps between the first/last frame within the temporal patch

 @auto_docstring
 class ZFQwen3VLProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
+        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token # type: ignore
+        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token # type: ignore
         self.image_token_id = (
+            tokenizer.image_token_id # type: ignore
             if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token) # type: ignore
         )
         self.video_token_id = (
+            tokenizer.video_token_id # type: ignore
             if getattr(tokenizer, "video_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.video_token) # type: ignore
         )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
         self.vision_start_token = (
+            "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token # type: ignore
         )
         self.vision_end_token = (
+            "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token # type: ignore
         )
         self.vision_start_token_id = (
+            tokenizer.vision_start_token_id # type: ignore
             if getattr(tokenizer, "vision_start_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.vision_start_token) # type: ignore
         )
         self.vision_end_token_id = (
+            tokenizer.vision_end_token_id # type: ignore
             if getattr(tokenizer, "vision_end_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.vision_end_token) # type: ignore
         )
     @auto_docstring
+    def __call__( # type: ignore
         self,
+        images: ImageInput = None, # type: ignore
+        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, # type: ignore
+        videos: VideoInput = None, # type: ignore
         **kwargs: Unpack[Qwen3VLProcessorKwargs],
     ) -> BatchFeature:
         r"""
             - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
+            Qwen3VLProcessorKwargs, # type: ignore
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs, # type: ignore
             **kwargs,
         )
         if images is not None:
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) # type: ignore
             image_grid_thw = image_inputs["image_grid_thw"]
         else:
             image_inputs = {}
             image_grid_thw = None
         if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) # type: ignore
             video_grid_thw = videos_inputs["video_grid_thw"]
             # If user has not requested video metadata, pop it
             if not kwargs.get("return_metadata"):
         text = text.copy()  # below lines change text in-place
         if image_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2 # type: ignore
             index = 0
             for i in range(len(text)):
                 while self.image_token in text[i]:
                     num_image_tokens = image_grid_thw[index].prod() // merge_length
+                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1) # type: ignore
                     index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token) # type: ignore
         if video_grid_thw is not None:
+            merge_length = self.video_processor.merge_size**2 # type: ignore
             index = 0
             for i in range(len(text)):
                 while self.video_token in text[i]:
+                    metadata = video_metadata[index] # type: ignore
                     if metadata.fps is None:
+                        logger.warning_once( # type: ignore
                             "Qwen3VL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
                             "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
                             "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
                     curr_timestamp = self._calculate_timestamps(
                         metadata.frames_indices,
                         metadata.fps,
+                        self.video_processor.merge_size, # type: ignore
+                        self.video_processor.focus_size, # type: ignore
                     )
                     video_placeholder = ""
                     frame_seqlen = video_grid_thw[index][1:].prod() // merge_length
                     for frame_idx in range(video_grid_thw[index][0]):
                             self.vision_start_token + "<|placeholder|>" * frame_seqlen + self.vision_end_token
                         )
                     if f"{self.vision_start_token}{self.video_token}{self.vision_end_token}" in text[i]:
+                        text[i] = text[i].replace( # type: ignore
                             f"{self.vision_start_token}{self.video_token}{self.vision_end_token}", video_placeholder, 1
                         )
                     else:
                         # vllm may input video token directly
+                        text[i] = text[i].replace(self.video_token, video_placeholder, 1) # type: ignore
                     index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token) # type: ignore
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) # type: ignore
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) # type: ignore
         if return_mm_token_type_ids:
             array_ids = np.array(text_inputs["input_ids"])
         if image_sizes is not None:
             images_kwargs = Qwen3VLProcessorKwargs._defaults.get("images_kwargs", {})
             images_kwargs.update(kwargs)
+            merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size # type: ignore
             num_image_patches = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs) # type: ignore
                 for image_size in image_sizes
             ]
             num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
             videos_kwargs = Qwen3VLProcessorKwargs._defaults.get("videos_kwargs", {})
             videos_kwargs.update(kwargs)
             num_video_patches = [
+                self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs) # type: ignore
                 for video_size in video_sizes
             ]
+            num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches] # type: ignore
             vision_data["num_video_tokens"] = num_video_tokens
         return MultiModalData(**vision_data)
         Returns:
             `list[str]`: The decoded text.
         """
+        return self.tokenizer.batch_decode( # type: ignore
             generated_outputs,
             skip_special_tokens=skip_special_tokens,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             **kwargs,
         )
+    def _calculate_timestamps(
+        self,
+        indices: list[int] | np.ndarray,
+        video_fps: float,
+        merge_size: int = 2,
+        focus_size: int = 2
+    ):
         if not isinstance(indices, list):
             indices = indices.tolist()
+        if len(indices) % (merge_size * focus_size) != 0:
+            indices.extend( # type: ignore
+                indices[-1] for _ in range((merge_size * focus_size) - len(indices) % (merge_size * focus_size))
+            )
         timestamps = [idx / video_fps for idx in indices]
         # @JJJYmmm frames are merged by self.merge_size, \
         # so we need to average the timestamps between the first/last frame within the temporal patch

processor_config.json CHANGED Viewed

@@ -25,7 +25,7 @@
       0.5
     ],
     "merge_size": 2,
-    "patch_size": 14,
     "resample": 3,
     "rescale_factor": 0.00392156862745098,
     "size": {
@@ -67,7 +67,7 @@
     "rescale_factor": 0.00392156862745098,
     "return_metadata": false,
     "size": {
-      "longest_edge": 25165824,
       "shortest_edge": 4096
     },
     "temporal_patch_size": 2,

       0.5
     ],
     "merge_size": 2,
+    "patch_size": 16,
     "resample": 3,
     "rescale_factor": 0.00392156862745098,
     "size": {
     "rescale_factor": 0.00392156862745098,
     "return_metadata": false,
     "size": {
+      "longest_edge": 251658240,
       "shortest_edge": 4096
     },
     "temporal_patch_size": 2,

video_processing_qwen3_vl.py CHANGED Viewed

@@ -47,10 +47,11 @@ def smart_resize(
     return h_bar, w_bar
-class Qwen3VLVideoProcessorInitKwargs(VideosKwargs, total=False):
     patch_size: int
     temporal_patch_size: int
     merge_size: int
     min_frames: int
     max_frames: int
@@ -79,21 +80,22 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
     patch_size = 16
     temporal_patch_size = 2
     merge_size = 2
     fps = 2
     min_frames = 4
     max_frames = 768
     do_sample_frames = True
-    valid_kwargs = Qwen3VLVideoProcessorInitKwargs
     model_input_names = ["pixel_values_videos", "video_grid_thw"]
-    def __init__(self, **kwargs: Unpack[Qwen3VLVideoProcessorInitKwargs]):
         super().__init__(**kwargs)
         if self.size is not None and (
             self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None
         ):
             raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
-    def _further_process_kwargs(
         self,
         size: SizeDict | None = None,
         **kwargs,
@@ -107,7 +109,7 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
         return super()._further_process_kwargs(size=size, **kwargs)
-    def sample_frames(
         self,
         metadata: VideoMetadata,
         num_frames: int | None = None,
@@ -142,7 +144,7 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
         if num_frames is None and fps is not None:
             if metadata.fps is None:
                 metadata.fps = 24
-                logger.warning_once(
                     "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
                     "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
                 )
@@ -156,7 +158,7 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
         return indices
-    def _preprocess(
         self,
         videos: list[torch.Tensor],
         do_convert_rgb: bool = True,
@@ -171,6 +173,7 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
         patch_size: int | None = None,
         temporal_patch_size: int | None = None,
         merge_size: int | None = None,
         return_tensors: str | TensorType | None = None,
         **kwargs,
     ):
@@ -185,16 +188,16 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
                     num_frames=num_frames,
                     height=height,
                     width=width,
-                    temporal_factor=temporal_patch_size,
-                    factor=patch_size * merge_size,
-                    min_pixels=size.shortest_edge,
-                    max_pixels=size.longest_edge,
                 )
                 stacked_videos = stacked_videos.view(B * T, C, H, W)
                 stacked_videos = self.resize(
                     stacked_videos,
                     size=SizeDict(height=resized_height, width=resized_width),
-                    interpolation=interpolation,
                 )
                 stacked_videos = stacked_videos.view(B, T, C, resized_height, resized_width)
             resized_videos_grouped[shape] = stacked_videos
@@ -206,40 +209,40 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
         processed_videos_grouped = {}
         processed_grids = {}
         for shape, stacked_videos in grouped_videos.items():
-            resized_height, resized_width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST)
             # Fused rescale and normalize
             stacked_videos = self.rescale_and_normalize(
-                stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
             )
             patches = stacked_videos
             # Check that videos have `num_frames` divisible by `temporal_patch_size`
             T = patches.shape[1]
-            if pad := -T % temporal_patch_size:
                 repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1)
                 patches = torch.cat((patches, repeats), dim=1)
             batch_size, grid_t, channel = patches.shape[:3]
-            grid_t = grid_t // temporal_patch_size
-            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
             patches = patches.view(
                 batch_size,
                 grid_t,
-                temporal_patch_size,
                 channel,
-                grid_h // merge_size,
-                merge_size,
-                patch_size,
-                grid_w // merge_size,
-                merge_size,
-                patch_size,
             )
             patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
             flatten_patches = patches.reshape(
                 batch_size,
                 grid_t * grid_h * grid_w,
-                channel * temporal_patch_size * patch_size * patch_size,
             )
             processed_videos_grouped[shape] = flatten_patches

     return h_bar, w_bar
+class ZFQwen3VLVideoProcessorInitKwargs(VideosKwargs, total=False):
     patch_size: int
     temporal_patch_size: int
     merge_size: int
+    focus_size: int
     min_frames: int
     max_frames: int
     patch_size = 16
     temporal_patch_size = 2
     merge_size = 2
+    focus_size = 2
     fps = 2
     min_frames = 4
     max_frames = 768
     do_sample_frames = True
+    valid_kwargs = ZFQwen3VLVideoProcessorInitKwargs
     model_input_names = ["pixel_values_videos", "video_grid_thw"]
+    def __init__(self, **kwargs: Unpack[ZFQwen3VLVideoProcessorInitKwargs]):
         super().__init__(**kwargs)
         if self.size is not None and (
             self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None
         ):
             raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+    def _further_process_kwargs( # type: ignore
         self,
         size: SizeDict | None = None,
         **kwargs,
         return super()._further_process_kwargs(size=size, **kwargs)
+    def sample_frames( # type: ignore
         self,
         metadata: VideoMetadata,
         num_frames: int | None = None,
         if num_frames is None and fps is not None:
             if metadata.fps is None:
                 metadata.fps = 24
+                logger.warning_once( # type: ignore
                     "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
                     "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
                 )
         return indices
+    def _preprocess( # type: ignore
         self,
         videos: list[torch.Tensor],
         do_convert_rgb: bool = True,
         patch_size: int | None = None,
         temporal_patch_size: int | None = None,
         merge_size: int | None = None,
+        focus_size: int | None = None,
         return_tensors: str | TensorType | None = None,
         **kwargs,
     ):
                     num_frames=num_frames,
                     height=height,
                     width=width,
+                    temporal_factor=temporal_patch_size, # type: ignore
+                    factor=patch_size * merge_size * focus_size, # type: ignore
+                    min_pixels=size.shortest_edge, # type: ignore
+                    max_pixels=size.longest_edge, # type: ignore
                 )
                 stacked_videos = stacked_videos.view(B * T, C, H, W)
                 stacked_videos = self.resize(
                     stacked_videos,
                     size=SizeDict(height=resized_height, width=resized_width),
+                    interpolation=interpolation, # type: ignore
                 )
                 stacked_videos = stacked_videos.view(B, T, C, resized_height, resized_width)
             resized_videos_grouped[shape] = stacked_videos
         processed_videos_grouped = {}
         processed_grids = {}
         for shape, stacked_videos in grouped_videos.items():
+            resized_height, resized_width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST) # type: ignore
             # Fused rescale and normalize
             stacked_videos = self.rescale_and_normalize(
+                stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std # type: ignore
             )
             patches = stacked_videos
             # Check that videos have `num_frames` divisible by `temporal_patch_size`
             T = patches.shape[1]
+            if pad := -T % (temporal_patch_size * focus_size): # type: ignore
                 repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1)
                 patches = torch.cat((patches, repeats), dim=1)
             batch_size, grid_t, channel = patches.shape[:3]
+            grid_t = grid_t // temporal_patch_size # type: ignore
+            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size # type: ignore
             patches = patches.view(
                 batch_size,
                 grid_t,
+                temporal_patch_size, # type: ignore
                 channel,
+                grid_h // merge_size, # type: ignore
+                merge_size, # type: ignore
+                patch_size, # type: ignore
+                grid_w // merge_size, # type: ignore
+                merge_size, # type: ignore
+                patch_size, # type: ignore
             )
             patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
             flatten_patches = patches.reshape(
                 batch_size,
                 grid_t * grid_h * grid_w,
+                channel * temporal_patch_size * patch_size * patch_size, # type: ignore
             )
             processed_videos_grouped[shape] = flatten_patches