update video processor params to the qwen3vl paper version

Browse files

Files changed (2) hide show

video_preprocessor_config.json +4 -3
video_processing_qwen3_vl.py +97 -3

video_preprocessor_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
   "do_resize": true,
   "do_sample_frames": true,
   "focus_size": 2,
-  "fps": 1,
   "image_mean": [
     0.5,
     0.5,
@@ -26,18 +26,19 @@
     0.5
   ],
   "input_data_format": null,
-  "max_frames": 3600,
   "merge_size": 2,
   "min_frames": 4,
   "num_frames": null,
   "pad_size": null,
   "patch_size": 16,
   "processor_class": "ZFQwen3VLProcessor",
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
   "return_metadata": false,
   "size": {
-    "longest_edge": 235929600,
     "shortest_edge": 4096
   },
   "temporal_patch_size": 2,

   "do_resize": true,
   "do_sample_frames": true,
   "focus_size": 2,
+  "fps": 2,
   "image_mean": [
     0.5,
     0.5,
     0.5
   ],
   "input_data_format": null,
+  "max_frames": 2048,
   "merge_size": 2,
   "min_frames": 4,
   "num_frames": null,
   "pad_size": null,
   "patch_size": 16,
   "processor_class": "ZFQwen3VLProcessor",
+  "processor_device": "cpu",
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
   "return_metadata": false,
   "size": {
+    "longest_edge": 458752000,
     "shortest_edge": 4096
   },
   "temporal_patch_size": 2,

video_processing_qwen3_vl.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import math
-from typing import Optional, Union
 import numpy as np
 import torch
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ChannelDimension, PILImageResampling, SizeDict, get_image_size
@@ -11,8 +12,8 @@ from transformers.utils.generic import TensorType
 from transformers.utils.doc import add_start_docstrings
 from transformers.utils import logging
 from transformers.video_processing_utils import BASE_VIDEO_PROCESSOR_DOCSTRING, BaseVideoProcessor
-from transformers.video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
 logger = logging.get_logger(__name__)
@@ -57,6 +58,7 @@ class Qwen3VLVideoProcessorInitKwargs(VideosKwargs):
     focus_size: Optional[int]
     min_frames: Optional[int]
     max_frames: Optional[int]
 @add_start_docstrings(
@@ -88,6 +90,7 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
     min_frames = 4
     max_frames = 768
     do_sample_frames = True
     valid_kwargs = Qwen3VLVideoProcessorInitKwargs
     model_input_names = ["pixel_values_videos", "video_grid_thw"]
@@ -183,6 +186,9 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
         grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
         resized_videos_grouped = {}
         for shape, stacked_videos in grouped_videos.items():
             B, T, C, H, W = stacked_videos.shape
             num_frames, height, width = T, H, W
@@ -262,5 +268,93 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
         return BatchFeature(data=data, tensor_type=return_tensors)
 __all__ = ["ZFQwen3VLVideoProcessor"]

 import math
+from typing import Optional, Union, Iterable
 import numpy as np
 import torch
+from torchvision.transforms.v2 import functional as F
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ChannelDimension, PILImageResampling, SizeDict, get_image_size
 from transformers.utils.doc import add_start_docstrings
 from transformers.utils import logging
 from transformers.video_processing_utils import BASE_VIDEO_PROCESSOR_DOCSTRING, BaseVideoProcessor
+from transformers.video_utils import VideoMetadata, group_videos_by_shape, reorder_videos, load_video, VideoInput
+from transformers.image_transforms import to_channel_dimension_format
 logger = logging.get_logger(__name__)
     focus_size: Optional[int]
     min_frames: Optional[int]
     max_frames: Optional[int]
+    processor_device: Optional[str]
 @add_start_docstrings(
     min_frames = 4
     max_frames = 768
     do_sample_frames = True
+    processor_device: str = "cpu"
     valid_kwargs = Qwen3VLVideoProcessorInitKwargs
     model_input_names = ["pixel_values_videos", "video_grid_thw"]
         grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
         resized_videos_grouped = {}
+        for vid in videos:
+            print(f'vid type: {type(vid)}, vid shape: {vid.shape}')
         for shape, stacked_videos in grouped_videos.items():
             B, T, C, H, W = stacked_videos.shape
             num_frames, height, width = T, H, W
         return BatchFeature(data=data, tensor_type=return_tensors)
+    def fetch_videos( # type: ignore
+        self,
+        video_url_or_urls: Union[str, list[str], list[list[str]]],
+        sample_indices_fn=None
+    ):
+        """
+        Convert a single or a list of urls into the corresponding `np.array` objects.
+        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
+        returned.
+        """
+        if isinstance(video_url_or_urls, list):
+            return list(zip(*[self.fetch_videos(x, sample_indices_fn=sample_indices_fn) for x in video_url_or_urls]))
+        else:
+            video, metadata = load_video(
+                video_url_or_urls, # type: ignore
+                backend="torchcodec",
+                sample_indices_fn=sample_indices_fn,
+                device=self.processor_device
+            ) # type: ignore
+            print(f'Loaded video shape: {video.shape}, dtype: {video.dtype}, device: {video.device}')
+            return video, metadata
+    def normalize(
+        self,
+        image: "torch.Tensor",
+        mean: Union[float, Iterable[float]],
+        std: Union[float, Iterable[float]],
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+        Args:
+            image (`torch.Tensor`):
+                Image to normalize.
+            mean (`torch.Tensor`, `float` or `Iterable[float]`):
+                Image mean to use for normalization.
+            std (`torch.Tensor`, `float` or `Iterable[float]`):
+                Image standard deviation to use for normalization.
+        Returns:
+            `torch.Tensor`: The normalized image.
+        """
+        return F.normalize(image, mean, std, inplace=True) # type: ignore
+    def rescale(
+        self,
+        image: "torch.Tensor",
+        scale: float,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Rescale an image by a scale factor. image = image * scale.
+        Args:
+            image (`torch.Tensor`):
+                Image to rescale.
+            scale (`float`):
+                The scaling factor to rescale pixel values by.
+        Returns:
+            `torch.Tensor`: The rescaled image.
+        """
+        return image.mul_(scale)
+    def _prepare_input_videos(
+        self,
+        videos: VideoInput,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        device: Optional[str] = None,
+    ) -> list["torch.Tensor"]:
+        """
+        Prepare the input videos for processing.
+        """
+        processed_videos = []
+        for video in videos:
+            # `make_batched_videos` always returns a 4D array per video
+            if isinstance(video, np.ndarray):
+                video = to_channel_dimension_format(video, ChannelDimension.FIRST, input_data_format)
+                # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays
+                video = torch.from_numpy(video).contiguous()
+            if device is not None:
+                raise ValueError("The `device` argument is not supported. Please use `processor_device` instead.")
+            processed_videos.append(video)
+        return processed_videos
 __all__ = ["ZFQwen3VLVideoProcessor"]