| | from typing import Optional, Union |
| |
|
| | import torch |
| | from torchvision.transforms.v2 import functional as F |
| |
|
| | from transformers.image_processing_base import BatchFeature |
| | from transformers.image_processing_utils_fast import ( |
| | BaseImageProcessorFast, |
| | DefaultFastImageProcessorKwargs, |
| | ) |
| | from transformers.image_transforms import group_images_by_shape, reorder_images |
| | from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD |
| | from transformers.image_utils import ( |
| | ChannelDimension, |
| | ImageInput, |
| | PILImageResampling, |
| | SizeDict, |
| | ) |
| | from transformers.processing_utils import Unpack |
| | from transformers.utils.generic import TensorType |
| | from transformers.utils.auto_docstring import auto_docstring |
| | from transformers.utils import logging |
| | from transformers.video_utils import VideoInput, make_batched_videos |
| | from .image_processing_qwen2_vl import smart_resize |
| |
|
| |
|
| | logger = logging.get_logger(__name__) |
| |
|
| |
|
| | class Qwen2VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): |
| | """ |
| | min_pixels (`int`, *optional*, defaults to `56 * 56`): |
| | The min pixels of the image to resize the image. |
| | max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): |
| | The max pixels of the image to resize the image. |
| | patch_size (`int`, *optional*, defaults to 14): |
| | The spatial patch size of the vision encoder. |
| | temporal_patch_size (`int`, *optional*, defaults to 2): |
| | The temporal patch size of the vision encoder. |
| | merge_size (`int`, *optional*, defaults to 2): |
| | The merge size of the vision encoder to llm encoder. |
| | focus_size (`int`, *optional*, defaults to 2): |
| | The focus size of the ZoomFocus. |
| | """ |
| |
|
| | min_pixels: Optional[int] |
| | max_pixels: Optional[int] |
| | patch_size: Optional[int] |
| | temporal_patch_size: Optional[int] |
| | merge_size: Optional[int] |
| | focus_size: Optional[int] |
| |
|
| |
|
| | @auto_docstring |
| | class ZFQwen2VLImageProcessorFast(BaseImageProcessorFast): |
| | do_resize = True |
| | resample = PILImageResampling.BICUBIC |
| | size = {"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 1280} |
| | do_rescale = True |
| | do_normalize = True |
| | image_mean = OPENAI_CLIP_MEAN |
| | image_std = OPENAI_CLIP_STD |
| | do_convert_rgb = True |
| | patch_size = 14 |
| | temporal_patch_size = 2 |
| | merge_size = 2 |
| | focus_size = 2 |
| | min_pixels = None |
| | max_pixels = None |
| | valid_kwargs = Qwen2VLFastImageProcessorKwargs |
| | model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"] |
| |
|
| | def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs]): |
| | size = kwargs.pop("size", None) |
| | min_pixels = kwargs.pop("min_pixels", None) |
| | max_pixels = kwargs.pop("max_pixels", None) |
| | |
| | size = self.size if size is None else size |
| | if min_pixels is not None: |
| | size["shortest_edge"] = min_pixels |
| | size.pop("min_pixels", None) |
| | if max_pixels is not None: |
| | size["longest_edge"] = max_pixels |
| | size.pop("max_pixels", None) |
| | if "shortest_edge" not in size or "longest_edge" not in size: |
| | raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") |
| |
|
| | super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs) |
| |
|
| | def _further_process_kwargs( |
| | self, |
| | size: Optional[SizeDict] = None, |
| | min_pixels: Optional[int] = None, |
| | max_pixels: Optional[int] = None, |
| | **kwargs, |
| | ) -> dict: |
| | """ |
| | Update kwargs that need further processing before being validated |
| | Can be overridden by subclasses to customize the processing of kwargs. |
| | """ |
| | if min_pixels is not None and max_pixels is not None: |
| | size = {"shortest_edge": min_pixels, "longest_edge": max_pixels} |
| | elif size is not None: |
| | if "shortest_edge" not in size or "longest_edge" not in size: |
| | raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") |
| | min_pixels = size["shortest_edge"] |
| | max_pixels = size["longest_edge"] |
| | else: |
| | size = {**self.size} |
| |
|
| | return super()._further_process_kwargs(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs) |
| |
|
| | @auto_docstring |
| | def preprocess( |
| | self, |
| | images: ImageInput, |
| | videos: Optional[VideoInput] = None, |
| | **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs], |
| | ) -> BatchFeature: |
| | return super().preprocess(images, videos, **kwargs) |
| |
|
| | def _preprocess_image_like_inputs( |
| | self, |
| | images: ImageInput, |
| | videos: VideoInput, |
| | do_convert_rgb: bool, |
| | input_data_format: ChannelDimension, |
| | device: Optional[Union[str, "torch.device"]] = None, |
| | **kwargs: Unpack[DefaultFastImageProcessorKwargs], |
| | ) -> BatchFeature: |
| | """ |
| | Preprocess image-like inputs. |
| | To be overridden by subclasses when image-like inputs other than images should be processed. |
| | It can be used for segmentation maps, depth maps, etc. |
| | """ |
| | |
| | batch_feature = BatchFeature() |
| | if images is not None: |
| | images = self._prepare_image_like_inputs( |
| | images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device |
| | ) |
| | batch_feature = self._preprocess(images, **kwargs) |
| | if videos is not None: |
| | logger.warning( |
| | "`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. " |
| | "This is a deprecated behavior and will be removed in v5.0. " |
| | "Your videos should be forwarded to `Qwen2VLVideoProcessor`. " |
| | ) |
| | |
| | videos = make_batched_videos(videos) |
| | videos = [ |
| | torch.stack(self._prepare_image_like_inputs(video, do_convert_rgb, input_data_format, device)) |
| | for video in videos |
| | ] |
| | video_outputs = self._preprocess(videos, **kwargs) |
| | batch_feature.update( |
| | {"pixel_values_videos": video_outputs.pixel_values, "video_grid_thw": video_outputs.image_grid_thw} |
| | ) |
| | return batch_feature |
| |
|
| | def _preprocess( |
| | self, |
| | images: list["torch.Tensor"], |
| | do_resize: bool, |
| | size: SizeDict, |
| | interpolation: Optional["F.InterpolationMode"], |
| | do_rescale: bool, |
| | rescale_factor: float, |
| | do_normalize: bool, |
| | image_mean: Optional[Union[float, list[float]]], |
| | image_std: Optional[Union[float, list[float]]], |
| | patch_size: int, |
| | temporal_patch_size: int, |
| | merge_size: int, |
| | focus_size: int, |
| | disable_grouping: Optional[bool], |
| | return_tensors: Optional[Union[str, TensorType]], |
| | **kwargs, |
| | ): |
| | |
| | grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) |
| | resized_images_grouped = {} |
| | for shape, stacked_images in grouped_images.items(): |
| | height, width = stacked_images.shape[-2:] |
| | if do_resize: |
| | resized_height, resized_width = smart_resize( |
| | height, |
| | width, |
| | factor=patch_size * merge_size * focus_size, |
| | min_pixels=size["shortest_edge"], |
| | max_pixels=size["longest_edge"], |
| | ) |
| | stacked_images = self.resize( |
| | image=stacked_images, |
| | size=SizeDict(height=resized_height, width=resized_width), |
| | interpolation=interpolation, |
| | ) |
| | resized_images_grouped[shape] = stacked_images |
| | resized_images = reorder_images(resized_images_grouped, grouped_images_index) |
| |
|
| | |
| | |
| | grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) |
| | processed_images_grouped = {} |
| | processed_grids = {} |
| | for shape, stacked_images in grouped_images.items(): |
| | resized_height, resized_width = stacked_images.shape[-2:] |
| | |
| | patches = self.rescale_and_normalize( |
| | stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std |
| | ) |
| | if patches.ndim == 4: |
| | |
| | patches = patches.unsqueeze(1) |
| | if patches.shape[1] % temporal_patch_size != 0: |
| | repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1) |
| | patches = torch.cat([patches, repeats], dim=1) |
| | batch_size, grid_t, channel = patches.shape[:3] |
| | grid_t = grid_t // temporal_patch_size |
| | grid_h, grid_w = resized_height // patch_size, resized_width // patch_size |
| |
|
| | patches = patches.view( |
| | batch_size, |
| | grid_t, |
| | temporal_patch_size, |
| | channel, |
| | grid_h // merge_size, |
| | merge_size, |
| | patch_size, |
| | grid_w // merge_size, |
| | merge_size, |
| | patch_size, |
| | ) |
| | |
| | |
| | patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9) |
| | flatten_patches = patches.reshape( |
| | batch_size, |
| | grid_t * grid_h * grid_w, |
| | channel * temporal_patch_size * patch_size * patch_size, |
| | ) |
| |
|
| | processed_images_grouped[shape] = flatten_patches |
| | processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size |
| |
|
| | processed_images = reorder_images(processed_images_grouped, grouped_images_index) |
| | processed_grids = reorder_images(processed_grids, grouped_images_index) |
| | pixel_values = torch.cat(processed_images, dim=0) |
| | image_grid_thw = torch.tensor(processed_grids) |
| |
|
| | return BatchFeature( |
| | data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors |
| | ) |
| |
|
| | def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None): |
| | """ |
| | A utility that returns number of image patches for a given image size. |
| | |
| | Note: Do not remove this method! It is used by vLLM to infer the number of patches and placeholders |
| | without an image input. |
| | |
| | Args: |
| | height (`int`): |
| | Height of the input image. |
| | width (`int`): |
| | Width of the input image. |
| | images_kwargs (`dict`, *optional*) |
| | Any kwargs to override defaults of the image processor. |
| | Returns: |
| | `int`: Number of image patches per image. |
| | """ |
| | min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"] |
| | max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"] |
| | patch_size = images_kwargs.get("patch_size", self.patch_size) |
| | merge_size = images_kwargs.get("merge_size", self.merge_size) |
| | focus_size = images_kwargs.get("focus_size", self.focus_size) |
| |
|
| | factor = patch_size * merge_size * focus_size |
| | resized_height, resized_width = smart_resize( |
| | height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels |
| | ) |
| | grid_h, grid_w = resized_height // patch_size, resized_width // patch_size |
| | return grid_h * grid_w |
| |
|
| |
|
| | __all__ = ["ZFQwen2VLImageProcessorFast"] |
| |
|