"""Villanova VLM Image Processor for HuggingFace. This is a standalone image processor file for use with trust_remote_code=True. It contains no imports from aithlas_trainer to ensure self-containment. """ from typing import Any import numpy as np from PIL import Image from transformers.image_processing_utils import BaseImageProcessor, BatchFeature from transformers.image_utils import ( ChannelDimension, ImageInput, make_list_of_images, to_numpy_array, valid_images, ) class VillanovaImageProcessor(BaseImageProcessor): """Image processor for Villanova VLM. Processes images for the ViT-L-14-CLIPA-336 vision encoder: - Resize to 336x336 - Normalize with ImageNet statistics (as used by OpenCLIP CLIPA models) - Convert to RGB if needed Args: do_resize: Whether to resize images size: Target size {"height": 336, "width": 336} resample: PIL resampling filter (default: BILINEAR as used by OpenCLIP) do_rescale: Whether to rescale pixel values rescale_factor: Rescale factor (1/255) do_normalize: Whether to normalize image_mean: Normalization mean (ImageNet: [0.485, 0.456, 0.406]) image_std: Normalization std (ImageNet: [0.229, 0.224, 0.225]) do_convert_rgb: Convert to RGB if needed Example: >>> processor = VillanovaImageProcessor() >>> image = Image.open("image.jpg") >>> inputs = processor(image, return_tensors="pt") >>> print(inputs.pixel_values.shape) torch.Size([1, 3, 336, 336]) """ model_input_names = ["pixel_values"] def __init__( self, do_resize: bool = True, size: dict[str, int] | None = None, resample: int = 2, # PIL.Image.BILINEAR (as used by OpenCLIP) do_rescale: bool = True, rescale_factor: float = 1 / 255, do_normalize: bool = True, image_mean: list[float] | None = None, image_std: list[float] | None = None, do_convert_rgb: bool = True, **kwargs: Any, ) -> None: super().__init__(**kwargs) self.do_resize = do_resize self.size = size or {"height": 336, "width": 336} self.resample = resample self.do_rescale = do_rescale self.rescale_factor = rescale_factor self.do_normalize = do_normalize # ImageNet normalization (same as OpenCLIP ViT-L-14-CLIPA-336) self.image_mean = image_mean or [0.485, 0.456, 0.406] self.image_std = image_std or [0.229, 0.224, 0.225] self.do_convert_rgb = do_convert_rgb def resize( self, image: np.ndarray, size: dict[str, int], resample: int = 2, data_format: ChannelDimension | None = None, **kwargs: Any, ) -> np.ndarray: """Resize image to target size.""" height, width = size["height"], size["width"] # Convert to PIL for resizing if isinstance(image, np.ndarray): pil_image = Image.fromarray(image.astype(np.uint8)) else: pil_image = image # Resize resized = pil_image.resize((width, height), resample=resample) # Convert back to numpy return np.array(resized) def rescale( self, image: np.ndarray, scale: float, data_format: ChannelDimension | None = None, **kwargs: Any, ) -> np.ndarray: """Rescale pixel values.""" return image.astype(np.float32) * scale def normalize( self, image: np.ndarray, mean: list[float], std: list[float], data_format: ChannelDimension | None = None, **kwargs: Any, ) -> np.ndarray: """Normalize image with mean and std.""" mean = np.array(mean, dtype=np.float32) std = np.array(std, dtype=np.float32) # Ensure image is float image = image.astype(np.float32) # Normalize (assuming HWC format) if image.ndim == 3: image = (image - mean) / std return image def preprocess( self, images: ImageInput, do_resize: bool | None = None, size: dict[str, int] | None = None, resample: int | None = None, do_rescale: bool | None = None, rescale_factor: float | None = None, do_normalize: bool | None = None, image_mean: list[float] | None = None, image_std: list[float] | None = None, do_convert_rgb: bool | None = None, return_tensors: str | None = None, data_format: ChannelDimension = ChannelDimension.FIRST, **kwargs: Any, ) -> BatchFeature: """Preprocess images for the model. Args: images: Single image or list of images do_resize: Override resize setting size: Override target size resample: Override resampling filter do_rescale: Override rescale setting rescale_factor: Override rescale factor do_normalize: Override normalize setting image_mean: Override mean image_std: Override std do_convert_rgb: Override RGB conversion return_tensors: Output tensor format ("pt", "np", etc.) data_format: Channel dimension format Returns: BatchFeature with pixel_values """ do_resize = do_resize if do_resize is not None else self.do_resize size = size if size is not None else self.size resample = resample if resample is not None else self.resample do_rescale = do_rescale if do_rescale is not None else self.do_rescale rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor do_normalize = do_normalize if do_normalize is not None else self.do_normalize image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb # Handle single image images = make_list_of_images(images) if not valid_images(images): raise ValueError("Invalid image input") processed_images = [] for image in images: # Convert to RGB if needed if do_convert_rgb: if isinstance(image, Image.Image): image = image.convert("RGB") elif isinstance(image, np.ndarray): if image.shape[-1] == 4: # RGBA image = image[..., :3] elif image.ndim == 2: # Grayscale image = np.stack([image] * 3, axis=-1) # Convert to numpy image = to_numpy_array(image) # Resize if do_resize: image = self.resize(image, size, resample) # Rescale if do_rescale: image = self.rescale(image, rescale_factor) # Normalize if do_normalize: image = self.normalize(image, image_mean, image_std) # Convert to CHW format if data_format == ChannelDimension.FIRST: image = np.transpose(image, (2, 0, 1)) processed_images.append(image) # Stack into batch pixel_values = np.stack(processed_images, axis=0) data = {"pixel_values": pixel_values} return BatchFeature(data=data, tensor_type=return_tensors)