Villanova-2B-VL-2512-Preview / image_processing_villanova.py
matteogabburo's picture
Upload folder using huggingface_hub
46d882e verified
"""Villanova VLM Image Processor for HuggingFace.
This is a standalone image processor file for use with trust_remote_code=True.
It contains no imports from aithlas_trainer to ensure self-containment.
"""
from typing import Any
import numpy as np
from PIL import Image
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
from transformers.image_utils import (
ChannelDimension,
ImageInput,
make_list_of_images,
to_numpy_array,
valid_images,
)
class VillanovaImageProcessor(BaseImageProcessor):
"""Image processor for Villanova VLM.
Processes images for the ViT-L-14-CLIPA-336 vision encoder:
- Resize to 336x336
- Normalize with ImageNet statistics (as used by OpenCLIP CLIPA models)
- Convert to RGB if needed
Args:
do_resize: Whether to resize images
size: Target size {"height": 336, "width": 336}
resample: PIL resampling filter (default: BILINEAR as used by OpenCLIP)
do_rescale: Whether to rescale pixel values
rescale_factor: Rescale factor (1/255)
do_normalize: Whether to normalize
image_mean: Normalization mean (ImageNet: [0.485, 0.456, 0.406])
image_std: Normalization std (ImageNet: [0.229, 0.224, 0.225])
do_convert_rgb: Convert to RGB if needed
Example:
>>> processor = VillanovaImageProcessor()
>>> image = Image.open("image.jpg")
>>> inputs = processor(image, return_tensors="pt")
>>> print(inputs.pixel_values.shape)
torch.Size([1, 3, 336, 336])
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: dict[str, int] | None = None,
resample: int = 2, # PIL.Image.BILINEAR (as used by OpenCLIP)
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
do_normalize: bool = True,
image_mean: list[float] | None = None,
image_std: list[float] | None = None,
do_convert_rgb: bool = True,
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
self.do_resize = do_resize
self.size = size or {"height": 336, "width": 336}
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
# ImageNet normalization (same as OpenCLIP ViT-L-14-CLIPA-336)
self.image_mean = image_mean or [0.485, 0.456, 0.406]
self.image_std = image_std or [0.229, 0.224, 0.225]
self.do_convert_rgb = do_convert_rgb
def resize(
self,
image: np.ndarray,
size: dict[str, int],
resample: int = 2,
data_format: ChannelDimension | None = None,
**kwargs: Any,
) -> np.ndarray:
"""Resize image to target size."""
height, width = size["height"], size["width"]
# Convert to PIL for resizing
if isinstance(image, np.ndarray):
pil_image = Image.fromarray(image.astype(np.uint8))
else:
pil_image = image
# Resize
resized = pil_image.resize((width, height), resample=resample)
# Convert back to numpy
return np.array(resized)
def rescale(
self,
image: np.ndarray,
scale: float,
data_format: ChannelDimension | None = None,
**kwargs: Any,
) -> np.ndarray:
"""Rescale pixel values."""
return image.astype(np.float32) * scale
def normalize(
self,
image: np.ndarray,
mean: list[float],
std: list[float],
data_format: ChannelDimension | None = None,
**kwargs: Any,
) -> np.ndarray:
"""Normalize image with mean and std."""
mean = np.array(mean, dtype=np.float32)
std = np.array(std, dtype=np.float32)
# Ensure image is float
image = image.astype(np.float32)
# Normalize (assuming HWC format)
if image.ndim == 3:
image = (image - mean) / std
return image
def preprocess(
self,
images: ImageInput,
do_resize: bool | None = None,
size: dict[str, int] | None = None,
resample: int | None = None,
do_rescale: bool | None = None,
rescale_factor: float | None = None,
do_normalize: bool | None = None,
image_mean: list[float] | None = None,
image_std: list[float] | None = None,
do_convert_rgb: bool | None = None,
return_tensors: str | None = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
**kwargs: Any,
) -> BatchFeature:
"""Preprocess images for the model.
Args:
images: Single image or list of images
do_resize: Override resize setting
size: Override target size
resample: Override resampling filter
do_rescale: Override rescale setting
rescale_factor: Override rescale factor
do_normalize: Override normalize setting
image_mean: Override mean
image_std: Override std
do_convert_rgb: Override RGB conversion
return_tensors: Output tensor format ("pt", "np", etc.)
data_format: Channel dimension format
Returns:
BatchFeature with pixel_values
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
# Handle single image
images = make_list_of_images(images)
if not valid_images(images):
raise ValueError("Invalid image input")
processed_images = []
for image in images:
# Convert to RGB if needed
if do_convert_rgb:
if isinstance(image, Image.Image):
image = image.convert("RGB")
elif isinstance(image, np.ndarray):
if image.shape[-1] == 4: # RGBA
image = image[..., :3]
elif image.ndim == 2: # Grayscale
image = np.stack([image] * 3, axis=-1)
# Convert to numpy
image = to_numpy_array(image)
# Resize
if do_resize:
image = self.resize(image, size, resample)
# Rescale
if do_rescale:
image = self.rescale(image, rescale_factor)
# Normalize
if do_normalize:
image = self.normalize(image, image_mean, image_std)
# Convert to CHW format
if data_format == ChannelDimension.FIRST:
image = np.transpose(image, (2, 0, 1))
processed_images.append(image)
# Stack into batch
pixel_values = np.stack(processed_images, axis=0)
data = {"pixel_values": pixel_values}
return BatchFeature(data=data, tensor_type=return_tensors)