|
|
"""Villanova VLM Image Processor for HuggingFace. |
|
|
|
|
|
This is a standalone image processor file for use with trust_remote_code=True. |
|
|
It contains no imports from aithlas_trainer to ensure self-containment. |
|
|
""" |
|
|
|
|
|
from typing import Any |
|
|
|
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature |
|
|
from transformers.image_utils import ( |
|
|
ChannelDimension, |
|
|
ImageInput, |
|
|
make_list_of_images, |
|
|
to_numpy_array, |
|
|
valid_images, |
|
|
) |
|
|
|
|
|
|
|
|
class VillanovaImageProcessor(BaseImageProcessor): |
|
|
"""Image processor for Villanova VLM. |
|
|
|
|
|
Processes images for the ViT-L-14-CLIPA-336 vision encoder: |
|
|
- Resize to 336x336 |
|
|
- Normalize with ImageNet statistics (as used by OpenCLIP CLIPA models) |
|
|
- Convert to RGB if needed |
|
|
|
|
|
Args: |
|
|
do_resize: Whether to resize images |
|
|
size: Target size {"height": 336, "width": 336} |
|
|
resample: PIL resampling filter (default: BILINEAR as used by OpenCLIP) |
|
|
do_rescale: Whether to rescale pixel values |
|
|
rescale_factor: Rescale factor (1/255) |
|
|
do_normalize: Whether to normalize |
|
|
image_mean: Normalization mean (ImageNet: [0.485, 0.456, 0.406]) |
|
|
image_std: Normalization std (ImageNet: [0.229, 0.224, 0.225]) |
|
|
do_convert_rgb: Convert to RGB if needed |
|
|
|
|
|
Example: |
|
|
>>> processor = VillanovaImageProcessor() |
|
|
>>> image = Image.open("image.jpg") |
|
|
>>> inputs = processor(image, return_tensors="pt") |
|
|
>>> print(inputs.pixel_values.shape) |
|
|
torch.Size([1, 3, 336, 336]) |
|
|
""" |
|
|
|
|
|
model_input_names = ["pixel_values"] |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
do_resize: bool = True, |
|
|
size: dict[str, int] | None = None, |
|
|
resample: int = 2, |
|
|
do_rescale: bool = True, |
|
|
rescale_factor: float = 1 / 255, |
|
|
do_normalize: bool = True, |
|
|
image_mean: list[float] | None = None, |
|
|
image_std: list[float] | None = None, |
|
|
do_convert_rgb: bool = True, |
|
|
**kwargs: Any, |
|
|
) -> None: |
|
|
super().__init__(**kwargs) |
|
|
|
|
|
self.do_resize = do_resize |
|
|
self.size = size or {"height": 336, "width": 336} |
|
|
self.resample = resample |
|
|
self.do_rescale = do_rescale |
|
|
self.rescale_factor = rescale_factor |
|
|
self.do_normalize = do_normalize |
|
|
|
|
|
self.image_mean = image_mean or [0.485, 0.456, 0.406] |
|
|
self.image_std = image_std or [0.229, 0.224, 0.225] |
|
|
self.do_convert_rgb = do_convert_rgb |
|
|
|
|
|
def resize( |
|
|
self, |
|
|
image: np.ndarray, |
|
|
size: dict[str, int], |
|
|
resample: int = 2, |
|
|
data_format: ChannelDimension | None = None, |
|
|
**kwargs: Any, |
|
|
) -> np.ndarray: |
|
|
"""Resize image to target size.""" |
|
|
height, width = size["height"], size["width"] |
|
|
|
|
|
|
|
|
if isinstance(image, np.ndarray): |
|
|
pil_image = Image.fromarray(image.astype(np.uint8)) |
|
|
else: |
|
|
pil_image = image |
|
|
|
|
|
|
|
|
resized = pil_image.resize((width, height), resample=resample) |
|
|
|
|
|
|
|
|
return np.array(resized) |
|
|
|
|
|
def rescale( |
|
|
self, |
|
|
image: np.ndarray, |
|
|
scale: float, |
|
|
data_format: ChannelDimension | None = None, |
|
|
**kwargs: Any, |
|
|
) -> np.ndarray: |
|
|
"""Rescale pixel values.""" |
|
|
return image.astype(np.float32) * scale |
|
|
|
|
|
def normalize( |
|
|
self, |
|
|
image: np.ndarray, |
|
|
mean: list[float], |
|
|
std: list[float], |
|
|
data_format: ChannelDimension | None = None, |
|
|
**kwargs: Any, |
|
|
) -> np.ndarray: |
|
|
"""Normalize image with mean and std.""" |
|
|
mean = np.array(mean, dtype=np.float32) |
|
|
std = np.array(std, dtype=np.float32) |
|
|
|
|
|
|
|
|
image = image.astype(np.float32) |
|
|
|
|
|
|
|
|
if image.ndim == 3: |
|
|
image = (image - mean) / std |
|
|
|
|
|
return image |
|
|
|
|
|
def preprocess( |
|
|
self, |
|
|
images: ImageInput, |
|
|
do_resize: bool | None = None, |
|
|
size: dict[str, int] | None = None, |
|
|
resample: int | None = None, |
|
|
do_rescale: bool | None = None, |
|
|
rescale_factor: float | None = None, |
|
|
do_normalize: bool | None = None, |
|
|
image_mean: list[float] | None = None, |
|
|
image_std: list[float] | None = None, |
|
|
do_convert_rgb: bool | None = None, |
|
|
return_tensors: str | None = None, |
|
|
data_format: ChannelDimension = ChannelDimension.FIRST, |
|
|
**kwargs: Any, |
|
|
) -> BatchFeature: |
|
|
"""Preprocess images for the model. |
|
|
|
|
|
Args: |
|
|
images: Single image or list of images |
|
|
do_resize: Override resize setting |
|
|
size: Override target size |
|
|
resample: Override resampling filter |
|
|
do_rescale: Override rescale setting |
|
|
rescale_factor: Override rescale factor |
|
|
do_normalize: Override normalize setting |
|
|
image_mean: Override mean |
|
|
image_std: Override std |
|
|
do_convert_rgb: Override RGB conversion |
|
|
return_tensors: Output tensor format ("pt", "np", etc.) |
|
|
data_format: Channel dimension format |
|
|
|
|
|
Returns: |
|
|
BatchFeature with pixel_values |
|
|
""" |
|
|
do_resize = do_resize if do_resize is not None else self.do_resize |
|
|
size = size if size is not None else self.size |
|
|
resample = resample if resample is not None else self.resample |
|
|
do_rescale = do_rescale if do_rescale is not None else self.do_rescale |
|
|
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor |
|
|
do_normalize = do_normalize if do_normalize is not None else self.do_normalize |
|
|
image_mean = image_mean if image_mean is not None else self.image_mean |
|
|
image_std = image_std if image_std is not None else self.image_std |
|
|
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb |
|
|
|
|
|
|
|
|
images = make_list_of_images(images) |
|
|
|
|
|
if not valid_images(images): |
|
|
raise ValueError("Invalid image input") |
|
|
|
|
|
processed_images = [] |
|
|
for image in images: |
|
|
|
|
|
if do_convert_rgb: |
|
|
if isinstance(image, Image.Image): |
|
|
image = image.convert("RGB") |
|
|
elif isinstance(image, np.ndarray): |
|
|
if image.shape[-1] == 4: |
|
|
image = image[..., :3] |
|
|
elif image.ndim == 2: |
|
|
image = np.stack([image] * 3, axis=-1) |
|
|
|
|
|
|
|
|
image = to_numpy_array(image) |
|
|
|
|
|
|
|
|
if do_resize: |
|
|
image = self.resize(image, size, resample) |
|
|
|
|
|
|
|
|
if do_rescale: |
|
|
image = self.rescale(image, rescale_factor) |
|
|
|
|
|
|
|
|
if do_normalize: |
|
|
image = self.normalize(image, image_mean, image_std) |
|
|
|
|
|
|
|
|
if data_format == ChannelDimension.FIRST: |
|
|
image = np.transpose(image, (2, 0, 1)) |
|
|
|
|
|
processed_images.append(image) |
|
|
|
|
|
|
|
|
pixel_values = np.stack(processed_images, axis=0) |
|
|
|
|
|
data = {"pixel_values": pixel_values} |
|
|
|
|
|
return BatchFeature(data=data, tensor_type=return_tensors) |
|
|
|