Villanova-2B-VL-2512-Preview / image_processing_villanova.py

Upload folder using huggingface_hub

46d882e verified 19 days ago

7.57 kB

	"""Villanova VLM Image Processor for HuggingFace.

	This is a standalone image processor file for use with trust_remote_code=True.
	It contains no imports from aithlas_trainer to ensure self-containment.
	"""

	from typing import Any

	import numpy as np
	from PIL import Image
	from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
	from transformers.image_utils import (
	ChannelDimension,
	ImageInput,
	make_list_of_images,
	to_numpy_array,
	valid_images,
	)


	class VillanovaImageProcessor(BaseImageProcessor):
	"""Image processor for Villanova VLM.

	Processes images for the ViT-L-14-CLIPA-336 vision encoder:
	- Resize to 336x336
	- Normalize with ImageNet statistics (as used by OpenCLIP CLIPA models)
	- Convert to RGB if needed

	Args:
	do_resize: Whether to resize images
	size: Target size {"height": 336, "width": 336}
	resample: PIL resampling filter (default: BILINEAR as used by OpenCLIP)
	do_rescale: Whether to rescale pixel values
	rescale_factor: Rescale factor (1/255)
	do_normalize: Whether to normalize
	image_mean: Normalization mean (ImageNet: [0.485, 0.456, 0.406])
	image_std: Normalization std (ImageNet: [0.229, 0.224, 0.225])
	do_convert_rgb: Convert to RGB if needed

	Example:
	>>> processor = VillanovaImageProcessor()
	>>> image = Image.open("image.jpg")
	>>> inputs = processor(image, return_tensors="pt")
	>>> print(inputs.pixel_values.shape)
	torch.Size([1, 3, 336, 336])
	"""

	model_input_names = ["pixel_values"]

	def __init__(
	self,
	do_resize: bool = True,
	size: dict[str, int] \| None = None,
	resample: int = 2, # PIL.Image.BILINEAR (as used by OpenCLIP)
	do_rescale: bool = True,
	rescale_factor: float = 1 / 255,
	do_normalize: bool = True,
	image_mean: list[float] \| None = None,
	image_std: list[float] \| None = None,
	do_convert_rgb: bool = True,
	**kwargs: Any,
	) -> None:
	super().__init__(**kwargs)

	self.do_resize = do_resize
	self.size = size or {"height": 336, "width": 336}
	self.resample = resample
	self.do_rescale = do_rescale
	self.rescale_factor = rescale_factor
	self.do_normalize = do_normalize
	# ImageNet normalization (same as OpenCLIP ViT-L-14-CLIPA-336)
	self.image_mean = image_mean or [0.485, 0.456, 0.406]
	self.image_std = image_std or [0.229, 0.224, 0.225]
	self.do_convert_rgb = do_convert_rgb

	def resize(
	self,
	image: np.ndarray,
	size: dict[str, int],
	resample: int = 2,
	data_format: ChannelDimension \| None = None,
	**kwargs: Any,
	) -> np.ndarray:
	"""Resize image to target size."""
	height, width = size["height"], size["width"]

	# Convert to PIL for resizing
	if isinstance(image, np.ndarray):
	pil_image = Image.fromarray(image.astype(np.uint8))
	else:
	pil_image = image

	# Resize
	resized = pil_image.resize((width, height), resample=resample)

	# Convert back to numpy
	return np.array(resized)

	def rescale(
	self,
	image: np.ndarray,
	scale: float,
	data_format: ChannelDimension \| None = None,
	**kwargs: Any,
	) -> np.ndarray:
	"""Rescale pixel values."""
	return image.astype(np.float32) * scale

	def normalize(
	self,
	image: np.ndarray,
	mean: list[float],
	std: list[float],
	data_format: ChannelDimension \| None = None,
	**kwargs: Any,
	) -> np.ndarray:
	"""Normalize image with mean and std."""
	mean = np.array(mean, dtype=np.float32)
	std = np.array(std, dtype=np.float32)

	# Ensure image is float
	image = image.astype(np.float32)

	# Normalize (assuming HWC format)
	if image.ndim == 3:
	image = (image - mean) / std

	return image

	def preprocess(
	self,
	images: ImageInput,
	do_resize: bool \| None = None,
	size: dict[str, int] \| None = None,
	resample: int \| None = None,
	do_rescale: bool \| None = None,
	rescale_factor: float \| None = None,
	do_normalize: bool \| None = None,
	image_mean: list[float] \| None = None,
	image_std: list[float] \| None = None,
	do_convert_rgb: bool \| None = None,
	return_tensors: str \| None = None,
	data_format: ChannelDimension = ChannelDimension.FIRST,
	**kwargs: Any,
	) -> BatchFeature:
	"""Preprocess images for the model.

	Args:
	images: Single image or list of images
	do_resize: Override resize setting
	size: Override target size
	resample: Override resampling filter
	do_rescale: Override rescale setting
	rescale_factor: Override rescale factor
	do_normalize: Override normalize setting
	image_mean: Override mean
	image_std: Override std
	do_convert_rgb: Override RGB conversion
	return_tensors: Output tensor format ("pt", "np", etc.)
	data_format: Channel dimension format

	Returns:
	BatchFeature with pixel_values
	"""
	do_resize = do_resize if do_resize is not None else self.do_resize
	size = size if size is not None else self.size
	resample = resample if resample is not None else self.resample
	do_rescale = do_rescale if do_rescale is not None else self.do_rescale
	rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
	do_normalize = do_normalize if do_normalize is not None else self.do_normalize
	image_mean = image_mean if image_mean is not None else self.image_mean
	image_std = image_std if image_std is not None else self.image_std
	do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb

	# Handle single image
	images = make_list_of_images(images)

	if not valid_images(images):
	raise ValueError("Invalid image input")

	processed_images = []
	for image in images:
	# Convert to RGB if needed
	if do_convert_rgb:
	if isinstance(image, Image.Image):
	image = image.convert("RGB")
	elif isinstance(image, np.ndarray):
	if image.shape[-1] == 4: # RGBA
	image = image[..., :3]
	elif image.ndim == 2: # Grayscale
	image = np.stack([image] * 3, axis=-1)

	# Convert to numpy
	image = to_numpy_array(image)

	# Resize
	if do_resize:
	image = self.resize(image, size, resample)

	# Rescale
	if do_rescale:
	image = self.rescale(image, rescale_factor)

	# Normalize
	if do_normalize:
	image = self.normalize(image, image_mean, image_std)

	# Convert to CHW format
	if data_format == ChannelDimension.FIRST:
	image = np.transpose(image, (2, 0, 1))

	processed_images.append(image)

	# Stack into batch
	pixel_values = np.stack(processed_images, axis=0)

	data = {"pixel_values": pixel_values}

	return BatchFeature(data=data, tensor_type=return_tensors)