Villanova-2B-VL-2512-Preview / processing_villanova.py
matteogabburo's picture
Upload folder using huggingface_hub
46d882e verified
"""Villanova VLM Processor for HuggingFace.
This is a standalone processor file for use with trust_remote_code=True.
It contains no imports from aithlas_trainer to ensure self-containment.
"""
from typing import Any
from PIL import Image
from transformers import AutoTokenizer
from transformers.feature_extraction_utils import BatchFeature
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from .image_processing_villanova import VillanovaImageProcessor
class VillanovaProcessor:
"""Unified processor for Villanova VLM.
Combines VillanovaImageProcessor and the LLM tokenizer for easy
preprocessing of image-text pairs.
Args:
image_processor: VillanovaImageProcessor instance
tokenizer: LLM tokenizer instance
Example:
>>> processor = VillanovaProcessor.from_pretrained("VillanovaAI/Villanova-2B-VL-2512-Preview")
>>> image = Image.open("image.jpg")
>>> inputs = processor(images=image, text="Describe this image.", return_tensors="pt")
>>> print(inputs.keys())
dict_keys(['pixel_values', 'input_ids', 'attention_mask'])
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "VillanovaImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(
self,
image_processor: VillanovaImageProcessor | None = None,
tokenizer: Any | None = None,
**kwargs: Any,
) -> None:
if image_processor is None:
image_processor = VillanovaImageProcessor()
self.image_processor = image_processor
self.tokenizer = tokenizer
def __call__(
self,
images: Image.Image | list[Image.Image] | None = None,
text: TextInput | PreTokenizedInput | list[TextInput] | None = None,
padding: bool | str = False,
truncation: bool | None = None,
max_length: int | None = None,
return_tensors: str | None = None,
**kwargs: Any,
) -> BatchFeature:
"""Process images and/or text for the model.
Args:
images: Single image or list of images (PIL.Image, path, or URL)
text: Single text or list of texts
padding: Padding strategy
truncation: Whether to truncate
max_length: Maximum sequence length
return_tensors: Output tensor format ("pt", "np", etc.)
Returns:
BatchFeature with pixel_values, input_ids, attention_mask
Raises:
ValueError: If neither images nor text is provided
"""
if images is None and text is None:
raise ValueError("You must provide either images or text or both")
result = BatchFeature()
# Process images
if images is not None:
image_features = self.image_processor(
images,
return_tensors=return_tensors,
**kwargs,
)
result.update(image_features)
# Process text
if text is not None:
text_features = self.tokenizer(
text,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=return_tensors,
**kwargs,
)
result.update(text_features)
return result
def batch_decode(self, *args: Any, **kwargs: Any) -> list[str]:
"""Decode token IDs to text.
Delegates to the tokenizer's batch_decode method.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args: Any, **kwargs: Any) -> str:
"""Decode token IDs to text.
Delegates to the tokenizer's decode method.
"""
return self.tokenizer.decode(*args, **kwargs)
def apply_chat_template(
self,
conversation: list[dict],
add_generation_prompt: bool = False,
**kwargs: Any,
) -> str:
"""Apply chat template to conversation.
Args:
conversation: List of message dicts with "role" and "content"
add_generation_prompt: Whether to add generation prompt
Returns:
Formatted prompt string
Example:
>>> messages = [{"role": "user", "content": "<image>\\nDescribe this."}]
>>> prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
"""
return self.tokenizer.apply_chat_template(
conversation,
add_generation_prompt=add_generation_prompt,
tokenize=False,
**kwargs,
)
@property
def model_input_names(self) -> list[str]:
"""Get model input names."""
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: str,
**kwargs: Any,
) -> "VillanovaProcessor":
"""Load processor from pretrained model.
Args:
pretrained_model_name_or_path: Model ID or local path
Returns:
VillanovaProcessor instance
"""
# Remove trust_remote_code from kwargs to avoid passing it twice
kwargs.pop("trust_remote_code", None)
image_processor = VillanovaImageProcessor.from_pretrained(
pretrained_model_name_or_path,
**kwargs,
)
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path,
trust_remote_code=True,
**kwargs,
)
return cls(image_processor=image_processor, tokenizer=tokenizer)
def save_pretrained(
self,
save_directory: str,
**kwargs: Any,
) -> None:
"""Save processor to directory.
Args:
save_directory: Directory to save to
"""
self.image_processor.save_pretrained(save_directory, **kwargs)
self.tokenizer.save_pretrained(save_directory, **kwargs)
@classmethod
def register_for_auto_class(cls, auto_class: str = "AutoProcessor") -> None:
"""Register this class for automatic loading.
This is a no-op for custom processors loaded with trust_remote_code=True,
but required by the transformers auto-loading mechanism.
Args:
auto_class: The auto class to register with (default: "AutoProcessor")
"""
# No-op - custom classes loaded via trust_remote_code don't need registration
pass