|
|
"""Villanova VLM Processor for HuggingFace. |
|
|
|
|
|
This is a standalone processor file for use with trust_remote_code=True. |
|
|
It contains no imports from aithlas_trainer to ensure self-containment. |
|
|
""" |
|
|
|
|
|
from typing import Any |
|
|
|
|
|
from PIL import Image |
|
|
from transformers import AutoTokenizer |
|
|
from transformers.feature_extraction_utils import BatchFeature |
|
|
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput |
|
|
|
|
|
from .image_processing_villanova import VillanovaImageProcessor |
|
|
|
|
|
|
|
|
class VillanovaProcessor: |
|
|
"""Unified processor for Villanova VLM. |
|
|
|
|
|
Combines VillanovaImageProcessor and the LLM tokenizer for easy |
|
|
preprocessing of image-text pairs. |
|
|
|
|
|
Args: |
|
|
image_processor: VillanovaImageProcessor instance |
|
|
tokenizer: LLM tokenizer instance |
|
|
|
|
|
Example: |
|
|
>>> processor = VillanovaProcessor.from_pretrained("VillanovaAI/Villanova-2B-VL-2512-Preview") |
|
|
>>> image = Image.open("image.jpg") |
|
|
>>> inputs = processor(images=image, text="Describe this image.", return_tensors="pt") |
|
|
>>> print(inputs.keys()) |
|
|
dict_keys(['pixel_values', 'input_ids', 'attention_mask']) |
|
|
""" |
|
|
|
|
|
attributes = ["image_processor", "tokenizer"] |
|
|
image_processor_class = "VillanovaImageProcessor" |
|
|
tokenizer_class = "AutoTokenizer" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
image_processor: VillanovaImageProcessor | None = None, |
|
|
tokenizer: Any | None = None, |
|
|
**kwargs: Any, |
|
|
) -> None: |
|
|
if image_processor is None: |
|
|
image_processor = VillanovaImageProcessor() |
|
|
|
|
|
self.image_processor = image_processor |
|
|
self.tokenizer = tokenizer |
|
|
|
|
|
def __call__( |
|
|
self, |
|
|
images: Image.Image | list[Image.Image] | None = None, |
|
|
text: TextInput | PreTokenizedInput | list[TextInput] | None = None, |
|
|
padding: bool | str = False, |
|
|
truncation: bool | None = None, |
|
|
max_length: int | None = None, |
|
|
return_tensors: str | None = None, |
|
|
**kwargs: Any, |
|
|
) -> BatchFeature: |
|
|
"""Process images and/or text for the model. |
|
|
|
|
|
Args: |
|
|
images: Single image or list of images (PIL.Image, path, or URL) |
|
|
text: Single text or list of texts |
|
|
padding: Padding strategy |
|
|
truncation: Whether to truncate |
|
|
max_length: Maximum sequence length |
|
|
return_tensors: Output tensor format ("pt", "np", etc.) |
|
|
|
|
|
Returns: |
|
|
BatchFeature with pixel_values, input_ids, attention_mask |
|
|
|
|
|
Raises: |
|
|
ValueError: If neither images nor text is provided |
|
|
""" |
|
|
if images is None and text is None: |
|
|
raise ValueError("You must provide either images or text or both") |
|
|
|
|
|
result = BatchFeature() |
|
|
|
|
|
|
|
|
if images is not None: |
|
|
image_features = self.image_processor( |
|
|
images, |
|
|
return_tensors=return_tensors, |
|
|
**kwargs, |
|
|
) |
|
|
result.update(image_features) |
|
|
|
|
|
|
|
|
if text is not None: |
|
|
text_features = self.tokenizer( |
|
|
text, |
|
|
padding=padding, |
|
|
truncation=truncation, |
|
|
max_length=max_length, |
|
|
return_tensors=return_tensors, |
|
|
**kwargs, |
|
|
) |
|
|
result.update(text_features) |
|
|
|
|
|
return result |
|
|
|
|
|
def batch_decode(self, *args: Any, **kwargs: Any) -> list[str]: |
|
|
"""Decode token IDs to text. |
|
|
|
|
|
Delegates to the tokenizer's batch_decode method. |
|
|
""" |
|
|
return self.tokenizer.batch_decode(*args, **kwargs) |
|
|
|
|
|
def decode(self, *args: Any, **kwargs: Any) -> str: |
|
|
"""Decode token IDs to text. |
|
|
|
|
|
Delegates to the tokenizer's decode method. |
|
|
""" |
|
|
return self.tokenizer.decode(*args, **kwargs) |
|
|
|
|
|
def apply_chat_template( |
|
|
self, |
|
|
conversation: list[dict], |
|
|
add_generation_prompt: bool = False, |
|
|
**kwargs: Any, |
|
|
) -> str: |
|
|
"""Apply chat template to conversation. |
|
|
|
|
|
Args: |
|
|
conversation: List of message dicts with "role" and "content" |
|
|
add_generation_prompt: Whether to add generation prompt |
|
|
|
|
|
Returns: |
|
|
Formatted prompt string |
|
|
|
|
|
Example: |
|
|
>>> messages = [{"role": "user", "content": "<image>\\nDescribe this."}] |
|
|
>>> prompt = processor.apply_chat_template(messages, add_generation_prompt=True) |
|
|
""" |
|
|
return self.tokenizer.apply_chat_template( |
|
|
conversation, |
|
|
add_generation_prompt=add_generation_prompt, |
|
|
tokenize=False, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
@property |
|
|
def model_input_names(self) -> list[str]: |
|
|
"""Get model input names.""" |
|
|
tokenizer_input_names = self.tokenizer.model_input_names |
|
|
image_processor_input_names = self.image_processor.model_input_names |
|
|
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) |
|
|
|
|
|
@classmethod |
|
|
def from_pretrained( |
|
|
cls, |
|
|
pretrained_model_name_or_path: str, |
|
|
**kwargs: Any, |
|
|
) -> "VillanovaProcessor": |
|
|
"""Load processor from pretrained model. |
|
|
|
|
|
Args: |
|
|
pretrained_model_name_or_path: Model ID or local path |
|
|
|
|
|
Returns: |
|
|
VillanovaProcessor instance |
|
|
""" |
|
|
|
|
|
kwargs.pop("trust_remote_code", None) |
|
|
|
|
|
image_processor = VillanovaImageProcessor.from_pretrained( |
|
|
pretrained_model_name_or_path, |
|
|
**kwargs, |
|
|
) |
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
|
pretrained_model_name_or_path, |
|
|
trust_remote_code=True, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
return cls(image_processor=image_processor, tokenizer=tokenizer) |
|
|
|
|
|
def save_pretrained( |
|
|
self, |
|
|
save_directory: str, |
|
|
**kwargs: Any, |
|
|
) -> None: |
|
|
"""Save processor to directory. |
|
|
|
|
|
Args: |
|
|
save_directory: Directory to save to |
|
|
""" |
|
|
self.image_processor.save_pretrained(save_directory, **kwargs) |
|
|
self.tokenizer.save_pretrained(save_directory, **kwargs) |
|
|
|
|
|
@classmethod |
|
|
def register_for_auto_class(cls, auto_class: str = "AutoProcessor") -> None: |
|
|
"""Register this class for automatic loading. |
|
|
|
|
|
This is a no-op for custom processors loaded with trust_remote_code=True, |
|
|
but required by the transformers auto-loading mechanism. |
|
|
|
|
|
Args: |
|
|
auto_class: The auto class to register with (default: "AutoProcessor") |
|
|
""" |
|
|
|
|
|
pass |
|
|
|