"""Villanova VLM Processor for HuggingFace. This is a standalone processor file for use with trust_remote_code=True. It contains no imports from aithlas_trainer to ensure self-containment. """ from typing import Any from PIL import Image from transformers import AutoTokenizer from transformers.feature_extraction_utils import BatchFeature from transformers.tokenization_utils_base import PreTokenizedInput, TextInput from .image_processing_villanova import VillanovaImageProcessor class VillanovaProcessor: """Unified processor for Villanova VLM. Combines VillanovaImageProcessor and the LLM tokenizer for easy preprocessing of image-text pairs. Args: image_processor: VillanovaImageProcessor instance tokenizer: LLM tokenizer instance Example: >>> processor = VillanovaProcessor.from_pretrained("VillanovaAI/Villanova-2B-VL-2512-Preview") >>> image = Image.open("image.jpg") >>> inputs = processor(images=image, text="Describe this image.", return_tensors="pt") >>> print(inputs.keys()) dict_keys(['pixel_values', 'input_ids', 'attention_mask']) """ attributes = ["image_processor", "tokenizer"] image_processor_class = "VillanovaImageProcessor" tokenizer_class = "AutoTokenizer" def __init__( self, image_processor: VillanovaImageProcessor | None = None, tokenizer: Any | None = None, **kwargs: Any, ) -> None: if image_processor is None: image_processor = VillanovaImageProcessor() self.image_processor = image_processor self.tokenizer = tokenizer def __call__( self, images: Image.Image | list[Image.Image] | None = None, text: TextInput | PreTokenizedInput | list[TextInput] | None = None, padding: bool | str = False, truncation: bool | None = None, max_length: int | None = None, return_tensors: str | None = None, **kwargs: Any, ) -> BatchFeature: """Process images and/or text for the model. Args: images: Single image or list of images (PIL.Image, path, or URL) text: Single text or list of texts padding: Padding strategy truncation: Whether to truncate max_length: Maximum sequence length return_tensors: Output tensor format ("pt", "np", etc.) Returns: BatchFeature with pixel_values, input_ids, attention_mask Raises: ValueError: If neither images nor text is provided """ if images is None and text is None: raise ValueError("You must provide either images or text or both") result = BatchFeature() # Process images if images is not None: image_features = self.image_processor( images, return_tensors=return_tensors, **kwargs, ) result.update(image_features) # Process text if text is not None: text_features = self.tokenizer( text, padding=padding, truncation=truncation, max_length=max_length, return_tensors=return_tensors, **kwargs, ) result.update(text_features) return result def batch_decode(self, *args: Any, **kwargs: Any) -> list[str]: """Decode token IDs to text. Delegates to the tokenizer's batch_decode method. """ return self.tokenizer.batch_decode(*args, **kwargs) def decode(self, *args: Any, **kwargs: Any) -> str: """Decode token IDs to text. Delegates to the tokenizer's decode method. """ return self.tokenizer.decode(*args, **kwargs) def apply_chat_template( self, conversation: list[dict], add_generation_prompt: bool = False, **kwargs: Any, ) -> str: """Apply chat template to conversation. Args: conversation: List of message dicts with "role" and "content" add_generation_prompt: Whether to add generation prompt Returns: Formatted prompt string Example: >>> messages = [{"role": "user", "content": "\\nDescribe this."}] >>> prompt = processor.apply_chat_template(messages, add_generation_prompt=True) """ return self.tokenizer.apply_chat_template( conversation, add_generation_prompt=add_generation_prompt, tokenize=False, **kwargs, ) @property def model_input_names(self) -> list[str]: """Get model input names.""" tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) @classmethod def from_pretrained( cls, pretrained_model_name_or_path: str, **kwargs: Any, ) -> "VillanovaProcessor": """Load processor from pretrained model. Args: pretrained_model_name_or_path: Model ID or local path Returns: VillanovaProcessor instance """ # Remove trust_remote_code from kwargs to avoid passing it twice kwargs.pop("trust_remote_code", None) image_processor = VillanovaImageProcessor.from_pretrained( pretrained_model_name_or_path, **kwargs, ) tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, trust_remote_code=True, **kwargs, ) return cls(image_processor=image_processor, tokenizer=tokenizer) def save_pretrained( self, save_directory: str, **kwargs: Any, ) -> None: """Save processor to directory. Args: save_directory: Directory to save to """ self.image_processor.save_pretrained(save_directory, **kwargs) self.tokenizer.save_pretrained(save_directory, **kwargs) @classmethod def register_for_auto_class(cls, auto_class: str = "AutoProcessor") -> None: """Register this class for automatic loading. This is a no-op for custom processors loaded with trust_remote_code=True, but required by the transformers auto-loading mechanism. Args: auto_class: The auto class to register with (default: "AutoProcessor") """ # No-op - custom classes loaded via trust_remote_code don't need registration pass