File size: 6,698 Bytes

46d882e

"""Villanova VLM Processor for HuggingFace.

This is a standalone processor file for use with trust_remote_code=True.
It contains no imports from aithlas_trainer to ensure self-containment.
"""

from typing import Any

from PIL import Image
from transformers import AutoTokenizer
from transformers.feature_extraction_utils import BatchFeature
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput

from .image_processing_villanova import VillanovaImageProcessor


class VillanovaProcessor:
    """Unified processor for Villanova VLM.

    Combines VillanovaImageProcessor and the LLM tokenizer for easy
    preprocessing of image-text pairs.

    Args:
        image_processor: VillanovaImageProcessor instance
        tokenizer: LLM tokenizer instance

    Example:
        >>> processor = VillanovaProcessor.from_pretrained("VillanovaAI/Villanova-2B-VL-2512-Preview")
        >>> image = Image.open("image.jpg")
        >>> inputs = processor(images=image, text="Describe this image.", return_tensors="pt")
        >>> print(inputs.keys())
        dict_keys(['pixel_values', 'input_ids', 'attention_mask'])
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "VillanovaImageProcessor"
    tokenizer_class = "AutoTokenizer"

    def __init__(
        self,
        image_processor: VillanovaImageProcessor | None = None,
        tokenizer: Any | None = None,
        **kwargs: Any,
    ) -> None:
        if image_processor is None:
            image_processor = VillanovaImageProcessor()

        self.image_processor = image_processor
        self.tokenizer = tokenizer

    def __call__(
        self,
        images: Image.Image | list[Image.Image] | None = None,
        text: TextInput | PreTokenizedInput | list[TextInput] | None = None,
        padding: bool | str = False,
        truncation: bool | None = None,
        max_length: int | None = None,
        return_tensors: str | None = None,
        **kwargs: Any,
    ) -> BatchFeature:
        """Process images and/or text for the model.

        Args:
            images: Single image or list of images (PIL.Image, path, or URL)
            text: Single text or list of texts
            padding: Padding strategy
            truncation: Whether to truncate
            max_length: Maximum sequence length
            return_tensors: Output tensor format ("pt", "np", etc.)

        Returns:
            BatchFeature with pixel_values, input_ids, attention_mask

        Raises:
            ValueError: If neither images nor text is provided
        """
        if images is None and text is None:
            raise ValueError("You must provide either images or text or both")

        result = BatchFeature()

        # Process images
        if images is not None:
            image_features = self.image_processor(
                images,
                return_tensors=return_tensors,
                **kwargs,
            )
            result.update(image_features)

        # Process text
        if text is not None:
            text_features = self.tokenizer(
                text,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                return_tensors=return_tensors,
                **kwargs,
            )
            result.update(text_features)

        return result

    def batch_decode(self, *args: Any, **kwargs: Any) -> list[str]:
        """Decode token IDs to text.

        Delegates to the tokenizer's batch_decode method.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args: Any, **kwargs: Any) -> str:
        """Decode token IDs to text.

        Delegates to the tokenizer's decode method.
        """
        return self.tokenizer.decode(*args, **kwargs)

    def apply_chat_template(
        self,
        conversation: list[dict],
        add_generation_prompt: bool = False,
        **kwargs: Any,
    ) -> str:
        """Apply chat template to conversation.

        Args:
            conversation: List of message dicts with "role" and "content"
            add_generation_prompt: Whether to add generation prompt

        Returns:
            Formatted prompt string

        Example:
            >>> messages = [{"role": "user", "content": "<image>\\nDescribe this."}]
            >>> prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        """
        return self.tokenizer.apply_chat_template(
            conversation,
            add_generation_prompt=add_generation_prompt,
            tokenize=False,
            **kwargs,
        )

    @property
    def model_input_names(self) -> list[str]:
        """Get model input names."""
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: str,
        **kwargs: Any,
    ) -> "VillanovaProcessor":
        """Load processor from pretrained model.

        Args:
            pretrained_model_name_or_path: Model ID or local path

        Returns:
            VillanovaProcessor instance
        """
        # Remove trust_remote_code from kwargs to avoid passing it twice
        kwargs.pop("trust_remote_code", None)

        image_processor = VillanovaImageProcessor.from_pretrained(
            pretrained_model_name_or_path,
            **kwargs,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path,
            trust_remote_code=True,
            **kwargs,
        )

        return cls(image_processor=image_processor, tokenizer=tokenizer)

    def save_pretrained(
        self,
        save_directory: str,
        **kwargs: Any,
    ) -> None:
        """Save processor to directory.

        Args:
            save_directory: Directory to save to
        """
        self.image_processor.save_pretrained(save_directory, **kwargs)
        self.tokenizer.save_pretrained(save_directory, **kwargs)

    @classmethod
    def register_for_auto_class(cls, auto_class: str = "AutoProcessor") -> None:
        """Register this class for automatic loading.

        This is a no-op for custom processors loaded with trust_remote_code=True,
        but required by the transformers auto-loading mechanism.

        Args:
            auto_class: The auto class to register with (default: "AutoProcessor")
        """
        # No-op - custom classes loaded via trust_remote_code don't need registration
        pass