File size: 6,698 Bytes
46d882e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
"""Villanova VLM Processor for HuggingFace.

This is a standalone processor file for use with trust_remote_code=True.
It contains no imports from aithlas_trainer to ensure self-containment.
"""

from typing import Any

from PIL import Image
from transformers import AutoTokenizer
from transformers.feature_extraction_utils import BatchFeature
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput

from .image_processing_villanova import VillanovaImageProcessor


class VillanovaProcessor:
    """Unified processor for Villanova VLM.

    Combines VillanovaImageProcessor and the LLM tokenizer for easy
    preprocessing of image-text pairs.

    Args:
        image_processor: VillanovaImageProcessor instance
        tokenizer: LLM tokenizer instance

    Example:
        >>> processor = VillanovaProcessor.from_pretrained("VillanovaAI/Villanova-2B-VL-2512-Preview")
        >>> image = Image.open("image.jpg")
        >>> inputs = processor(images=image, text="Describe this image.", return_tensors="pt")
        >>> print(inputs.keys())
        dict_keys(['pixel_values', 'input_ids', 'attention_mask'])
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "VillanovaImageProcessor"
    tokenizer_class = "AutoTokenizer"

    def __init__(
        self,
        image_processor: VillanovaImageProcessor | None = None,
        tokenizer: Any | None = None,
        **kwargs: Any,
    ) -> None:
        if image_processor is None:
            image_processor = VillanovaImageProcessor()

        self.image_processor = image_processor
        self.tokenizer = tokenizer

    def __call__(
        self,
        images: Image.Image | list[Image.Image] | None = None,
        text: TextInput | PreTokenizedInput | list[TextInput] | None = None,
        padding: bool | str = False,
        truncation: bool | None = None,
        max_length: int | None = None,
        return_tensors: str | None = None,
        **kwargs: Any,
    ) -> BatchFeature:
        """Process images and/or text for the model.

        Args:
            images: Single image or list of images (PIL.Image, path, or URL)
            text: Single text or list of texts
            padding: Padding strategy
            truncation: Whether to truncate
            max_length: Maximum sequence length
            return_tensors: Output tensor format ("pt", "np", etc.)

        Returns:
            BatchFeature with pixel_values, input_ids, attention_mask

        Raises:
            ValueError: If neither images nor text is provided
        """
        if images is None and text is None:
            raise ValueError("You must provide either images or text or both")

        result = BatchFeature()

        # Process images
        if images is not None:
            image_features = self.image_processor(
                images,
                return_tensors=return_tensors,
                **kwargs,
            )
            result.update(image_features)

        # Process text
        if text is not None:
            text_features = self.tokenizer(
                text,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                return_tensors=return_tensors,
                **kwargs,
            )
            result.update(text_features)

        return result

    def batch_decode(self, *args: Any, **kwargs: Any) -> list[str]:
        """Decode token IDs to text.

        Delegates to the tokenizer's batch_decode method.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args: Any, **kwargs: Any) -> str:
        """Decode token IDs to text.

        Delegates to the tokenizer's decode method.
        """
        return self.tokenizer.decode(*args, **kwargs)

    def apply_chat_template(
        self,
        conversation: list[dict],
        add_generation_prompt: bool = False,
        **kwargs: Any,
    ) -> str:
        """Apply chat template to conversation.

        Args:
            conversation: List of message dicts with "role" and "content"
            add_generation_prompt: Whether to add generation prompt

        Returns:
            Formatted prompt string

        Example:
            >>> messages = [{"role": "user", "content": "<image>\\nDescribe this."}]
            >>> prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        """
        return self.tokenizer.apply_chat_template(
            conversation,
            add_generation_prompt=add_generation_prompt,
            tokenize=False,
            **kwargs,
        )

    @property
    def model_input_names(self) -> list[str]:
        """Get model input names."""
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: str,
        **kwargs: Any,
    ) -> "VillanovaProcessor":
        """Load processor from pretrained model.

        Args:
            pretrained_model_name_or_path: Model ID or local path

        Returns:
            VillanovaProcessor instance
        """
        # Remove trust_remote_code from kwargs to avoid passing it twice
        kwargs.pop("trust_remote_code", None)

        image_processor = VillanovaImageProcessor.from_pretrained(
            pretrained_model_name_or_path,
            **kwargs,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path,
            trust_remote_code=True,
            **kwargs,
        )

        return cls(image_processor=image_processor, tokenizer=tokenizer)

    def save_pretrained(
        self,
        save_directory: str,
        **kwargs: Any,
    ) -> None:
        """Save processor to directory.

        Args:
            save_directory: Directory to save to
        """
        self.image_processor.save_pretrained(save_directory, **kwargs)
        self.tokenizer.save_pretrained(save_directory, **kwargs)

    @classmethod
    def register_for_auto_class(cls, auto_class: str = "AutoProcessor") -> None:
        """Register this class for automatic loading.

        This is a no-op for custom processors loaded with trust_remote_code=True,
        but required by the transformers auto-loading mechanism.

        Args:
            auto_class: The auto class to register with (default: "AutoProcessor")
        """
        # No-op - custom classes loaded via trust_remote_code don't need registration
        pass