Villanova-2B-VL-2512-Preview / processing_villanova.py

Upload folder using huggingface_hub

46d882e verified 18 days ago

6.7 kB

	"""Villanova VLM Processor for HuggingFace.

	This is a standalone processor file for use with trust_remote_code=True.
	It contains no imports from aithlas_trainer to ensure self-containment.
	"""

	from typing import Any

	from PIL import Image
	from transformers import AutoTokenizer
	from transformers.feature_extraction_utils import BatchFeature
	from transformers.tokenization_utils_base import PreTokenizedInput, TextInput

	from .image_processing_villanova import VillanovaImageProcessor


	class VillanovaProcessor:
	"""Unified processor for Villanova VLM.

	Combines VillanovaImageProcessor and the LLM tokenizer for easy
	preprocessing of image-text pairs.

	Args:
	image_processor: VillanovaImageProcessor instance
	tokenizer: LLM tokenizer instance

	Example:
	>>> processor = VillanovaProcessor.from_pretrained("VillanovaAI/Villanova-2B-VL-2512-Preview")
	>>> image = Image.open("image.jpg")
	>>> inputs = processor(images=image, text="Describe this image.", return_tensors="pt")
	>>> print(inputs.keys())
	dict_keys(['pixel_values', 'input_ids', 'attention_mask'])
	"""

	attributes = ["image_processor", "tokenizer"]
	image_processor_class = "VillanovaImageProcessor"
	tokenizer_class = "AutoTokenizer"

	def __init__(
	self,
	image_processor: VillanovaImageProcessor \| None = None,
	tokenizer: Any \| None = None,
	**kwargs: Any,
	) -> None:
	if image_processor is None:
	image_processor = VillanovaImageProcessor()

	self.image_processor = image_processor
	self.tokenizer = tokenizer

	def __call__(
	self,
	images: Image.Image \| list[Image.Image] \| None = None,
	text: TextInput \| PreTokenizedInput \| list[TextInput] \| None = None,
	padding: bool \| str = False,
	truncation: bool \| None = None,
	max_length: int \| None = None,
	return_tensors: str \| None = None,
	**kwargs: Any,
	) -> BatchFeature:
	"""Process images and/or text for the model.

	Args:
	images: Single image or list of images (PIL.Image, path, or URL)
	text: Single text or list of texts
	padding: Padding strategy
	truncation: Whether to truncate
	max_length: Maximum sequence length
	return_tensors: Output tensor format ("pt", "np", etc.)

	Returns:
	BatchFeature with pixel_values, input_ids, attention_mask

	Raises:
	ValueError: If neither images nor text is provided
	"""
	if images is None and text is None:
	raise ValueError("You must provide either images or text or both")

	result = BatchFeature()

	# Process images
	if images is not None:
	image_features = self.image_processor(
	images,
	return_tensors=return_tensors,
	**kwargs,
	)
	result.update(image_features)

	# Process text
	if text is not None:
	text_features = self.tokenizer(
	text,
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	return_tensors=return_tensors,
	**kwargs,
	)
	result.update(text_features)

	return result

	def batch_decode(self, args: Any, *kwargs: Any) -> list[str]:
	"""Decode token IDs to text.

	Delegates to the tokenizer's batch_decode method.
	"""
	return self.tokenizer.batch_decode(args, *kwargs)

	def decode(self, args: Any, *kwargs: Any) -> str:
	"""Decode token IDs to text.

	Delegates to the tokenizer's decode method.
	"""
	return self.tokenizer.decode(args, *kwargs)

	def apply_chat_template(
	self,
	conversation: list[dict],
	add_generation_prompt: bool = False,
	**kwargs: Any,
	) -> str:
	"""Apply chat template to conversation.

	Args:
	conversation: List of message dicts with "role" and "content"
	add_generation_prompt: Whether to add generation prompt

	Returns:
	Formatted prompt string

	Example:
	>>> messages = [{"role": "user", "content": "<image>\\nDescribe this."}]
	>>> prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
	"""
	return self.tokenizer.apply_chat_template(
	conversation,
	add_generation_prompt=add_generation_prompt,
	tokenize=False,
	**kwargs,
	)

	@property
	def model_input_names(self) -> list[str]:
	"""Get model input names."""
	tokenizer_input_names = self.tokenizer.model_input_names
	image_processor_input_names = self.image_processor.model_input_names
	return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

	@classmethod
	def from_pretrained(
	cls,
	pretrained_model_name_or_path: str,
	**kwargs: Any,
	) -> "VillanovaProcessor":
	"""Load processor from pretrained model.

	Args:
	pretrained_model_name_or_path: Model ID or local path

	Returns:
	VillanovaProcessor instance
	"""
	# Remove trust_remote_code from kwargs to avoid passing it twice
	kwargs.pop("trust_remote_code", None)

	image_processor = VillanovaImageProcessor.from_pretrained(
	pretrained_model_name_or_path,
	**kwargs,
	)
	tokenizer = AutoTokenizer.from_pretrained(
	pretrained_model_name_or_path,
	trust_remote_code=True,
	**kwargs,
	)

	return cls(image_processor=image_processor, tokenizer=tokenizer)

	def save_pretrained(
	self,
	save_directory: str,
	**kwargs: Any,
	) -> None:
	"""Save processor to directory.

	Args:
	save_directory: Directory to save to
	"""
	self.image_processor.save_pretrained(save_directory, **kwargs)
	self.tokenizer.save_pretrained(save_directory, **kwargs)

	@classmethod
	def register_for_auto_class(cls, auto_class: str = "AutoProcessor") -> None:
	"""Register this class for automatic loading.

	This is a no-op for custom processors loaded with trust_remote_code=True,
	but required by the transformers auto-loading mechanism.

	Args:
	auto_class: The auto class to register with (default: "AutoProcessor")
	"""
	# No-op - custom classes loaded via trust_remote_code don't need registration
	pass