""" LongCLIP processor for preprocessing images and text. This module provides a processor that combines image and text preprocessing for LongCLIP models. """ from typing import List, Optional, Union from transformers import CLIPImageProcessor, CLIPTokenizer from transformers.processing_utils import ProcessorMixin class LongCLIPProcessor(ProcessorMixin): """ Processor for LongCLIP that combines image and text preprocessing. This processor wraps CLIPImageProcessor and CLIPTokenizer to provide a unified interface for preprocessing inputs for LongCLIP models. Args: image_processor (CLIPImageProcessor): Image processor for preprocessing images. tokenizer (CLIPTokenizer): Tokenizer for preprocessing text. Attributes: image_processor_class (str): Name of the image processor class. tokenizer_class (str): Name of the tokenizer class. Example: ```python >>> from long_clip_hf import LongCLIPProcessor >>> from transformers import CLIPImageProcessor, CLIPTokenizer >>> from PIL import Image >>> >>> # Initialize processor >>> image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32") >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") >>> processor = LongCLIPProcessor(image_processor=image_processor, tokenizer=tokenizer) >>> >>> # Process inputs >>> image = Image.open("path/to/image.jpg") >>> text = "a photo of a cat" >>> inputs = processor(text=text, images=image, return_tensors="pt", padding=True, max_length=248) >>> >>> # inputs contains both 'input_ids', 'attention_mask' and 'pixel_values' ``` """ attributes = ["image_processor", "tokenizer"] image_processor_class = "CLIPImageProcessor" tokenizer_class = "CLIPTokenizer" def __init__( self, image_processor: Optional[CLIPImageProcessor] = None, tokenizer: Optional[CLIPTokenizer] = None, **kwargs, ): if image_processor is None: raise ValueError("You need to specify an `image_processor`.") if tokenizer is None: raise ValueError("You need to specify a `tokenizer`.") super().__init__(image_processor, tokenizer) def __call__( self, text: Union[str, List[str], None] = None, images=None, return_tensors: Optional[str] = "pt", padding: Union[bool, str] = True, max_length: Optional[int] = 248, truncation: Optional[bool] = True, **kwargs, ): """ Preprocess text and images for LongCLIP model. Args: text (str, List[str], optional): Text or list of texts to process. images: Image or list of images to process. Can be PIL Image, numpy array, or tensor. return_tensors (str, optional): Type of tensors to return ('pt' for PyTorch). padding (bool or str, optional): Padding strategy. Defaults to True. max_length (int, optional): Maximum sequence length. Defaults to 248 for LongCLIP. truncation (bool, optional): Whether to truncate sequences. Defaults to True. **kwargs: Additional keyword arguments. Returns: BatchEncoding: Dictionary containing processed inputs with keys: - input_ids: Tokenized text (if text provided) - attention_mask: Attention mask for text (if text provided) - pixel_values: Processed images (if images provided) """ # Process text if text is not None: text_inputs = self.tokenizer( text, return_tensors=return_tensors, padding=padding, max_length=max_length, truncation=truncation, **kwargs, ) else: text_inputs = {} # Process images if images is not None: image_inputs = self.image_processor( images, return_tensors=return_tensors, ) else: image_inputs = {} # Combine inputs return {**text_inputs, **image_inputs} def batch_decode(self, *args, **kwargs): """ Decode token IDs back to text. This method is forwarded to the tokenizer's batch_decode method. """ return self.tokenizer.batch_decode(*args, **kwargs) def decode(self, *args, **kwargs): """ Decode token IDs back to text. This method is forwarded to the tokenizer's decode method. """ return self.tokenizer.decode(*args, **kwargs) @property def model_input_names(self): """ Get the names of model inputs. Returns: List[str]: List of input names. """ tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))