creative-graphic-design
/

LongCLIP-B

@@ -1,20 +1,145 @@
-from typing import Union
-from transformers import CLIPProcessor, CLIPTokenizer, CLIPTokenizerFast
-class LongCLIPProcessor(CLIPProcessor):
-    tokenizer: Union[CLIPTokenizer, CLIPTokenizerFast]
     def __call__(
-        self, text=None, short_text=None, images=None, return_tensors=None, **kwargs
     ):
-        encoding = super().__call__(text, images, return_tensors, **kwargs)
-        if short_text is not None:
-            short_text_encoding = self.tokenizer(
-                short_text, return_tensors=return_tensors, **kwargs
             )
-            encoding["short_input_ids"] = short_text_encoding.input_ids
-            encoding["short_attention_mask"] = short_text_encoding.attention_mask
-        return encoding

+"""
+LongCLIP processor for preprocessing images and text.
+This module provides a processor that combines image and text preprocessing
+for LongCLIP models.
+"""
+from typing import List, Optional, Union
+from transformers import CLIPImageProcessor, CLIPTokenizer
+from transformers.processing_utils import ProcessorMixin
+class LongCLIPProcessor(ProcessorMixin):
+    """
+    Processor for LongCLIP that combines image and text preprocessing.
+    This processor wraps CLIPImageProcessor and CLIPTokenizer to provide
+    a unified interface for preprocessing inputs for LongCLIP models.
+    Args:
+        image_processor (CLIPImageProcessor): Image processor for preprocessing images.
+        tokenizer (CLIPTokenizer): Tokenizer for preprocessing text.
+    Attributes:
+        image_processor_class (str): Name of the image processor class.
+        tokenizer_class (str): Name of the tokenizer class.
+    Example:
+        ```python
+        >>> from long_clip_hf import LongCLIPProcessor
+        >>> from transformers import CLIPImageProcessor, CLIPTokenizer
+        >>> from PIL import Image
+        >>>
+        >>> # Initialize processor
+        >>> image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = LongCLIPProcessor(image_processor=image_processor, tokenizer=tokenizer)
+        >>>
+        >>> # Process inputs
+        >>> image = Image.open("path/to/image.jpg")
+        >>> text = "a photo of a cat"
+        >>> inputs = processor(text=text, images=image, return_tensors="pt", padding=True, max_length=248)
+        >>>
+        >>> # inputs contains both 'input_ids', 'attention_mask' and 'pixel_values'
+        ```
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "CLIPImageProcessor"
+    tokenizer_class = "CLIPTokenizer"
+    def __init__(
+        self,
+        image_processor: Optional[CLIPImageProcessor] = None,
+        tokenizer: Optional[CLIPTokenizer] = None,
+        **kwargs,
+    ):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        super().__init__(image_processor, tokenizer)
     def __call__(
+        self,
+        text: Union[str, List[str], None] = None,
+        images=None,
+        return_tensors: Optional[str] = "pt",
+        padding: Union[bool, str] = True,
+        max_length: Optional[int] = 248,
+        truncation: Optional[bool] = True,
+        **kwargs,
     ):
+        """
+        Preprocess text and images for LongCLIP model.
+        Args:
+            text (str, List[str], optional): Text or list of texts to process.
+            images: Image or list of images to process. Can be PIL Image, numpy array, or tensor.
+            return_tensors (str, optional): Type of tensors to return ('pt' for PyTorch).
+            padding (bool or str, optional): Padding strategy. Defaults to True.
+            max_length (int, optional): Maximum sequence length. Defaults to 248 for LongCLIP.
+            truncation (bool, optional): Whether to truncate sequences. Defaults to True.
+            **kwargs: Additional keyword arguments.
+        Returns:
+            BatchEncoding: Dictionary containing processed inputs with keys:
+                - input_ids: Tokenized text (if text provided)
+                - attention_mask: Attention mask for text (if text provided)
+                - pixel_values: Processed images (if images provided)
+        """
+        # Process text
+        if text is not None:
+            text_inputs = self.tokenizer(
+                text,
+                return_tensors=return_tensors,
+                padding=padding,
+                max_length=max_length,
+                truncation=truncation,
+                **kwargs,
             )
+        else:
+            text_inputs = {}
+        # Process images
+        if images is not None:
+            image_inputs = self.image_processor(
+                images,
+                return_tensors=return_tensors,
+            )
+        else:
+            image_inputs = {}
+        # Combine inputs
+        return {**text_inputs, **image_inputs}
+    def batch_decode(self, *args, **kwargs):
+        """
+        Decode token IDs back to text.
+        This method is forwarded to the tokenizer's batch_decode method.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        Decode token IDs back to text.
+        This method is forwarded to the tokenizer's decode method.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        """
+        Get the names of model inputs.
+        Returns:
+            List[str]: List of input names.
+        """
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))