BabyLM-community
/

babylm-multimodal-baseline-git

+from transformers import ProcessorMixin, AutoProcessor
+from transformers.models.auto.processing_auto import AutoProcessor
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding
+import json
+import os
+class GITProcessor(ProcessorMixin):
+    """
+    Custom processor that combines a tokenizer and feature extractor.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, image_processor, tokenizer):
+        super().__init__(image_processor, tokenizer)
+    def __call__(self, text=None, images=None, **kwargs):
+        """
+        Main processing method that handles both text and images.
+        Args:
+            text: Text input(s) to tokenize
+            images: Image input(s) to process
+            **kwargs: Additional arguments passed to tokenizer/image_processor
+        Returns:
+            Dictionary with processed inputs
+        """
+        if text is None and images is None:
+            raise ValueError("You need to specify either text or images")
+        encoding = {}
+        # Process text if provided
+        if text is not None:
+            text_encoding = self.tokenizer(text, **kwargs)
+            encoding.update(text_encoding)
+        # Process images if provided
+        if images is not None:
+            image_encoding = self.image_processor(images, **kwargs)
+            # Add prefix to avoid key conflicts
+            for key, value in image_encoding.items():
+                encoding[f"pixel_values" if key == "pixel_values" else f"image_{key}"] = value
+        return BatchEncoding(encoding)
+    def batch_decode(self, *args, **kwargs):
+        """
+        Delegate batch decoding to the tokenizer.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        Delegate decoding to the tokenizer.
+        """
+        return self.tokenizer.decode(*args, **kwargs)