OceanirAI
/

Oculus

+"""
+Oculus Processor
+Handles image and text preprocessing for the Oculus model.
+"""
+from typing import Optional, Union, List, Dict, Any
+from PIL import Image
+import numpy as np
+from transformers import ProcessorMixin, BatchFeature
+from transformers.image_utils import ImageInput
+class OculusProcessor(ProcessorMixin):
+    """
+    Processor for Oculus model.
+    Combines image processing and text tokenization.
+    Usage:
+        ```python
+        processor = OculusProcessor.from_pretrained("OceanirAI/oculus-0.2")
+        # Process inputs
+        inputs = processor(
+            images=image,
+            text="What is in this image?",
+            mode="text",
+            return_tensors="pt"
+        )
+        ```
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        **kwargs
+    ):
+        super().__init__(image_processor, tokenizer)
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        # Special tokens
+        self.thinking_token = kwargs.get("thinking_token", "<think>")
+        self.thinking_end_token = kwargs.get("thinking_end_token", "</think>")
+        self.focus_token = kwargs.get("focus_token", "<focus>")
+        self.focus_end_token = kwargs.get("focus_end_token", "</focus>")
+        # Output mode tokens
+        self.mode_tokens = {
+            "text": "<text>",
+            "point": "<point>",
+            "box": "<box>",
+            "polygon": "<polygon>",
+        }
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[str, List[str]] = None,
+        mode: str = "text",
+        think: bool = False,
+        return_tensors: Optional[str] = None,
+        **kwargs
+    ) -> BatchFeature:
+        """
+        Process images and text for Oculus model.
+        Args:
+            images: Input image(s)
+            text: Input text prompt(s)
+            mode: Output mode ("text", "point", "box", "polygon")
+            think: Enable reasoning mode
+            return_tensors: Tensor format ("pt", "np", etc.)
+        Returns:
+            BatchFeature with processed inputs
+        """
+        # Process images
+        if images is not None:
+            if self.image_processor is not None:
+                image_features = self.image_processor(images, return_tensors=return_tensors)
+            else:
+                # Basic processing
+                if isinstance(images, Image.Image):
+                    images = [images]
+                image_features = {"pixel_values": images}
+        else:
+            image_features = {}
+        # Process text
+        if text is not None:
+            # Add mode and thinking tokens
+            processed_text = self._format_prompt(text, mode, think)
+            if self.tokenizer is not None:
+                text_features = self.tokenizer(
+                    processed_text,
+                    return_tensors=return_tensors,
+                    padding=True,
+                    truncation=True,
+                    **kwargs
+                )
+            else:
+                text_features = {"text": processed_text}
+        else:
+            text_features = {}
+        # Combine features
+        return BatchFeature(
+            data={
+                **image_features,
+                **text_features,
+                "mode": mode,
+                "think": think,
+            },
+            tensor_type=return_tensors
+        )
+    def _format_prompt(
+        self,
+        text: Union[str, List[str]],
+        mode: str,
+        think: bool
+    ) -> Union[str, List[str]]:
+        """Format prompt with special tokens."""
+        def format_single(t: str) -> str:
+            parts = []
+            # Add mode token
+            if mode in self.mode_tokens:
+                parts.append(self.mode_tokens[mode])
+            # Add thinking token if enabled
+            if think:
+                parts.append(self.thinking_token)
+            # Add prompt
+            parts.append(t)
+            return " ".join(parts)
+        if isinstance(text, str):
+            return format_single(text)
+        else:
+            return [format_single(t) for t in text]
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = True,
+        **kwargs
+    ) -> str:
+        """Decode token IDs to text."""
+        if self.tokenizer is not None:
+            text = self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
+        else:
+            text = str(token_ids)
+        # Parse thinking trace if present
+        thinking_trace = None
+        if self.thinking_token in text and self.thinking_end_token in text:
+            start = text.find(self.thinking_token) + len(self.thinking_token)
+            end = text.find(self.thinking_end_token)
+            thinking_trace = text[start:end].strip()
+            text = text[end + len(self.thinking_end_token):].strip()
+        return text, thinking_trace
+    def batch_decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = True,
+        **kwargs
+    ) -> List[str]:
+        """Decode batch of token IDs."""
+        return [
+            self.decode(ids, skip_special_tokens=skip_special_tokens, **kwargs)
+            for ids in token_ids
+        ]
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
+        """Load processor from pretrained."""
+        try:
+            from transformers import AutoImageProcessor, AutoTokenizer
+            image_processor = AutoImageProcessor.from_pretrained(
+                pretrained_model_name_or_path, **kwargs
+            )
+            tokenizer = AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path, **kwargs
+            )
+            return cls(image_processor=image_processor, tokenizer=tokenizer, **kwargs)
+        except:
+            # Return basic processor without HF components
+            return cls(**kwargs)
+    def save_pretrained(self, save_directory: str, **kwargs):
+        """Save processor to directory."""
+        if self.image_processor is not None:
+            self.image_processor.save_pretrained(save_directory)
+        if self.tokenizer is not None:
+            self.tokenizer.save_pretrained(save_directory)