""" Oculus Processor Handles image and text preprocessing for the Oculus model. """ from typing import Optional, Union, List, Dict, Any from PIL import Image import numpy as np from transformers import ProcessorMixin, BatchFeature from transformers.image_utils import ImageInput class OculusProcessor(ProcessorMixin): """ Processor for Oculus model. Combines image processing and text tokenization. Usage: ```python processor = OculusProcessor.from_pretrained("OceanirAI/oculus-0.2") # Process inputs inputs = processor( images=image, text="What is in this image?", mode="text", return_tensors="pt" ) ``` """ attributes = ["image_processor", "tokenizer"] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" def __init__( self, image_processor=None, tokenizer=None, **kwargs ): super().__init__(image_processor, tokenizer) self.image_processor = image_processor self.tokenizer = tokenizer # Special tokens self.thinking_token = kwargs.get("thinking_token", "") self.thinking_end_token = kwargs.get("thinking_end_token", "") self.focus_token = kwargs.get("focus_token", "") self.focus_end_token = kwargs.get("focus_end_token", "") # Output mode tokens self.mode_tokens = { "text": "", "point": "", "box": "", "polygon": "", } def __call__( self, images: ImageInput = None, text: Union[str, List[str]] = None, mode: str = "text", think: bool = False, return_tensors: Optional[str] = None, **kwargs ) -> BatchFeature: """ Process images and text for Oculus model. Args: images: Input image(s) text: Input text prompt(s) mode: Output mode ("text", "point", "box", "polygon") think: Enable reasoning mode return_tensors: Tensor format ("pt", "np", etc.) Returns: BatchFeature with processed inputs """ # Process images if images is not None: if self.image_processor is not None: image_features = self.image_processor(images, return_tensors=return_tensors) else: # Basic processing if isinstance(images, Image.Image): images = [images] image_features = {"pixel_values": images} else: image_features = {} # Process text if text is not None: # Add mode and thinking tokens processed_text = self._format_prompt(text, mode, think) if self.tokenizer is not None: text_features = self.tokenizer( processed_text, return_tensors=return_tensors, padding=True, truncation=True, **kwargs ) else: text_features = {"text": processed_text} else: text_features = {} # Combine features return BatchFeature( data={ **image_features, **text_features, "mode": mode, "think": think, }, tensor_type=return_tensors ) def _format_prompt( self, text: Union[str, List[str]], mode: str, think: bool ) -> Union[str, List[str]]: """Format prompt with special tokens.""" def format_single(t: str) -> str: parts = [] # Add mode token if mode in self.mode_tokens: parts.append(self.mode_tokens[mode]) # Add thinking token if enabled if think: parts.append(self.thinking_token) # Add prompt parts.append(t) return " ".join(parts) if isinstance(text, str): return format_single(text) else: return [format_single(t) for t in text] def decode( self, token_ids, skip_special_tokens: bool = True, **kwargs ) -> str: """Decode token IDs to text.""" if self.tokenizer is not None: text = self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs) else: text = str(token_ids) # Parse thinking trace if present thinking_trace = None if self.thinking_token in text and self.thinking_end_token in text: start = text.find(self.thinking_token) + len(self.thinking_token) end = text.find(self.thinking_end_token) thinking_trace = text[start:end].strip() text = text[end + len(self.thinking_end_token):].strip() return text, thinking_trace def batch_decode( self, token_ids, skip_special_tokens: bool = True, **kwargs ) -> List[str]: """Decode batch of token IDs.""" return [ self.decode(ids, skip_special_tokens=skip_special_tokens, **kwargs) for ids in token_ids ] @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): """Load processor from pretrained.""" try: from transformers import AutoImageProcessor, AutoTokenizer image_processor = AutoImageProcessor.from_pretrained( pretrained_model_name_or_path, **kwargs ) tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs ) return cls(image_processor=image_processor, tokenizer=tokenizer, **kwargs) except: # Return basic processor without HF components return cls(**kwargs) def save_pretrained(self, save_directory: str, **kwargs): """Save processor to directory.""" if self.image_processor is not None: self.image_processor.save_pretrained(save_directory) if self.tokenizer is not None: self.tokenizer.save_pretrained(save_directory)