""" Oculus Configuration Oceanir-Oculus OO1 Architecture configuration. Hybrid-reasoning vision-language model with: - Reasoning via Thinking Traces - Perceptive Tool Calling + Focus (Zoom & Crop) - Structured Outputs - Complex OCR - Desktop UI Understanding """ from typing import Optional, Dict, Any, List from transformers import PretrainedConfig class OculusConfig(PretrainedConfig): """ Configuration class for Oculus vision-language model. Oceanir-Oculus OO1 Architecture - hybrid vision-language model optimized for visual reasoning on commodity GPUs and edge devices. """ model_type = "oculus" def __init__( self, # Architecture architecture_name: str = "Oceanir-Oculus OO1", # Vision encoder settings vision_hidden_size: int = 1024, vision_num_layers: int = 24, vision_num_heads: int = 16, image_size: int = 224, patch_size: int = 16, # Projector settings fused_vision_dim: int = 2176, projector_hidden_dim: int = 4352, num_vision_tokens: int = 64, # Language model settings lm_hidden_size: int = 1536, lm_num_layers: int = 16, lm_num_heads: int = 24, vocab_size: int = 131072, max_position_embeddings: int = 32768, # Reasoning / Thinking Traces reasoning_enabled: bool = True, thinking_token: str = "", thinking_end_token: str = "", max_thinking_tokens: int = 256, thinking_style: str = "structured", # Focus System (Perceptive Tool Calling) enable_focus: bool = True, focus_token: str = "", focus_end_token: str = "", max_focus_regions: int = 4, focus_min_size: int = 64, auto_focus_threshold: float = 0.7, # Structured Output structured_output_enabled: bool = True, json_token: str = "", json_end_token: str = "", box_token: str = "", box_end_token: str = "", point_token: str = "", point_end_token: str = "", # OCR Settings ocr_enabled: bool = True, ocr_languages: List[str] = None, ocr_confidence_threshold: float = 0.5, # Desktop UI Understanding ui_understanding_enabled: bool = True, ui_element_classes: int = 50, # Output mode settings output_mode: str = "text", num_detection_classes: int = 80, num_segmentation_classes: int = 150, # Generation settings max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95, **kwargs ): super().__init__(**kwargs) # Architecture self.architecture_name = architecture_name # Vision self.vision_hidden_size = vision_hidden_size self.vision_num_layers = vision_num_layers self.vision_num_heads = vision_num_heads self.image_size = image_size self.patch_size = patch_size # Projector self.fused_vision_dim = fused_vision_dim self.projector_hidden_dim = projector_hidden_dim self.num_vision_tokens = num_vision_tokens # Language model self.lm_hidden_size = lm_hidden_size self.lm_num_layers = lm_num_layers self.lm_num_heads = lm_num_heads self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings # Reasoning / Thinking Traces self.reasoning_enabled = reasoning_enabled self.thinking_token = thinking_token self.thinking_end_token = thinking_end_token self.max_thinking_tokens = max_thinking_tokens self.thinking_style = thinking_style # Focus System self.enable_focus = enable_focus self.focus_token = focus_token self.focus_end_token = focus_end_token self.max_focus_regions = max_focus_regions self.focus_min_size = focus_min_size self.auto_focus_threshold = auto_focus_threshold # Structured Output self.structured_output_enabled = structured_output_enabled self.json_token = json_token self.json_end_token = json_end_token self.box_token = box_token self.box_end_token = box_end_token self.point_token = point_token self.point_end_token = point_end_token # OCR self.ocr_enabled = ocr_enabled self.ocr_languages = ocr_languages or ["en"] self.ocr_confidence_threshold = ocr_confidence_threshold # Desktop UI self.ui_understanding_enabled = ui_understanding_enabled self.ui_element_classes = ui_element_classes # Output modes self.output_mode = output_mode self.num_detection_classes = num_detection_classes self.num_segmentation_classes = num_segmentation_classes # Generation self.max_new_tokens = max_new_tokens self.temperature = temperature self.top_p = top_p @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): """Load config from pretrained path.""" config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) return cls.from_dict(config_dict, **kwargs) def to_dict(self) -> Dict[str, Any]: """Serialize config to dictionary.""" output = super().to_dict() return output OculusConfig.register_for_auto_class()