""" Oculus Configuration HuggingFace-compatible configuration for the unified Oculus model. """ from typing import Optional, Dict, Any, List from transformers import PretrainedConfig class OculusConfig(PretrainedConfig): """ Configuration class for Oculus vision-language model. Args: vision_config: Configuration for vision encoders projector_config: Configuration for vision-to-language projector text_config: Configuration for language model reasoning_enabled: Whether to enable thinking traces output_mode: Default output mode ("text", "point", "box", "polygon") """ model_type = "oculus" def __init__( self, # Vision encoder settings dinov3_model_id: str = "facebook/dinov2-large", siglip_model_id: str = "google/siglip-base-patch16-224", dinov3_hidden_size: int = 1280, # DINOv3 ViT-H/16+ output dim siglip_hidden_size: int = 768, # SigLIP2 base output dim # Projector settings projector_hidden_dim: int = 2048, num_vision_tokens: int = 64, # Language model settings text_model_id: str = "Salesforce/blip-image-captioning-base", lm_hidden_size: int = 1536, vocab_size: int = 131072, max_position_embeddings: int = 32768, # Reasoning settings reasoning_enabled: bool = True, thinking_token: str = "", thinking_end_token: str = "", max_thinking_tokens: int = 256, # Output mode settings output_mode: str = "text", # "text", "point", "box", "polygon" num_detection_classes: int = 80, num_segmentation_classes: int = 150, # Generation settings max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95, # Tool calling / Focus system enable_focus: bool = True, focus_token: str = "", focus_end_token: str = "", **kwargs ): super().__init__(**kwargs) # Vision self.dinov3_model_id = dinov3_model_id self.siglip_model_id = siglip_model_id self.dinov3_hidden_size = dinov3_hidden_size self.siglip_hidden_size = siglip_hidden_size self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size # Projector self.projector_hidden_dim = projector_hidden_dim self.num_vision_tokens = num_vision_tokens # Language model self.text_model_id = text_model_id self.lm_hidden_size = lm_hidden_size self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings # Reasoning self.reasoning_enabled = reasoning_enabled self.thinking_token = thinking_token self.thinking_end_token = thinking_end_token self.max_thinking_tokens = max_thinking_tokens # Output modes self.output_mode = output_mode self.num_detection_classes = num_detection_classes self.num_segmentation_classes = num_segmentation_classes # Generation self.max_new_tokens = max_new_tokens self.temperature = temperature self.top_p = top_p # Focus system self.enable_focus = enable_focus self.focus_token = focus_token self.focus_end_token = focus_end_token @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): """Load config from pretrained path.""" config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) return cls.from_dict(config_dict, **kwargs) def to_dict(self) -> Dict[str, Any]: """Serialize config to dictionary.""" output = super().to_dict() return output # Register for auto-loading OculusConfig.register_for_auto_class()