Oculus / oculus_unified_model /configuration_oculus.py
kobiakor15's picture
Upload oculus_unified_model/configuration_oculus.py with huggingface_hub
af9b0dd verified
"""
Oculus Configuration
HuggingFace-compatible configuration for the unified Oculus model.
"""
from typing import Optional, Dict, Any, List
from transformers import PretrainedConfig
class OculusConfig(PretrainedConfig):
"""
Configuration class for Oculus vision-language model.
Args:
vision_config: Configuration for vision encoders
projector_config: Configuration for vision-to-language projector
text_config: Configuration for language model
reasoning_enabled: Whether to enable thinking traces
output_mode: Default output mode ("text", "point", "box", "polygon")
"""
model_type = "oculus"
def __init__(
self,
# Vision encoder settings
dinov3_model_id: str = "facebook/dinov2-large",
siglip_model_id: str = "google/siglip-base-patch16-224",
dinov3_hidden_size: int = 1280, # DINOv3 ViT-H/16+ output dim
siglip_hidden_size: int = 768, # SigLIP2 base output dim
# Projector settings
projector_hidden_dim: int = 2048,
num_vision_tokens: int = 64,
# Language model settings
text_model_id: str = "Salesforce/blip-image-captioning-base",
lm_hidden_size: int = 1536,
vocab_size: int = 131072,
max_position_embeddings: int = 32768,
# Reasoning settings
reasoning_enabled: bool = True,
thinking_token: str = "<think>",
thinking_end_token: str = "</think>",
max_thinking_tokens: int = 256,
# Output mode settings
output_mode: str = "text", # "text", "point", "box", "polygon"
num_detection_classes: int = 80,
num_segmentation_classes: int = 150,
# Generation settings
max_new_tokens: int = 512,
temperature: float = 0.7,
top_p: float = 0.95,
# Tool calling / Focus system
enable_focus: bool = True,
focus_token: str = "<focus>",
focus_end_token: str = "</focus>",
**kwargs
):
super().__init__(**kwargs)
# Vision
self.dinov3_model_id = dinov3_model_id
self.siglip_model_id = siglip_model_id
self.dinov3_hidden_size = dinov3_hidden_size
self.siglip_hidden_size = siglip_hidden_size
self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size
# Projector
self.projector_hidden_dim = projector_hidden_dim
self.num_vision_tokens = num_vision_tokens
# Language model
self.text_model_id = text_model_id
self.lm_hidden_size = lm_hidden_size
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
# Reasoning
self.reasoning_enabled = reasoning_enabled
self.thinking_token = thinking_token
self.thinking_end_token = thinking_end_token
self.max_thinking_tokens = max_thinking_tokens
# Output modes
self.output_mode = output_mode
self.num_detection_classes = num_detection_classes
self.num_segmentation_classes = num_segmentation_classes
# Generation
self.max_new_tokens = max_new_tokens
self.temperature = temperature
self.top_p = top_p
# Focus system
self.enable_focus = enable_focus
self.focus_token = focus_token
self.focus_end_token = focus_end_token
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
"""Load config from pretrained path."""
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
return cls.from_dict(config_dict, **kwargs)
def to_dict(self) -> Dict[str, Any]:
"""Serialize config to dictionary."""
output = super().to_dict()
return output
# Register for auto-loading
OculusConfig.register_for_auto_class()