Oculus-0.1 / oculus_unified_model /configuration_oculus.py
kobiakor15's picture
Upload oculus_unified_model/configuration_oculus.py with huggingface_hub
ddd62f3 verified
"""
Oculus Configuration
Oceanir-Oculus OO1 Architecture configuration.
Hybrid-reasoning vision-language model with:
- Reasoning via Thinking Traces
- Perceptive Tool Calling + Focus (Zoom & Crop)
- Structured Outputs
- Complex OCR
- Desktop UI Understanding
"""
from typing import Optional, Dict, Any, List
from transformers import PretrainedConfig
class OculusConfig(PretrainedConfig):
"""
Configuration class for Oculus vision-language model.
Oceanir-Oculus OO1 Architecture - hybrid vision-language model
optimized for visual reasoning on commodity GPUs and edge devices.
"""
model_type = "oculus"
def __init__(
self,
# Architecture
architecture_name: str = "Oceanir-Oculus OO1",
# Vision encoder settings
vision_hidden_size: int = 1024,
vision_num_layers: int = 24,
vision_num_heads: int = 16,
image_size: int = 224,
patch_size: int = 16,
# Projector settings
fused_vision_dim: int = 2176,
projector_hidden_dim: int = 4352,
num_vision_tokens: int = 64,
# Language model settings
lm_hidden_size: int = 1536,
lm_num_layers: int = 16,
lm_num_heads: int = 24,
vocab_size: int = 131072,
max_position_embeddings: int = 32768,
# Reasoning / Thinking Traces
reasoning_enabled: bool = True,
thinking_token: str = "<think>",
thinking_end_token: str = "</think>",
max_thinking_tokens: int = 256,
thinking_style: str = "structured",
# Focus System (Perceptive Tool Calling)
enable_focus: bool = True,
focus_token: str = "<focus>",
focus_end_token: str = "</focus>",
max_focus_regions: int = 4,
focus_min_size: int = 64,
auto_focus_threshold: float = 0.7,
# Structured Output
structured_output_enabled: bool = True,
json_token: str = "<json>",
json_end_token: str = "</json>",
box_token: str = "<box>",
box_end_token: str = "</box>",
point_token: str = "<point>",
point_end_token: str = "</point>",
# OCR Settings
ocr_enabled: bool = True,
ocr_languages: List[str] = None,
ocr_confidence_threshold: float = 0.5,
# Desktop UI Understanding
ui_understanding_enabled: bool = True,
ui_element_classes: int = 50,
# Output mode settings
output_mode: str = "text",
num_detection_classes: int = 80,
num_segmentation_classes: int = 150,
# Generation settings
max_new_tokens: int = 512,
temperature: float = 0.7,
top_p: float = 0.95,
**kwargs
):
super().__init__(**kwargs)
# Architecture
self.architecture_name = architecture_name
# Vision
self.vision_hidden_size = vision_hidden_size
self.vision_num_layers = vision_num_layers
self.vision_num_heads = vision_num_heads
self.image_size = image_size
self.patch_size = patch_size
# Projector
self.fused_vision_dim = fused_vision_dim
self.projector_hidden_dim = projector_hidden_dim
self.num_vision_tokens = num_vision_tokens
# Language model
self.lm_hidden_size = lm_hidden_size
self.lm_num_layers = lm_num_layers
self.lm_num_heads = lm_num_heads
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
# Reasoning / Thinking Traces
self.reasoning_enabled = reasoning_enabled
self.thinking_token = thinking_token
self.thinking_end_token = thinking_end_token
self.max_thinking_tokens = max_thinking_tokens
self.thinking_style = thinking_style
# Focus System
self.enable_focus = enable_focus
self.focus_token = focus_token
self.focus_end_token = focus_end_token
self.max_focus_regions = max_focus_regions
self.focus_min_size = focus_min_size
self.auto_focus_threshold = auto_focus_threshold
# Structured Output
self.structured_output_enabled = structured_output_enabled
self.json_token = json_token
self.json_end_token = json_end_token
self.box_token = box_token
self.box_end_token = box_end_token
self.point_token = point_token
self.point_end_token = point_end_token
# OCR
self.ocr_enabled = ocr_enabled
self.ocr_languages = ocr_languages or ["en"]
self.ocr_confidence_threshold = ocr_confidence_threshold
# Desktop UI
self.ui_understanding_enabled = ui_understanding_enabled
self.ui_element_classes = ui_element_classes
# Output modes
self.output_mode = output_mode
self.num_detection_classes = num_detection_classes
self.num_segmentation_classes = num_segmentation_classes
# Generation
self.max_new_tokens = max_new_tokens
self.temperature = temperature
self.top_p = top_p
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
"""Load config from pretrained path."""
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
return cls.from_dict(config_dict, **kwargs)
def to_dict(self) -> Dict[str, Any]:
"""Serialize config to dictionary."""
output = super().to_dict()
return output
OculusConfig.register_for_auto_class()