|
|
""" |
|
|
Oculus Configuration |
|
|
|
|
|
Oceanir-Oculus OO1 Architecture configuration. |
|
|
Hybrid-reasoning vision-language model with: |
|
|
- Reasoning via Thinking Traces |
|
|
- Perceptive Tool Calling + Focus (Zoom & Crop) |
|
|
- Structured Outputs |
|
|
- Complex OCR |
|
|
- Desktop UI Understanding |
|
|
""" |
|
|
|
|
|
from typing import Optional, Dict, Any, List |
|
|
from transformers import PretrainedConfig |
|
|
|
|
|
|
|
|
class OculusConfig(PretrainedConfig): |
|
|
""" |
|
|
Configuration class for Oculus vision-language model. |
|
|
|
|
|
Oceanir-Oculus OO1 Architecture - hybrid vision-language model |
|
|
optimized for visual reasoning on commodity GPUs and edge devices. |
|
|
""" |
|
|
|
|
|
model_type = "oculus" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
|
|
|
architecture_name: str = "Oceanir-Oculus OO1", |
|
|
|
|
|
|
|
|
vision_hidden_size: int = 1024, |
|
|
vision_num_layers: int = 24, |
|
|
vision_num_heads: int = 16, |
|
|
image_size: int = 224, |
|
|
patch_size: int = 16, |
|
|
|
|
|
|
|
|
fused_vision_dim: int = 2176, |
|
|
projector_hidden_dim: int = 4352, |
|
|
num_vision_tokens: int = 64, |
|
|
|
|
|
|
|
|
lm_hidden_size: int = 1536, |
|
|
lm_num_layers: int = 16, |
|
|
lm_num_heads: int = 24, |
|
|
vocab_size: int = 131072, |
|
|
max_position_embeddings: int = 32768, |
|
|
|
|
|
|
|
|
reasoning_enabled: bool = True, |
|
|
thinking_token: str = "<think>", |
|
|
thinking_end_token: str = "</think>", |
|
|
max_thinking_tokens: int = 256, |
|
|
thinking_style: str = "structured", |
|
|
|
|
|
|
|
|
enable_focus: bool = True, |
|
|
focus_token: str = "<focus>", |
|
|
focus_end_token: str = "</focus>", |
|
|
max_focus_regions: int = 4, |
|
|
focus_min_size: int = 64, |
|
|
auto_focus_threshold: float = 0.7, |
|
|
|
|
|
|
|
|
structured_output_enabled: bool = True, |
|
|
json_token: str = "<json>", |
|
|
json_end_token: str = "</json>", |
|
|
box_token: str = "<box>", |
|
|
box_end_token: str = "</box>", |
|
|
point_token: str = "<point>", |
|
|
point_end_token: str = "</point>", |
|
|
|
|
|
|
|
|
ocr_enabled: bool = True, |
|
|
ocr_languages: List[str] = None, |
|
|
ocr_confidence_threshold: float = 0.5, |
|
|
|
|
|
|
|
|
ui_understanding_enabled: bool = True, |
|
|
ui_element_classes: int = 50, |
|
|
|
|
|
|
|
|
output_mode: str = "text", |
|
|
num_detection_classes: int = 80, |
|
|
num_segmentation_classes: int = 150, |
|
|
|
|
|
|
|
|
max_new_tokens: int = 512, |
|
|
temperature: float = 0.7, |
|
|
top_p: float = 0.95, |
|
|
|
|
|
**kwargs |
|
|
): |
|
|
super().__init__(**kwargs) |
|
|
|
|
|
|
|
|
self.architecture_name = architecture_name |
|
|
|
|
|
|
|
|
self.vision_hidden_size = vision_hidden_size |
|
|
self.vision_num_layers = vision_num_layers |
|
|
self.vision_num_heads = vision_num_heads |
|
|
self.image_size = image_size |
|
|
self.patch_size = patch_size |
|
|
|
|
|
|
|
|
self.fused_vision_dim = fused_vision_dim |
|
|
self.projector_hidden_dim = projector_hidden_dim |
|
|
self.num_vision_tokens = num_vision_tokens |
|
|
|
|
|
|
|
|
self.lm_hidden_size = lm_hidden_size |
|
|
self.lm_num_layers = lm_num_layers |
|
|
self.lm_num_heads = lm_num_heads |
|
|
self.vocab_size = vocab_size |
|
|
self.max_position_embeddings = max_position_embeddings |
|
|
|
|
|
|
|
|
self.reasoning_enabled = reasoning_enabled |
|
|
self.thinking_token = thinking_token |
|
|
self.thinking_end_token = thinking_end_token |
|
|
self.max_thinking_tokens = max_thinking_tokens |
|
|
self.thinking_style = thinking_style |
|
|
|
|
|
|
|
|
self.enable_focus = enable_focus |
|
|
self.focus_token = focus_token |
|
|
self.focus_end_token = focus_end_token |
|
|
self.max_focus_regions = max_focus_regions |
|
|
self.focus_min_size = focus_min_size |
|
|
self.auto_focus_threshold = auto_focus_threshold |
|
|
|
|
|
|
|
|
self.structured_output_enabled = structured_output_enabled |
|
|
self.json_token = json_token |
|
|
self.json_end_token = json_end_token |
|
|
self.box_token = box_token |
|
|
self.box_end_token = box_end_token |
|
|
self.point_token = point_token |
|
|
self.point_end_token = point_end_token |
|
|
|
|
|
|
|
|
self.ocr_enabled = ocr_enabled |
|
|
self.ocr_languages = ocr_languages or ["en"] |
|
|
self.ocr_confidence_threshold = ocr_confidence_threshold |
|
|
|
|
|
|
|
|
self.ui_understanding_enabled = ui_understanding_enabled |
|
|
self.ui_element_classes = ui_element_classes |
|
|
|
|
|
|
|
|
self.output_mode = output_mode |
|
|
self.num_detection_classes = num_detection_classes |
|
|
self.num_segmentation_classes = num_segmentation_classes |
|
|
|
|
|
|
|
|
self.max_new_tokens = max_new_tokens |
|
|
self.temperature = temperature |
|
|
self.top_p = top_p |
|
|
|
|
|
@classmethod |
|
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): |
|
|
"""Load config from pretrained path.""" |
|
|
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) |
|
|
return cls.from_dict(config_dict, **kwargs) |
|
|
|
|
|
def to_dict(self) -> Dict[str, Any]: |
|
|
"""Serialize config to dictionary.""" |
|
|
output = super().to_dict() |
|
|
return output |
|
|
|
|
|
|
|
|
OculusConfig.register_for_auto_class() |
|
|
|