File size: 5,546 Bytes
ff21a21 ddd62f3 ff21a21 a028cbf ddd62f3 ff21a21 a028cbf ff21a21 a028cbf ff21a21 ddd62f3 a028cbf ff21a21 ddd62f3 a028cbf ff21a21 a028cbf ddd62f3 ff21a21 ddd62f3 ff21a21 a028cbf ff21a21 ddd62f3 a028cbf ddd62f3 a028cbf ddd62f3 a028cbf ddd62f3 a028cbf ddd62f3 a028cbf ff21a21 ddd62f3 ff21a21 a028cbf ff21a21 a028cbf ff21a21 a028cbf ddd62f3 a028cbf ff21a21 ddd62f3 ff21a21 a028cbf ddd62f3 ff21a21 ddd62f3 ff21a21 a028cbf ff21a21 a028cbf ddd62f3 a028cbf ff21a21 a028cbf ff21a21 a028cbf ff21a21 a028cbf ff21a21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
"""
Oculus Configuration
Oceanir-Oculus OO1 Architecture configuration.
Hybrid-reasoning vision-language model with:
- Reasoning via Thinking Traces
- Perceptive Tool Calling + Focus (Zoom & Crop)
- Structured Outputs
- Complex OCR
- Desktop UI Understanding
"""
from typing import Optional, Dict, Any, List
from transformers import PretrainedConfig
class OculusConfig(PretrainedConfig):
"""
Configuration class for Oculus vision-language model.
Oceanir-Oculus OO1 Architecture - hybrid vision-language model
optimized for visual reasoning on commodity GPUs and edge devices.
"""
model_type = "oculus"
def __init__(
self,
# Architecture
architecture_name: str = "Oceanir-Oculus OO1",
# Vision encoder settings
vision_hidden_size: int = 1024,
vision_num_layers: int = 24,
vision_num_heads: int = 16,
image_size: int = 224,
patch_size: int = 16,
# Projector settings
fused_vision_dim: int = 2176,
projector_hidden_dim: int = 4352,
num_vision_tokens: int = 64,
# Language model settings
lm_hidden_size: int = 1536,
lm_num_layers: int = 16,
lm_num_heads: int = 24,
vocab_size: int = 131072,
max_position_embeddings: int = 32768,
# Reasoning / Thinking Traces
reasoning_enabled: bool = True,
thinking_token: str = "<think>",
thinking_end_token: str = "</think>",
max_thinking_tokens: int = 256,
thinking_style: str = "structured",
# Focus System (Perceptive Tool Calling)
enable_focus: bool = True,
focus_token: str = "<focus>",
focus_end_token: str = "</focus>",
max_focus_regions: int = 4,
focus_min_size: int = 64,
auto_focus_threshold: float = 0.7,
# Structured Output
structured_output_enabled: bool = True,
json_token: str = "<json>",
json_end_token: str = "</json>",
box_token: str = "<box>",
box_end_token: str = "</box>",
point_token: str = "<point>",
point_end_token: str = "</point>",
# OCR Settings
ocr_enabled: bool = True,
ocr_languages: List[str] = None,
ocr_confidence_threshold: float = 0.5,
# Desktop UI Understanding
ui_understanding_enabled: bool = True,
ui_element_classes: int = 50,
# Output mode settings
output_mode: str = "text",
num_detection_classes: int = 80,
num_segmentation_classes: int = 150,
# Generation settings
max_new_tokens: int = 512,
temperature: float = 0.7,
top_p: float = 0.95,
**kwargs
):
super().__init__(**kwargs)
# Architecture
self.architecture_name = architecture_name
# Vision
self.vision_hidden_size = vision_hidden_size
self.vision_num_layers = vision_num_layers
self.vision_num_heads = vision_num_heads
self.image_size = image_size
self.patch_size = patch_size
# Projector
self.fused_vision_dim = fused_vision_dim
self.projector_hidden_dim = projector_hidden_dim
self.num_vision_tokens = num_vision_tokens
# Language model
self.lm_hidden_size = lm_hidden_size
self.lm_num_layers = lm_num_layers
self.lm_num_heads = lm_num_heads
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
# Reasoning / Thinking Traces
self.reasoning_enabled = reasoning_enabled
self.thinking_token = thinking_token
self.thinking_end_token = thinking_end_token
self.max_thinking_tokens = max_thinking_tokens
self.thinking_style = thinking_style
# Focus System
self.enable_focus = enable_focus
self.focus_token = focus_token
self.focus_end_token = focus_end_token
self.max_focus_regions = max_focus_regions
self.focus_min_size = focus_min_size
self.auto_focus_threshold = auto_focus_threshold
# Structured Output
self.structured_output_enabled = structured_output_enabled
self.json_token = json_token
self.json_end_token = json_end_token
self.box_token = box_token
self.box_end_token = box_end_token
self.point_token = point_token
self.point_end_token = point_end_token
# OCR
self.ocr_enabled = ocr_enabled
self.ocr_languages = ocr_languages or ["en"]
self.ocr_confidence_threshold = ocr_confidence_threshold
# Desktop UI
self.ui_understanding_enabled = ui_understanding_enabled
self.ui_element_classes = ui_element_classes
# Output modes
self.output_mode = output_mode
self.num_detection_classes = num_detection_classes
self.num_segmentation_classes = num_segmentation_classes
# Generation
self.max_new_tokens = max_new_tokens
self.temperature = temperature
self.top_p = top_p
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
"""Load config from pretrained path."""
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
return cls.from_dict(config_dict, **kwargs)
def to_dict(self) -> Dict[str, Any]:
"""Serialize config to dictionary."""
output = super().to_dict()
return output
OculusConfig.register_for_auto_class()
|