File size: 5,546 Bytes
ff21a21
 
 
ddd62f3
 
 
 
 
 
 
ff21a21
 
 
 
 
 
 
 
 
a028cbf
ddd62f3
 
ff21a21
a028cbf
ff21a21
a028cbf
ff21a21
 
ddd62f3
 
 
 
 
 
 
 
 
a028cbf
ff21a21
ddd62f3
a028cbf
ff21a21
a028cbf
ddd62f3
ff21a21
ddd62f3
 
ff21a21
 
a028cbf
 
ff21a21
 
 
 
ddd62f3
a028cbf
 
 
 
 
 
ddd62f3
 
a028cbf
 
 
 
 
ddd62f3
 
 
 
a028cbf
 
 
ddd62f3
a028cbf
 
 
 
ddd62f3
a028cbf
ff21a21
ddd62f3
ff21a21
 
a028cbf
ff21a21
 
 
 
a028cbf
ff21a21
 
 
a028cbf
ddd62f3
 
 
 
 
 
 
 
 
a028cbf
ff21a21
ddd62f3
ff21a21
 
a028cbf
ddd62f3
ff21a21
ddd62f3
 
ff21a21
 
a028cbf
 
ff21a21
 
 
 
a028cbf
 
 
 
 
 
 
 
 
 
 
 
 
 
ddd62f3
 
 
 
a028cbf
 
 
 
 
 
 
 
 
 
ff21a21
 
 
 
a028cbf
ff21a21
 
 
 
a028cbf
ff21a21
 
 
 
 
a028cbf
ff21a21
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
"""
Oculus Configuration

Oceanir-Oculus OO1 Architecture configuration.
Hybrid-reasoning vision-language model with:
- Reasoning via Thinking Traces
- Perceptive Tool Calling + Focus (Zoom & Crop)
- Structured Outputs
- Complex OCR
- Desktop UI Understanding
"""

from typing import Optional, Dict, Any, List
from transformers import PretrainedConfig


class OculusConfig(PretrainedConfig):
    """
    Configuration class for Oculus vision-language model.

    Oceanir-Oculus OO1 Architecture - hybrid vision-language model
    optimized for visual reasoning on commodity GPUs and edge devices.
    """

    model_type = "oculus"

    def __init__(
        self,
        # Architecture
        architecture_name: str = "Oceanir-Oculus OO1",

        # Vision encoder settings
        vision_hidden_size: int = 1024,
        vision_num_layers: int = 24,
        vision_num_heads: int = 16,
        image_size: int = 224,
        patch_size: int = 16,

        # Projector settings
        fused_vision_dim: int = 2176,
        projector_hidden_dim: int = 4352,
        num_vision_tokens: int = 64,

        # Language model settings
        lm_hidden_size: int = 1536,
        lm_num_layers: int = 16,
        lm_num_heads: int = 24,
        vocab_size: int = 131072,
        max_position_embeddings: int = 32768,

        # Reasoning / Thinking Traces
        reasoning_enabled: bool = True,
        thinking_token: str = "<think>",
        thinking_end_token: str = "</think>",
        max_thinking_tokens: int = 256,
        thinking_style: str = "structured",

        # Focus System (Perceptive Tool Calling)
        enable_focus: bool = True,
        focus_token: str = "<focus>",
        focus_end_token: str = "</focus>",
        max_focus_regions: int = 4,
        focus_min_size: int = 64,
        auto_focus_threshold: float = 0.7,

        # Structured Output
        structured_output_enabled: bool = True,
        json_token: str = "<json>",
        json_end_token: str = "</json>",
        box_token: str = "<box>",
        box_end_token: str = "</box>",
        point_token: str = "<point>",
        point_end_token: str = "</point>",

        # OCR Settings
        ocr_enabled: bool = True,
        ocr_languages: List[str] = None,
        ocr_confidence_threshold: float = 0.5,

        # Desktop UI Understanding
        ui_understanding_enabled: bool = True,
        ui_element_classes: int = 50,

        # Output mode settings
        output_mode: str = "text",
        num_detection_classes: int = 80,
        num_segmentation_classes: int = 150,

        # Generation settings
        max_new_tokens: int = 512,
        temperature: float = 0.7,
        top_p: float = 0.95,

        **kwargs
    ):
        super().__init__(**kwargs)

        # Architecture
        self.architecture_name = architecture_name

        # Vision
        self.vision_hidden_size = vision_hidden_size
        self.vision_num_layers = vision_num_layers
        self.vision_num_heads = vision_num_heads
        self.image_size = image_size
        self.patch_size = patch_size

        # Projector
        self.fused_vision_dim = fused_vision_dim
        self.projector_hidden_dim = projector_hidden_dim
        self.num_vision_tokens = num_vision_tokens

        # Language model
        self.lm_hidden_size = lm_hidden_size
        self.lm_num_layers = lm_num_layers
        self.lm_num_heads = lm_num_heads
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings

        # Reasoning / Thinking Traces
        self.reasoning_enabled = reasoning_enabled
        self.thinking_token = thinking_token
        self.thinking_end_token = thinking_end_token
        self.max_thinking_tokens = max_thinking_tokens
        self.thinking_style = thinking_style

        # Focus System
        self.enable_focus = enable_focus
        self.focus_token = focus_token
        self.focus_end_token = focus_end_token
        self.max_focus_regions = max_focus_regions
        self.focus_min_size = focus_min_size
        self.auto_focus_threshold = auto_focus_threshold

        # Structured Output
        self.structured_output_enabled = structured_output_enabled
        self.json_token = json_token
        self.json_end_token = json_end_token
        self.box_token = box_token
        self.box_end_token = box_end_token
        self.point_token = point_token
        self.point_end_token = point_end_token

        # OCR
        self.ocr_enabled = ocr_enabled
        self.ocr_languages = ocr_languages or ["en"]
        self.ocr_confidence_threshold = ocr_confidence_threshold

        # Desktop UI
        self.ui_understanding_enabled = ui_understanding_enabled
        self.ui_element_classes = ui_element_classes

        # Output modes
        self.output_mode = output_mode
        self.num_detection_classes = num_detection_classes
        self.num_segmentation_classes = num_segmentation_classes

        # Generation
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        """Load config from pretrained path."""
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
        return cls.from_dict(config_dict, **kwargs)

    def to_dict(self) -> Dict[str, Any]:
        """Serialize config to dictionary."""
        output = super().to_dict()
        return output


OculusConfig.register_for_auto_class()