kobiakor15 commited on
Commit
ddd62f3
·
verified ·
1 Parent(s): c027646

Upload oculus_unified_model/configuration_oculus.py with huggingface_hub

Browse files
oculus_unified_model/configuration_oculus.py CHANGED
@@ -1,8 +1,13 @@
1
  """
2
  Oculus Configuration
3
 
4
- HuggingFace-compatible configuration for the unified Oculus model.
5
- Supports Isaac 0.2 features: Thinking Traces, Focus/Zoom, Structured Output, Complex OCR, Desktop UI.
 
 
 
 
 
6
  """
7
 
8
  from typing import Optional, Dict, Any, List
@@ -13,33 +18,33 @@ class OculusConfig(PretrainedConfig):
13
  """
14
  Configuration class for Oculus vision-language model.
15
 
16
- Architecture: DINOv3 + SigLIP2 + LFM2.5-1.2B
17
-
18
- Isaac 0.2 Features:
19
- - Reasoning via Thinking Traces
20
- - Perceptive Tool Calling + Focus (Zoom & Crop)
21
- - Structured Outputs (JSON)
22
- - Complex OCR
23
- - Desktop UI Understanding
24
  """
25
 
26
  model_type = "oculus"
27
 
28
  def __init__(
29
  self,
30
- # Vision encoder settings (DINOv3 + SigLIP2)
31
- dinov3_model_id: str = "facebook/dinov3-vitl16-pretrain-lvd1689m",
32
- siglip_model_id: str = "google/siglip2-so400m-patch16-naflex",
33
- dinov3_hidden_size: int = 1024, # DINOv3 ViT-L/16 output dim
34
- siglip_hidden_size: int = 1152, # SigLIP2 SO400M output dim
 
 
 
 
35
 
36
  # Projector settings
 
37
  projector_hidden_dim: int = 4352,
38
  num_vision_tokens: int = 64,
39
 
40
- # Language model settings (LFM2.5-1.2B)
41
- lm_model_id: str = "LiquidAI/LFM2.5-1.2B-Base",
42
  lm_hidden_size: int = 1536,
 
 
43
  vocab_size: int = 131072,
44
  max_position_embeddings: int = 32768,
45
 
@@ -48,32 +53,36 @@ class OculusConfig(PretrainedConfig):
48
  thinking_token: str = "<think>",
49
  thinking_end_token: str = "</think>",
50
  max_thinking_tokens: int = 256,
51
- thinking_style: str = "structured", # "structured", "verbose", "minimal"
52
 
53
  # Focus System (Perceptive Tool Calling)
54
  enable_focus: bool = True,
55
  focus_token: str = "<focus>",
56
  focus_end_token: str = "</focus>",
57
  max_focus_regions: int = 4,
58
- focus_min_size: int = 64, # Minimum crop size in pixels
59
- auto_focus_threshold: float = 0.7, # Confidence threshold to trigger focus
60
 
61
  # Structured Output
62
  structured_output_enabled: bool = True,
63
  json_token: str = "<json>",
64
  json_end_token: str = "</json>",
 
 
 
 
65
 
66
  # OCR Settings
67
  ocr_enabled: bool = True,
68
- ocr_languages: List[str] = None, # None = auto-detect
69
  ocr_confidence_threshold: float = 0.5,
70
 
71
  # Desktop UI Understanding
72
  ui_understanding_enabled: bool = True,
73
- ui_element_classes: int = 50, # button, text_field, checkbox, etc.
74
 
75
  # Output mode settings
76
- output_mode: str = "text", # "text", "point", "box", "polygon", "json"
77
  num_detection_classes: int = 80,
78
  num_segmentation_classes: int = 150,
79
 
@@ -86,20 +95,25 @@ class OculusConfig(PretrainedConfig):
86
  ):
87
  super().__init__(**kwargs)
88
 
89
- # Vision encoders
90
- self.dinov3_model_id = dinov3_model_id
91
- self.siglip_model_id = siglip_model_id
92
- self.dinov3_hidden_size = dinov3_hidden_size
93
- self.siglip_hidden_size = siglip_hidden_size
94
- self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size
 
 
 
95
 
96
  # Projector
 
97
  self.projector_hidden_dim = projector_hidden_dim
98
  self.num_vision_tokens = num_vision_tokens
99
 
100
- # Language model (LFM2.5)
101
- self.lm_model_id = lm_model_id
102
  self.lm_hidden_size = lm_hidden_size
 
 
103
  self.vocab_size = vocab_size
104
  self.max_position_embeddings = max_position_embeddings
105
 
@@ -122,6 +136,10 @@ class OculusConfig(PretrainedConfig):
122
  self.structured_output_enabled = structured_output_enabled
123
  self.json_token = json_token
124
  self.json_end_token = json_end_token
 
 
 
 
125
 
126
  # OCR
127
  self.ocr_enabled = ocr_enabled
@@ -154,5 +172,4 @@ class OculusConfig(PretrainedConfig):
154
  return output
155
 
156
 
157
- # Register for auto-loading
158
  OculusConfig.register_for_auto_class()
 
1
  """
2
  Oculus Configuration
3
 
4
+ Oceanir-Oculus OO1 Architecture configuration.
5
+ Hybrid-reasoning vision-language model with:
6
+ - Reasoning via Thinking Traces
7
+ - Perceptive Tool Calling + Focus (Zoom & Crop)
8
+ - Structured Outputs
9
+ - Complex OCR
10
+ - Desktop UI Understanding
11
  """
12
 
13
  from typing import Optional, Dict, Any, List
 
18
  """
19
  Configuration class for Oculus vision-language model.
20
 
21
+ Oceanir-Oculus OO1 Architecture - hybrid vision-language model
22
+ optimized for visual reasoning on commodity GPUs and edge devices.
 
 
 
 
 
 
23
  """
24
 
25
  model_type = "oculus"
26
 
27
  def __init__(
28
  self,
29
+ # Architecture
30
+ architecture_name: str = "Oceanir-Oculus OO1",
31
+
32
+ # Vision encoder settings
33
+ vision_hidden_size: int = 1024,
34
+ vision_num_layers: int = 24,
35
+ vision_num_heads: int = 16,
36
+ image_size: int = 224,
37
+ patch_size: int = 16,
38
 
39
  # Projector settings
40
+ fused_vision_dim: int = 2176,
41
  projector_hidden_dim: int = 4352,
42
  num_vision_tokens: int = 64,
43
 
44
+ # Language model settings
 
45
  lm_hidden_size: int = 1536,
46
+ lm_num_layers: int = 16,
47
+ lm_num_heads: int = 24,
48
  vocab_size: int = 131072,
49
  max_position_embeddings: int = 32768,
50
 
 
53
  thinking_token: str = "<think>",
54
  thinking_end_token: str = "</think>",
55
  max_thinking_tokens: int = 256,
56
+ thinking_style: str = "structured",
57
 
58
  # Focus System (Perceptive Tool Calling)
59
  enable_focus: bool = True,
60
  focus_token: str = "<focus>",
61
  focus_end_token: str = "</focus>",
62
  max_focus_regions: int = 4,
63
+ focus_min_size: int = 64,
64
+ auto_focus_threshold: float = 0.7,
65
 
66
  # Structured Output
67
  structured_output_enabled: bool = True,
68
  json_token: str = "<json>",
69
  json_end_token: str = "</json>",
70
+ box_token: str = "<box>",
71
+ box_end_token: str = "</box>",
72
+ point_token: str = "<point>",
73
+ point_end_token: str = "</point>",
74
 
75
  # OCR Settings
76
  ocr_enabled: bool = True,
77
+ ocr_languages: List[str] = None,
78
  ocr_confidence_threshold: float = 0.5,
79
 
80
  # Desktop UI Understanding
81
  ui_understanding_enabled: bool = True,
82
+ ui_element_classes: int = 50,
83
 
84
  # Output mode settings
85
+ output_mode: str = "text",
86
  num_detection_classes: int = 80,
87
  num_segmentation_classes: int = 150,
88
 
 
95
  ):
96
  super().__init__(**kwargs)
97
 
98
+ # Architecture
99
+ self.architecture_name = architecture_name
100
+
101
+ # Vision
102
+ self.vision_hidden_size = vision_hidden_size
103
+ self.vision_num_layers = vision_num_layers
104
+ self.vision_num_heads = vision_num_heads
105
+ self.image_size = image_size
106
+ self.patch_size = patch_size
107
 
108
  # Projector
109
+ self.fused_vision_dim = fused_vision_dim
110
  self.projector_hidden_dim = projector_hidden_dim
111
  self.num_vision_tokens = num_vision_tokens
112
 
113
+ # Language model
 
114
  self.lm_hidden_size = lm_hidden_size
115
+ self.lm_num_layers = lm_num_layers
116
+ self.lm_num_heads = lm_num_heads
117
  self.vocab_size = vocab_size
118
  self.max_position_embeddings = max_position_embeddings
119
 
 
136
  self.structured_output_enabled = structured_output_enabled
137
  self.json_token = json_token
138
  self.json_end_token = json_end_token
139
+ self.box_token = box_token
140
+ self.box_end_token = box_end_token
141
+ self.point_token = point_token
142
+ self.point_end_token = point_end_token
143
 
144
  # OCR
145
  self.ocr_enabled = ocr_enabled
 
172
  return output
173
 
174
 
 
175
  OculusConfig.register_for_auto_class()