kobiakor15 commited on
Commit
a028cbf
·
verified ·
1 Parent(s): 7483603

Upload oculus_unified_model/configuration_oculus.py with huggingface_hub

Browse files
oculus_unified_model/configuration_oculus.py CHANGED
@@ -2,6 +2,7 @@
2
  Oculus Configuration
3
 
4
  HuggingFace-compatible configuration for the unified Oculus model.
 
5
  """
6
 
7
  from typing import Optional, Dict, Any, List
@@ -11,104 +12,142 @@ from transformers import PretrainedConfig
11
  class OculusConfig(PretrainedConfig):
12
  """
13
  Configuration class for Oculus vision-language model.
14
-
15
- Args:
16
- vision_config: Configuration for vision encoders
17
- projector_config: Configuration for vision-to-language projector
18
- text_config: Configuration for language model
19
- reasoning_enabled: Whether to enable thinking traces
20
- output_mode: Default output mode ("text", "point", "box", "polygon")
 
 
21
  """
22
-
23
  model_type = "oculus"
24
-
25
  def __init__(
26
  self,
27
- # Vision encoder settings
28
- dinov3_model_id: str = "facebook/dinov2-large",
29
- siglip_model_id: str = "google/siglip-base-patch16-224",
30
- dinov3_hidden_size: int = 1280, # DINOv3 ViT-H/16+ output dim
31
- siglip_hidden_size: int = 768, # SigLIP2 base output dim
32
-
33
  # Projector settings
34
- projector_hidden_dim: int = 2048,
35
  num_vision_tokens: int = 64,
36
-
37
- # Language model settings
38
- text_model_id: str = "Salesforce/blip-image-captioning-base",
39
  lm_hidden_size: int = 1536,
40
  vocab_size: int = 131072,
41
  max_position_embeddings: int = 32768,
42
-
43
- # Reasoning settings
44
  reasoning_enabled: bool = True,
45
  thinking_token: str = "<think>",
46
  thinking_end_token: str = "</think>",
47
  max_thinking_tokens: int = 256,
48
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # Output mode settings
50
- output_mode: str = "text", # "text", "point", "box", "polygon"
51
  num_detection_classes: int = 80,
52
  num_segmentation_classes: int = 150,
53
-
54
  # Generation settings
55
  max_new_tokens: int = 512,
56
  temperature: float = 0.7,
57
  top_p: float = 0.95,
58
-
59
- # Tool calling / Focus system
60
- enable_focus: bool = True,
61
- focus_token: str = "<focus>",
62
- focus_end_token: str = "</focus>",
63
-
64
  **kwargs
65
  ):
66
  super().__init__(**kwargs)
67
-
68
- # Vision
69
  self.dinov3_model_id = dinov3_model_id
70
  self.siglip_model_id = siglip_model_id
71
  self.dinov3_hidden_size = dinov3_hidden_size
72
  self.siglip_hidden_size = siglip_hidden_size
73
  self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size
74
-
75
  # Projector
76
  self.projector_hidden_dim = projector_hidden_dim
77
  self.num_vision_tokens = num_vision_tokens
78
-
79
- # Language model
80
- self.text_model_id = text_model_id
81
  self.lm_hidden_size = lm_hidden_size
82
  self.vocab_size = vocab_size
83
  self.max_position_embeddings = max_position_embeddings
84
-
85
- # Reasoning
86
  self.reasoning_enabled = reasoning_enabled
87
  self.thinking_token = thinking_token
88
  self.thinking_end_token = thinking_end_token
89
  self.max_thinking_tokens = max_thinking_tokens
90
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  # Output modes
92
  self.output_mode = output_mode
93
  self.num_detection_classes = num_detection_classes
94
  self.num_segmentation_classes = num_segmentation_classes
95
-
96
  # Generation
97
  self.max_new_tokens = max_new_tokens
98
  self.temperature = temperature
99
  self.top_p = top_p
100
-
101
- # Focus system
102
- self.enable_focus = enable_focus
103
- self.focus_token = focus_token
104
- self.focus_end_token = focus_end_token
105
-
106
  @classmethod
107
  def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
108
  """Load config from pretrained path."""
109
  config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
110
  return cls.from_dict(config_dict, **kwargs)
111
-
112
  def to_dict(self) -> Dict[str, Any]:
113
  """Serialize config to dictionary."""
114
  output = super().to_dict()
 
2
  Oculus Configuration
3
 
4
  HuggingFace-compatible configuration for the unified Oculus model.
5
+ Supports Isaac 0.2 features: Thinking Traces, Focus/Zoom, Structured Output, Complex OCR, Desktop UI.
6
  """
7
 
8
  from typing import Optional, Dict, Any, List
 
12
  class OculusConfig(PretrainedConfig):
13
  """
14
  Configuration class for Oculus vision-language model.
15
+
16
+ Architecture: DINOv3 + SigLIP2 + LFM2.5-1.2B
17
+
18
+ Isaac 0.2 Features:
19
+ - Reasoning via Thinking Traces
20
+ - Perceptive Tool Calling + Focus (Zoom & Crop)
21
+ - Structured Outputs (JSON)
22
+ - Complex OCR
23
+ - Desktop UI Understanding
24
  """
25
+
26
  model_type = "oculus"
27
+
28
  def __init__(
29
  self,
30
+ # Vision encoder settings (DINOv3 + SigLIP2)
31
+ dinov3_model_id: str = "facebook/dinov3-vitl16-pretrain-lvd1689m",
32
+ siglip_model_id: str = "google/siglip2-so400m-patch16-naflex",
33
+ dinov3_hidden_size: int = 1024, # DINOv3 ViT-L/16 output dim
34
+ siglip_hidden_size: int = 1152, # SigLIP2 SO400M output dim
35
+
36
  # Projector settings
37
+ projector_hidden_dim: int = 4352,
38
  num_vision_tokens: int = 64,
39
+
40
+ # Language model settings (LFM2.5-1.2B)
41
+ lm_model_id: str = "LiquidAI/LFM2.5-1.2B-Base",
42
  lm_hidden_size: int = 1536,
43
  vocab_size: int = 131072,
44
  max_position_embeddings: int = 32768,
45
+
46
+ # Reasoning / Thinking Traces
47
  reasoning_enabled: bool = True,
48
  thinking_token: str = "<think>",
49
  thinking_end_token: str = "</think>",
50
  max_thinking_tokens: int = 256,
51
+ thinking_style: str = "structured", # "structured", "verbose", "minimal"
52
+
53
+ # Focus System (Perceptive Tool Calling)
54
+ enable_focus: bool = True,
55
+ focus_token: str = "<focus>",
56
+ focus_end_token: str = "</focus>",
57
+ max_focus_regions: int = 4,
58
+ focus_min_size: int = 64, # Minimum crop size in pixels
59
+ auto_focus_threshold: float = 0.7, # Confidence threshold to trigger focus
60
+
61
+ # Structured Output
62
+ structured_output_enabled: bool = True,
63
+ json_token: str = "<json>",
64
+ json_end_token: str = "</json>",
65
+
66
+ # OCR Settings
67
+ ocr_enabled: bool = True,
68
+ ocr_languages: List[str] = None, # None = auto-detect
69
+ ocr_confidence_threshold: float = 0.5,
70
+
71
+ # Desktop UI Understanding
72
+ ui_understanding_enabled: bool = True,
73
+ ui_element_classes: int = 50, # button, text_field, checkbox, etc.
74
+
75
  # Output mode settings
76
+ output_mode: str = "text", # "text", "point", "box", "polygon", "json"
77
  num_detection_classes: int = 80,
78
  num_segmentation_classes: int = 150,
79
+
80
  # Generation settings
81
  max_new_tokens: int = 512,
82
  temperature: float = 0.7,
83
  top_p: float = 0.95,
84
+
 
 
 
 
 
85
  **kwargs
86
  ):
87
  super().__init__(**kwargs)
88
+
89
+ # Vision encoders
90
  self.dinov3_model_id = dinov3_model_id
91
  self.siglip_model_id = siglip_model_id
92
  self.dinov3_hidden_size = dinov3_hidden_size
93
  self.siglip_hidden_size = siglip_hidden_size
94
  self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size
95
+
96
  # Projector
97
  self.projector_hidden_dim = projector_hidden_dim
98
  self.num_vision_tokens = num_vision_tokens
99
+
100
+ # Language model (LFM2.5)
101
+ self.lm_model_id = lm_model_id
102
  self.lm_hidden_size = lm_hidden_size
103
  self.vocab_size = vocab_size
104
  self.max_position_embeddings = max_position_embeddings
105
+
106
+ # Reasoning / Thinking Traces
107
  self.reasoning_enabled = reasoning_enabled
108
  self.thinking_token = thinking_token
109
  self.thinking_end_token = thinking_end_token
110
  self.max_thinking_tokens = max_thinking_tokens
111
+ self.thinking_style = thinking_style
112
+
113
+ # Focus System
114
+ self.enable_focus = enable_focus
115
+ self.focus_token = focus_token
116
+ self.focus_end_token = focus_end_token
117
+ self.max_focus_regions = max_focus_regions
118
+ self.focus_min_size = focus_min_size
119
+ self.auto_focus_threshold = auto_focus_threshold
120
+
121
+ # Structured Output
122
+ self.structured_output_enabled = structured_output_enabled
123
+ self.json_token = json_token
124
+ self.json_end_token = json_end_token
125
+
126
+ # OCR
127
+ self.ocr_enabled = ocr_enabled
128
+ self.ocr_languages = ocr_languages or ["en"]
129
+ self.ocr_confidence_threshold = ocr_confidence_threshold
130
+
131
+ # Desktop UI
132
+ self.ui_understanding_enabled = ui_understanding_enabled
133
+ self.ui_element_classes = ui_element_classes
134
+
135
  # Output modes
136
  self.output_mode = output_mode
137
  self.num_detection_classes = num_detection_classes
138
  self.num_segmentation_classes = num_segmentation_classes
139
+
140
  # Generation
141
  self.max_new_tokens = max_new_tokens
142
  self.temperature = temperature
143
  self.top_p = top_p
144
+
 
 
 
 
 
145
  @classmethod
146
  def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
147
  """Load config from pretrained path."""
148
  config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
149
  return cls.from_dict(config_dict, **kwargs)
150
+
151
  def to_dict(self) -> Dict[str, Any]:
152
  """Serialize config to dictionary."""
153
  output = super().to_dict()