Upload oculus_unified_model/configuration_oculus.py with huggingface_hub
Browse files
oculus_unified_model/configuration_oculus.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
Oculus Configuration
|
| 3 |
|
| 4 |
HuggingFace-compatible configuration for the unified Oculus model.
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
from typing import Optional, Dict, Any, List
|
|
@@ -11,104 +12,142 @@ from transformers import PretrainedConfig
|
|
| 11 |
class OculusConfig(PretrainedConfig):
|
| 12 |
"""
|
| 13 |
Configuration class for Oculus vision-language model.
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
| 21 |
"""
|
| 22 |
-
|
| 23 |
model_type = "oculus"
|
| 24 |
-
|
| 25 |
def __init__(
|
| 26 |
self,
|
| 27 |
-
# Vision encoder settings
|
| 28 |
-
dinov3_model_id: str = "facebook/
|
| 29 |
-
siglip_model_id: str = "google/
|
| 30 |
-
dinov3_hidden_size: int =
|
| 31 |
-
siglip_hidden_size: int =
|
| 32 |
-
|
| 33 |
# Projector settings
|
| 34 |
-
projector_hidden_dim: int =
|
| 35 |
num_vision_tokens: int = 64,
|
| 36 |
-
|
| 37 |
-
# Language model settings
|
| 38 |
-
|
| 39 |
lm_hidden_size: int = 1536,
|
| 40 |
vocab_size: int = 131072,
|
| 41 |
max_position_embeddings: int = 32768,
|
| 42 |
-
|
| 43 |
-
# Reasoning
|
| 44 |
reasoning_enabled: bool = True,
|
| 45 |
thinking_token: str = "<think>",
|
| 46 |
thinking_end_token: str = "</think>",
|
| 47 |
max_thinking_tokens: int = 256,
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
# Output mode settings
|
| 50 |
-
output_mode: str = "text", # "text", "point", "box", "polygon"
|
| 51 |
num_detection_classes: int = 80,
|
| 52 |
num_segmentation_classes: int = 150,
|
| 53 |
-
|
| 54 |
# Generation settings
|
| 55 |
max_new_tokens: int = 512,
|
| 56 |
temperature: float = 0.7,
|
| 57 |
top_p: float = 0.95,
|
| 58 |
-
|
| 59 |
-
# Tool calling / Focus system
|
| 60 |
-
enable_focus: bool = True,
|
| 61 |
-
focus_token: str = "<focus>",
|
| 62 |
-
focus_end_token: str = "</focus>",
|
| 63 |
-
|
| 64 |
**kwargs
|
| 65 |
):
|
| 66 |
super().__init__(**kwargs)
|
| 67 |
-
|
| 68 |
-
# Vision
|
| 69 |
self.dinov3_model_id = dinov3_model_id
|
| 70 |
self.siglip_model_id = siglip_model_id
|
| 71 |
self.dinov3_hidden_size = dinov3_hidden_size
|
| 72 |
self.siglip_hidden_size = siglip_hidden_size
|
| 73 |
self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size
|
| 74 |
-
|
| 75 |
# Projector
|
| 76 |
self.projector_hidden_dim = projector_hidden_dim
|
| 77 |
self.num_vision_tokens = num_vision_tokens
|
| 78 |
-
|
| 79 |
-
# Language model
|
| 80 |
-
self.
|
| 81 |
self.lm_hidden_size = lm_hidden_size
|
| 82 |
self.vocab_size = vocab_size
|
| 83 |
self.max_position_embeddings = max_position_embeddings
|
| 84 |
-
|
| 85 |
-
# Reasoning
|
| 86 |
self.reasoning_enabled = reasoning_enabled
|
| 87 |
self.thinking_token = thinking_token
|
| 88 |
self.thinking_end_token = thinking_end_token
|
| 89 |
self.max_thinking_tokens = max_thinking_tokens
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
# Output modes
|
| 92 |
self.output_mode = output_mode
|
| 93 |
self.num_detection_classes = num_detection_classes
|
| 94 |
self.num_segmentation_classes = num_segmentation_classes
|
| 95 |
-
|
| 96 |
# Generation
|
| 97 |
self.max_new_tokens = max_new_tokens
|
| 98 |
self.temperature = temperature
|
| 99 |
self.top_p = top_p
|
| 100 |
-
|
| 101 |
-
# Focus system
|
| 102 |
-
self.enable_focus = enable_focus
|
| 103 |
-
self.focus_token = focus_token
|
| 104 |
-
self.focus_end_token = focus_end_token
|
| 105 |
-
|
| 106 |
@classmethod
|
| 107 |
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
| 108 |
"""Load config from pretrained path."""
|
| 109 |
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
| 110 |
return cls.from_dict(config_dict, **kwargs)
|
| 111 |
-
|
| 112 |
def to_dict(self) -> Dict[str, Any]:
|
| 113 |
"""Serialize config to dictionary."""
|
| 114 |
output = super().to_dict()
|
|
|
|
| 2 |
Oculus Configuration
|
| 3 |
|
| 4 |
HuggingFace-compatible configuration for the unified Oculus model.
|
| 5 |
+
Supports Isaac 0.2 features: Thinking Traces, Focus/Zoom, Structured Output, Complex OCR, Desktop UI.
|
| 6 |
"""
|
| 7 |
|
| 8 |
from typing import Optional, Dict, Any, List
|
|
|
|
| 12 |
class OculusConfig(PretrainedConfig):
|
| 13 |
"""
|
| 14 |
Configuration class for Oculus vision-language model.
|
| 15 |
+
|
| 16 |
+
Architecture: DINOv3 + SigLIP2 + LFM2.5-1.2B
|
| 17 |
+
|
| 18 |
+
Isaac 0.2 Features:
|
| 19 |
+
- Reasoning via Thinking Traces
|
| 20 |
+
- Perceptive Tool Calling + Focus (Zoom & Crop)
|
| 21 |
+
- Structured Outputs (JSON)
|
| 22 |
+
- Complex OCR
|
| 23 |
+
- Desktop UI Understanding
|
| 24 |
"""
|
| 25 |
+
|
| 26 |
model_type = "oculus"
|
| 27 |
+
|
| 28 |
def __init__(
|
| 29 |
self,
|
| 30 |
+
# Vision encoder settings (DINOv3 + SigLIP2)
|
| 31 |
+
dinov3_model_id: str = "facebook/dinov3-vitl16-pretrain-lvd1689m",
|
| 32 |
+
siglip_model_id: str = "google/siglip2-so400m-patch16-naflex",
|
| 33 |
+
dinov3_hidden_size: int = 1024, # DINOv3 ViT-L/16 output dim
|
| 34 |
+
siglip_hidden_size: int = 1152, # SigLIP2 SO400M output dim
|
| 35 |
+
|
| 36 |
# Projector settings
|
| 37 |
+
projector_hidden_dim: int = 4352,
|
| 38 |
num_vision_tokens: int = 64,
|
| 39 |
+
|
| 40 |
+
# Language model settings (LFM2.5-1.2B)
|
| 41 |
+
lm_model_id: str = "LiquidAI/LFM2.5-1.2B-Base",
|
| 42 |
lm_hidden_size: int = 1536,
|
| 43 |
vocab_size: int = 131072,
|
| 44 |
max_position_embeddings: int = 32768,
|
| 45 |
+
|
| 46 |
+
# Reasoning / Thinking Traces
|
| 47 |
reasoning_enabled: bool = True,
|
| 48 |
thinking_token: str = "<think>",
|
| 49 |
thinking_end_token: str = "</think>",
|
| 50 |
max_thinking_tokens: int = 256,
|
| 51 |
+
thinking_style: str = "structured", # "structured", "verbose", "minimal"
|
| 52 |
+
|
| 53 |
+
# Focus System (Perceptive Tool Calling)
|
| 54 |
+
enable_focus: bool = True,
|
| 55 |
+
focus_token: str = "<focus>",
|
| 56 |
+
focus_end_token: str = "</focus>",
|
| 57 |
+
max_focus_regions: int = 4,
|
| 58 |
+
focus_min_size: int = 64, # Minimum crop size in pixels
|
| 59 |
+
auto_focus_threshold: float = 0.7, # Confidence threshold to trigger focus
|
| 60 |
+
|
| 61 |
+
# Structured Output
|
| 62 |
+
structured_output_enabled: bool = True,
|
| 63 |
+
json_token: str = "<json>",
|
| 64 |
+
json_end_token: str = "</json>",
|
| 65 |
+
|
| 66 |
+
# OCR Settings
|
| 67 |
+
ocr_enabled: bool = True,
|
| 68 |
+
ocr_languages: List[str] = None, # None = auto-detect
|
| 69 |
+
ocr_confidence_threshold: float = 0.5,
|
| 70 |
+
|
| 71 |
+
# Desktop UI Understanding
|
| 72 |
+
ui_understanding_enabled: bool = True,
|
| 73 |
+
ui_element_classes: int = 50, # button, text_field, checkbox, etc.
|
| 74 |
+
|
| 75 |
# Output mode settings
|
| 76 |
+
output_mode: str = "text", # "text", "point", "box", "polygon", "json"
|
| 77 |
num_detection_classes: int = 80,
|
| 78 |
num_segmentation_classes: int = 150,
|
| 79 |
+
|
| 80 |
# Generation settings
|
| 81 |
max_new_tokens: int = 512,
|
| 82 |
temperature: float = 0.7,
|
| 83 |
top_p: float = 0.95,
|
| 84 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
**kwargs
|
| 86 |
):
|
| 87 |
super().__init__(**kwargs)
|
| 88 |
+
|
| 89 |
+
# Vision encoders
|
| 90 |
self.dinov3_model_id = dinov3_model_id
|
| 91 |
self.siglip_model_id = siglip_model_id
|
| 92 |
self.dinov3_hidden_size = dinov3_hidden_size
|
| 93 |
self.siglip_hidden_size = siglip_hidden_size
|
| 94 |
self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size
|
| 95 |
+
|
| 96 |
# Projector
|
| 97 |
self.projector_hidden_dim = projector_hidden_dim
|
| 98 |
self.num_vision_tokens = num_vision_tokens
|
| 99 |
+
|
| 100 |
+
# Language model (LFM2.5)
|
| 101 |
+
self.lm_model_id = lm_model_id
|
| 102 |
self.lm_hidden_size = lm_hidden_size
|
| 103 |
self.vocab_size = vocab_size
|
| 104 |
self.max_position_embeddings = max_position_embeddings
|
| 105 |
+
|
| 106 |
+
# Reasoning / Thinking Traces
|
| 107 |
self.reasoning_enabled = reasoning_enabled
|
| 108 |
self.thinking_token = thinking_token
|
| 109 |
self.thinking_end_token = thinking_end_token
|
| 110 |
self.max_thinking_tokens = max_thinking_tokens
|
| 111 |
+
self.thinking_style = thinking_style
|
| 112 |
+
|
| 113 |
+
# Focus System
|
| 114 |
+
self.enable_focus = enable_focus
|
| 115 |
+
self.focus_token = focus_token
|
| 116 |
+
self.focus_end_token = focus_end_token
|
| 117 |
+
self.max_focus_regions = max_focus_regions
|
| 118 |
+
self.focus_min_size = focus_min_size
|
| 119 |
+
self.auto_focus_threshold = auto_focus_threshold
|
| 120 |
+
|
| 121 |
+
# Structured Output
|
| 122 |
+
self.structured_output_enabled = structured_output_enabled
|
| 123 |
+
self.json_token = json_token
|
| 124 |
+
self.json_end_token = json_end_token
|
| 125 |
+
|
| 126 |
+
# OCR
|
| 127 |
+
self.ocr_enabled = ocr_enabled
|
| 128 |
+
self.ocr_languages = ocr_languages or ["en"]
|
| 129 |
+
self.ocr_confidence_threshold = ocr_confidence_threshold
|
| 130 |
+
|
| 131 |
+
# Desktop UI
|
| 132 |
+
self.ui_understanding_enabled = ui_understanding_enabled
|
| 133 |
+
self.ui_element_classes = ui_element_classes
|
| 134 |
+
|
| 135 |
# Output modes
|
| 136 |
self.output_mode = output_mode
|
| 137 |
self.num_detection_classes = num_detection_classes
|
| 138 |
self.num_segmentation_classes = num_segmentation_classes
|
| 139 |
+
|
| 140 |
# Generation
|
| 141 |
self.max_new_tokens = max_new_tokens
|
| 142 |
self.temperature = temperature
|
| 143 |
self.top_p = top_p
|
| 144 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
@classmethod
|
| 146 |
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
| 147 |
"""Load config from pretrained path."""
|
| 148 |
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
| 149 |
return cls.from_dict(config_dict, **kwargs)
|
| 150 |
+
|
| 151 |
def to_dict(self) -> Dict[str, Any]:
|
| 152 |
"""Serialize config to dictionary."""
|
| 153 |
output = super().to_dict()
|