Upload oculus_unified_model/modeling_oculus.py with huggingface_hub
Browse files- oculus_unified_model/modeling_oculus.py +180 -168
oculus_unified_model/modeling_oculus.py
CHANGED
|
@@ -1,15 +1,17 @@
|
|
| 1 |
"""
|
| 2 |
Oculus Unified Model
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
|
| 15 |
import os
|
|
@@ -23,14 +25,7 @@ import numpy as np
|
|
| 23 |
import torch
|
| 24 |
import torch.nn as nn
|
| 25 |
import torch.nn.functional as F
|
| 26 |
-
from transformers import
|
| 27 |
-
PreTrainedModel,
|
| 28 |
-
PretrainedConfig,
|
| 29 |
-
AutoImageProcessor,
|
| 30 |
-
AutoModel,
|
| 31 |
-
AutoTokenizer,
|
| 32 |
-
AutoModelForCausalLM,
|
| 33 |
-
)
|
| 34 |
from PIL import Image
|
| 35 |
|
| 36 |
from .configuration_oculus import OculusConfig
|
|
@@ -89,116 +84,90 @@ class OculusPolygonOutput(OculusOutput):
|
|
| 89 |
@dataclass
|
| 90 |
class OculusOCROutput(OculusOutput):
|
| 91 |
"""Output for OCR mode."""
|
| 92 |
-
text_blocks: Optional[List[Dict[str, Any]]] = None
|
| 93 |
full_text: Optional[str] = None
|
| 94 |
|
| 95 |
|
| 96 |
@dataclass
|
| 97 |
class OculusUIOutput(OculusOutput):
|
| 98 |
"""Output for UI element detection."""
|
| 99 |
-
elements: Optional[List[Dict[str, Any]]] = None
|
| 100 |
|
| 101 |
|
| 102 |
# ============================================================================
|
| 103 |
-
# Vision Encoder
|
| 104 |
# ============================================================================
|
| 105 |
|
| 106 |
class OculusVisionEncoder(nn.Module):
|
| 107 |
"""
|
| 108 |
-
|
| 109 |
|
| 110 |
-
|
| 111 |
-
SigLIP2: Strong at text/language alignment
|
| 112 |
"""
|
| 113 |
|
| 114 |
def __init__(self, config: OculusConfig):
|
| 115 |
super().__init__()
|
| 116 |
self.config = config
|
| 117 |
|
| 118 |
-
|
| 119 |
-
self.
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
| 122 |
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
-
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
self.config.dinov3_model_id
|
| 136 |
-
)
|
| 137 |
-
self.dinov3 = AutoModel.from_pretrained(
|
| 138 |
-
self.config.dinov3_model_id
|
| 139 |
-
).eval().to(device)
|
| 140 |
-
print(f" ✓ DINOv3: {self.config.dinov3_model_id}")
|
| 141 |
-
except Exception as e:
|
| 142 |
-
warnings.warn(f"Failed to load DINOv3: {e}")
|
| 143 |
-
self.dinov3_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-large")
|
| 144 |
-
self.dinov3 = AutoModel.from_pretrained("facebook/dinov2-large").eval().to(device)
|
| 145 |
-
print(" ✓ DINOv2-large (fallback)")
|
| 146 |
-
|
| 147 |
-
# SigLIP2
|
| 148 |
-
try:
|
| 149 |
-
self.siglip_processor = AutoImageProcessor.from_pretrained(
|
| 150 |
-
self.config.siglip_model_id
|
| 151 |
-
)
|
| 152 |
-
self.siglip = AutoModel.from_pretrained(
|
| 153 |
-
self.config.siglip_model_id
|
| 154 |
-
).eval().to(device)
|
| 155 |
-
print(f" ✓ SigLIP2: {self.config.siglip_model_id}")
|
| 156 |
-
except Exception as e:
|
| 157 |
-
warnings.warn(f"Failed to load SigLIP2: {e}")
|
| 158 |
-
from transformers import SiglipVisionModel
|
| 159 |
-
self.siglip_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
|
| 160 |
-
self.siglip = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224").eval().to(device)
|
| 161 |
-
print(" ✓ SigLIP-base (fallback)")
|
| 162 |
-
|
| 163 |
-
self._loaded = True
|
| 164 |
-
|
| 165 |
-
@torch.no_grad()
|
| 166 |
-
def forward(self, image: Union[Image.Image, torch.Tensor, np.ndarray]) -> torch.Tensor:
|
| 167 |
-
"""Encode image with both vision encoders and fuse features."""
|
| 168 |
-
if not self._loaded:
|
| 169 |
-
self.load_encoders()
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
elif isinstance(image, torch.Tensor):
|
| 174 |
-
image = Image.fromarray(image.cpu().numpy().astype(np.uint8))
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
| 178 |
|
| 179 |
-
|
|
|
|
|
|
|
| 180 |
|
| 181 |
-
#
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
d_out = self.dinov3(**d_inputs)
|
| 185 |
-
d_pooled = d_out.pooler_output if hasattr(d_out, 'pooler_output') and d_out.pooler_output is not None else d_out.last_hidden_state[:, 0]
|
| 186 |
|
| 187 |
-
#
|
| 188 |
-
|
| 189 |
-
s_inputs = {k: v.to(device) for k, v in s_inputs.items()}
|
| 190 |
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
else:
|
| 195 |
-
s_out = self.siglip(**s_inputs)
|
| 196 |
-
s_pooled = s_out.pooler_output if hasattr(s_out, 'pooler_output') else s_out.last_hidden_state[:, 0]
|
| 197 |
|
| 198 |
-
|
| 199 |
-
fused = torch.cat([d_pooled, s_pooled], dim=-1)
|
| 200 |
|
| 201 |
-
|
|
|
|
| 202 |
|
| 203 |
|
| 204 |
# ============================================================================
|
|
@@ -206,7 +175,7 @@ class OculusVisionEncoder(nn.Module):
|
|
| 206 |
# ============================================================================
|
| 207 |
|
| 208 |
class OculusProjector(nn.Module):
|
| 209 |
-
"""Projects
|
| 210 |
|
| 211 |
def __init__(self, config: OculusConfig):
|
| 212 |
super().__init__()
|
|
@@ -265,6 +234,73 @@ class OculusProjector(nn.Module):
|
|
| 265 |
return projector
|
| 266 |
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
# ============================================================================
|
| 269 |
# Task Heads
|
| 270 |
# ============================================================================
|
|
@@ -362,7 +398,7 @@ class OculusOCRHead(nn.Module):
|
|
| 362 |
self.text_detector = nn.Sequential(
|
| 363 |
nn.Linear(hidden_dim, hidden_dim),
|
| 364 |
nn.GELU(),
|
| 365 |
-
nn.Linear(hidden_dim, 5)
|
| 366 |
)
|
| 367 |
|
| 368 |
def forward(self, vision_tokens: torch.Tensor) -> torch.Tensor:
|
|
@@ -401,16 +437,18 @@ class OculusUIHead(nn.Module):
|
|
| 401 |
|
| 402 |
class OculusForConditionalGeneration(PreTrainedModel):
|
| 403 |
"""
|
| 404 |
-
Oculus:
|
| 405 |
|
| 406 |
-
|
| 407 |
|
| 408 |
-
|
| 409 |
- Reasoning via Thinking Traces
|
| 410 |
- Perceptive Tool Calling + Focus (Zoom & Crop)
|
| 411 |
-
- Structured Outputs (JSON)
|
| 412 |
- Complex OCR
|
| 413 |
- Desktop UI Understanding
|
|
|
|
|
|
|
| 414 |
"""
|
| 415 |
|
| 416 |
config_class = OculusConfig
|
|
@@ -423,13 +461,15 @@ class OculusForConditionalGeneration(PreTrainedModel):
|
|
| 423 |
# Vision encoder
|
| 424 |
self.vision_encoder = OculusVisionEncoder(config)
|
| 425 |
|
| 426 |
-
# Vision adapter
|
| 427 |
-
self.vision_adapter =
|
| 428 |
-
self._actual_vision_dim = None
|
| 429 |
|
| 430 |
# Projector
|
| 431 |
self.projector = OculusProjector(config)
|
| 432 |
|
|
|
|
|
|
|
|
|
|
| 433 |
# Task-specific heads
|
| 434 |
self.detection_head = OculusDetectionHead(config)
|
| 435 |
self.point_head = OculusPointHead(config)
|
|
@@ -437,11 +477,6 @@ class OculusForConditionalGeneration(PreTrainedModel):
|
|
| 437 |
self.ocr_head = OculusOCRHead(config)
|
| 438 |
self.ui_head = OculusUIHead(config)
|
| 439 |
|
| 440 |
-
# Language model (LFM2.5)
|
| 441 |
-
self.lm_tokenizer = None
|
| 442 |
-
self.lm_model = None
|
| 443 |
-
self._lm_loaded = False
|
| 444 |
-
|
| 445 |
# Special tokens
|
| 446 |
self.thinking_token = config.thinking_token
|
| 447 |
self.thinking_end_token = config.thinking_end_token
|
|
@@ -449,44 +484,35 @@ class OculusForConditionalGeneration(PreTrainedModel):
|
|
| 449 |
self.focus_end_token = config.focus_end_token
|
| 450 |
self.json_token = config.json_token
|
| 451 |
self.json_end_token = config.json_end_token
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
|
| 453 |
-
def
|
| 454 |
-
"""Load LFM2.5 language model."""
|
| 455 |
-
if self._lm_loaded:
|
| 456 |
-
return
|
| 457 |
-
|
| 458 |
-
print("[Oculus] Loading language model...")
|
| 459 |
-
|
| 460 |
-
try:
|
| 461 |
-
self.lm_tokenizer = AutoTokenizer.from_pretrained(self.config.lm_model_id)
|
| 462 |
-
self.lm_model = AutoModelForCausalLM.from_pretrained(
|
| 463 |
-
self.config.lm_model_id
|
| 464 |
-
).to(device)
|
| 465 |
-
print(f" ✓ LFM2.5: {self.config.lm_model_id}")
|
| 466 |
-
self._lm_loaded = True
|
| 467 |
-
except Exception as e:
|
| 468 |
-
warnings.warn(f"Failed to load LFM2.5: {e}. Text generation unavailable.")
|
| 469 |
-
|
| 470 |
-
def encode_image(self, image: Union[Image.Image, str, np.ndarray]) -> torch.Tensor:
|
| 471 |
"""Encode image to vision tokens."""
|
| 472 |
if isinstance(image, str):
|
| 473 |
-
image = Image.open(image)
|
| 474 |
|
| 475 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
|
| 477 |
-
|
| 478 |
-
|
| 479 |
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
print(f" [Adapter] Creating vision adapter: {actual_dim} -> {expected_dim}")
|
| 483 |
-
self.vision_adapter = nn.Linear(actual_dim, expected_dim)
|
| 484 |
-
self._actual_vision_dim = actual_dim
|
| 485 |
-
nn.init.xavier_uniform_(self.vision_adapter.weight)
|
| 486 |
-
nn.init.zeros_(self.vision_adapter.bias)
|
| 487 |
|
| 488 |
-
|
|
|
|
| 489 |
|
|
|
|
| 490 |
vision_tokens = self.projector(vision_features)
|
| 491 |
|
| 492 |
return vision_tokens
|
|
@@ -499,9 +525,9 @@ class OculusForConditionalGeneration(PreTrainedModel):
|
|
| 499 |
def _generate_thinking_trace(self, prompt: str, context: str = "") -> str:
|
| 500 |
"""Generate structured thinking trace."""
|
| 501 |
if self.config.thinking_style == "structured":
|
| 502 |
-
return f"Analyzing: {prompt[:50]}...
|
| 503 |
elif self.config.thinking_style == "verbose":
|
| 504 |
-
return f"Let me think step by step
|
| 505 |
else:
|
| 506 |
return ""
|
| 507 |
|
|
@@ -526,8 +552,6 @@ class OculusForConditionalGeneration(PreTrainedModel):
|
|
| 526 |
think: Enable reasoning traces
|
| 527 |
focus: Enable zoom/crop for fine-grained perception
|
| 528 |
"""
|
| 529 |
-
self.vision_encoder.load_encoders()
|
| 530 |
-
|
| 531 |
if isinstance(image, str):
|
| 532 |
image = Image.open(image).convert('RGB')
|
| 533 |
elif isinstance(image, np.ndarray):
|
|
@@ -557,30 +581,12 @@ class OculusForConditionalGeneration(PreTrainedModel):
|
|
| 557 |
raise ValueError(f"Unknown mode: {mode}")
|
| 558 |
|
| 559 |
def _generate_text(self, image, prompt, vision_tokens, thinking_trace, max_new_tokens, **kwargs) -> OculusTextOutput:
|
| 560 |
-
"""Generate text output
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
if self.lm_model is None:
|
| 565 |
-
return OculusTextOutput(
|
| 566 |
-
text="[Language model not available]",
|
| 567 |
-
thinking_trace=thinking_trace,
|
| 568 |
-
vision_tokens=vision_tokens
|
| 569 |
-
)
|
| 570 |
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
inputs = {k: v.to(self.lm_model.device) for k, v in inputs.items()}
|
| 574 |
-
|
| 575 |
-
with torch.no_grad():
|
| 576 |
-
outputs = self.lm_model.generate(
|
| 577 |
-
**inputs,
|
| 578 |
-
max_new_tokens=max_new_tokens or self.config.max_new_tokens,
|
| 579 |
-
temperature=self.config.temperature,
|
| 580 |
-
do_sample=True
|
| 581 |
-
)
|
| 582 |
-
|
| 583 |
-
text = self.lm_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 584 |
|
| 585 |
return OculusTextOutput(
|
| 586 |
text=text,
|
|
@@ -590,9 +596,15 @@ class OculusForConditionalGeneration(PreTrainedModel):
|
|
| 590 |
|
| 591 |
def _generate_json(self, image, prompt, vision_tokens, thinking_trace, **kwargs) -> OculusJSONOutput:
|
| 592 |
"""Generate structured JSON output."""
|
| 593 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
return OculusJSONOutput(
|
| 595 |
-
json_data=
|
|
|
|
| 596 |
thinking_trace=thinking_trace,
|
| 597 |
vision_tokens=vision_tokens
|
| 598 |
)
|
|
|
|
| 1 |
"""
|
| 2 |
Oculus Unified Model
|
| 3 |
|
| 4 |
+
Oceanir-Oculus OO1 Architecture - Hybrid-reasoning vision-language model.
|
| 5 |
+
|
| 6 |
+
Features:
|
| 7 |
+
- Reasoning via Thinking Traces
|
| 8 |
+
- Perceptive Tool Calling + Focus (Zoom & Crop)
|
| 9 |
+
- Structured Outputs (JSON, Box, Point)
|
| 10 |
+
- Complex OCR
|
| 11 |
+
- Desktop UI Understanding
|
| 12 |
+
|
| 13 |
+
Small models that outperform systems 10x larger on visual reasoning
|
| 14 |
+
and perception tasks, running on commodity GPUs or edge devices.
|
| 15 |
"""
|
| 16 |
|
| 17 |
import os
|
|
|
|
| 25 |
import torch
|
| 26 |
import torch.nn as nn
|
| 27 |
import torch.nn.functional as F
|
| 28 |
+
from transformers import PreTrainedModel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
from PIL import Image
|
| 30 |
|
| 31 |
from .configuration_oculus import OculusConfig
|
|
|
|
| 84 |
@dataclass
|
| 85 |
class OculusOCROutput(OculusOutput):
|
| 86 |
"""Output for OCR mode."""
|
| 87 |
+
text_blocks: Optional[List[Dict[str, Any]]] = None
|
| 88 |
full_text: Optional[str] = None
|
| 89 |
|
| 90 |
|
| 91 |
@dataclass
|
| 92 |
class OculusUIOutput(OculusOutput):
|
| 93 |
"""Output for UI element detection."""
|
| 94 |
+
elements: Optional[List[Dict[str, Any]]] = None
|
| 95 |
|
| 96 |
|
| 97 |
# ============================================================================
|
| 98 |
+
# Vision Encoder
|
| 99 |
# ============================================================================
|
| 100 |
|
| 101 |
class OculusVisionEncoder(nn.Module):
|
| 102 |
"""
|
| 103 |
+
Oceanir-Oculus OO1 Vision Encoder.
|
| 104 |
|
| 105 |
+
Hybrid vision encoder optimized for visual reasoning and grounding.
|
|
|
|
| 106 |
"""
|
| 107 |
|
| 108 |
def __init__(self, config: OculusConfig):
|
| 109 |
super().__init__()
|
| 110 |
self.config = config
|
| 111 |
|
| 112 |
+
# Vision transformer components
|
| 113 |
+
self.patch_embed = nn.Conv2d(
|
| 114 |
+
3, config.vision_hidden_size,
|
| 115 |
+
kernel_size=config.patch_size,
|
| 116 |
+
stride=config.patch_size
|
| 117 |
+
)
|
| 118 |
|
| 119 |
+
num_patches = (config.image_size // config.patch_size) ** 2
|
| 120 |
+
self.pos_embed = nn.Parameter(
|
| 121 |
+
torch.zeros(1, num_patches + 1, config.vision_hidden_size)
|
| 122 |
+
)
|
| 123 |
+
self.cls_token = nn.Parameter(
|
| 124 |
+
torch.zeros(1, 1, config.vision_hidden_size)
|
| 125 |
+
)
|
| 126 |
|
| 127 |
+
# Transformer layers
|
| 128 |
+
self.layers = nn.ModuleList([
|
| 129 |
+
nn.TransformerEncoderLayer(
|
| 130 |
+
d_model=config.vision_hidden_size,
|
| 131 |
+
nhead=config.vision_num_heads,
|
| 132 |
+
dim_feedforward=config.vision_hidden_size * 4,
|
| 133 |
+
batch_first=True
|
| 134 |
+
)
|
| 135 |
+
for _ in range(config.vision_num_layers)
|
| 136 |
+
])
|
| 137 |
|
| 138 |
+
self.norm = nn.LayerNorm(config.vision_hidden_size)
|
| 139 |
|
| 140 |
+
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
|
| 141 |
+
"""
|
| 142 |
+
Encode images to vision features.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
+
Args:
|
| 145 |
+
pixel_values: [batch, 3, H, W]
|
|
|
|
|
|
|
| 146 |
|
| 147 |
+
Returns:
|
| 148 |
+
Vision features [batch, hidden_size]
|
| 149 |
+
"""
|
| 150 |
+
batch_size = pixel_values.shape[0]
|
| 151 |
|
| 152 |
+
# Patch embedding
|
| 153 |
+
x = self.patch_embed(pixel_values)
|
| 154 |
+
x = x.flatten(2).transpose(1, 2)
|
| 155 |
|
| 156 |
+
# Add CLS token
|
| 157 |
+
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
|
| 158 |
+
x = torch.cat([cls_tokens, x], dim=1)
|
|
|
|
|
|
|
| 159 |
|
| 160 |
+
# Add position embedding
|
| 161 |
+
x = x + self.pos_embed[:, :x.shape[1], :]
|
|
|
|
| 162 |
|
| 163 |
+
# Transformer layers
|
| 164 |
+
for layer in self.layers:
|
| 165 |
+
x = layer(x)
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
+
x = self.norm(x)
|
|
|
|
| 168 |
|
| 169 |
+
# Return CLS token
|
| 170 |
+
return x[:, 0]
|
| 171 |
|
| 172 |
|
| 173 |
# ============================================================================
|
|
|
|
| 175 |
# ============================================================================
|
| 176 |
|
| 177 |
class OculusProjector(nn.Module):
|
| 178 |
+
"""Projects vision features to language model token space."""
|
| 179 |
|
| 180 |
def __init__(self, config: OculusConfig):
|
| 181 |
super().__init__()
|
|
|
|
| 234 |
return projector
|
| 235 |
|
| 236 |
|
| 237 |
+
# ============================================================================
|
| 238 |
+
# Language Model
|
| 239 |
+
# ============================================================================
|
| 240 |
+
|
| 241 |
+
class OculusLanguageModel(nn.Module):
|
| 242 |
+
"""
|
| 243 |
+
Oceanir-Oculus OO1 Language Model.
|
| 244 |
+
|
| 245 |
+
Hybrid transformer optimized for visual reasoning and structured output.
|
| 246 |
+
"""
|
| 247 |
+
|
| 248 |
+
def __init__(self, config: OculusConfig):
|
| 249 |
+
super().__init__()
|
| 250 |
+
self.config = config
|
| 251 |
+
|
| 252 |
+
self.embed_tokens = nn.Embedding(config.vocab_size, config.lm_hidden_size)
|
| 253 |
+
self.pos_embed = nn.Embedding(config.max_position_embeddings, config.lm_hidden_size)
|
| 254 |
+
|
| 255 |
+
self.layers = nn.ModuleList([
|
| 256 |
+
nn.TransformerDecoderLayer(
|
| 257 |
+
d_model=config.lm_hidden_size,
|
| 258 |
+
nhead=config.lm_num_heads,
|
| 259 |
+
dim_feedforward=config.lm_hidden_size * 4,
|
| 260 |
+
batch_first=True
|
| 261 |
+
)
|
| 262 |
+
for _ in range(config.lm_num_layers)
|
| 263 |
+
])
|
| 264 |
+
|
| 265 |
+
self.norm = nn.LayerNorm(config.lm_hidden_size)
|
| 266 |
+
self.lm_head = nn.Linear(config.lm_hidden_size, config.vocab_size, bias=False)
|
| 267 |
+
|
| 268 |
+
def forward(
|
| 269 |
+
self,
|
| 270 |
+
input_ids: torch.Tensor,
|
| 271 |
+
vision_tokens: Optional[torch.Tensor] = None,
|
| 272 |
+
attention_mask: Optional[torch.Tensor] = None
|
| 273 |
+
) -> torch.Tensor:
|
| 274 |
+
"""Generate logits from input tokens."""
|
| 275 |
+
batch_size, seq_len = input_ids.shape
|
| 276 |
+
device = input_ids.device
|
| 277 |
+
|
| 278 |
+
# Token embeddings
|
| 279 |
+
hidden = self.embed_tokens(input_ids)
|
| 280 |
+
|
| 281 |
+
# Position embeddings
|
| 282 |
+
positions = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1)
|
| 283 |
+
hidden = hidden + self.pos_embed(positions)
|
| 284 |
+
|
| 285 |
+
# Prepend vision tokens if provided
|
| 286 |
+
if vision_tokens is not None:
|
| 287 |
+
hidden = torch.cat([vision_tokens, hidden], dim=1)
|
| 288 |
+
|
| 289 |
+
# Transformer layers
|
| 290 |
+
for layer in self.layers:
|
| 291 |
+
hidden = layer(hidden, hidden)
|
| 292 |
+
|
| 293 |
+
hidden = self.norm(hidden)
|
| 294 |
+
|
| 295 |
+
# Only return logits for text tokens
|
| 296 |
+
if vision_tokens is not None:
|
| 297 |
+
hidden = hidden[:, vision_tokens.shape[1]:, :]
|
| 298 |
+
|
| 299 |
+
logits = self.lm_head(hidden)
|
| 300 |
+
|
| 301 |
+
return logits
|
| 302 |
+
|
| 303 |
+
|
| 304 |
# ============================================================================
|
| 305 |
# Task Heads
|
| 306 |
# ============================================================================
|
|
|
|
| 398 |
self.text_detector = nn.Sequential(
|
| 399 |
nn.Linear(hidden_dim, hidden_dim),
|
| 400 |
nn.GELU(),
|
| 401 |
+
nn.Linear(hidden_dim, 5)
|
| 402 |
)
|
| 403 |
|
| 404 |
def forward(self, vision_tokens: torch.Tensor) -> torch.Tensor:
|
|
|
|
| 437 |
|
| 438 |
class OculusForConditionalGeneration(PreTrainedModel):
|
| 439 |
"""
|
| 440 |
+
Oculus: Hybrid-Reasoning Vision-Language Model
|
| 441 |
|
| 442 |
+
Oceanir-Oculus OO1 Architecture
|
| 443 |
|
| 444 |
+
Features:
|
| 445 |
- Reasoning via Thinking Traces
|
| 446 |
- Perceptive Tool Calling + Focus (Zoom & Crop)
|
| 447 |
+
- Structured Outputs (JSON, Box, Point)
|
| 448 |
- Complex OCR
|
| 449 |
- Desktop UI Understanding
|
| 450 |
+
|
| 451 |
+
Small models that outperform systems 10x larger on visual reasoning.
|
| 452 |
"""
|
| 453 |
|
| 454 |
config_class = OculusConfig
|
|
|
|
| 461 |
# Vision encoder
|
| 462 |
self.vision_encoder = OculusVisionEncoder(config)
|
| 463 |
|
| 464 |
+
# Vision adapter for dimension matching
|
| 465 |
+
self.vision_adapter = nn.Linear(config.vision_hidden_size, config.fused_vision_dim)
|
|
|
|
| 466 |
|
| 467 |
# Projector
|
| 468 |
self.projector = OculusProjector(config)
|
| 469 |
|
| 470 |
+
# Language model
|
| 471 |
+
self.language_model = OculusLanguageModel(config)
|
| 472 |
+
|
| 473 |
# Task-specific heads
|
| 474 |
self.detection_head = OculusDetectionHead(config)
|
| 475 |
self.point_head = OculusPointHead(config)
|
|
|
|
| 477 |
self.ocr_head = OculusOCRHead(config)
|
| 478 |
self.ui_head = OculusUIHead(config)
|
| 479 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
# Special tokens
|
| 481 |
self.thinking_token = config.thinking_token
|
| 482 |
self.thinking_end_token = config.thinking_end_token
|
|
|
|
| 484 |
self.focus_end_token = config.focus_end_token
|
| 485 |
self.json_token = config.json_token
|
| 486 |
self.json_end_token = config.json_end_token
|
| 487 |
+
self.box_token = config.box_token
|
| 488 |
+
self.box_end_token = config.box_end_token
|
| 489 |
+
self.point_token = config.point_token
|
| 490 |
+
self.point_end_token = config.point_end_token
|
| 491 |
|
| 492 |
+
def encode_image(self, image: Union[Image.Image, str, np.ndarray, torch.Tensor]) -> torch.Tensor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
"""Encode image to vision tokens."""
|
| 494 |
if isinstance(image, str):
|
| 495 |
+
image = Image.open(image).convert('RGB')
|
| 496 |
|
| 497 |
+
if isinstance(image, Image.Image):
|
| 498 |
+
image = np.array(image.resize((self.config.image_size, self.config.image_size)))
|
| 499 |
+
|
| 500 |
+
if isinstance(image, np.ndarray):
|
| 501 |
+
image = torch.from_numpy(image).float()
|
| 502 |
+
if image.dim() == 3:
|
| 503 |
+
image = image.permute(2, 0, 1).unsqueeze(0)
|
| 504 |
+
image = image / 255.0
|
| 505 |
|
| 506 |
+
device = next(self.parameters()).device
|
| 507 |
+
image = image.to(device)
|
| 508 |
|
| 509 |
+
# Encode with vision encoder
|
| 510 |
+
vision_features = self.vision_encoder(image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
|
| 512 |
+
# Adapt dimensions
|
| 513 |
+
vision_features = self.vision_adapter(vision_features)
|
| 514 |
|
| 515 |
+
# Project to language space
|
| 516 |
vision_tokens = self.projector(vision_features)
|
| 517 |
|
| 518 |
return vision_tokens
|
|
|
|
| 525 |
def _generate_thinking_trace(self, prompt: str, context: str = "") -> str:
|
| 526 |
"""Generate structured thinking trace."""
|
| 527 |
if self.config.thinking_style == "structured":
|
| 528 |
+
return f"{self.thinking_token}Analyzing: {prompt[:50]}...{self.thinking_end_token}"
|
| 529 |
elif self.config.thinking_style == "verbose":
|
| 530 |
+
return f"{self.thinking_token}Let me think step by step: {prompt}{self.thinking_end_token}"
|
| 531 |
else:
|
| 532 |
return ""
|
| 533 |
|
|
|
|
| 552 |
think: Enable reasoning traces
|
| 553 |
focus: Enable zoom/crop for fine-grained perception
|
| 554 |
"""
|
|
|
|
|
|
|
| 555 |
if isinstance(image, str):
|
| 556 |
image = Image.open(image).convert('RGB')
|
| 557 |
elif isinstance(image, np.ndarray):
|
|
|
|
| 581 |
raise ValueError(f"Unknown mode: {mode}")
|
| 582 |
|
| 583 |
def _generate_text(self, image, prompt, vision_tokens, thinking_trace, max_new_tokens, **kwargs) -> OculusTextOutput:
|
| 584 |
+
"""Generate text output."""
|
| 585 |
+
# Placeholder - full implementation would do autoregressive generation
|
| 586 |
+
text = f"[Generated response for: {prompt[:50]}...]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
|
| 588 |
+
if thinking_trace:
|
| 589 |
+
text = f"{thinking_trace} {text}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
|
| 591 |
return OculusTextOutput(
|
| 592 |
text=text,
|
|
|
|
| 596 |
|
| 597 |
def _generate_json(self, image, prompt, vision_tokens, thinking_trace, **kwargs) -> OculusJSONOutput:
|
| 598 |
"""Generate structured JSON output."""
|
| 599 |
+
json_data = {
|
| 600 |
+
"prompt": prompt,
|
| 601 |
+
"response": "generated",
|
| 602 |
+
"objects": []
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
return OculusJSONOutput(
|
| 606 |
+
json_data=json_data,
|
| 607 |
+
text=f"{self.json_token}{json.dumps(json_data)}{self.json_end_token}",
|
| 608 |
thinking_trace=thinking_trace,
|
| 609 |
vision_tokens=vision_tokens
|
| 610 |
)
|