|
|
|
|
|
""" |
|
|
Oculus 0.2 Unified Demo |
|
|
|
|
|
Demonstrates all features of the unified Oculus model: |
|
|
- Text mode (captioning, VQA) |
|
|
- Point mode (counting objects) |
|
|
- Box mode (detection with bounding boxes) |
|
|
- Polygon mode (segmentation) |
|
|
- Optional reasoning with thinking traces |
|
|
- Focus system for fine-grained perception |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import requests |
|
|
from pathlib import Path |
|
|
from io import BytesIO |
|
|
|
|
|
from PIL import Image |
|
|
import torch |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
|
|
from oculus_unified_model import OculusForConditionalGeneration, OculusConfig |
|
|
|
|
|
|
|
|
def download_image(url: str) -> Image.Image: |
|
|
"""Download image from URL.""" |
|
|
headers = {'User-Agent': 'Mozilla/5.0'} |
|
|
response = requests.get(url, headers=headers, timeout=10) |
|
|
response.raise_for_status() |
|
|
return Image.open(BytesIO(response.content)).convert('RGB') |
|
|
|
|
|
|
|
|
def print_header(title: str): |
|
|
print("\n" + "=" * 70) |
|
|
print(f"๐ฎ {title}") |
|
|
print("=" * 70) |
|
|
|
|
|
|
|
|
def print_section(title: str): |
|
|
print(f"\n{'โ' * 70}") |
|
|
print(f" {title}") |
|
|
print(f"{'โ' * 70}") |
|
|
|
|
|
|
|
|
def demo(): |
|
|
print_header("OCULUS 0.2 UNIFIED MODEL DEMO") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n[1] Loading Oculus Model...") |
|
|
|
|
|
|
|
|
weights_path = Path(__file__).parent / "checkpoints" / "oculus_coco" / "final" |
|
|
|
|
|
if weights_path.exists(): |
|
|
print(f" Found trained weights at: {weights_path}") |
|
|
model = OculusForConditionalGeneration.from_pretrained(weights_path) |
|
|
else: |
|
|
print(" Using default configuration") |
|
|
config = OculusConfig( |
|
|
reasoning_enabled=True, |
|
|
enable_focus=True, |
|
|
) |
|
|
model = OculusForConditionalGeneration(config) |
|
|
|
|
|
print(" โ Model loaded!") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_images = [ |
|
|
{ |
|
|
"name": "Cat on Couch", |
|
|
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg" |
|
|
}, |
|
|
{ |
|
|
"name": "Golden Gate Bridge", |
|
|
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/GoldenGateBridge-001.jpg/1200px-GoldenGateBridge-001.jpg" |
|
|
}, |
|
|
] |
|
|
|
|
|
for test in test_images: |
|
|
print_header(f"Testing: {test['name']}") |
|
|
|
|
|
try: |
|
|
print("\n[Downloading image...]") |
|
|
image = download_image(test["url"]) |
|
|
print(f" Image size: {image.size}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print_section("๐ TEXT MODE - Captioning") |
|
|
|
|
|
output = model.generate( |
|
|
image=image, |
|
|
prompt="Describe this image in detail", |
|
|
mode="text", |
|
|
think=False |
|
|
) |
|
|
|
|
|
print(f" Caption: \"{output.text}\"") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print_section("๐ง TEXT MODE - With Reasoning") |
|
|
|
|
|
output = model.generate( |
|
|
image=image, |
|
|
prompt="What is the main subject of this image?", |
|
|
mode="text", |
|
|
think=True |
|
|
) |
|
|
|
|
|
if output.thinking_trace: |
|
|
print(f" ๐ญ Thinking: {output.thinking_trace[:200]}...") |
|
|
print(f" Answer: \"{output.text}\"") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print_section("โ TEXT MODE - VQA") |
|
|
|
|
|
questions = [ |
|
|
"What colors are visible in this image?", |
|
|
"Is this indoors or outdoors?", |
|
|
] |
|
|
|
|
|
for q in questions: |
|
|
output = model.generate( |
|
|
image=image, |
|
|
prompt=q, |
|
|
mode="text" |
|
|
) |
|
|
print(f" Q: {q}") |
|
|
print(f" A: {output.text}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print_section("๐ POINT MODE - Object Counting") |
|
|
|
|
|
output = model.generate( |
|
|
image=image, |
|
|
prompt="Find objects", |
|
|
mode="point" |
|
|
) |
|
|
|
|
|
print(f" Detected {len(output.points)} points") |
|
|
for i, (pt, label, conf) in enumerate(zip( |
|
|
output.points[:5], |
|
|
output.labels[:5], |
|
|
output.confidences[:5] |
|
|
)): |
|
|
print(f" Point {i+1}: {pt} (class={label}, conf={conf:.2f})") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print_section("๐ฆ BOX MODE - Object Detection") |
|
|
|
|
|
output = model.generate( |
|
|
image=image, |
|
|
prompt="Detect all objects", |
|
|
mode="box" |
|
|
) |
|
|
|
|
|
print(f" Detected {len(output.boxes)} boxes") |
|
|
for i, (box, label, conf) in enumerate(zip( |
|
|
output.boxes[:5], |
|
|
output.labels[:5], |
|
|
output.confidences[:5] |
|
|
)): |
|
|
print(f" Box {i+1}: {[f'{b:.2f}' for b in box]} (class={label}, conf={conf:.2f})") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print_section("๐ท POLYGON MODE - Segmentation") |
|
|
|
|
|
output = model.generate( |
|
|
image=image, |
|
|
prompt="Segment the scene", |
|
|
mode="polygon" |
|
|
) |
|
|
|
|
|
print(f" Segmentation mask shape: {output.mask.shape if output.mask is not None else 'N/A'}") |
|
|
print(f" Detected {len(output.polygons)} regions") |
|
|
for i, (poly, label) in enumerate(zip( |
|
|
output.polygons[:3], |
|
|
output.labels[:3] |
|
|
)): |
|
|
print(f" Region {i+1}: class={label}, vertices={len(poly)}") |
|
|
|
|
|
print("\n โ
All modes successful!") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"\n โ Error: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print_header("DEMO COMPLETE") |
|
|
|
|
|
print(""" |
|
|
Oculus 0.2 supports: |
|
|
|
|
|
๐ TEXT MODE |
|
|
- Image captioning |
|
|
- Visual question answering |
|
|
- With optional reasoning traces |
|
|
|
|
|
๐ POINT MODE |
|
|
- Object counting |
|
|
- Point localization |
|
|
|
|
|
๐ฆ BOX MODE |
|
|
- Object detection |
|
|
- Bounding box prediction |
|
|
|
|
|
๐ท POLYGON MODE |
|
|
- Semantic segmentation |
|
|
- Instance segmentation |
|
|
|
|
|
๐ง REASONING |
|
|
- Optional thinking traces |
|
|
- Multi-step reasoning |
|
|
|
|
|
๐ FOCUS SYSTEM |
|
|
- Zoom & crop for fine-grained perception |
|
|
- Automatic region detection |
|
|
|
|
|
Usage: |
|
|
```python |
|
|
from oculus_unified_model import OculusForConditionalGeneration |
|
|
|
|
|
model = OculusForConditionalGeneration.from_pretrained("./checkpoints/oculus_coco/final") |
|
|
|
|
|
# Caption |
|
|
output = model.generate(image, mode="text", prompt="Describe this") |
|
|
|
|
|
# VQA with reasoning |
|
|
output = model.generate(image, mode="text", prompt="What color is it?", think=True) |
|
|
|
|
|
# Detection |
|
|
output = model.generate(image, mode="box", prompt="Find cars") |
|
|
|
|
|
# Segmentation |
|
|
output = model.generate(image, mode="polygon") |
|
|
``` |
|
|
""") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo() |
|
|
|