OceanirAI
/

Oculus

+#!/usr/bin/env python3
+"""
+Oculus 0.2 Unified Demo
+Demonstrates all features of the unified Oculus model:
+- Text mode (captioning, VQA)
+- Point mode (counting objects)
+- Box mode (detection with bounding boxes)
+- Polygon mode (segmentation)
+- Optional reasoning with thinking traces
+- Focus system for fine-grained perception
+"""
+import os
+import sys
+import requests
+from pathlib import Path
+from io import BytesIO
+from PIL import Image
+import torch
+# Add parent to path
+sys.path.insert(0, str(Path(__file__).parent))
+from oculus_unified_model import OculusForConditionalGeneration, OculusConfig
+def download_image(url: str) -> Image.Image:
+    """Download image from URL."""
+    headers = {'User-Agent': 'Mozilla/5.0'}
+    response = requests.get(url, headers=headers, timeout=10)
+    response.raise_for_status()
+    return Image.open(BytesIO(response.content)).convert('RGB')
+def print_header(title: str):
+    print("\n" + "=" * 70)
+    print(f"🔮 {title}")
+    print("=" * 70)
+def print_section(title: str):
+    print(f"\n{'─' * 70}")
+    print(f"   {title}")
+    print(f"{'─' * 70}")
+def demo():
+    print_header("OCULUS 0.2 UNIFIED MODEL DEMO")
+    # ================================================================
+    # Load Model
+    # ================================================================
+    print("\n[1] Loading Oculus Model...")
+    # Check if we have trained weights
+    weights_path = Path(__file__).parent / "checkpoints" / "oculus_coco" / "final"
+    if weights_path.exists():
+        print(f"    Found trained weights at: {weights_path}")
+        model = OculusForConditionalGeneration.from_pretrained(weights_path)
+    else:
+        print("    Using default configuration")
+        config = OculusConfig(
+            reasoning_enabled=True,
+            enable_focus=True,
+        )
+        model = OculusForConditionalGeneration(config)
+    print("    ✓ Model loaded!")
+    # ================================================================
+    # Test Images
+    # ================================================================
+    test_images = [
+        {
+            "name": "Cat on Couch",
+            "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg"
+        },
+        {
+            "name": "Golden Gate Bridge",
+            "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/GoldenGateBridge-001.jpg/1200px-GoldenGateBridge-001.jpg"
+        },
+    ]
+    for test in test_images:
+        print_header(f"Testing: {test['name']}")
+        try:
+            print("\n[Downloading image...]")
+            image = download_image(test["url"])
+            print(f"    Image size: {image.size}")
+            # ========================================================
+            # Mode 1: TEXT (Captioning)
+            # ========================================================
+            print_section("📝 TEXT MODE - Captioning")
+            output = model.generate(
+                image=image,
+                prompt="Describe this image in detail",
+                mode="text",
+                think=False
+            )
+            print(f"    Caption: \"{output.text}\"")
+            # ========================================================
+            # Mode 2: TEXT with Reasoning
+            # ========================================================
+            print_section("🧠 TEXT MODE - With Reasoning")
+            output = model.generate(
+                image=image,
+                prompt="What is the main subject of this image?",
+                mode="text",
+                think=True  # Enable thinking traces
+            )
+            if output.thinking_trace:
+                print(f"    💭 Thinking: {output.thinking_trace[:200]}...")
+            print(f"    Answer: \"{output.text}\"")
+            # ========================================================
+            # Mode 3: TEXT (VQA)
+            # ========================================================
+            print_section("❓ TEXT MODE - VQA")
+            questions = [
+                "What colors are visible in this image?",
+                "Is this indoors or outdoors?",
+            ]
+            for q in questions:
+                output = model.generate(
+                    image=image,
+                    prompt=q,
+                    mode="text"
+                )
+                print(f"    Q: {q}")
+                print(f"    A: {output.text}")
+            # ========================================================
+            # Mode 4: POINT (Counting)
+            # ========================================================
+            print_section("📍 POINT MODE - Object Counting")
+            output = model.generate(
+                image=image,
+                prompt="Find objects",
+                mode="point"
+            )
+            print(f"    Detected {len(output.points)} points")
+            for i, (pt, label, conf) in enumerate(zip(
+                output.points[:5],
+                output.labels[:5],
+                output.confidences[:5]
+            )):
+                print(f"      Point {i+1}: {pt} (class={label}, conf={conf:.2f})")
+            # ========================================================
+            # Mode 5: BOX (Detection)
+            # ========================================================
+            print_section("📦 BOX MODE - Object Detection")
+            output = model.generate(
+                image=image,
+                prompt="Detect all objects",
+                mode="box"
+            )
+            print(f"    Detected {len(output.boxes)} boxes")
+            for i, (box, label, conf) in enumerate(zip(
+                output.boxes[:5],
+                output.labels[:5],
+                output.confidences[:5]
+            )):
+                print(f"      Box {i+1}: {[f'{b:.2f}' for b in box]} (class={label}, conf={conf:.2f})")
+            # ========================================================
+            # Mode 6: POLYGON (Segmentation)
+            # ========================================================
+            print_section("🔷 POLYGON MODE - Segmentation")
+            output = model.generate(
+                image=image,
+                prompt="Segment the scene",
+                mode="polygon"
+            )
+            print(f"    Segmentation mask shape: {output.mask.shape if output.mask is not None else 'N/A'}")
+            print(f"    Detected {len(output.polygons)} regions")
+            for i, (poly, label) in enumerate(zip(
+                output.polygons[:3],
+                output.labels[:3]
+            )):
+                print(f"      Region {i+1}: class={label}, vertices={len(poly)}")
+            print("\n    ✅ All modes successful!")
+        except Exception as e:
+            print(f"\n    ❌ Error: {e}")
+            import traceback
+            traceback.print_exc()
+    # ================================================================
+    # Summary
+    # ================================================================
+    print_header("DEMO COMPLETE")
+    print("""
+    Oculus 0.2 supports:
+    📝 TEXT MODE
+       - Image captioning
+       - Visual question answering
+       - With optional reasoning traces
+    📍 POINT MODE
+       - Object counting
+       - Point localization
+    📦 BOX MODE
+       - Object detection
+       - Bounding box prediction
+    🔷 POLYGON MODE
+       - Semantic segmentation
+       - Instance segmentation
+    🧠 REASONING
+       - Optional thinking traces
+       - Multi-step reasoning
+    🔍 FOCUS SYSTEM
+       - Zoom & crop for fine-grained perception
+       - Automatic region detection
+    Usage:
+    ```python
+    from oculus_unified_model import OculusForConditionalGeneration
+    model = OculusForConditionalGeneration.from_pretrained("./checkpoints/oculus_coco/final")
+    # Caption
+    output = model.generate(image, mode="text", prompt="Describe this")
+    # VQA with reasoning
+    output = model.generate(image, mode="text", prompt="What color is it?", think=True)
+    # Detection
+    output = model.generate(image, mode="box", prompt="Find cars")
+    # Segmentation
+    output = model.generate(image, mode="polygon")
+    ```
+    """)
+if __name__ == "__main__":
+    demo()