OceanirAI
/

Oculus

+#!/usr/bin/env python3
+"""
+Oculus Car Part Detection Demo
+Demonstrates detection on car images using the extended training model.
+"""
+import sys
+import requests
+from io import BytesIO
+from PIL import Image, ImageDraw, ImageFont
+import torch
+import numpy as np
+# Add parent to path
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent))
+from oculus_unified_model import OculusForConditionalGeneration
+def visualize_results(image, output, filename="output_car_parts.png"):
+    """Draw bounding boxes and labels on image."""
+    draw = ImageDraw.Draw(image)
+    # Try to load a font
+    try:
+        font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 16)
+    except:
+        font = ImageFont.load_default()
+    width, height = image.size
+    # COCO Classes
+    COCO_CLASSES = [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
+        'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench',
+        'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
+        'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+        'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
+        'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
+        'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+        'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
+        'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+        'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
+        'toothbrush'
+    ]
+    # Draw boxes
+    for box, label, conf in zip(output.boxes, output.labels, output.confidences):
+        # Box is [x1, y1, x2, y2] normalized
+        x1, y1, x2, y2 = box
+        # Clamp normalized coords
+        x1 = max(0.0, min(1.0, x1))
+        y1 = max(0.0, min(1.0, y1))
+        x2 = max(0.0, min(1.0, x2))
+        y2 = max(0.0, min(1.0, y2))
+        # Ensure valid box
+        if x2 <= x1 or y2 <= y1:
+            continue
+        x1 *= width
+        y1 *= height
+        x2 *= width
+        y2 *= height
+        # Color based on confidence
+        color = "red" if conf < 0.5 else "green"
+        draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
+        # Label
+        try:
+            class_name = COCO_CLASSES[int(label)]
+        except:
+            class_name = str(label)
+        label_text = f"{class_name} ({conf:.2f})"
+        # Draw text background
+        text_bbox = draw.textbbox((x1, y1), label_text, font=font)
+        draw.rectangle(text_bbox, fill=color)
+        draw.text((x1, y1), label_text, fill="white", font=font)
+    image.save(filename)
+    print(f"Saved visualization to {filename}")
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Oculus General Object Detection Demo")
+    parser.add_argument("--image", type=str, help="Path to image file to test")
+    parser.add_argument("--prompt", type=str, default="Detect objects", help="Text prompt for the model")
+    parser.add_argument("--mode", type=str, default="box", choices=["box", "vqa", "caption"], help="Inference mode")
+    parser.add_argument("--threshold", type=float, default=0.2, help="Detection threshold")
+    parser.add_argument("--output", type=str, default="detection_result.png", help="Output filename")
+    args = parser.parse_args()
+    # ... (Checkpoint loading logic remains the same) ...
+    # Find latest checkpoint
+    checkpoint_dir = Path("checkpoints/oculus_detection_v2")
+    model_path = None
+    if checkpoint_dir.exists():
+        # Get all step folders
+        steps = []
+        for d in checkpoint_dir.iterdir():
+            if d.is_dir() and d.name.startswith("step_"):
+                try:
+                    step = int(d.name.split("_")[1])
+                    steps.append((step, d))
+                except:
+                    pass
+        # Sort and pick latest
+        if steps:
+            steps.sort(key=lambda x: x[0], reverse=True)
+            model_path = str(steps[0][1])
+            print(f"✨ Found latest checkpoint: {model_path}")
+    if model_path is None:
+        model_path = str(checkpoint_dir / "final")
+    # Fallback to initial detection checkpoint if extended one isn't ready
+    if not Path(model_path).exists():
+        model_path = "checkpoints/oculus_detection/final"
+        print(f"⚠️ Extended V2 model not found, falling back to V1: {model_path}")
+    print(f"Loading model from {model_path}...")
+    try:
+        model = OculusForConditionalGeneration.from_pretrained(model_path)
+        # Load heads
+        heads_path = Path(model_path) / "heads.pth"
+        if heads_path.exists():
+            heads = torch.load(heads_path, map_location="cpu")
+            model.detection_head.load_state_dict(heads['detection'])
+            print("✓ Loaded detection heads")
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return
+    # Image logic
+    if args.image:
+        image_path = args.image
+        print(f"\nProcessing Custom Image: {image_path}...")
+    else:
+        # Use a generic COCO sample (dining table/people) instead of car if possible
+        # defaulting to the car one is fine, but let's see if we have others
+        image_path = "data/coco/images/000000071345.jpg"
+        print(f"\nProcessing Default Image: {image_path}...")
+    try:
+        if Path(image_path).exists():
+            image = Image.open(image_path).convert('RGB')
+        else:
+            # Fallback to online image
+             # Let's use a more crowded scene for generic detection
+            url = "https://upload.wikimedia.org/wikipedia/commons/thumb/8/8d/President_Barack_Obama.jpg/800px-President_Barack_Obama.jpg"
+            print(f"Image not found, downloading sample {url}...")
+            response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
+            image = Image.open(BytesIO(response.content)).convert('RGB')
+        # Mode selection
+        if args.mode == "box":
+            print(f"Running detection with prompt: '{args.prompt}'...")
+            output = model.generate(
+                image,
+                mode="box",
+                prompt=args.prompt,
+                threshold=args.threshold
+            )
+            print(f"Found {len(output.boxes)} objects")
+            visualize_results(image, output, args.output)
+        elif args.mode == "caption":
+            print("Generating caption...")
+            output = model.generate(image, mode="text", prompt="A photo of")
+            print(f"\n📝 Caption: {output.text}\n")
+        elif args.mode == "vqa":
+            question = args.prompt if args.prompt != "Detect objects" else "What is in this image?"
+            print(f"Thinking about question: '{question}'...")
+            output = model.generate(image, mode="text", prompt=question)
+            print(f"\n🤔 Answer: {output.text}\n")
+    except Exception as e:
+        print(f"Error processing image: {e}")
+if __name__ == "__main__":
+    main()