Oculus

File size: 7,413 Bytes

8d515d0

#!/usr/bin/env python3
"""
Oculus Car Part Detection Demo

Demonstrates detection on car images using the extended training model.
"""

import sys
import requests
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont
import torch
import numpy as np

# Add parent to path
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))

from oculus_unified_model import OculusForConditionalGeneration

def visualize_results(image, output, filename="output_car_parts.png"):
    """Draw bounding boxes and labels on image."""
    draw = ImageDraw.Draw(image)
    
    # Try to load a font
    try:
        font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 16)
    except:
        font = ImageFont.load_default()
    
    width, height = image.size
    
    # COCO Classes
    COCO_CLASSES = [
        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
        'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench',
        'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
        'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
        'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
        'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
        'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
        'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
        'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
        'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
        'toothbrush'
    ]

    # Draw boxes
    for box, label, conf in zip(output.boxes, output.labels, output.confidences):
        # Box is [x1, y1, x2, y2] normalized
        x1, y1, x2, y2 = box
        
        # Clamp normalized coords
        x1 = max(0.0, min(1.0, x1))
        y1 = max(0.0, min(1.0, y1))
        x2 = max(0.0, min(1.0, x2))
        y2 = max(0.0, min(1.0, y2))
        
        # Ensure valid box
        if x2 <= x1 or y2 <= y1:
            continue
            
        x1 *= width
        y1 *= height
        x2 *= width
        y2 *= height
        
        # Color based on confidence
        color = "red" if conf < 0.5 else "green"
        
        draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
        
        # Label
        try:
            class_name = COCO_CLASSES[int(label)]
        except:
            class_name = str(label)
            
        label_text = f"{class_name} ({conf:.2f})"
        
        # Draw text background
        text_bbox = draw.textbbox((x1, y1), label_text, font=font)
        draw.rectangle(text_bbox, fill=color)
        draw.text((x1, y1), label_text, fill="white", font=font)
        
    image.save(filename)
    print(f"Saved visualization to {filename}")

def main():
    import argparse
    parser = argparse.ArgumentParser(description="Oculus General Object Detection Demo")
    parser.add_argument("--image", type=str, help="Path to image file to test")
    parser.add_argument("--prompt", type=str, default="Detect objects", help="Text prompt for the model")
    parser.add_argument("--mode", type=str, default="box", choices=["box", "vqa", "caption"], help="Inference mode")
    parser.add_argument("--threshold", type=float, default=0.2, help="Detection threshold")
    parser.add_argument("--output", type=str, default="detection_result.png", help="Output filename")
    args = parser.parse_args()
    
    # ... (Checkpoint loading logic remains the same) ...
    # Find latest checkpoint
    checkpoint_dir = Path("checkpoints/oculus_detection_v2")
    model_path = None
    
    if checkpoint_dir.exists():
        # Get all step folders
        steps = []
        for d in checkpoint_dir.iterdir():
            if d.is_dir() and d.name.startswith("step_"):
                try:
                    step = int(d.name.split("_")[1])
                    steps.append((step, d))
                except:
                    pass
        
        # Sort and pick latest
        if steps:
            steps.sort(key=lambda x: x[0], reverse=True)
            model_path = str(steps[0][1])
            print(f"✨ Found latest checkpoint: {model_path}")
            
    if model_path is None:
        model_path = str(checkpoint_dir / "final")
        
    # Fallback to initial detection checkpoint if extended one isn't ready
    if not Path(model_path).exists():
        model_path = "checkpoints/oculus_detection/final"
        print(f"⚠️ Extended V2 model not found, falling back to V1: {model_path}")
    
    print(f"Loading model from {model_path}...")
    try:
        model = OculusForConditionalGeneration.from_pretrained(model_path)
        
        # Load heads
        heads_path = Path(model_path) / "heads.pth"
        if heads_path.exists():
            heads = torch.load(heads_path, map_location="cpu")
            model.detection_head.load_state_dict(heads['detection'])
            print("✓ Loaded detection heads")
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    # Image logic
    if args.image:
        image_path = args.image
        print(f"\nProcessing Custom Image: {image_path}...")
    else:
        # Use a generic COCO sample (dining table/people) instead of car if possible
        # defaulting to the car one is fine, but let's see if we have others
        image_path = "data/coco/images/000000071345.jpg" 
        print(f"\nProcessing Default Image: {image_path}...")
    
    try:
        if Path(image_path).exists():
            image = Image.open(image_path).convert('RGB')
        else:
            # Fallback to online image
             # Let's use a more crowded scene for generic detection
            url = "https://upload.wikimedia.org/wikipedia/commons/thumb/8/8d/President_Barack_Obama.jpg/800px-President_Barack_Obama.jpg"
            print(f"Image not found, downloading sample {url}...")
            response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            image = Image.open(BytesIO(response.content)).convert('RGB')
            
        # Mode selection
        if args.mode == "box":
            print(f"Running detection with prompt: '{args.prompt}'...")
            output = model.generate(
                image, 
                mode="box", 
                prompt=args.prompt,
                threshold=args.threshold
            )
            print(f"Found {len(output.boxes)} objects")
            visualize_results(image, output, args.output)
            
        elif args.mode == "caption":
            print("Generating caption...")
            output = model.generate(image, mode="text", prompt="A photo of")
            print(f"\n📝 Caption: {output.text}\n")
            
        elif args.mode == "vqa":
            question = args.prompt if args.prompt != "Detect objects" else "What is in this image?"
            print(f"Thinking about question: '{question}'...")
            output = model.generate(image, mode="text", prompt=question)
            print(f"\n🤔 Answer: {output.text}\n")
            
    except Exception as e:
        print(f"Error processing image: {e}")

if __name__ == "__main__":
    main()