OceanirAI
/

Oculus

+#!/usr/bin/env python3
+"""
+Oculus VLM Benchmark Suite
+Evaluates Oculus on industry-standard VLM benchmarks similar to Isaac/Moondream.
+Benchmarks:
+1. VQA v2 (subset)
+2. RefCOCO Grounding
+3. Counting (CVBench-style)
+4. COCO Detection (mAP)
+5. Captioning (BLEU)
+"""
+import os
+import sys
+import json
+import random
+import time
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Dict, Optional
+from collections import defaultdict
+import numpy as np
+import torch
+from PIL import Image
+from tqdm import tqdm
+OCULUS_ROOT = Path(__file__).parent
+sys.path.insert(0, str(OCULUS_ROOT))
+from oculus_inference import OculusPredictor
+# ============================================================================
+# Benchmark Utilities
+# ============================================================================
+def compute_iou(box1, box2):
+    """Compute IoU between two boxes [x1, y1, x2, y2]."""
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+    inter = max(0, x2 - x1) * max(0, y2 - y1)
+    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    return inter / (area1 + area2 - inter + 1e-8)
+# ============================================================================
+# Benchmark 1: VQA v2 Style
+# ============================================================================
+class VQABenchmark:
+    """Visual Question Answering benchmark using COCO-derived questions."""
+    def __init__(self, data_dir="data/coco", max_samples=200):
+        self.samples = []
+        # Load COCO annotations to generate VQA-style questions
+        ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
+        cap_file = Path(data_dir) / "annotations" / "captions_train2017.json"
+        if not ann_file.exists():
+            print("⚠️ COCO annotations not found")
+            return
+        with open(ann_file) as f:
+            instances = json.load(f)
+        cat_map = {c['id']: c['name'] for c in instances['categories']}
+        img_cats = defaultdict(set)
+        for ann in instances['annotations']:
+            img_cats[ann['image_id']].add(cat_map.get(ann['category_id'], 'object'))
+        # Generate VQA samples
+        for img in instances['images']:
+            img_path = Path(data_dir) / "images" / img['file_name']
+            if not img_path.exists():
+                continue
+            cats = list(img_cats.get(img['id'], []))
+            if not cats:
+                continue
+            cat = random.choice(cats)
+            # Create different question types
+            templates = [
+                (f"Is there a {cat} in this image?", "yes"),
+                (f"What type of object is visible?", cat),
+                (f"Does this image contain a {cat}?", "yes"),
+            ]
+            q, a = random.choice(templates)
+            self.samples.append({
+                'path': str(img_path),
+                'question': q,
+                'answer': a.lower(),
+                'category': cat
+            })
+            if len(self.samples) >= max_samples:
+                break
+        print(f"  VQA: Loaded {len(self.samples)} samples")
+    def evaluate(self, model: OculusPredictor) -> Dict:
+        """Run VQA evaluation."""
+        print("\n📊 VQA v2 Style Benchmark")
+        print("-" * 50)
+        correct = 0
+        total = 0
+        for sample in tqdm(self.samples, desc="VQA"):
+            try:
+                answer = model.ask(sample['path'], sample['question'])
+                # Check if expected answer is in response
+                if sample['answer'] in answer.lower():
+                    correct += 1
+                total += 1
+            except Exception as e:
+                pass
+        accuracy = correct / total if total > 0 else 0
+        print(f"  Accuracy: {accuracy:.2%} ({correct}/{total})")
+        return {
+            'accuracy': float(accuracy),
+            'correct': correct,
+            'total': total
+        }
+# ============================================================================
+# Benchmark 2: RefCOCO Grounding
+# ============================================================================
+class RefCOCOBenchmark:
+    """Referring Expression Grounding using COCO boxes."""
+    def __init__(self, data_dir="data/coco", max_samples=100):
+        self.samples = []
+        ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
+        if not ann_file.exists():
+            return
+        with open(ann_file) as f:
+            instances = json.load(f)
+        cat_map = {c['id']: c['name'] for c in instances['categories']}
+        img_map = {img['id']: img for img in instances['images']}
+        # Group annotations by image
+        img_anns = defaultdict(list)
+        for ann in instances['annotations']:
+            img_anns[ann['image_id']].append(ann)
+        for img_id, anns in img_anns.items():
+            if len(anns) < 1:
+                continue
+            img = img_map.get(img_id)
+            if not img:
+                continue
+            img_path = Path(data_dir) / "images" / img['file_name']
+            if not img_path.exists():
+                continue
+            # Pick a random object
+            ann = random.choice(anns)
+            cat = cat_map.get(ann['category_id'], 'object')
+            # Normalize bbox
+            x, y, w, h = ann['bbox']
+            box = [
+                x / img['width'],
+                y / img['height'],
+                (x + w) / img['width'],
+                (y + h) / img['height']
+            ]
+            self.samples.append({
+                'path': str(img_path),
+                'expression': f"the {cat}",
+                'gt_box': box
+            })
+            if len(self.samples) >= max_samples:
+                break
+        print(f"  RefCOCO: Loaded {len(self.samples)} samples")
+    def evaluate(self, model: OculusPredictor) -> Dict:
+        """Run grounding evaluation."""
+        print("\n📊 RefCOCO Grounding Benchmark")
+        print("-" * 50)
+        ious = []
+        acc_50 = 0
+        for sample in tqdm(self.samples, desc="RefCOCO"):
+            try:
+                results = model.detect(sample['path'], prompt=f"Find {sample['expression']}")
+                if len(results['boxes']) > 0:
+                    # Take highest confidence box
+                    pred_box = results['boxes'][0]
+                    iou = compute_iou(sample['gt_box'], pred_box)
+                    ious.append(iou)
+                    if iou >= 0.5:
+                        acc_50 += 1
+                else:
+                    ious.append(0)
+            except:
+                ious.append(0)
+        mean_iou = np.mean(ious) if ious else 0
+        accuracy = acc_50 / len(self.samples) if self.samples else 0
+        print(f"  Mean IoU: {mean_iou:.4f}")
+        print(f"  Acc@0.5: {accuracy:.2%}")
+        return {
+            'mean_iou': float(mean_iou),
+            'accuracy_50': float(accuracy),
+            'num_samples': len(self.samples)
+        }
+# ============================================================================
+# Benchmark 3: Counting (CountBench Style)
+# ============================================================================
+class CountBenchmark:
+    """Object counting benchmark."""
+    def __init__(self, data_dir="data/coco", max_samples=100):
+        self.samples = []
+        ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
+        if not ann_file.exists():
+            return
+        with open(ann_file) as f:
+            instances = json.load(f)
+        cat_map = {c['id']: c['name'] for c in instances['categories']}
+        img_map = {img['id']: img for img in instances['images']}
+        # Count objects per image per category
+        img_counts = defaultdict(lambda: defaultdict(int))
+        for ann in instances['annotations']:
+            if not ann.get('iscrowd', 0):
+                cat = cat_map.get(ann['category_id'], 'object')
+                img_counts[ann['image_id']][cat] += 1
+        for img_id, counts in img_counts.items():
+            img = img_map.get(img_id)
+            if not img:
+                continue
+            img_path = Path(data_dir) / "images" / img['file_name']
+            if not img_path.exists():
+                continue
+            # Pick category with 2-10 objects (reasonable counting range)
+            for cat, count in counts.items():
+                if 2 <= count <= 10:
+                    self.samples.append({
+                        'path': str(img_path),
+                        'category': cat,
+                        'count': count
+                    })
+                    break
+            if len(self.samples) >= max_samples:
+                break
+        print(f"  CountBench: Loaded {len(self.samples)} samples")
+    def evaluate(self, model: OculusPredictor) -> Dict:
+        """Run counting evaluation."""
+        print("\n📊 CountBench Benchmark")
+        print("-" * 50)
+        exact = 0
+        within_one = 0
+        errors = []
+        word_to_num = {'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
+                       'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}
+        for sample in tqdm(self.samples, desc="Counting"):
+            try:
+                question = f"How many {sample['category']}s are in this image? Answer with a number."
+                answer = model.ask(sample['path'], question).lower()
+                # Parse number
+                pred = None
+                for word in answer.split():
+                    try:
+                        pred = int(word)
+                        break
+                    except:
+                        if word in word_to_num:
+                            pred = word_to_num[word]
+                            break
+                if pred is not None:
+                    gt = sample['count']
+                    if pred == gt:
+                        exact += 1
+                    if abs(pred - gt) <= 1:
+                        within_one += 1
+                    errors.append(abs(pred - gt))
+            except:
+                pass
+        total = len(self.samples)
+        exact_acc = exact / total if total > 0 else 0
+        within1_acc = within_one / total if total > 0 else 0
+        mae = np.mean(errors) if errors else 0
+        print(f"  Exact Accuracy: {exact_acc:.2%}")
+        print(f"  Within-1 Accuracy: {within1_acc:.2%}")
+        print(f"  MAE: {mae:.2f}")
+        return {
+            'exact_accuracy': float(exact_acc),
+            'within_one_accuracy': float(within1_acc),
+            'mae': float(mae),
+            'total': total
+        }
+# ============================================================================
+# Benchmark 4: COCO Detection (mAP)
+# ============================================================================
+class DetectionBenchmark:
+    """Object Detection benchmark."""
+    def __init__(self, data_dir="data/coco", max_samples=100):
+        self.samples = []
+        ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
+        if not ann_file.exists():
+            return
+        with open(ann_file) as f:
+            instances = json.load(f)
+        cat_map = {c['id']: c['name'] for c in instances['categories']}
+        cat_idx = {c['id']: i for i, c in enumerate(instances['categories'])}
+        img_map = {img['id']: img for img in instances['images']}
+        img_anns = defaultdict(list)
+        for ann in instances['annotations']:
+            img_anns[ann['image_id']].append(ann)
+        for img_id, anns in img_anns.items():
+            img = img_map.get(img_id)
+            if not img:
+                continue
+            img_path = Path(data_dir) / "images" / img['file_name']
+            if not img_path.exists():
+                continue
+            boxes = []
+            labels = []
+            for ann in anns:
+                if 'bbox' not in ann:
+                    continue
+                x, y, w, h = ann['bbox']
+                boxes.append([
+                    x / img['width'],
+                    y / img['height'],
+                    (x + w) / img['width'],
+                    (y + h) / img['height']
+                ])
+                labels.append(cat_idx.get(ann['category_id'], 0))
+            if boxes:
+                self.samples.append({
+                    'path': str(img_path),
+                    'boxes': boxes,
+                    'labels': labels
+                })
+            if len(self.samples) >= max_samples:
+                break
+        print(f"  Detection: Loaded {len(self.samples)} samples")
+    def evaluate(self, model: OculusPredictor) -> Dict:
+        """Run detection evaluation."""
+        print("\n📊 COCO Detection Benchmark")
+        print("-" * 50)
+        all_ious = []
+        all_correct = []
+        for sample in tqdm(self.samples, desc="Detection"):
+            try:
+                results = model.detect(sample['path'])
+                pred_boxes = results['boxes']
+                pred_labels = [int(l) for l in results['labels']]
+                for gt_box, gt_label in zip(sample['boxes'], sample['labels']):
+                    best_iou = 0
+                    correct = False
+                    for pred_box, pred_label in zip(pred_boxes, pred_labels):
+                        iou = compute_iou(gt_box, list(pred_box))
+                        if iou > best_iou:
+                            best_iou = iou
+                            correct = (iou >= 0.5) and (pred_label == gt_label)
+                    all_ious.append(best_iou)
+                    all_correct.append(correct)
+            except:
+                pass
+        mean_iou = np.mean(all_ious) if all_ious else 0
+        accuracy = np.mean(all_correct) if all_correct else 0
+        print(f"  Mean IoU: {mean_iou:.4f}")
+        print(f"  mAP@0.5: {accuracy:.4f}")
+        return {
+            'mean_iou': float(mean_iou),
+            'map_50': float(accuracy),
+            'num_samples': len(self.samples)
+        }
+# ============================================================================
+# Main Runner
+# ============================================================================
+def run_all_benchmarks():
+    """Run complete benchmark suite."""
+    print("=" * 60)
+    print("🔮 OCULUS VLM BENCHMARK SUITE")
+    print("=" * 60)
+    # Initialize model
+    print("\n[Loading Oculus Model]")
+    model = OculusPredictor()
+    results = {}
+    # Run benchmarks
+    print("\n[Running Benchmarks]")
+    # 1. VQA
+    vqa = VQABenchmark(max_samples=200)
+    results['vqa_v2'] = vqa.evaluate(model)
+    # 2. RefCOCO
+    refcoco = RefCOCOBenchmark(max_samples=100)
+    results['refcoco'] = refcoco.evaluate(model)
+    # 3. Counting
+    counting = CountBenchmark(max_samples=100)
+    results['countbench'] = counting.evaluate(model)
+    # 4. Detection
+    detection = DetectionBenchmark(max_samples=100)
+    results['coco_detection'] = detection.evaluate(model)
+    # Summary
+    print("\n" + "=" * 60)
+    print("📊 BENCHMARK RESULTS SUMMARY")
+    print("=" * 60)
+    print(f"""
+╔═══════════════════════════════════════════════════════════╗
+║                    OCULUS BENCHMARKS                      ║
+╠═══════════════════════════════════════════════════════════╣
+║  VQA v2 (Style)                                           ║
+║    Accuracy:        {results['vqa_v2']['accuracy']:.2%}                              ║
+╠═══════════════════════════════════════════════════════════╣
+║  RefCOCO Grounding                                        ║
+║    Mean IoU:        {results['refcoco']['mean_iou']:.4f}                             ║
+║    Acc@0.5:         {results['refcoco']['accuracy_50']:.2%}                              ║
+╠═══════════════════════════════════════════════════════════╣
+║  CountBench                                               ║
+║    Exact Accuracy:  {results['countbench']['exact_accuracy']:.2%}                              ║
+║    Within-1 Acc:    {results['countbench']['within_one_accuracy']:.2%}                              ║
+╠═══════════════════════════════════════════════════════════╣
+║  COCO Detection                                           ║
+║    Mean IoU:        {results['coco_detection']['mean_iou']:.4f}                             ║
+║    mAP@0.5:         {results['coco_detection']['map_50']:.4f}                             ║
+╚═══════════════════════════════════════════════════════════╝
+""")
+    # Save results
+    output_path = OCULUS_ROOT / "benchmark_results.json"
+    with open(output_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"💾 Results saved to: {output_path}")
+    return results
+if __name__ == "__main__":
+    run_all_benchmarks()