#!/usr/bin/env python3 """ Oculus VLM Benchmark Suite Evaluates Oculus on industry-standard VLM benchmarks similar to Isaac/Moondream. Benchmarks: 1. VQA v2 (subset) 2. RefCOCO Grounding 3. Counting (CVBench-style) 4. COCO Detection (mAP) 5. Captioning (BLEU) """ import os import sys import json import random import time from pathlib import Path from dataclasses import dataclass from typing import List, Dict, Optional from collections import defaultdict import numpy as np import torch from PIL import Image from tqdm import tqdm OCULUS_ROOT = Path(__file__).parent sys.path.insert(0, str(OCULUS_ROOT)) from oculus_inference import OculusPredictor # ============================================================================ # Benchmark Utilities # ============================================================================ def compute_iou(box1, box2): """Compute IoU between two boxes [x1, y1, x2, y2].""" x1 = max(box1[0], box2[0]) y1 = max(box1[1], box2[1]) x2 = min(box1[2], box2[2]) y2 = min(box1[3], box2[3]) inter = max(0, x2 - x1) * max(0, y2 - y1) area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) area2 = (box2[2] - box2[0]) * (box2[3] - box2[1]) return inter / (area1 + area2 - inter + 1e-8) # ============================================================================ # Benchmark 1: VQA v2 Style # ============================================================================ class VQABenchmark: """Visual Question Answering benchmark using COCO-derived questions.""" def __init__(self, data_dir="data/coco", max_samples=200): self.samples = [] # Load COCO annotations to generate VQA-style questions ann_file = Path(data_dir) / "annotations" / "instances_train2017.json" cap_file = Path(data_dir) / "annotations" / "captions_train2017.json" if not ann_file.exists(): print("āš ļø COCO annotations not found") return with open(ann_file) as f: instances = json.load(f) cat_map = {c['id']: c['name'] for c in instances['categories']} img_cats = defaultdict(set) for ann in instances['annotations']: img_cats[ann['image_id']].add(cat_map.get(ann['category_id'], 'object')) # Generate VQA samples for img in instances['images']: img_path = Path(data_dir) / "images" / img['file_name'] if not img_path.exists(): continue cats = list(img_cats.get(img['id'], [])) if not cats: continue cat = random.choice(cats) # Create different question types templates = [ (f"Is there a {cat} in this image?", "yes"), (f"What type of object is visible?", cat), (f"Does this image contain a {cat}?", "yes"), ] q, a = random.choice(templates) self.samples.append({ 'path': str(img_path), 'question': q, 'answer': a.lower(), 'category': cat }) if len(self.samples) >= max_samples: break print(f" VQA: Loaded {len(self.samples)} samples") def evaluate(self, model: OculusPredictor) -> Dict: """Run VQA evaluation.""" print("\nšŸ“Š VQA v2 Style Benchmark") print("-" * 50) correct = 0 total = 0 for sample in tqdm(self.samples, desc="VQA"): try: answer = model.ask(sample['path'], sample['question']) # Check if expected answer is in response if sample['answer'] in answer.lower(): correct += 1 total += 1 except Exception as e: pass accuracy = correct / total if total > 0 else 0 print(f" Accuracy: {accuracy:.2%} ({correct}/{total})") return { 'accuracy': float(accuracy), 'correct': correct, 'total': total } # ============================================================================ # Benchmark 2: RefCOCO Grounding # ============================================================================ class RefCOCOBenchmark: """Referring Expression Grounding using COCO boxes.""" def __init__(self, data_dir="data/coco", max_samples=100): self.samples = [] ann_file = Path(data_dir) / "annotations" / "instances_train2017.json" if not ann_file.exists(): return with open(ann_file) as f: instances = json.load(f) cat_map = {c['id']: c['name'] for c in instances['categories']} img_map = {img['id']: img for img in instances['images']} # Group annotations by image img_anns = defaultdict(list) for ann in instances['annotations']: img_anns[ann['image_id']].append(ann) for img_id, anns in img_anns.items(): if len(anns) < 1: continue img = img_map.get(img_id) if not img: continue img_path = Path(data_dir) / "images" / img['file_name'] if not img_path.exists(): continue # Pick a random object ann = random.choice(anns) cat = cat_map.get(ann['category_id'], 'object') # Normalize bbox x, y, w, h = ann['bbox'] box = [ x / img['width'], y / img['height'], (x + w) / img['width'], (y + h) / img['height'] ] self.samples.append({ 'path': str(img_path), 'expression': f"the {cat}", 'gt_box': box }) if len(self.samples) >= max_samples: break print(f" RefCOCO: Loaded {len(self.samples)} samples") def evaluate(self, model: OculusPredictor) -> Dict: """Run grounding evaluation.""" print("\nšŸ“Š RefCOCO Grounding Benchmark") print("-" * 50) ious = [] acc_50 = 0 for sample in tqdm(self.samples, desc="RefCOCO"): try: results = model.detect(sample['path'], prompt=f"Find {sample['expression']}") if len(results['boxes']) > 0: # Take highest confidence box pred_box = results['boxes'][0] iou = compute_iou(sample['gt_box'], pred_box) ious.append(iou) if iou >= 0.5: acc_50 += 1 else: ious.append(0) except: ious.append(0) mean_iou = np.mean(ious) if ious else 0 accuracy = acc_50 / len(self.samples) if self.samples else 0 print(f" Mean IoU: {mean_iou:.4f}") print(f" Acc@0.5: {accuracy:.2%}") return { 'mean_iou': float(mean_iou), 'accuracy_50': float(accuracy), 'num_samples': len(self.samples) } # ============================================================================ # Benchmark 3: Counting (CountBench Style) # ============================================================================ class CountBenchmark: """Object counting benchmark.""" def __init__(self, data_dir="data/coco", max_samples=100): self.samples = [] ann_file = Path(data_dir) / "annotations" / "instances_train2017.json" if not ann_file.exists(): return with open(ann_file) as f: instances = json.load(f) cat_map = {c['id']: c['name'] for c in instances['categories']} img_map = {img['id']: img for img in instances['images']} # Count objects per image per category img_counts = defaultdict(lambda: defaultdict(int)) for ann in instances['annotations']: if not ann.get('iscrowd', 0): cat = cat_map.get(ann['category_id'], 'object') img_counts[ann['image_id']][cat] += 1 for img_id, counts in img_counts.items(): img = img_map.get(img_id) if not img: continue img_path = Path(data_dir) / "images" / img['file_name'] if not img_path.exists(): continue # Pick category with 2-10 objects (reasonable counting range) for cat, count in counts.items(): if 2 <= count <= 10: self.samples.append({ 'path': str(img_path), 'category': cat, 'count': count }) break if len(self.samples) >= max_samples: break print(f" CountBench: Loaded {len(self.samples)} samples") def evaluate(self, model: OculusPredictor) -> Dict: """Run counting evaluation.""" print("\nšŸ“Š CountBench Benchmark") print("-" * 50) exact = 0 within_one = 0 errors = [] word_to_num = {'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10} for sample in tqdm(self.samples, desc="Counting"): try: question = f"How many {sample['category']}s are in this image? Answer with a number." answer = model.ask(sample['path'], question).lower() # Parse number pred = None for word in answer.split(): try: pred = int(word) break except: if word in word_to_num: pred = word_to_num[word] break if pred is not None: gt = sample['count'] if pred == gt: exact += 1 if abs(pred - gt) <= 1: within_one += 1 errors.append(abs(pred - gt)) except: pass total = len(self.samples) exact_acc = exact / total if total > 0 else 0 within1_acc = within_one / total if total > 0 else 0 mae = np.mean(errors) if errors else 0 print(f" Exact Accuracy: {exact_acc:.2%}") print(f" Within-1 Accuracy: {within1_acc:.2%}") print(f" MAE: {mae:.2f}") return { 'exact_accuracy': float(exact_acc), 'within_one_accuracy': float(within1_acc), 'mae': float(mae), 'total': total } # ============================================================================ # Benchmark 4: COCO Detection (mAP) # ============================================================================ class DetectionBenchmark: """Object Detection benchmark.""" def __init__(self, data_dir="data/coco", max_samples=100): self.samples = [] ann_file = Path(data_dir) / "annotations" / "instances_train2017.json" if not ann_file.exists(): return with open(ann_file) as f: instances = json.load(f) cat_map = {c['id']: c['name'] for c in instances['categories']} cat_idx = {c['id']: i for i, c in enumerate(instances['categories'])} img_map = {img['id']: img for img in instances['images']} img_anns = defaultdict(list) for ann in instances['annotations']: img_anns[ann['image_id']].append(ann) for img_id, anns in img_anns.items(): img = img_map.get(img_id) if not img: continue img_path = Path(data_dir) / "images" / img['file_name'] if not img_path.exists(): continue boxes = [] labels = [] for ann in anns: if 'bbox' not in ann: continue x, y, w, h = ann['bbox'] boxes.append([ x / img['width'], y / img['height'], (x + w) / img['width'], (y + h) / img['height'] ]) labels.append(cat_idx.get(ann['category_id'], 0)) if boxes: self.samples.append({ 'path': str(img_path), 'boxes': boxes, 'labels': labels }) if len(self.samples) >= max_samples: break print(f" Detection: Loaded {len(self.samples)} samples") def evaluate(self, model: OculusPredictor) -> Dict: """Run detection evaluation.""" print("\nšŸ“Š COCO Detection Benchmark") print("-" * 50) all_ious = [] all_correct = [] for sample in tqdm(self.samples, desc="Detection"): try: results = model.detect(sample['path']) pred_boxes = results['boxes'] pred_labels = [int(l) for l in results['labels']] for gt_box, gt_label in zip(sample['boxes'], sample['labels']): best_iou = 0 correct = False for pred_box, pred_label in zip(pred_boxes, pred_labels): iou = compute_iou(gt_box, list(pred_box)) if iou > best_iou: best_iou = iou correct = (iou >= 0.5) and (pred_label == gt_label) all_ious.append(best_iou) all_correct.append(correct) except: pass mean_iou = np.mean(all_ious) if all_ious else 0 accuracy = np.mean(all_correct) if all_correct else 0 print(f" Mean IoU: {mean_iou:.4f}") print(f" mAP@0.5: {accuracy:.4f}") return { 'mean_iou': float(mean_iou), 'map_50': float(accuracy), 'num_samples': len(self.samples) } # ============================================================================ # Main Runner # ============================================================================ def run_all_benchmarks(): """Run complete benchmark suite.""" print("=" * 60) print("šŸ”® OCULUS VLM BENCHMARK SUITE") print("=" * 60) # Initialize model print("\n[Loading Oculus Model]") model = OculusPredictor() results = {} # Run benchmarks print("\n[Running Benchmarks]") # 1. VQA vqa = VQABenchmark(max_samples=200) results['vqa_v2'] = vqa.evaluate(model) # 2. RefCOCO refcoco = RefCOCOBenchmark(max_samples=100) results['refcoco'] = refcoco.evaluate(model) # 3. Counting counting = CountBenchmark(max_samples=100) results['countbench'] = counting.evaluate(model) # 4. Detection detection = DetectionBenchmark(max_samples=100) results['coco_detection'] = detection.evaluate(model) # Summary print("\n" + "=" * 60) print("šŸ“Š BENCHMARK RESULTS SUMMARY") print("=" * 60) print(f""" ╔═══════════════════════════════════════════════════════════╗ ā•‘ OCULUS BENCHMARKS ā•‘ ╠═══════════════════════════════════════════════════════════╣ ā•‘ VQA v2 (Style) ā•‘ ā•‘ Accuracy: {results['vqa_v2']['accuracy']:.2%} ā•‘ ╠═══════════════════════════════════════════════════════════╣ ā•‘ RefCOCO Grounding ā•‘ ā•‘ Mean IoU: {results['refcoco']['mean_iou']:.4f} ā•‘ ā•‘ Acc@0.5: {results['refcoco']['accuracy_50']:.2%} ā•‘ ╠═══════════════════════════════════════════════════════════╣ ā•‘ CountBench ā•‘ ā•‘ Exact Accuracy: {results['countbench']['exact_accuracy']:.2%} ā•‘ ā•‘ Within-1 Acc: {results['countbench']['within_one_accuracy']:.2%} ā•‘ ╠═══════════════════════════════════════════════════════════╣ ā•‘ COCO Detection ā•‘ ā•‘ Mean IoU: {results['coco_detection']['mean_iou']:.4f} ā•‘ ā•‘ mAP@0.5: {results['coco_detection']['map_50']:.4f} ā•‘ ā•šā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā• """) # Save results output_path = OCULUS_ROOT / "benchmark_results.json" with open(output_path, "w") as f: json.dump(results, f, indent=2) print(f"šŸ’¾ Results saved to: {output_path}") return results if __name__ == "__main__": run_all_benchmarks()