Oculus / benchmark_vlm.py
kobiakor15's picture
Upload benchmark_vlm.py with huggingface_hub
d6e0b94 verified
#!/usr/bin/env python3
"""
Oculus VLM Benchmark Suite
Evaluates Oculus on industry-standard VLM benchmarks similar to Isaac/Moondream.
Benchmarks:
1. VQA v2 (subset)
2. RefCOCO Grounding
3. Counting (CVBench-style)
4. COCO Detection (mAP)
5. Captioning (BLEU)
"""
import os
import sys
import json
import random
import time
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Optional
from collections import defaultdict
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm
OCULUS_ROOT = Path(__file__).parent
sys.path.insert(0, str(OCULUS_ROOT))
from oculus_inference import OculusPredictor
# ============================================================================
# Benchmark Utilities
# ============================================================================
def compute_iou(box1, box2):
"""Compute IoU between two boxes [x1, y1, x2, y2]."""
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
inter = max(0, x2 - x1) * max(0, y2 - y1)
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
return inter / (area1 + area2 - inter + 1e-8)
# ============================================================================
# Benchmark 1: VQA v2 Style
# ============================================================================
class VQABenchmark:
"""Visual Question Answering benchmark using COCO-derived questions."""
def __init__(self, data_dir="data/coco", max_samples=200):
self.samples = []
# Load COCO annotations to generate VQA-style questions
ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
cap_file = Path(data_dir) / "annotations" / "captions_train2017.json"
if not ann_file.exists():
print("โš ๏ธ COCO annotations not found")
return
with open(ann_file) as f:
instances = json.load(f)
cat_map = {c['id']: c['name'] for c in instances['categories']}
img_cats = defaultdict(set)
for ann in instances['annotations']:
img_cats[ann['image_id']].add(cat_map.get(ann['category_id'], 'object'))
# Generate VQA samples
for img in instances['images']:
img_path = Path(data_dir) / "images" / img['file_name']
if not img_path.exists():
continue
cats = list(img_cats.get(img['id'], []))
if not cats:
continue
cat = random.choice(cats)
# Create different question types
templates = [
(f"Is there a {cat} in this image?", "yes"),
(f"What type of object is visible?", cat),
(f"Does this image contain a {cat}?", "yes"),
]
q, a = random.choice(templates)
self.samples.append({
'path': str(img_path),
'question': q,
'answer': a.lower(),
'category': cat
})
if len(self.samples) >= max_samples:
break
print(f" VQA: Loaded {len(self.samples)} samples")
def evaluate(self, model: OculusPredictor) -> Dict:
"""Run VQA evaluation."""
print("\n๐Ÿ“Š VQA v2 Style Benchmark")
print("-" * 50)
correct = 0
total = 0
for sample in tqdm(self.samples, desc="VQA"):
try:
answer = model.ask(sample['path'], sample['question'])
# Check if expected answer is in response
if sample['answer'] in answer.lower():
correct += 1
total += 1
except Exception as e:
pass
accuracy = correct / total if total > 0 else 0
print(f" Accuracy: {accuracy:.2%} ({correct}/{total})")
return {
'accuracy': float(accuracy),
'correct': correct,
'total': total
}
# ============================================================================
# Benchmark 2: RefCOCO Grounding
# ============================================================================
class RefCOCOBenchmark:
"""Referring Expression Grounding using COCO boxes."""
def __init__(self, data_dir="data/coco", max_samples=100):
self.samples = []
ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
if not ann_file.exists():
return
with open(ann_file) as f:
instances = json.load(f)
cat_map = {c['id']: c['name'] for c in instances['categories']}
img_map = {img['id']: img for img in instances['images']}
# Group annotations by image
img_anns = defaultdict(list)
for ann in instances['annotations']:
img_anns[ann['image_id']].append(ann)
for img_id, anns in img_anns.items():
if len(anns) < 1:
continue
img = img_map.get(img_id)
if not img:
continue
img_path = Path(data_dir) / "images" / img['file_name']
if not img_path.exists():
continue
# Pick a random object
ann = random.choice(anns)
cat = cat_map.get(ann['category_id'], 'object')
# Normalize bbox
x, y, w, h = ann['bbox']
box = [
x / img['width'],
y / img['height'],
(x + w) / img['width'],
(y + h) / img['height']
]
self.samples.append({
'path': str(img_path),
'expression': f"the {cat}",
'gt_box': box
})
if len(self.samples) >= max_samples:
break
print(f" RefCOCO: Loaded {len(self.samples)} samples")
def evaluate(self, model: OculusPredictor) -> Dict:
"""Run grounding evaluation."""
print("\n๐Ÿ“Š RefCOCO Grounding Benchmark")
print("-" * 50)
ious = []
acc_50 = 0
for sample in tqdm(self.samples, desc="RefCOCO"):
try:
results = model.detect(sample['path'], prompt=f"Find {sample['expression']}")
if len(results['boxes']) > 0:
# Take highest confidence box
pred_box = results['boxes'][0]
iou = compute_iou(sample['gt_box'], pred_box)
ious.append(iou)
if iou >= 0.5:
acc_50 += 1
else:
ious.append(0)
except:
ious.append(0)
mean_iou = np.mean(ious) if ious else 0
accuracy = acc_50 / len(self.samples) if self.samples else 0
print(f" Mean IoU: {mean_iou:.4f}")
print(f" Acc@0.5: {accuracy:.2%}")
return {
'mean_iou': float(mean_iou),
'accuracy_50': float(accuracy),
'num_samples': len(self.samples)
}
# ============================================================================
# Benchmark 3: Counting (CountBench Style)
# ============================================================================
class CountBenchmark:
"""Object counting benchmark."""
def __init__(self, data_dir="data/coco", max_samples=100):
self.samples = []
ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
if not ann_file.exists():
return
with open(ann_file) as f:
instances = json.load(f)
cat_map = {c['id']: c['name'] for c in instances['categories']}
img_map = {img['id']: img for img in instances['images']}
# Count objects per image per category
img_counts = defaultdict(lambda: defaultdict(int))
for ann in instances['annotations']:
if not ann.get('iscrowd', 0):
cat = cat_map.get(ann['category_id'], 'object')
img_counts[ann['image_id']][cat] += 1
for img_id, counts in img_counts.items():
img = img_map.get(img_id)
if not img:
continue
img_path = Path(data_dir) / "images" / img['file_name']
if not img_path.exists():
continue
# Pick category with 2-10 objects (reasonable counting range)
for cat, count in counts.items():
if 2 <= count <= 10:
self.samples.append({
'path': str(img_path),
'category': cat,
'count': count
})
break
if len(self.samples) >= max_samples:
break
print(f" CountBench: Loaded {len(self.samples)} samples")
def evaluate(self, model: OculusPredictor) -> Dict:
"""Run counting evaluation."""
print("\n๐Ÿ“Š CountBench Benchmark")
print("-" * 50)
exact = 0
within_one = 0
errors = []
word_to_num = {'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}
for sample in tqdm(self.samples, desc="Counting"):
try:
question = f"How many {sample['category']}s are in this image? Answer with a number."
answer = model.ask(sample['path'], question).lower()
# Parse number
pred = None
for word in answer.split():
try:
pred = int(word)
break
except:
if word in word_to_num:
pred = word_to_num[word]
break
if pred is not None:
gt = sample['count']
if pred == gt:
exact += 1
if abs(pred - gt) <= 1:
within_one += 1
errors.append(abs(pred - gt))
except:
pass
total = len(self.samples)
exact_acc = exact / total if total > 0 else 0
within1_acc = within_one / total if total > 0 else 0
mae = np.mean(errors) if errors else 0
print(f" Exact Accuracy: {exact_acc:.2%}")
print(f" Within-1 Accuracy: {within1_acc:.2%}")
print(f" MAE: {mae:.2f}")
return {
'exact_accuracy': float(exact_acc),
'within_one_accuracy': float(within1_acc),
'mae': float(mae),
'total': total
}
# ============================================================================
# Benchmark 4: COCO Detection (mAP)
# ============================================================================
class DetectionBenchmark:
"""Object Detection benchmark."""
def __init__(self, data_dir="data/coco", max_samples=100):
self.samples = []
ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
if not ann_file.exists():
return
with open(ann_file) as f:
instances = json.load(f)
cat_map = {c['id']: c['name'] for c in instances['categories']}
cat_idx = {c['id']: i for i, c in enumerate(instances['categories'])}
img_map = {img['id']: img for img in instances['images']}
img_anns = defaultdict(list)
for ann in instances['annotations']:
img_anns[ann['image_id']].append(ann)
for img_id, anns in img_anns.items():
img = img_map.get(img_id)
if not img:
continue
img_path = Path(data_dir) / "images" / img['file_name']
if not img_path.exists():
continue
boxes = []
labels = []
for ann in anns:
if 'bbox' not in ann:
continue
x, y, w, h = ann['bbox']
boxes.append([
x / img['width'],
y / img['height'],
(x + w) / img['width'],
(y + h) / img['height']
])
labels.append(cat_idx.get(ann['category_id'], 0))
if boxes:
self.samples.append({
'path': str(img_path),
'boxes': boxes,
'labels': labels
})
if len(self.samples) >= max_samples:
break
print(f" Detection: Loaded {len(self.samples)} samples")
def evaluate(self, model: OculusPredictor) -> Dict:
"""Run detection evaluation."""
print("\n๐Ÿ“Š COCO Detection Benchmark")
print("-" * 50)
all_ious = []
all_correct = []
for sample in tqdm(self.samples, desc="Detection"):
try:
results = model.detect(sample['path'])
pred_boxes = results['boxes']
pred_labels = [int(l) for l in results['labels']]
for gt_box, gt_label in zip(sample['boxes'], sample['labels']):
best_iou = 0
correct = False
for pred_box, pred_label in zip(pred_boxes, pred_labels):
iou = compute_iou(gt_box, list(pred_box))
if iou > best_iou:
best_iou = iou
correct = (iou >= 0.5) and (pred_label == gt_label)
all_ious.append(best_iou)
all_correct.append(correct)
except:
pass
mean_iou = np.mean(all_ious) if all_ious else 0
accuracy = np.mean(all_correct) if all_correct else 0
print(f" Mean IoU: {mean_iou:.4f}")
print(f" mAP@0.5: {accuracy:.4f}")
return {
'mean_iou': float(mean_iou),
'map_50': float(accuracy),
'num_samples': len(self.samples)
}
# ============================================================================
# Main Runner
# ============================================================================
def run_all_benchmarks():
"""Run complete benchmark suite."""
print("=" * 60)
print("๐Ÿ”ฎ OCULUS VLM BENCHMARK SUITE")
print("=" * 60)
# Initialize model
print("\n[Loading Oculus Model]")
model = OculusPredictor()
results = {}
# Run benchmarks
print("\n[Running Benchmarks]")
# 1. VQA
vqa = VQABenchmark(max_samples=200)
results['vqa_v2'] = vqa.evaluate(model)
# 2. RefCOCO
refcoco = RefCOCOBenchmark(max_samples=100)
results['refcoco'] = refcoco.evaluate(model)
# 3. Counting
counting = CountBenchmark(max_samples=100)
results['countbench'] = counting.evaluate(model)
# 4. Detection
detection = DetectionBenchmark(max_samples=100)
results['coco_detection'] = detection.evaluate(model)
# Summary
print("\n" + "=" * 60)
print("๐Ÿ“Š BENCHMARK RESULTS SUMMARY")
print("=" * 60)
print(f"""
โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—
โ•‘ OCULUS BENCHMARKS โ•‘
โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ
โ•‘ VQA v2 (Style) โ•‘
โ•‘ Accuracy: {results['vqa_v2']['accuracy']:.2%} โ•‘
โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ
โ•‘ RefCOCO Grounding โ•‘
โ•‘ Mean IoU: {results['refcoco']['mean_iou']:.4f} โ•‘
โ•‘ Acc@0.5: {results['refcoco']['accuracy_50']:.2%} โ•‘
โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ
โ•‘ CountBench โ•‘
โ•‘ Exact Accuracy: {results['countbench']['exact_accuracy']:.2%} โ•‘
โ•‘ Within-1 Acc: {results['countbench']['within_one_accuracy']:.2%} โ•‘
โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ
โ•‘ COCO Detection โ•‘
โ•‘ Mean IoU: {results['coco_detection']['mean_iou']:.4f} โ•‘
โ•‘ mAP@0.5: {results['coco_detection']['map_50']:.4f} โ•‘
โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
""")
# Save results
output_path = OCULUS_ROOT / "benchmark_results.json"
with open(output_path, "w") as f:
json.dump(results, f, indent=2)
print(f"๐Ÿ’พ Results saved to: {output_path}")
return results
if __name__ == "__main__":
run_all_benchmarks()