|
|
|
|
|
""" |
|
|
Oculus VLM Benchmark Suite |
|
|
|
|
|
Evaluates Oculus on industry-standard VLM benchmarks similar to Isaac/Moondream. |
|
|
Benchmarks: |
|
|
1. VQA v2 (subset) |
|
|
2. RefCOCO Grounding |
|
|
3. Counting (CVBench-style) |
|
|
4. COCO Detection (mAP) |
|
|
5. Captioning (BLEU) |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import json |
|
|
import random |
|
|
import time |
|
|
from pathlib import Path |
|
|
from dataclasses import dataclass |
|
|
from typing import List, Dict, Optional |
|
|
from collections import defaultdict |
|
|
|
|
|
import numpy as np |
|
|
import torch |
|
|
from PIL import Image |
|
|
from tqdm import tqdm |
|
|
|
|
|
OCULUS_ROOT = Path(__file__).parent |
|
|
sys.path.insert(0, str(OCULUS_ROOT)) |
|
|
|
|
|
from oculus_inference import OculusPredictor |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compute_iou(box1, box2): |
|
|
"""Compute IoU between two boxes [x1, y1, x2, y2].""" |
|
|
x1 = max(box1[0], box2[0]) |
|
|
y1 = max(box1[1], box2[1]) |
|
|
x2 = min(box1[2], box2[2]) |
|
|
y2 = min(box1[3], box2[3]) |
|
|
|
|
|
inter = max(0, x2 - x1) * max(0, y2 - y1) |
|
|
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) |
|
|
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1]) |
|
|
|
|
|
return inter / (area1 + area2 - inter + 1e-8) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class VQABenchmark: |
|
|
"""Visual Question Answering benchmark using COCO-derived questions.""" |
|
|
|
|
|
def __init__(self, data_dir="data/coco", max_samples=200): |
|
|
self.samples = [] |
|
|
|
|
|
|
|
|
ann_file = Path(data_dir) / "annotations" / "instances_train2017.json" |
|
|
cap_file = Path(data_dir) / "annotations" / "captions_train2017.json" |
|
|
|
|
|
if not ann_file.exists(): |
|
|
print("โ ๏ธ COCO annotations not found") |
|
|
return |
|
|
|
|
|
with open(ann_file) as f: |
|
|
instances = json.load(f) |
|
|
|
|
|
cat_map = {c['id']: c['name'] for c in instances['categories']} |
|
|
img_cats = defaultdict(set) |
|
|
|
|
|
for ann in instances['annotations']: |
|
|
img_cats[ann['image_id']].add(cat_map.get(ann['category_id'], 'object')) |
|
|
|
|
|
|
|
|
for img in instances['images']: |
|
|
img_path = Path(data_dir) / "images" / img['file_name'] |
|
|
if not img_path.exists(): |
|
|
continue |
|
|
|
|
|
cats = list(img_cats.get(img['id'], [])) |
|
|
if not cats: |
|
|
continue |
|
|
|
|
|
cat = random.choice(cats) |
|
|
|
|
|
|
|
|
templates = [ |
|
|
(f"Is there a {cat} in this image?", "yes"), |
|
|
(f"What type of object is visible?", cat), |
|
|
(f"Does this image contain a {cat}?", "yes"), |
|
|
] |
|
|
|
|
|
q, a = random.choice(templates) |
|
|
self.samples.append({ |
|
|
'path': str(img_path), |
|
|
'question': q, |
|
|
'answer': a.lower(), |
|
|
'category': cat |
|
|
}) |
|
|
|
|
|
if len(self.samples) >= max_samples: |
|
|
break |
|
|
|
|
|
print(f" VQA: Loaded {len(self.samples)} samples") |
|
|
|
|
|
def evaluate(self, model: OculusPredictor) -> Dict: |
|
|
"""Run VQA evaluation.""" |
|
|
print("\n๐ VQA v2 Style Benchmark") |
|
|
print("-" * 50) |
|
|
|
|
|
correct = 0 |
|
|
total = 0 |
|
|
|
|
|
for sample in tqdm(self.samples, desc="VQA"): |
|
|
try: |
|
|
answer = model.ask(sample['path'], sample['question']) |
|
|
|
|
|
|
|
|
if sample['answer'] in answer.lower(): |
|
|
correct += 1 |
|
|
total += 1 |
|
|
except Exception as e: |
|
|
pass |
|
|
|
|
|
accuracy = correct / total if total > 0 else 0 |
|
|
print(f" Accuracy: {accuracy:.2%} ({correct}/{total})") |
|
|
|
|
|
return { |
|
|
'accuracy': float(accuracy), |
|
|
'correct': correct, |
|
|
'total': total |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RefCOCOBenchmark: |
|
|
"""Referring Expression Grounding using COCO boxes.""" |
|
|
|
|
|
def __init__(self, data_dir="data/coco", max_samples=100): |
|
|
self.samples = [] |
|
|
|
|
|
ann_file = Path(data_dir) / "annotations" / "instances_train2017.json" |
|
|
if not ann_file.exists(): |
|
|
return |
|
|
|
|
|
with open(ann_file) as f: |
|
|
instances = json.load(f) |
|
|
|
|
|
cat_map = {c['id']: c['name'] for c in instances['categories']} |
|
|
img_map = {img['id']: img for img in instances['images']} |
|
|
|
|
|
|
|
|
img_anns = defaultdict(list) |
|
|
for ann in instances['annotations']: |
|
|
img_anns[ann['image_id']].append(ann) |
|
|
|
|
|
for img_id, anns in img_anns.items(): |
|
|
if len(anns) < 1: |
|
|
continue |
|
|
|
|
|
img = img_map.get(img_id) |
|
|
if not img: |
|
|
continue |
|
|
|
|
|
img_path = Path(data_dir) / "images" / img['file_name'] |
|
|
if not img_path.exists(): |
|
|
continue |
|
|
|
|
|
|
|
|
ann = random.choice(anns) |
|
|
cat = cat_map.get(ann['category_id'], 'object') |
|
|
|
|
|
|
|
|
x, y, w, h = ann['bbox'] |
|
|
box = [ |
|
|
x / img['width'], |
|
|
y / img['height'], |
|
|
(x + w) / img['width'], |
|
|
(y + h) / img['height'] |
|
|
] |
|
|
|
|
|
self.samples.append({ |
|
|
'path': str(img_path), |
|
|
'expression': f"the {cat}", |
|
|
'gt_box': box |
|
|
}) |
|
|
|
|
|
if len(self.samples) >= max_samples: |
|
|
break |
|
|
|
|
|
print(f" RefCOCO: Loaded {len(self.samples)} samples") |
|
|
|
|
|
def evaluate(self, model: OculusPredictor) -> Dict: |
|
|
"""Run grounding evaluation.""" |
|
|
print("\n๐ RefCOCO Grounding Benchmark") |
|
|
print("-" * 50) |
|
|
|
|
|
ious = [] |
|
|
acc_50 = 0 |
|
|
|
|
|
for sample in tqdm(self.samples, desc="RefCOCO"): |
|
|
try: |
|
|
results = model.detect(sample['path'], prompt=f"Find {sample['expression']}") |
|
|
|
|
|
if len(results['boxes']) > 0: |
|
|
|
|
|
pred_box = results['boxes'][0] |
|
|
iou = compute_iou(sample['gt_box'], pred_box) |
|
|
ious.append(iou) |
|
|
if iou >= 0.5: |
|
|
acc_50 += 1 |
|
|
else: |
|
|
ious.append(0) |
|
|
except: |
|
|
ious.append(0) |
|
|
|
|
|
mean_iou = np.mean(ious) if ious else 0 |
|
|
accuracy = acc_50 / len(self.samples) if self.samples else 0 |
|
|
|
|
|
print(f" Mean IoU: {mean_iou:.4f}") |
|
|
print(f" Acc@0.5: {accuracy:.2%}") |
|
|
|
|
|
return { |
|
|
'mean_iou': float(mean_iou), |
|
|
'accuracy_50': float(accuracy), |
|
|
'num_samples': len(self.samples) |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CountBenchmark: |
|
|
"""Object counting benchmark.""" |
|
|
|
|
|
def __init__(self, data_dir="data/coco", max_samples=100): |
|
|
self.samples = [] |
|
|
|
|
|
ann_file = Path(data_dir) / "annotations" / "instances_train2017.json" |
|
|
if not ann_file.exists(): |
|
|
return |
|
|
|
|
|
with open(ann_file) as f: |
|
|
instances = json.load(f) |
|
|
|
|
|
cat_map = {c['id']: c['name'] for c in instances['categories']} |
|
|
img_map = {img['id']: img for img in instances['images']} |
|
|
|
|
|
|
|
|
img_counts = defaultdict(lambda: defaultdict(int)) |
|
|
for ann in instances['annotations']: |
|
|
if not ann.get('iscrowd', 0): |
|
|
cat = cat_map.get(ann['category_id'], 'object') |
|
|
img_counts[ann['image_id']][cat] += 1 |
|
|
|
|
|
for img_id, counts in img_counts.items(): |
|
|
img = img_map.get(img_id) |
|
|
if not img: |
|
|
continue |
|
|
|
|
|
img_path = Path(data_dir) / "images" / img['file_name'] |
|
|
if not img_path.exists(): |
|
|
continue |
|
|
|
|
|
|
|
|
for cat, count in counts.items(): |
|
|
if 2 <= count <= 10: |
|
|
self.samples.append({ |
|
|
'path': str(img_path), |
|
|
'category': cat, |
|
|
'count': count |
|
|
}) |
|
|
break |
|
|
|
|
|
if len(self.samples) >= max_samples: |
|
|
break |
|
|
|
|
|
print(f" CountBench: Loaded {len(self.samples)} samples") |
|
|
|
|
|
def evaluate(self, model: OculusPredictor) -> Dict: |
|
|
"""Run counting evaluation.""" |
|
|
print("\n๐ CountBench Benchmark") |
|
|
print("-" * 50) |
|
|
|
|
|
exact = 0 |
|
|
within_one = 0 |
|
|
errors = [] |
|
|
|
|
|
word_to_num = {'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, |
|
|
'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10} |
|
|
|
|
|
for sample in tqdm(self.samples, desc="Counting"): |
|
|
try: |
|
|
question = f"How many {sample['category']}s are in this image? Answer with a number." |
|
|
answer = model.ask(sample['path'], question).lower() |
|
|
|
|
|
|
|
|
pred = None |
|
|
for word in answer.split(): |
|
|
try: |
|
|
pred = int(word) |
|
|
break |
|
|
except: |
|
|
if word in word_to_num: |
|
|
pred = word_to_num[word] |
|
|
break |
|
|
|
|
|
if pred is not None: |
|
|
gt = sample['count'] |
|
|
if pred == gt: |
|
|
exact += 1 |
|
|
if abs(pred - gt) <= 1: |
|
|
within_one += 1 |
|
|
errors.append(abs(pred - gt)) |
|
|
except: |
|
|
pass |
|
|
|
|
|
total = len(self.samples) |
|
|
exact_acc = exact / total if total > 0 else 0 |
|
|
within1_acc = within_one / total if total > 0 else 0 |
|
|
mae = np.mean(errors) if errors else 0 |
|
|
|
|
|
print(f" Exact Accuracy: {exact_acc:.2%}") |
|
|
print(f" Within-1 Accuracy: {within1_acc:.2%}") |
|
|
print(f" MAE: {mae:.2f}") |
|
|
|
|
|
return { |
|
|
'exact_accuracy': float(exact_acc), |
|
|
'within_one_accuracy': float(within1_acc), |
|
|
'mae': float(mae), |
|
|
'total': total |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DetectionBenchmark: |
|
|
"""Object Detection benchmark.""" |
|
|
|
|
|
def __init__(self, data_dir="data/coco", max_samples=100): |
|
|
self.samples = [] |
|
|
|
|
|
ann_file = Path(data_dir) / "annotations" / "instances_train2017.json" |
|
|
if not ann_file.exists(): |
|
|
return |
|
|
|
|
|
with open(ann_file) as f: |
|
|
instances = json.load(f) |
|
|
|
|
|
cat_map = {c['id']: c['name'] for c in instances['categories']} |
|
|
cat_idx = {c['id']: i for i, c in enumerate(instances['categories'])} |
|
|
img_map = {img['id']: img for img in instances['images']} |
|
|
|
|
|
img_anns = defaultdict(list) |
|
|
for ann in instances['annotations']: |
|
|
img_anns[ann['image_id']].append(ann) |
|
|
|
|
|
for img_id, anns in img_anns.items(): |
|
|
img = img_map.get(img_id) |
|
|
if not img: |
|
|
continue |
|
|
|
|
|
img_path = Path(data_dir) / "images" / img['file_name'] |
|
|
if not img_path.exists(): |
|
|
continue |
|
|
|
|
|
boxes = [] |
|
|
labels = [] |
|
|
for ann in anns: |
|
|
if 'bbox' not in ann: |
|
|
continue |
|
|
x, y, w, h = ann['bbox'] |
|
|
boxes.append([ |
|
|
x / img['width'], |
|
|
y / img['height'], |
|
|
(x + w) / img['width'], |
|
|
(y + h) / img['height'] |
|
|
]) |
|
|
labels.append(cat_idx.get(ann['category_id'], 0)) |
|
|
|
|
|
if boxes: |
|
|
self.samples.append({ |
|
|
'path': str(img_path), |
|
|
'boxes': boxes, |
|
|
'labels': labels |
|
|
}) |
|
|
|
|
|
if len(self.samples) >= max_samples: |
|
|
break |
|
|
|
|
|
print(f" Detection: Loaded {len(self.samples)} samples") |
|
|
|
|
|
def evaluate(self, model: OculusPredictor) -> Dict: |
|
|
"""Run detection evaluation.""" |
|
|
print("\n๐ COCO Detection Benchmark") |
|
|
print("-" * 50) |
|
|
|
|
|
all_ious = [] |
|
|
all_correct = [] |
|
|
|
|
|
for sample in tqdm(self.samples, desc="Detection"): |
|
|
try: |
|
|
results = model.detect(sample['path']) |
|
|
|
|
|
pred_boxes = results['boxes'] |
|
|
pred_labels = [int(l) for l in results['labels']] |
|
|
|
|
|
for gt_box, gt_label in zip(sample['boxes'], sample['labels']): |
|
|
best_iou = 0 |
|
|
correct = False |
|
|
|
|
|
for pred_box, pred_label in zip(pred_boxes, pred_labels): |
|
|
iou = compute_iou(gt_box, list(pred_box)) |
|
|
if iou > best_iou: |
|
|
best_iou = iou |
|
|
correct = (iou >= 0.5) and (pred_label == gt_label) |
|
|
|
|
|
all_ious.append(best_iou) |
|
|
all_correct.append(correct) |
|
|
except: |
|
|
pass |
|
|
|
|
|
mean_iou = np.mean(all_ious) if all_ious else 0 |
|
|
accuracy = np.mean(all_correct) if all_correct else 0 |
|
|
|
|
|
print(f" Mean IoU: {mean_iou:.4f}") |
|
|
print(f" mAP@0.5: {accuracy:.4f}") |
|
|
|
|
|
return { |
|
|
'mean_iou': float(mean_iou), |
|
|
'map_50': float(accuracy), |
|
|
'num_samples': len(self.samples) |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_all_benchmarks(): |
|
|
"""Run complete benchmark suite.""" |
|
|
print("=" * 60) |
|
|
print("๐ฎ OCULUS VLM BENCHMARK SUITE") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print("\n[Loading Oculus Model]") |
|
|
model = OculusPredictor() |
|
|
|
|
|
results = {} |
|
|
|
|
|
|
|
|
print("\n[Running Benchmarks]") |
|
|
|
|
|
|
|
|
vqa = VQABenchmark(max_samples=200) |
|
|
results['vqa_v2'] = vqa.evaluate(model) |
|
|
|
|
|
|
|
|
refcoco = RefCOCOBenchmark(max_samples=100) |
|
|
results['refcoco'] = refcoco.evaluate(model) |
|
|
|
|
|
|
|
|
counting = CountBenchmark(max_samples=100) |
|
|
results['countbench'] = counting.evaluate(model) |
|
|
|
|
|
|
|
|
detection = DetectionBenchmark(max_samples=100) |
|
|
results['coco_detection'] = detection.evaluate(model) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("๐ BENCHMARK RESULTS SUMMARY") |
|
|
print("=" * 60) |
|
|
|
|
|
print(f""" |
|
|
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ |
|
|
โ OCULUS BENCHMARKS โ |
|
|
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฃ |
|
|
โ VQA v2 (Style) โ |
|
|
โ Accuracy: {results['vqa_v2']['accuracy']:.2%} โ |
|
|
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฃ |
|
|
โ RefCOCO Grounding โ |
|
|
โ Mean IoU: {results['refcoco']['mean_iou']:.4f} โ |
|
|
โ Acc@0.5: {results['refcoco']['accuracy_50']:.2%} โ |
|
|
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฃ |
|
|
โ CountBench โ |
|
|
โ Exact Accuracy: {results['countbench']['exact_accuracy']:.2%} โ |
|
|
โ Within-1 Acc: {results['countbench']['within_one_accuracy']:.2%} โ |
|
|
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฃ |
|
|
โ COCO Detection โ |
|
|
โ Mean IoU: {results['coco_detection']['mean_iou']:.4f} โ |
|
|
โ mAP@0.5: {results['coco_detection']['map_50']:.4f} โ |
|
|
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ |
|
|
""") |
|
|
|
|
|
|
|
|
output_path = OCULUS_ROOT / "benchmark_results.json" |
|
|
with open(output_path, "w") as f: |
|
|
json.dump(results, f, indent=2) |
|
|
print(f"๐พ Results saved to: {output_path}") |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
run_all_benchmarks() |
|
|
|