kobiakor15 commited on
Commit
d6e0b94
ยท
verified ยท
1 Parent(s): e931398

Upload benchmark_vlm.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. benchmark_vlm.py +519 -0
benchmark_vlm.py ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Oculus VLM Benchmark Suite
4
+
5
+ Evaluates Oculus on industry-standard VLM benchmarks similar to Isaac/Moondream.
6
+ Benchmarks:
7
+ 1. VQA v2 (subset)
8
+ 2. RefCOCO Grounding
9
+ 3. Counting (CVBench-style)
10
+ 4. COCO Detection (mAP)
11
+ 5. Captioning (BLEU)
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ import json
17
+ import random
18
+ import time
19
+ from pathlib import Path
20
+ from dataclasses import dataclass
21
+ from typing import List, Dict, Optional
22
+ from collections import defaultdict
23
+
24
+ import numpy as np
25
+ import torch
26
+ from PIL import Image
27
+ from tqdm import tqdm
28
+
29
+ OCULUS_ROOT = Path(__file__).parent
30
+ sys.path.insert(0, str(OCULUS_ROOT))
31
+
32
+ from oculus_inference import OculusPredictor
33
+
34
+
35
+ # ============================================================================
36
+ # Benchmark Utilities
37
+ # ============================================================================
38
+
39
+ def compute_iou(box1, box2):
40
+ """Compute IoU between two boxes [x1, y1, x2, y2]."""
41
+ x1 = max(box1[0], box2[0])
42
+ y1 = max(box1[1], box2[1])
43
+ x2 = min(box1[2], box2[2])
44
+ y2 = min(box1[3], box2[3])
45
+
46
+ inter = max(0, x2 - x1) * max(0, y2 - y1)
47
+ area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
48
+ area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
49
+
50
+ return inter / (area1 + area2 - inter + 1e-8)
51
+
52
+
53
+ # ============================================================================
54
+ # Benchmark 1: VQA v2 Style
55
+ # ============================================================================
56
+
57
+ class VQABenchmark:
58
+ """Visual Question Answering benchmark using COCO-derived questions."""
59
+
60
+ def __init__(self, data_dir="data/coco", max_samples=200):
61
+ self.samples = []
62
+
63
+ # Load COCO annotations to generate VQA-style questions
64
+ ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
65
+ cap_file = Path(data_dir) / "annotations" / "captions_train2017.json"
66
+
67
+ if not ann_file.exists():
68
+ print("โš ๏ธ COCO annotations not found")
69
+ return
70
+
71
+ with open(ann_file) as f:
72
+ instances = json.load(f)
73
+
74
+ cat_map = {c['id']: c['name'] for c in instances['categories']}
75
+ img_cats = defaultdict(set)
76
+
77
+ for ann in instances['annotations']:
78
+ img_cats[ann['image_id']].add(cat_map.get(ann['category_id'], 'object'))
79
+
80
+ # Generate VQA samples
81
+ for img in instances['images']:
82
+ img_path = Path(data_dir) / "images" / img['file_name']
83
+ if not img_path.exists():
84
+ continue
85
+
86
+ cats = list(img_cats.get(img['id'], []))
87
+ if not cats:
88
+ continue
89
+
90
+ cat = random.choice(cats)
91
+
92
+ # Create different question types
93
+ templates = [
94
+ (f"Is there a {cat} in this image?", "yes"),
95
+ (f"What type of object is visible?", cat),
96
+ (f"Does this image contain a {cat}?", "yes"),
97
+ ]
98
+
99
+ q, a = random.choice(templates)
100
+ self.samples.append({
101
+ 'path': str(img_path),
102
+ 'question': q,
103
+ 'answer': a.lower(),
104
+ 'category': cat
105
+ })
106
+
107
+ if len(self.samples) >= max_samples:
108
+ break
109
+
110
+ print(f" VQA: Loaded {len(self.samples)} samples")
111
+
112
+ def evaluate(self, model: OculusPredictor) -> Dict:
113
+ """Run VQA evaluation."""
114
+ print("\n๐Ÿ“Š VQA v2 Style Benchmark")
115
+ print("-" * 50)
116
+
117
+ correct = 0
118
+ total = 0
119
+
120
+ for sample in tqdm(self.samples, desc="VQA"):
121
+ try:
122
+ answer = model.ask(sample['path'], sample['question'])
123
+
124
+ # Check if expected answer is in response
125
+ if sample['answer'] in answer.lower():
126
+ correct += 1
127
+ total += 1
128
+ except Exception as e:
129
+ pass
130
+
131
+ accuracy = correct / total if total > 0 else 0
132
+ print(f" Accuracy: {accuracy:.2%} ({correct}/{total})")
133
+
134
+ return {
135
+ 'accuracy': float(accuracy),
136
+ 'correct': correct,
137
+ 'total': total
138
+ }
139
+
140
+
141
+ # ============================================================================
142
+ # Benchmark 2: RefCOCO Grounding
143
+ # ============================================================================
144
+
145
+ class RefCOCOBenchmark:
146
+ """Referring Expression Grounding using COCO boxes."""
147
+
148
+ def __init__(self, data_dir="data/coco", max_samples=100):
149
+ self.samples = []
150
+
151
+ ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
152
+ if not ann_file.exists():
153
+ return
154
+
155
+ with open(ann_file) as f:
156
+ instances = json.load(f)
157
+
158
+ cat_map = {c['id']: c['name'] for c in instances['categories']}
159
+ img_map = {img['id']: img for img in instances['images']}
160
+
161
+ # Group annotations by image
162
+ img_anns = defaultdict(list)
163
+ for ann in instances['annotations']:
164
+ img_anns[ann['image_id']].append(ann)
165
+
166
+ for img_id, anns in img_anns.items():
167
+ if len(anns) < 1:
168
+ continue
169
+
170
+ img = img_map.get(img_id)
171
+ if not img:
172
+ continue
173
+
174
+ img_path = Path(data_dir) / "images" / img['file_name']
175
+ if not img_path.exists():
176
+ continue
177
+
178
+ # Pick a random object
179
+ ann = random.choice(anns)
180
+ cat = cat_map.get(ann['category_id'], 'object')
181
+
182
+ # Normalize bbox
183
+ x, y, w, h = ann['bbox']
184
+ box = [
185
+ x / img['width'],
186
+ y / img['height'],
187
+ (x + w) / img['width'],
188
+ (y + h) / img['height']
189
+ ]
190
+
191
+ self.samples.append({
192
+ 'path': str(img_path),
193
+ 'expression': f"the {cat}",
194
+ 'gt_box': box
195
+ })
196
+
197
+ if len(self.samples) >= max_samples:
198
+ break
199
+
200
+ print(f" RefCOCO: Loaded {len(self.samples)} samples")
201
+
202
+ def evaluate(self, model: OculusPredictor) -> Dict:
203
+ """Run grounding evaluation."""
204
+ print("\n๐Ÿ“Š RefCOCO Grounding Benchmark")
205
+ print("-" * 50)
206
+
207
+ ious = []
208
+ acc_50 = 0
209
+
210
+ for sample in tqdm(self.samples, desc="RefCOCO"):
211
+ try:
212
+ results = model.detect(sample['path'], prompt=f"Find {sample['expression']}")
213
+
214
+ if len(results['boxes']) > 0:
215
+ # Take highest confidence box
216
+ pred_box = results['boxes'][0]
217
+ iou = compute_iou(sample['gt_box'], pred_box)
218
+ ious.append(iou)
219
+ if iou >= 0.5:
220
+ acc_50 += 1
221
+ else:
222
+ ious.append(0)
223
+ except:
224
+ ious.append(0)
225
+
226
+ mean_iou = np.mean(ious) if ious else 0
227
+ accuracy = acc_50 / len(self.samples) if self.samples else 0
228
+
229
+ print(f" Mean IoU: {mean_iou:.4f}")
230
+ print(f" Acc@0.5: {accuracy:.2%}")
231
+
232
+ return {
233
+ 'mean_iou': float(mean_iou),
234
+ 'accuracy_50': float(accuracy),
235
+ 'num_samples': len(self.samples)
236
+ }
237
+
238
+
239
+ # ============================================================================
240
+ # Benchmark 3: Counting (CountBench Style)
241
+ # ============================================================================
242
+
243
+ class CountBenchmark:
244
+ """Object counting benchmark."""
245
+
246
+ def __init__(self, data_dir="data/coco", max_samples=100):
247
+ self.samples = []
248
+
249
+ ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
250
+ if not ann_file.exists():
251
+ return
252
+
253
+ with open(ann_file) as f:
254
+ instances = json.load(f)
255
+
256
+ cat_map = {c['id']: c['name'] for c in instances['categories']}
257
+ img_map = {img['id']: img for img in instances['images']}
258
+
259
+ # Count objects per image per category
260
+ img_counts = defaultdict(lambda: defaultdict(int))
261
+ for ann in instances['annotations']:
262
+ if not ann.get('iscrowd', 0):
263
+ cat = cat_map.get(ann['category_id'], 'object')
264
+ img_counts[ann['image_id']][cat] += 1
265
+
266
+ for img_id, counts in img_counts.items():
267
+ img = img_map.get(img_id)
268
+ if not img:
269
+ continue
270
+
271
+ img_path = Path(data_dir) / "images" / img['file_name']
272
+ if not img_path.exists():
273
+ continue
274
+
275
+ # Pick category with 2-10 objects (reasonable counting range)
276
+ for cat, count in counts.items():
277
+ if 2 <= count <= 10:
278
+ self.samples.append({
279
+ 'path': str(img_path),
280
+ 'category': cat,
281
+ 'count': count
282
+ })
283
+ break
284
+
285
+ if len(self.samples) >= max_samples:
286
+ break
287
+
288
+ print(f" CountBench: Loaded {len(self.samples)} samples")
289
+
290
+ def evaluate(self, model: OculusPredictor) -> Dict:
291
+ """Run counting evaluation."""
292
+ print("\n๐Ÿ“Š CountBench Benchmark")
293
+ print("-" * 50)
294
+
295
+ exact = 0
296
+ within_one = 0
297
+ errors = []
298
+
299
+ word_to_num = {'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
300
+ 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}
301
+
302
+ for sample in tqdm(self.samples, desc="Counting"):
303
+ try:
304
+ question = f"How many {sample['category']}s are in this image? Answer with a number."
305
+ answer = model.ask(sample['path'], question).lower()
306
+
307
+ # Parse number
308
+ pred = None
309
+ for word in answer.split():
310
+ try:
311
+ pred = int(word)
312
+ break
313
+ except:
314
+ if word in word_to_num:
315
+ pred = word_to_num[word]
316
+ break
317
+
318
+ if pred is not None:
319
+ gt = sample['count']
320
+ if pred == gt:
321
+ exact += 1
322
+ if abs(pred - gt) <= 1:
323
+ within_one += 1
324
+ errors.append(abs(pred - gt))
325
+ except:
326
+ pass
327
+
328
+ total = len(self.samples)
329
+ exact_acc = exact / total if total > 0 else 0
330
+ within1_acc = within_one / total if total > 0 else 0
331
+ mae = np.mean(errors) if errors else 0
332
+
333
+ print(f" Exact Accuracy: {exact_acc:.2%}")
334
+ print(f" Within-1 Accuracy: {within1_acc:.2%}")
335
+ print(f" MAE: {mae:.2f}")
336
+
337
+ return {
338
+ 'exact_accuracy': float(exact_acc),
339
+ 'within_one_accuracy': float(within1_acc),
340
+ 'mae': float(mae),
341
+ 'total': total
342
+ }
343
+
344
+
345
+ # ============================================================================
346
+ # Benchmark 4: COCO Detection (mAP)
347
+ # ============================================================================
348
+
349
+ class DetectionBenchmark:
350
+ """Object Detection benchmark."""
351
+
352
+ def __init__(self, data_dir="data/coco", max_samples=100):
353
+ self.samples = []
354
+
355
+ ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
356
+ if not ann_file.exists():
357
+ return
358
+
359
+ with open(ann_file) as f:
360
+ instances = json.load(f)
361
+
362
+ cat_map = {c['id']: c['name'] for c in instances['categories']}
363
+ cat_idx = {c['id']: i for i, c in enumerate(instances['categories'])}
364
+ img_map = {img['id']: img for img in instances['images']}
365
+
366
+ img_anns = defaultdict(list)
367
+ for ann in instances['annotations']:
368
+ img_anns[ann['image_id']].append(ann)
369
+
370
+ for img_id, anns in img_anns.items():
371
+ img = img_map.get(img_id)
372
+ if not img:
373
+ continue
374
+
375
+ img_path = Path(data_dir) / "images" / img['file_name']
376
+ if not img_path.exists():
377
+ continue
378
+
379
+ boxes = []
380
+ labels = []
381
+ for ann in anns:
382
+ if 'bbox' not in ann:
383
+ continue
384
+ x, y, w, h = ann['bbox']
385
+ boxes.append([
386
+ x / img['width'],
387
+ y / img['height'],
388
+ (x + w) / img['width'],
389
+ (y + h) / img['height']
390
+ ])
391
+ labels.append(cat_idx.get(ann['category_id'], 0))
392
+
393
+ if boxes:
394
+ self.samples.append({
395
+ 'path': str(img_path),
396
+ 'boxes': boxes,
397
+ 'labels': labels
398
+ })
399
+
400
+ if len(self.samples) >= max_samples:
401
+ break
402
+
403
+ print(f" Detection: Loaded {len(self.samples)} samples")
404
+
405
+ def evaluate(self, model: OculusPredictor) -> Dict:
406
+ """Run detection evaluation."""
407
+ print("\n๐Ÿ“Š COCO Detection Benchmark")
408
+ print("-" * 50)
409
+
410
+ all_ious = []
411
+ all_correct = []
412
+
413
+ for sample in tqdm(self.samples, desc="Detection"):
414
+ try:
415
+ results = model.detect(sample['path'])
416
+
417
+ pred_boxes = results['boxes']
418
+ pred_labels = [int(l) for l in results['labels']]
419
+
420
+ for gt_box, gt_label in zip(sample['boxes'], sample['labels']):
421
+ best_iou = 0
422
+ correct = False
423
+
424
+ for pred_box, pred_label in zip(pred_boxes, pred_labels):
425
+ iou = compute_iou(gt_box, list(pred_box))
426
+ if iou > best_iou:
427
+ best_iou = iou
428
+ correct = (iou >= 0.5) and (pred_label == gt_label)
429
+
430
+ all_ious.append(best_iou)
431
+ all_correct.append(correct)
432
+ except:
433
+ pass
434
+
435
+ mean_iou = np.mean(all_ious) if all_ious else 0
436
+ accuracy = np.mean(all_correct) if all_correct else 0
437
+
438
+ print(f" Mean IoU: {mean_iou:.4f}")
439
+ print(f" mAP@0.5: {accuracy:.4f}")
440
+
441
+ return {
442
+ 'mean_iou': float(mean_iou),
443
+ 'map_50': float(accuracy),
444
+ 'num_samples': len(self.samples)
445
+ }
446
+
447
+
448
+ # ============================================================================
449
+ # Main Runner
450
+ # ============================================================================
451
+
452
+ def run_all_benchmarks():
453
+ """Run complete benchmark suite."""
454
+ print("=" * 60)
455
+ print("๐Ÿ”ฎ OCULUS VLM BENCHMARK SUITE")
456
+ print("=" * 60)
457
+
458
+ # Initialize model
459
+ print("\n[Loading Oculus Model]")
460
+ model = OculusPredictor()
461
+
462
+ results = {}
463
+
464
+ # Run benchmarks
465
+ print("\n[Running Benchmarks]")
466
+
467
+ # 1. VQA
468
+ vqa = VQABenchmark(max_samples=200)
469
+ results['vqa_v2'] = vqa.evaluate(model)
470
+
471
+ # 2. RefCOCO
472
+ refcoco = RefCOCOBenchmark(max_samples=100)
473
+ results['refcoco'] = refcoco.evaluate(model)
474
+
475
+ # 3. Counting
476
+ counting = CountBenchmark(max_samples=100)
477
+ results['countbench'] = counting.evaluate(model)
478
+
479
+ # 4. Detection
480
+ detection = DetectionBenchmark(max_samples=100)
481
+ results['coco_detection'] = detection.evaluate(model)
482
+
483
+ # Summary
484
+ print("\n" + "=" * 60)
485
+ print("๐Ÿ“Š BENCHMARK RESULTS SUMMARY")
486
+ print("=" * 60)
487
+
488
+ print(f"""
489
+ โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—
490
+ โ•‘ OCULUS BENCHMARKS โ•‘
491
+ โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ
492
+ โ•‘ VQA v2 (Style) โ•‘
493
+ โ•‘ Accuracy: {results['vqa_v2']['accuracy']:.2%} โ•‘
494
+ โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ
495
+ โ•‘ RefCOCO Grounding โ•‘
496
+ โ•‘ Mean IoU: {results['refcoco']['mean_iou']:.4f} โ•‘
497
+ โ•‘ Acc@0.5: {results['refcoco']['accuracy_50']:.2%} โ•‘
498
+ โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ
499
+ โ•‘ CountBench โ•‘
500
+ โ•‘ Exact Accuracy: {results['countbench']['exact_accuracy']:.2%} โ•‘
501
+ โ•‘ Within-1 Acc: {results['countbench']['within_one_accuracy']:.2%} โ•‘
502
+ โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ
503
+ โ•‘ COCO Detection โ•‘
504
+ โ•‘ Mean IoU: {results['coco_detection']['mean_iou']:.4f} โ•‘
505
+ โ•‘ mAP@0.5: {results['coco_detection']['map_50']:.4f} โ•‘
506
+ โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
507
+ """)
508
+
509
+ # Save results
510
+ output_path = OCULUS_ROOT / "benchmark_results.json"
511
+ with open(output_path, "w") as f:
512
+ json.dump(results, f, indent=2)
513
+ print(f"๐Ÿ’พ Results saved to: {output_path}")
514
+
515
+ return results
516
+
517
+
518
+ if __name__ == "__main__":
519
+ run_all_benchmarks()