kobiakor15 commited on
Commit
a040425
·
verified ·
1 Parent(s): d6e0b94

Upload eval_benchmarks.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. eval_benchmarks.py +601 -0
eval_benchmarks.py ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OCULUS Benchmark Evaluation Suite
4
+
5
+ Evaluates Oculus on multiple vision-language benchmarks:
6
+ 1. COCO Detection (mAP)
7
+ 2. Car Part Damage Detection
8
+ 3. Counting (Pixmo-style)
9
+ 4. VQA Accuracy
10
+ 5. RefCOCO Grounding (IoU)
11
+
12
+ Inspired by Isaac model benchmarks.
13
+ """
14
+
15
+ import os
16
+ import sys
17
+ import json
18
+ import time
19
+ import random
20
+ from pathlib import Path
21
+ from dataclasses import dataclass, field
22
+ from typing import List, Dict, Tuple, Optional
23
+ from collections import defaultdict
24
+
25
+ import numpy as np
26
+ import torch
27
+ from PIL import Image
28
+
29
+ OCULUS_ROOT = Path(__file__).parent
30
+ sys.path.insert(0, str(OCULUS_ROOT))
31
+
32
+ from oculus_unified_model import OculusForConditionalGeneration
33
+
34
+
35
+ # ============================================================================
36
+ # Metrics
37
+ # ============================================================================
38
+
39
+ def compute_iou(box1: List[float], box2: List[float]) -> float:
40
+ """Compute IoU between two boxes [x1, y1, x2, y2]."""
41
+ x1 = max(box1[0], box2[0])
42
+ y1 = max(box1[1], box2[1])
43
+ x2 = min(box1[2], box2[2])
44
+ y2 = min(box1[3], box2[3])
45
+
46
+ inter_area = max(0, x2 - x1) * max(0, y2 - y1)
47
+
48
+ area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
49
+ area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
50
+
51
+ union_area = area1 + area2 - inter_area + 1e-8
52
+
53
+ return inter_area / union_area
54
+
55
+
56
+ def compute_ap(recalls: List[float], precisions: List[float]) -> float:
57
+ """Compute Average Precision from recall/precision curve."""
58
+ recalls = [0] + list(recalls) + [1]
59
+ precisions = [0] + list(precisions) + [0]
60
+
61
+ # Make precision monotonically decreasing
62
+ for i in range(len(precisions) - 2, -1, -1):
63
+ precisions[i] = max(precisions[i], precisions[i + 1])
64
+
65
+ # Calculate area under curve
66
+ ap = 0
67
+ for i in range(1, len(recalls)):
68
+ ap += (recalls[i] - recalls[i - 1]) * precisions[i]
69
+
70
+ return ap
71
+
72
+
73
+ # ============================================================================
74
+ # Benchmark 1: COCO Detection (mAP)
75
+ # ============================================================================
76
+
77
+ class COCODetectionBenchmark:
78
+ """COCO Detection benchmark - computes mAP@0.5."""
79
+
80
+ def __init__(self, data_dir: str = "data/coco", max_samples: int = 500):
81
+ self.data_dir = Path(data_dir)
82
+ self.max_samples = max_samples
83
+
84
+ # Load validation annotations - fallback to train if not enough samples
85
+ ann_file = self.data_dir / "annotations" / "instances_train2017.json" # Use train set
86
+
87
+ with open(ann_file) as f:
88
+ coco = json.load(f)
89
+
90
+ # Build index
91
+ self.cat_id_to_name = {c['id']: c['name'] for c in coco['categories']}
92
+ self.cat_id_to_idx = {c['id']: i for i, c in enumerate(coco['categories'])}
93
+
94
+ # Build samples
95
+ img_to_anns = defaultdict(list)
96
+ for ann in coco['annotations']:
97
+ if ann.get('iscrowd', 0):
98
+ continue
99
+ img_to_anns[ann['image_id']].append(ann)
100
+
101
+ self.samples = []
102
+ for img in coco['images']:
103
+ if img['id'] not in img_to_anns:
104
+ continue
105
+
106
+ img_path = self.data_dir / "images" / img['file_name']
107
+ if not img_path.exists():
108
+ continue
109
+
110
+ anns = img_to_anns[img['id']]
111
+ boxes = []
112
+ labels = []
113
+ for ann in anns:
114
+ if 'bbox' not in ann:
115
+ continue
116
+ x, y, w, h = ann['bbox']
117
+ # Normalize to [0, 1]
118
+ boxes.append([
119
+ x / img['width'],
120
+ y / img['height'],
121
+ (x + w) / img['width'],
122
+ (y + h) / img['height']
123
+ ])
124
+ labels.append(self.cat_id_to_idx[ann['category_id']])
125
+
126
+ if boxes:
127
+ self.samples.append({
128
+ 'path': str(img_path),
129
+ 'boxes': boxes,
130
+ 'labels': labels
131
+ })
132
+
133
+ if len(self.samples) >= max_samples:
134
+ break
135
+
136
+ print(f" Loaded {len(self.samples)} COCO samples")
137
+
138
+ def evaluate(self, model: OculusForConditionalGeneration) -> Dict:
139
+ """Evaluate detection performance."""
140
+ print("\n📦 COCO Detection Benchmark")
141
+ print("-" * 40)
142
+
143
+ all_ious = []
144
+ all_correct = []
145
+
146
+ for i, sample in enumerate(self.samples):
147
+ if i % 50 == 0:
148
+ print(f" Progress: {i}/{len(self.samples)}")
149
+
150
+ try:
151
+ image = Image.open(sample['path']).convert('RGB')
152
+ output = model.generate(image, mode="box", prompt="Detect objects")
153
+
154
+ gt_boxes = sample['boxes']
155
+ pred_boxes = output.boxes
156
+ pred_labels = [int(l) for l in output.labels]
157
+
158
+ # Match predictions to ground truth
159
+ for gt_box, gt_label in zip(gt_boxes, sample['labels']):
160
+ best_iou = 0
161
+ is_correct = False
162
+
163
+ for pred_box, pred_label in zip(pred_boxes, pred_labels):
164
+ iou = compute_iou(gt_box, list(pred_box))
165
+ if iou > best_iou:
166
+ best_iou = iou
167
+ is_correct = (iou >= 0.5) and (pred_label == gt_label)
168
+
169
+ all_ious.append(best_iou)
170
+ all_correct.append(is_correct)
171
+
172
+ except Exception as e:
173
+ pass
174
+
175
+ mean_iou = np.mean(all_ious) if all_ious else 0
176
+ accuracy = np.mean(all_correct) if all_correct else 0
177
+
178
+ results = {
179
+ 'mean_iou': float(mean_iou),
180
+ 'accuracy': float(accuracy),
181
+ 'num_samples': len(self.samples)
182
+ }
183
+
184
+ print(f" Mean IoU: {mean_iou:.4f}")
185
+ print(f" Accuracy (IoU>0.5 + correct class): {accuracy:.4f}")
186
+
187
+ return results
188
+
189
+
190
+ # ============================================================================
191
+ # Benchmark 2: Car Part Damage Detection
192
+ # ============================================================================
193
+
194
+ class CarDamageBenchmark:
195
+ """Car Part Damage detection benchmark from HuggingFace."""
196
+
197
+ CAR_PART_LABELS = [
198
+ 'Back-bumper', 'Back-door', 'Back-wheel', 'Back-window', 'Back-windshield',
199
+ 'Fender', 'Front-bumper', 'Front-door', 'Front-wheel', 'Front-window',
200
+ 'Grille', 'Headlight', 'Hood', 'License-plate', 'Mirror', 'Quarter-panel',
201
+ 'Rocker-panel', 'Roof', 'Tail-light', 'Trunk', 'Windshield'
202
+ ]
203
+
204
+ def __init__(self, max_samples: int = 50):
205
+ self.max_samples = max_samples
206
+ self.samples = []
207
+
208
+ try:
209
+ from datasets import load_dataset
210
+ print(" Loading car_part_damage dataset...")
211
+ ds = load_dataset("moondream/car_part_damage", split="test")
212
+
213
+ for i, item in enumerate(ds):
214
+ if i >= max_samples:
215
+ break
216
+
217
+ boxes = []
218
+ labels = []
219
+ for ann in item['annotations']:
220
+ bbox = ann['bbox']
221
+ # Normalize to [0, 1]
222
+ boxes.append([
223
+ bbox[0] / item['width'],
224
+ bbox[1] / item['height'],
225
+ bbox[2] / item['width'],
226
+ bbox[3] / item['height']
227
+ ])
228
+ labels.append(ann['category'])
229
+
230
+ self.samples.append({
231
+ 'image': item['image'],
232
+ 'boxes': boxes,
233
+ 'labels': labels,
234
+ 'width': item['width'],
235
+ 'height': item['height']
236
+ })
237
+
238
+ print(f" Loaded {len(self.samples)} car damage samples")
239
+
240
+ except Exception as e:
241
+ print(f" ⚠️ Could not load dataset: {e}")
242
+
243
+ def evaluate(self, model: OculusForConditionalGeneration) -> Dict:
244
+ """Evaluate on car damage detection."""
245
+ print("\n🚗 Car Part Damage Benchmark")
246
+ print("-" * 40)
247
+
248
+ if not self.samples:
249
+ return {'error': 'Dataset not loaded'}
250
+
251
+ all_ious = []
252
+ correct_parts = 0
253
+ total_parts = 0
254
+
255
+ for i, sample in enumerate(self.samples):
256
+ if i % 10 == 0:
257
+ print(f" Progress: {i}/{len(self.samples)}")
258
+
259
+ try:
260
+ image = sample['image']
261
+ output = model.generate(image, mode="box", prompt="Detect car parts and damage")
262
+
263
+ pred_boxes = output.boxes
264
+
265
+ for gt_box in sample['boxes']:
266
+ total_parts += 1
267
+ best_iou = 0
268
+
269
+ for pred_box in pred_boxes:
270
+ iou = compute_iou(gt_box, list(pred_box))
271
+ best_iou = max(best_iou, iou)
272
+
273
+ all_ious.append(best_iou)
274
+ if best_iou >= 0.5:
275
+ correct_parts += 1
276
+
277
+ except Exception as e:
278
+ pass
279
+
280
+ mean_iou = np.mean(all_ious) if all_ious else 0
281
+ recall = correct_parts / total_parts if total_parts > 0 else 0
282
+
283
+ results = {
284
+ 'mean_iou': float(mean_iou),
285
+ 'recall@0.5': float(recall),
286
+ 'correct_parts': correct_parts,
287
+ 'total_parts': total_parts
288
+ }
289
+
290
+ print(f" Mean IoU: {mean_iou:.4f}")
291
+ print(f" Recall@0.5: {recall:.4f} ({correct_parts}/{total_parts})")
292
+
293
+ return results
294
+
295
+
296
+ # ============================================================================
297
+ # Benchmark 3: Counting (Pixmo-style)
298
+ # ============================================================================
299
+
300
+ class CountingBenchmark:
301
+ """Object counting benchmark."""
302
+
303
+ def __init__(self, data_dir: str = "data/coco", max_samples: int = 200):
304
+ self.data_dir = Path(data_dir)
305
+ self.samples = []
306
+
307
+ # Load COCO annotations for counting
308
+ ann_file = self.data_dir / "annotations" / "instances_val2017.json"
309
+ if not ann_file.exists():
310
+ ann_file = self.data_dir / "annotations" / "instances_train2017.json"
311
+
312
+ with open(ann_file) as f:
313
+ coco = json.load(f)
314
+
315
+ self.cat_id_to_name = {c['id']: c['name'] for c in coco['categories']}
316
+
317
+ # Build image to counts
318
+ img_counts = defaultdict(lambda: defaultdict(int))
319
+ for ann in coco['annotations']:
320
+ if not ann.get('iscrowd', 0):
321
+ img_counts[ann['image_id']][ann['category_id']] += 1
322
+
323
+ for img in coco['images']:
324
+ if img['id'] not in img_counts:
325
+ continue
326
+
327
+ img_path = self.data_dir / "images" / img['file_name']
328
+ if not img_path.exists():
329
+ continue
330
+
331
+ counts = img_counts[img['id']]
332
+ # Pick the most common category
333
+ most_common_cat = max(counts.keys(), key=lambda k: counts[k])
334
+ count = counts[most_common_cat]
335
+
336
+ if 2 <= count <= 10: # Reasonable counting range
337
+ self.samples.append({
338
+ 'path': str(img_path),
339
+ 'category': self.cat_id_to_name[most_common_cat],
340
+ 'count': count
341
+ })
342
+
343
+ if len(self.samples) >= max_samples:
344
+ break
345
+
346
+ print(f" Loaded {len(self.samples)} counting samples")
347
+
348
+ def evaluate(self, model: OculusForConditionalGeneration) -> Dict:
349
+ """Evaluate counting accuracy."""
350
+ print("\n🔢 Counting Benchmark")
351
+ print("-" * 40)
352
+
353
+ exact_matches = 0
354
+ within_one = 0
355
+ total = 0
356
+ errors = []
357
+
358
+ for i, sample in enumerate(self.samples):
359
+ if i % 25 == 0:
360
+ print(f" Progress: {i}/{len(self.samples)}")
361
+
362
+ try:
363
+ image = Image.open(sample['path']).convert('RGB')
364
+ question = f"How many {sample['category']}s are in this image?"
365
+
366
+ output = model.generate(image, mode="text", prompt=question)
367
+
368
+ # Extract number from response
369
+ response = output.text.lower()
370
+ gt_count = sample['count']
371
+
372
+ # Try to parse number
373
+ pred_count = None
374
+ for word in response.split():
375
+ try:
376
+ pred_count = int(word)
377
+ break
378
+ except:
379
+ pass
380
+
381
+ # Try word numbers
382
+ word_to_num = {
383
+ 'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
384
+ 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10
385
+ }
386
+ if pred_count is None:
387
+ for word, num in word_to_num.items():
388
+ if word in response:
389
+ pred_count = num
390
+ break
391
+
392
+ if pred_count is not None:
393
+ total += 1
394
+ if pred_count == gt_count:
395
+ exact_matches += 1
396
+ if abs(pred_count - gt_count) <= 1:
397
+ within_one += 1
398
+ errors.append(abs(pred_count - gt_count))
399
+
400
+ except Exception as e:
401
+ pass
402
+
403
+ accuracy = exact_matches / total if total > 0 else 0
404
+ within1_acc = within_one / total if total > 0 else 0
405
+ mae = np.mean(errors) if errors else 0
406
+
407
+ results = {
408
+ 'exact_accuracy': float(accuracy),
409
+ 'within_one_accuracy': float(within1_acc),
410
+ 'mae': float(mae),
411
+ 'total': total
412
+ }
413
+
414
+ print(f" Exact Accuracy: {accuracy:.2%}")
415
+ print(f" Within-1 Accuracy: {within1_acc:.2%}")
416
+ print(f" Mean Absolute Error: {mae:.2f}")
417
+
418
+ return results
419
+
420
+
421
+ # ============================================================================
422
+ # Benchmark 4: VQA
423
+ # ============================================================================
424
+
425
+ class VQABenchmark:
426
+ """Visual Question Answering benchmark."""
427
+
428
+ def __init__(self, data_dir: str = "data/coco", max_samples: int = 200):
429
+ self.data_dir = Path(data_dir)
430
+
431
+ # Create simple VQA questions from COCO
432
+ self.samples = []
433
+
434
+ ann_file = self.data_dir / "annotations" / "instances_val2017.json"
435
+ if not ann_file.exists():
436
+ ann_file = self.data_dir / "annotations" / "instances_train2017.json"
437
+
438
+ with open(ann_file) as f:
439
+ coco = json.load(f)
440
+
441
+ self.cat_id_to_name = {c['id']: c['name'] for c in coco['categories']}
442
+
443
+ # Build samples
444
+ img_cats = defaultdict(set)
445
+ for ann in coco['annotations']:
446
+ img_cats[ann['image_id']].add(ann['category_id'])
447
+
448
+ for img in coco['images']:
449
+ if img['id'] not in img_cats:
450
+ continue
451
+
452
+ img_path = self.data_dir / "images" / img['file_name']
453
+ if not img_path.exists():
454
+ continue
455
+
456
+ cats = list(img_cats[img['id']])
457
+ if cats:
458
+ cat = random.choice(cats)
459
+ cat_name = self.cat_id_to_name[cat]
460
+
461
+ # Generate questions
462
+ questions = [
463
+ (f"Is there a {cat_name} in this image?", "yes"),
464
+ (f"What objects are visible in this image?", cat_name),
465
+ ]
466
+
467
+ for q, a in questions[:1]:
468
+ self.samples.append({
469
+ 'path': str(img_path),
470
+ 'question': q,
471
+ 'answer': a
472
+ })
473
+
474
+ if len(self.samples) >= max_samples:
475
+ break
476
+
477
+ print(f" Loaded {len(self.samples)} VQA samples")
478
+
479
+ def evaluate(self, model: OculusForConditionalGeneration) -> Dict:
480
+ """Evaluate VQA accuracy."""
481
+ print("\n❓ VQA Benchmark")
482
+ print("-" * 40)
483
+
484
+ correct = 0
485
+ total = 0
486
+
487
+ for i, sample in enumerate(self.samples):
488
+ if i % 25 == 0:
489
+ print(f" Progress: {i}/{len(self.samples)}")
490
+
491
+ try:
492
+ image = Image.open(sample['path']).convert('RGB')
493
+ output = model.generate(image, mode="text", prompt=sample['question'])
494
+
495
+ response = output.text.lower()
496
+ answer = sample['answer'].lower()
497
+
498
+ # Check if answer is in response
499
+ is_correct = answer in response
500
+
501
+ if is_correct:
502
+ correct += 1
503
+ total += 1
504
+
505
+ except Exception as e:
506
+ pass
507
+
508
+ accuracy = correct / total if total > 0 else 0
509
+
510
+ results = {
511
+ 'accuracy': float(accuracy),
512
+ 'correct': correct,
513
+ 'total': total
514
+ }
515
+
516
+ print(f" Accuracy: {accuracy:.2%} ({correct}/{total})")
517
+
518
+ return results
519
+
520
+
521
+ # ============================================================================
522
+ # Main Evaluation
523
+ # ============================================================================
524
+
525
+ def run_benchmarks(model_path: str, benchmarks: List[str] = None):
526
+ """Run all benchmarks on the model."""
527
+
528
+ print("=" * 70)
529
+ print("🔮 OCULUS BENCHMARK EVALUATION SUITE")
530
+ print("=" * 70)
531
+ print(f"Model: {model_path}")
532
+
533
+ # Load model
534
+ print("\n[Loading Model]")
535
+ model = OculusForConditionalGeneration.from_pretrained(model_path)
536
+
537
+ # Load detection heads if available
538
+ heads_path = Path(model_path) / "heads.pth"
539
+ if heads_path.exists():
540
+ import torch
541
+ heads = torch.load(heads_path)
542
+ model.detection_head.load_state_dict(heads['detection'])
543
+ model.point_head.load_state_dict(heads['point'])
544
+ print(" ✓ Loaded trained detection heads")
545
+
546
+ model.vision_encoder.load_encoders()
547
+ model.load_language_model()
548
+
549
+ all_results = {}
550
+
551
+ # Run benchmarks
552
+ if benchmarks is None:
553
+ benchmarks = ['coco', 'car_damage', 'counting', 'vqa']
554
+
555
+ if 'coco' in benchmarks:
556
+ bench = COCODetectionBenchmark(max_samples=100)
557
+ all_results['coco_detection'] = bench.evaluate(model)
558
+
559
+ if 'car_damage' in benchmarks:
560
+ bench = CarDamageBenchmark(max_samples=50)
561
+ all_results['car_damage'] = bench.evaluate(model)
562
+
563
+ if 'counting' in benchmarks:
564
+ bench = CountingBenchmark(max_samples=100)
565
+ all_results['counting'] = bench.evaluate(model)
566
+
567
+ if 'vqa' in benchmarks:
568
+ bench = VQABenchmark(max_samples=100)
569
+ all_results['vqa'] = bench.evaluate(model)
570
+
571
+ # Summary
572
+ print("\n" + "=" * 70)
573
+ print("📊 BENCHMARK SUMMARY")
574
+ print("=" * 70)
575
+
576
+ for name, results in all_results.items():
577
+ print(f"\n{name}:")
578
+ for k, v in results.items():
579
+ if isinstance(v, float):
580
+ print(f" {k}: {v:.4f}")
581
+ else:
582
+ print(f" {k}: {v}")
583
+
584
+ # Save results
585
+ results_path = Path(model_path) / "benchmark_results.json"
586
+ with open(results_path, "w") as f:
587
+ json.dump(all_results, f, indent=2)
588
+ print(f"\n💾 Results saved to: {results_path}")
589
+
590
+ return all_results
591
+
592
+
593
+ if __name__ == "__main__":
594
+ import argparse
595
+
596
+ parser = argparse.ArgumentParser()
597
+ parser.add_argument("--model", default="checkpoints/oculus_detection/final")
598
+ parser.add_argument("--benchmarks", nargs="+", default=None)
599
+ args = parser.parse_args()
600
+
601
+ run_benchmarks(args.model, args.benchmarks)