#!/usr/bin/env python3
"""
Oculus VLM Benchmark Suite

Evaluates Oculus on industry-standard VLM benchmarks similar to Isaac/Moondream.
Benchmarks:
1. VQA v2 (subset)
2. RefCOCO Grounding
3. Counting (CVBench-style)
4. COCO Detection (mAP)
5. Captioning (BLEU)
"""

import os
import sys
import json
import random
import time
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Optional
from collections import defaultdict

import numpy as np
import torch
from PIL import Image
from tqdm import tqdm

OCULUS_ROOT = Path(__file__).parent
sys.path.insert(0, str(OCULUS_ROOT))

from oculus_inference import OculusPredictor


# ============================================================================
# Benchmark Utilities
# ============================================================================

def compute_iou(box1, box2):
    """Compute IoU between two boxes [x1, y1, x2, y2]."""
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    inter = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    
    return inter / (area1 + area2 - inter + 1e-8)


# ============================================================================
# Benchmark 1: VQA v2 Style
# ============================================================================

class VQABenchmark:
    """Visual Question Answering benchmark using COCO-derived questions."""
    
    def __init__(self, data_dir="data/coco", max_samples=200):
        self.samples = []
        
        # Load COCO annotations to generate VQA-style questions
        ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
        cap_file = Path(data_dir) / "annotations" / "captions_train2017.json"
        
        if not ann_file.exists():
            print("⚠️ COCO annotations not found")
            return
            
        with open(ann_file) as f:
            instances = json.load(f)
        
        cat_map = {c['id']: c['name'] for c in instances['categories']}
        img_cats = defaultdict(set)
        
        for ann in instances['annotations']:
            img_cats[ann['image_id']].add(cat_map.get(ann['category_id'], 'object'))
        
        # Generate VQA samples
        for img in instances['images']:
            img_path = Path(data_dir) / "images" / img['file_name']
            if not img_path.exists():
                continue
                
            cats = list(img_cats.get(img['id'], []))
            if not cats:
                continue
            
            cat = random.choice(cats)
            
            # Create different question types
            templates = [
                (f"Is there a {cat} in this image?", "yes"),
                (f"What type of object is visible?", cat),
                (f"Does this image contain a {cat}?", "yes"),
            ]
            
            q, a = random.choice(templates)
            self.samples.append({
                'path': str(img_path),
                'question': q,
                'answer': a.lower(),
                'category': cat
            })
            
            if len(self.samples) >= max_samples:
                break
        
        print(f"  VQA: Loaded {len(self.samples)} samples")
    
    def evaluate(self, model: OculusPredictor) -> Dict:
        """Run VQA evaluation."""
        print("\n📊 VQA v2 Style Benchmark")
        print("-" * 50)
        
        correct = 0
        total = 0
        
        for sample in tqdm(self.samples, desc="VQA"):
            try:
                answer = model.ask(sample['path'], sample['question'])
                
                # Check if expected answer is in response
                if sample['answer'] in answer.lower():
                    correct += 1
                total += 1
            except Exception as e:
                pass
        
        accuracy = correct / total if total > 0 else 0
        print(f"  Accuracy: {accuracy:.2%} ({correct}/{total})")
        
        return {
            'accuracy': float(accuracy),
            'correct': correct,
            'total': total
        }


# ============================================================================
# Benchmark 2: RefCOCO Grounding
# ============================================================================

class RefCOCOBenchmark:
    """Referring Expression Grounding using COCO boxes."""
    
    def __init__(self, data_dir="data/coco", max_samples=100):
        self.samples = []
        
        ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
        if not ann_file.exists():
            return
            
        with open(ann_file) as f:
            instances = json.load(f)
        
        cat_map = {c['id']: c['name'] for c in instances['categories']}
        img_map = {img['id']: img for img in instances['images']}
        
        # Group annotations by image
        img_anns = defaultdict(list)
        for ann in instances['annotations']:
            img_anns[ann['image_id']].append(ann)
        
        for img_id, anns in img_anns.items():
            if len(anns) < 1:
                continue
                
            img = img_map.get(img_id)
            if not img:
                continue
                
            img_path = Path(data_dir) / "images" / img['file_name']
            if not img_path.exists():
                continue
            
            # Pick a random object
            ann = random.choice(anns)
            cat = cat_map.get(ann['category_id'], 'object')
            
            # Normalize bbox
            x, y, w, h = ann['bbox']
            box = [
                x / img['width'],
                y / img['height'],
                (x + w) / img['width'],
                (y + h) / img['height']
            ]
            
            self.samples.append({
                'path': str(img_path),
                'expression': f"the {cat}",
                'gt_box': box
            })
            
            if len(self.samples) >= max_samples:
                break
        
        print(f"  RefCOCO: Loaded {len(self.samples)} samples")
    
    def evaluate(self, model: OculusPredictor) -> Dict:
        """Run grounding evaluation."""
        print("\n📊 RefCOCO Grounding Benchmark")
        print("-" * 50)
        
        ious = []
        acc_50 = 0
        
        for sample in tqdm(self.samples, desc="RefCOCO"):
            try:
                results = model.detect(sample['path'], prompt=f"Find {sample['expression']}")
                
                if len(results['boxes']) > 0:
                    # Take highest confidence box
                    pred_box = results['boxes'][0]
                    iou = compute_iou(sample['gt_box'], pred_box)
                    ious.append(iou)
                    if iou >= 0.5:
                        acc_50 += 1
                else:
                    ious.append(0)
            except:
                ious.append(0)
        
        mean_iou = np.mean(ious) if ious else 0
        accuracy = acc_50 / len(self.samples) if self.samples else 0
        
        print(f"  Mean IoU: {mean_iou:.4f}")
        print(f"  Acc@0.5: {accuracy:.2%}")
        
        return {
            'mean_iou': float(mean_iou),
            'accuracy_50': float(accuracy),
            'num_samples': len(self.samples)
        }


# ============================================================================
# Benchmark 3: Counting (CountBench Style)
# ============================================================================

class CountBenchmark:
    """Object counting benchmark."""
    
    def __init__(self, data_dir="data/coco", max_samples=100):
        self.samples = []
        
        ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
        if not ann_file.exists():
            return
            
        with open(ann_file) as f:
            instances = json.load(f)
        
        cat_map = {c['id']: c['name'] for c in instances['categories']}
        img_map = {img['id']: img for img in instances['images']}
        
        # Count objects per image per category
        img_counts = defaultdict(lambda: defaultdict(int))
        for ann in instances['annotations']:
            if not ann.get('iscrowd', 0):
                cat = cat_map.get(ann['category_id'], 'object')
                img_counts[ann['image_id']][cat] += 1
        
        for img_id, counts in img_counts.items():
            img = img_map.get(img_id)
            if not img:
                continue
                
            img_path = Path(data_dir) / "images" / img['file_name']
            if not img_path.exists():
                continue
            
            # Pick category with 2-10 objects (reasonable counting range)
            for cat, count in counts.items():
                if 2 <= count <= 10:
                    self.samples.append({
                        'path': str(img_path),
                        'category': cat,
                        'count': count
                    })
                    break
            
            if len(self.samples) >= max_samples:
                break
        
        print(f"  CountBench: Loaded {len(self.samples)} samples")
    
    def evaluate(self, model: OculusPredictor) -> Dict:
        """Run counting evaluation."""
        print("\n📊 CountBench Benchmark")
        print("-" * 50)
        
        exact = 0
        within_one = 0
        errors = []
        
        word_to_num = {'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
                       'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}
        
        for sample in tqdm(self.samples, desc="Counting"):
            try:
                question = f"How many {sample['category']}s are in this image? Answer with a number."
                answer = model.ask(sample['path'], question).lower()
                
                # Parse number
                pred = None
                for word in answer.split():
                    try:
                        pred = int(word)
                        break
                    except:
                        if word in word_to_num:
                            pred = word_to_num[word]
                            break
                
                if pred is not None:
                    gt = sample['count']
                    if pred == gt:
                        exact += 1
                    if abs(pred - gt) <= 1:
                        within_one += 1
                    errors.append(abs(pred - gt))
            except:
                pass
        
        total = len(self.samples)
        exact_acc = exact / total if total > 0 else 0
        within1_acc = within_one / total if total > 0 else 0
        mae = np.mean(errors) if errors else 0
        
        print(f"  Exact Accuracy: {exact_acc:.2%}")
        print(f"  Within-1 Accuracy: {within1_acc:.2%}")
        print(f"  MAE: {mae:.2f}")
        
        return {
            'exact_accuracy': float(exact_acc),
            'within_one_accuracy': float(within1_acc),
            'mae': float(mae),
            'total': total
        }


# ============================================================================
# Benchmark 4: COCO Detection (mAP)
# ============================================================================

class DetectionBenchmark:
    """Object Detection benchmark."""
    
    def __init__(self, data_dir="data/coco", max_samples=100):
        self.samples = []
        
        ann_file = Path(data_dir) / "annotations" / "instances_train2017.json"
        if not ann_file.exists():
            return
            
        with open(ann_file) as f:
            instances = json.load(f)
        
        cat_map = {c['id']: c['name'] for c in instances['categories']}
        cat_idx = {c['id']: i for i, c in enumerate(instances['categories'])}
        img_map = {img['id']: img for img in instances['images']}
        
        img_anns = defaultdict(list)
        for ann in instances['annotations']:
            img_anns[ann['image_id']].append(ann)
        
        for img_id, anns in img_anns.items():
            img = img_map.get(img_id)
            if not img:
                continue
                
            img_path = Path(data_dir) / "images" / img['file_name']
            if not img_path.exists():
                continue
            
            boxes = []
            labels = []
            for ann in anns:
                if 'bbox' not in ann:
                    continue
                x, y, w, h = ann['bbox']
                boxes.append([
                    x / img['width'],
                    y / img['height'],
                    (x + w) / img['width'],
                    (y + h) / img['height']
                ])
                labels.append(cat_idx.get(ann['category_id'], 0))
            
            if boxes:
                self.samples.append({
                    'path': str(img_path),
                    'boxes': boxes,
                    'labels': labels
                })
            
            if len(self.samples) >= max_samples:
                break
        
        print(f"  Detection: Loaded {len(self.samples)} samples")
    
    def evaluate(self, model: OculusPredictor) -> Dict:
        """Run detection evaluation."""
        print("\n📊 COCO Detection Benchmark")
        print("-" * 50)
        
        all_ious = []
        all_correct = []
        
        for sample in tqdm(self.samples, desc="Detection"):
            try:
                results = model.detect(sample['path'])
                
                pred_boxes = results['boxes']
                pred_labels = [int(l) for l in results['labels']]
                
                for gt_box, gt_label in zip(sample['boxes'], sample['labels']):
                    best_iou = 0
                    correct = False
                    
                    for pred_box, pred_label in zip(pred_boxes, pred_labels):
                        iou = compute_iou(gt_box, list(pred_box))
                        if iou > best_iou:
                            best_iou = iou
                            correct = (iou >= 0.5) and (pred_label == gt_label)
                    
                    all_ious.append(best_iou)
                    all_correct.append(correct)
            except:
                pass
        
        mean_iou = np.mean(all_ious) if all_ious else 0
        accuracy = np.mean(all_correct) if all_correct else 0
        
        print(f"  Mean IoU: {mean_iou:.4f}")
        print(f"  mAP@0.5: {accuracy:.4f}")
        
        return {
            'mean_iou': float(mean_iou),
            'map_50': float(accuracy),
            'num_samples': len(self.samples)
        }


# ============================================================================
# Main Runner
# ============================================================================

def run_all_benchmarks():
    """Run complete benchmark suite."""
    print("=" * 60)
    print("🔮 OCULUS VLM BENCHMARK SUITE")
    print("=" * 60)
    
    # Initialize model
    print("\n[Loading Oculus Model]")
    model = OculusPredictor()
    
    results = {}
    
    # Run benchmarks
    print("\n[Running Benchmarks]")
    
    # 1. VQA
    vqa = VQABenchmark(max_samples=200)
    results['vqa_v2'] = vqa.evaluate(model)
    
    # 2. RefCOCO
    refcoco = RefCOCOBenchmark(max_samples=100)
    results['refcoco'] = refcoco.evaluate(model)
    
    # 3. Counting
    counting = CountBenchmark(max_samples=100)
    results['countbench'] = counting.evaluate(model)
    
    # 4. Detection
    detection = DetectionBenchmark(max_samples=100)
    results['coco_detection'] = detection.evaluate(model)
    
    # Summary
    print("\n" + "=" * 60)
    print("📊 BENCHMARK RESULTS SUMMARY")
    print("=" * 60)
    
    print(f"""
╔═══════════════════════════════════════════════════════════╗
║                    OCULUS BENCHMARKS                      ║
╠═══════════════════════════════════════════════════════════╣
║  VQA v2 (Style)                                           ║
║    Accuracy:        {results['vqa_v2']['accuracy']:.2%}                              ║
╠═══════════════════════════════════════════════════════════╣
║  RefCOCO Grounding                                        ║
║    Mean IoU:        {results['refcoco']['mean_iou']:.4f}                             ║
║    Acc@0.5:         {results['refcoco']['accuracy_50']:.2%}                              ║
╠═══════════════════════════════════════════════════════════╣
║  CountBench                                               ║
║    Exact Accuracy:  {results['countbench']['exact_accuracy']:.2%}                              ║
║    Within-1 Acc:    {results['countbench']['within_one_accuracy']:.2%}                              ║
╠═══════════════════════════════════════════════════════════╣
║  COCO Detection                                           ║
║    Mean IoU:        {results['coco_detection']['mean_iou']:.4f}                             ║
║    mAP@0.5:         {results['coco_detection']['map_50']:.4f}                             ║
╚═══════════════════════════════════════════════════════════╝
""")
    
    # Save results
    output_path = OCULUS_ROOT / "benchmark_results.json"
    with open(output_path, "w") as f:
        json.dump(results, f, indent=2)
    print(f"💾 Results saved to: {output_path}")
    
    return results


if __name__ == "__main__":
    run_all_benchmarks()