# benchmark_eval.py — EVALUATION SUITE
"""
Runs the full FakeShield Video Forensic Lab on real and AI samples.
Generates an accuracy report and forensic signal breakdown.
"""
import os
import json
import time
import sys

# Ensure backend directory is in path
sys.path.append(os.path.join(os.getcwd(), 'backend'))

from app.services.video.video_detector import analyze_video

EVAL_DIR = "backend/eval_samples"
REPORT_FILE = "backend/benchmark_report.json"

def run_benchmark():
    if not os.path.exists(EVAL_DIR):
        print(f"❌ Error: {EVAL_DIR} not found. Run downloads first.")
        return

    samples = [f for f in os.listdir(EVAL_DIR) if f.endswith(".mp4")]
    if not samples:
        print(f"❌ No samples found in {EVAL_DIR}.")
        return

    print(f"🎬 Starting Benchmark on {len(samples)} samples...")
    print("-" * 60)

    results = []
    
    for filename in samples:
        video_path = os.path.join(EVAL_DIR, filename)
        is_ai_ground_truth = filename.startswith("sora")
        
        print(f"🔍 Analyzing: {filename} (GT: {'AI' if is_ai_ground_truth else 'REAL'})")
        
        start_time = time.time()
        try:
            # Run analysis
            report = analyze_video(video_path, max_frames=32)
            elapsed = time.time() - start_time
            
            verdict = report.get("verdict", "UNKNOWN")
            fused_score = report.get("fused_score", 0.0)
            
            # Simple accuracy check
            is_correct = False
            if is_ai_ground_truth and verdict in ["AI_GENERATED", "LIKELY_AI"]:
                is_correct = True
            elif not is_ai_ground_truth and verdict in ["LIKELY_REAL", "UNCERTAIN"]:
                # Uncertain is better than false positive for real samples
                is_correct = True
                
            res = {
                "filename": filename,
                "ground_truth": "AI" if is_ai_ground_truth else "REAL",
                "verdict": verdict,
                "score": fused_score,
                "is_correct": is_correct,
                "elapsed_sec": round(elapsed, 2),
                "signal_scores": report.get("signal_scores", {}),
                "robustness": report.get("robustness", {}),
                "provenance": report.get("provenance", {})
            }
            results.append(res)
            
            status_icon = "✅" if is_correct else "❌"
            print(f"   {status_icon} Verdict: {verdict} ({fused_score}%) | Time: {elapsed:.1f}s")
            
        except Exception as e:
            print(f"   ❌ Error: {e}")
            results.append({
                "filename": filename,
                "error": str(e)
            })

    # Summary
    total = len([r for r in results if "error" not in r])
    correct = len([r for r in results if r.get("is_correct")])
    accuracy = (correct / total * 100) if total > 0 else 0

    print("-" * 60)
    print(f"📊 BENCHMARK COMPLETE")
    print(f"✅ Accuracy: {accuracy:.1f}% ({correct}/{total})")
    
    # Save report
    summary = {
        "timestamp": time.ctime(),
        "total_samples": total,
        "correct": correct,
        "accuracy_pct": accuracy,
        "results": results
    }
    
    with open(REPORT_FILE, "w") as f:
        json.dump(summary, f, indent=4)
        
    print(f"📝 Report saved to {REPORT_FILE}")

if __name__ == "__main__":
    run_benchmark()