Spaces:
Running
Running
| # benchmark_eval.py β EVALUATION SUITE | |
| """ | |
| Runs the full FakeShield Video Forensic Lab on real and AI samples. | |
| Generates an accuracy report and forensic signal breakdown. | |
| """ | |
| import os | |
| import json | |
| import time | |
| import sys | |
| # Ensure backend directory is in path | |
| sys.path.append(os.path.join(os.getcwd(), 'backend')) | |
| from app.services.video.video_detector import analyze_video | |
| EVAL_DIR = "backend/eval_samples" | |
| REPORT_FILE = "backend/benchmark_report.json" | |
| def run_benchmark(): | |
| if not os.path.exists(EVAL_DIR): | |
| print(f"β Error: {EVAL_DIR} not found. Run downloads first.") | |
| return | |
| samples = [f for f in os.listdir(EVAL_DIR) if f.endswith(".mp4")] | |
| if not samples: | |
| print(f"β No samples found in {EVAL_DIR}.") | |
| return | |
| print(f"π¬ Starting Benchmark on {len(samples)} samples...") | |
| print("-" * 60) | |
| results = [] | |
| for filename in samples: | |
| video_path = os.path.join(EVAL_DIR, filename) | |
| is_ai_ground_truth = filename.startswith("sora") | |
| print(f"π Analyzing: {filename} (GT: {'AI' if is_ai_ground_truth else 'REAL'})") | |
| start_time = time.time() | |
| try: | |
| # Run analysis | |
| report = analyze_video(video_path, max_frames=32) | |
| elapsed = time.time() - start_time | |
| verdict = report.get("verdict", "UNKNOWN") | |
| fused_score = report.get("fused_score", 0.0) | |
| # Simple accuracy check | |
| is_correct = False | |
| if is_ai_ground_truth and verdict in ["AI_GENERATED", "LIKELY_AI"]: | |
| is_correct = True | |
| elif not is_ai_ground_truth and verdict in ["LIKELY_REAL", "UNCERTAIN"]: | |
| # Uncertain is better than false positive for real samples | |
| is_correct = True | |
| res = { | |
| "filename": filename, | |
| "ground_truth": "AI" if is_ai_ground_truth else "REAL", | |
| "verdict": verdict, | |
| "score": fused_score, | |
| "is_correct": is_correct, | |
| "elapsed_sec": round(elapsed, 2), | |
| "signal_scores": report.get("signal_scores", {}), | |
| "robustness": report.get("robustness", {}), | |
| "provenance": report.get("provenance", {}) | |
| } | |
| results.append(res) | |
| status_icon = "β " if is_correct else "β" | |
| print(f" {status_icon} Verdict: {verdict} ({fused_score}%) | Time: {elapsed:.1f}s") | |
| except Exception as e: | |
| print(f" β Error: {e}") | |
| results.append({ | |
| "filename": filename, | |
| "error": str(e) | |
| }) | |
| # Summary | |
| total = len([r for r in results if "error" not in r]) | |
| correct = len([r for r in results if r.get("is_correct")]) | |
| accuracy = (correct / total * 100) if total > 0 else 0 | |
| print("-" * 60) | |
| print(f"π BENCHMARK COMPLETE") | |
| print(f"β Accuracy: {accuracy:.1f}% ({correct}/{total})") | |
| # Save report | |
| summary = { | |
| "timestamp": time.ctime(), | |
| "total_samples": total, | |
| "correct": correct, | |
| "accuracy_pct": accuracy, | |
| "results": results | |
| } | |
| with open(REPORT_FILE, "w") as f: | |
| json.dump(summary, f, indent=4) | |
| print(f"π Report saved to {REPORT_FILE}") | |
| if __name__ == "__main__": | |
| run_benchmark() | |