"""Baseline benchmark for image deepfake detector against CIFAKE dataset. Downloads a sample from CIFAKE (real CIFAR-10 vs Stable Diffusion fakes), runs the full detection pipeline, and reports standard ML metrics. Usage: source venv/bin/activate python scripts/benchmark_image.py [--sample N] """ import argparse import io import json import sys import time from pathlib import Path import numpy as np from PIL import Image from sklearn.metrics import ( accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score, ) # Add project root to path so we can import app modules ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) from app.services.image_detector import ImageDetector def download_dataset(sample_per_class: int = 100) -> list[tuple[bytes, int, str]]: """Download deepfake test dataset and return (image_bytes, label, source) tuples. Dataset: itsLeen/deepfake_vs_real_image_detection (512x512+ images) Labels: 0 = real, 1 = fake. """ from datasets import load_dataset print("Downloading deepfake dataset (itsLeen/deepfake_vs_real_image_detection)...") dataset = load_dataset( "itsLeen/deepfake_vs_real_image_detection", split="test", streaming=True ).shuffle(seed=42) real_samples = [] fake_samples = [] for row in dataset: if row["label"] == 0 and len(real_samples) < sample_per_class: real_samples.append(row) elif row["label"] == 1 and len(fake_samples) < sample_per_class: fake_samples.append(row) if len(real_samples) >= sample_per_class and len(fake_samples) >= sample_per_class: break print(f" Real images: {len(real_samples)}, Fake images: {len(fake_samples)}") samples = [] for i, row in enumerate(real_samples): img = row["image"].convert("RGB") buf = io.BytesIO() img.save(buf, format="JPEG") samples.append((buf.getvalue(), 0, f"real_{i:04d}")) for i, row in enumerate(fake_samples): img = row["image"].convert("RGB") buf = io.BytesIO() img.save(buf, format="JPEG") samples.append((buf.getvalue(), 1, f"fake_{i:04d}")) return samples def run_individual_model(detector: ImageDetector, image_bytes: bytes) -> dict | None: """Run each sub-model individually to get per-model predictions.""" import torch results = {} image = Image.open(io.BytesIO(image_bytes)).convert("RGB") # CommunityForensics ViT if detector._model_loaded: try: import torchvision.transforms as T transform = T.Compose([ T.Resize((384, 384), interpolation=T.InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=[0.4815, 0.4578, 0.4082], std=[0.2686, 0.2613, 0.2758]), ]) pixel_values = transform(image).unsqueeze(0) with torch.no_grad(): logits = detector.model(pixel_values=pixel_values).logits probs = torch.softmax(logits, dim=-1) results["communityforensics"] = { "fake_prob": probs[0][1].item(), "verdict": "manipulated" if probs[0][1].item() > 0.5 else "real", } except Exception: results["communityforensics"] = None # prithivMLmods SigLIP if detector._ensemble_loaded: try: inputs = detector.processor_ensemble(images=image, return_tensors="pt") with torch.no_grad(): logits = detector.model_ensemble(**inputs).logits probs = torch.softmax(logits, dim=-1) results["prithivmlmods"] = { "fake_prob": probs[0][0].item(), "verdict": "manipulated" if probs[0][0].item() > 0.5 else "real", } except Exception: results["prithivmlmods"] = None return results def run_benchmark(sample_per_class: int = 100): """Run the full benchmark and print results.""" print("=" * 60) print("DeepFakeGuard — Image Detector Baseline Benchmark") print("=" * 60) print() # Download dataset samples = download_dataset(sample_per_class) print() # Load detector print("Loading image detection models...") detector = ImageDetector() detector.load_model() if not detector.is_loaded: print("ERROR: No models loaded. Cannot run benchmark.") sys.exit(1) print(f" Primary (CommunityForensics): {'loaded' if detector._model_loaded else 'FAILED'}") print(f" Ensemble (prithivMLmods): {'loaded' if detector._ensemble_loaded else 'FAILED'}") print() # Run detection print(f"Running detection on {len(samples)} images...") y_true = [] y_pred = [] y_pred_cf = [] # CommunityForensics predictions y_pred_pm = [] # prithivMLmods predictions confidences = [] details = [] start = time.time() for i, (img_bytes, label, source) in enumerate(samples): result = detector.detect(img_bytes, filename=f"{source}.jpg") pred_label = 1 if result["verdict"] == "manipulated" else 0 y_true.append(label) y_pred.append(pred_label) confidences.append(result["confidence"]) # Per-model breakdown per_model = run_individual_model(detector, img_bytes) if per_model.get("communityforensics"): y_pred_cf.append(1 if per_model["communityforensics"]["verdict"] == "manipulated" else 0) if per_model.get("prithivmlmods"): y_pred_pm.append(1 if per_model["prithivmlmods"]["verdict"] == "manipulated" else 0) details.append({ "source": source, "label": "real" if label == 0 else "fake", "predicted": result["verdict"], "confidence": result["confidence"], "severity": result["severity"], "correct": pred_label == label, "communityforensics": per_model.get("communityforensics"), "prithivmlmods": per_model.get("prithivmlmods"), }) if (i + 1) % 20 == 0: print(f" Processed {i + 1}/{len(samples)}") elapsed = time.time() - start print(f" Done in {elapsed:.1f}s ({elapsed / len(samples):.2f}s/image)") print() # Compute metrics y_true = np.array(y_true) y_pred = np.array(y_pred) # --- Full Pipeline --- print("=" * 60) print("FULL PIPELINE (ML Ensemble + Rule-Based)") print("=" * 60) acc = accuracy_score(y_true, y_pred) prec = precision_score(y_true, y_pred, pos_label=1, zero_division=0) rec = recall_score(y_true, y_pred, pos_label=1, zero_division=0) f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=0) cm = confusion_matrix(y_true, y_pred) print(f" Accuracy: {acc:.3f}") print(f" Precision: {prec:.3f} (of predicted fake, how many actually fake)") print(f" Recall: {rec:.3f} (of actual fake, how many caught)") print(f" F1 Score: {f1:.3f}") print() print(f" Confusion Matrix:") print(f" Predicted") print(f" Real Fake") print(f" Actual Real {cm[0][0]:>4} {cm[0][1]:>4}") print(f" Actual Fake {cm[1][0]:>4} {cm[1][1]:>4}") print() real_mask = y_true == 0 fake_mask = y_true == 1 print(f" Real images correct: {np.sum((y_pred == 0) & real_mask)}/{np.sum(real_mask)}") print(f" Fake images correct: {np.sum((y_pred == 1) & fake_mask)}/{np.sum(fake_mask)}") print(f" Avg confidence: {np.mean(confidences):.3f}") print() # --- Per-Model Breakdown --- if y_pred_cf: print("=" * 60) print("CommunityForensics ViT (Primary)") print("=" * 60) y_pred_cf = np.array(y_pred_cf) acc_cf = accuracy_score(y_true, y_pred_cf) f1_cf = f1_score(y_true, y_pred_cf, pos_label=1, zero_division=0) print(f" Accuracy: {acc_cf:.3f} F1: {f1_cf:.3f}") cm_cf = confusion_matrix(y_true, y_pred_cf) print(f" Confusion Matrix:") print(f" Predicted") print(f" Real Fake") print(f" Actual Real {cm_cf[0][0]:>4} {cm_cf[0][1]:>4}") print(f" Actual Fake {cm_cf[1][0]:>4} {cm_cf[1][1]:>4}") print() if y_pred_pm: print("=" * 60) print("prithivMLmods SigLIP (Ensemble)") print("=" * 60) y_pred_pm = np.array(y_pred_pm) acc_pm = accuracy_score(y_true, y_pred_pm) f1_pm = f1_score(y_true, y_pred_pm, pos_label=1, zero_division=0) print(f" Accuracy: {acc_pm:.3f} F1: {f1_pm:.3f}") cm_pm = confusion_matrix(y_true, y_pred_pm) print(f" Confusion Matrix:") print(f" Predicted") print(f" Real Fake") print(f" Actual Real {cm_pm[0][0]:>4} {cm_pm[0][1]:>4}") print(f" Actual Fake {cm_pm[1][0]:>4} {cm_pm[1][1]:>4}") print() # --- Summary comparison --- print("=" * 60) print("COMPARISON") print("=" * 60) print(f" {'Model':<35} {'Accuracy':>10} {'F1':>8}") print(f" {'-'*35} {'-'*10} {'-'*8}") print(f" {'Full Pipeline (Ensemble + Rules)':<35} {acc:>10.3f} {f1:>8.3f}") if y_pred_cf is not None and len(y_pred_cf): print(f" {'CommunityForensics ViT':<35} {acc_cf:>10.3f} {f1_cf:>8.3f}") if y_pred_pm is not None and len(y_pred_pm): print(f" {'prithivMLmods SigLIP':<35} {acc_pm:>10.3f} {f1_pm:>8.3f}") print() # Save results results = { "dataset": "itsLeen/deepfake_vs_real_image_detection", "samples": {"real": int(np.sum(real_mask)), "fake": int(np.sum(fake_mask))}, "full_pipeline": { "accuracy": round(float(acc), 4), "precision": round(float(prec), 4), "recall": round(float(rec), 4), "f1": round(float(f1), 4), "confusion_matrix": cm.tolist(), "avg_confidence": round(float(np.mean(confidences)), 4), }, "communityforensics_vit": { "accuracy": round(float(acc_cf), 4), "f1": round(float(f1_cf), 4), } if y_pred_cf is not None and len(y_pred_cf) else None, "prithivmlmods_siglip": { "accuracy": round(float(acc_pm), 4), "f1": round(float(f1_pm), 4), } if y_pred_pm is not None and len(y_pred_pm) else None, "per_image_details": details, } output_path = ROOT / "scripts" / "benchmark_results.json" with open(output_path, "w") as f: json.dump(results, f, indent=2) print(f"Results saved to {output_path}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Benchmark image deepfake detector") parser.add_argument( "--sample", type=int, default=100, help="Number of images per class (default: 100)", ) args = parser.parse_args() run_benchmark(sample_per_class=args.sample)