deepfakeguard / scripts /benchmark_audio.py
noorullah1102's picture
Initial commit
a869ab1
"""Baseline benchmark for audio deepfake detector against labeled dataset.
Downloads a sample from UniDataPro/real-vs-fake-human-voice-deepfake-audio,
runs the full detection pipeline, and reports standard ML metrics.
Usage:
source venv/bin/activate
python scripts/benchmark_audio.py [--sample N]
"""
import argparse
import io
import json
import sys
import time
from pathlib import Path
import numpy as np
import soundfile as sf
from sklearn.metrics import (
accuracy_score,
confusion_matrix,
f1_score,
precision_score,
recall_score,
)
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from app.services.audio_detector import AudioDetector
def download_dataset(sample_per_class: int = 50) -> list[tuple[bytes, int, str]]:
"""Download audio deepfake dataset and return (audio_bytes, label, source) tuples.
Dataset: UniDataPro/real-vs-fake-human-voice-deepfake-audio
Labels: 0 = real, 1 = fake.
"""
from datasets import load_dataset
print("Downloading audio dataset (UniDataPro/real-vs-fake-human-voice-deepfake-audio)...")
dataset = load_dataset(
"UniDataPro/real-vs-fake-human-voice-deepfake-audio",
split="train",
streaming=True,
).shuffle(seed=42)
real_samples = []
fake_samples = []
for row in dataset:
arr = row["audio"]["array"]
sr = row["audio"]["sampling_rate"]
label = row["label"]
# Convert numpy array to WAV bytes
buf = io.BytesIO()
sf.write(buf, arr, sr, format='WAV')
wav_bytes = buf.getvalue()
if label == 0 and len(real_samples) < sample_per_class:
real_samples.append((wav_bytes, 0, f"real_{len(real_samples):04d}"))
elif label == 1 and len(fake_samples) < sample_per_class:
fake_samples.append((wav_bytes, 1, f"fake_{len(fake_samples):04d}"))
if len(real_samples) >= sample_per_class and len(fake_samples) >= sample_per_class:
break
print(f" Real audio: {len(real_samples)}, Fake audio: {len(fake_samples)}")
return real_samples + fake_samples
def run_benchmark(sample_per_class: int = 50):
"""Run the full benchmark and print results."""
print("=" * 60)
print("DeepFakeGuard — Audio Detector Baseline Benchmark")
print("=" * 60)
print()
samples = download_dataset(sample_per_class)
print()
print("Loading audio detection model...")
detector = AudioDetector()
detector.load_model()
if not detector.is_loaded:
print("ERROR: Audio model not loaded. Cannot run benchmark.")
sys.exit(1)
print(f" Wav2Vec2 model: loaded")
print()
print(f"Running detection on {len(samples)} audio clips...")
y_true = []
y_pred = []
confidences = []
details = []
start = time.time()
for i, (audio_bytes, label, source) in enumerate(samples):
result = detector.detect(audio_bytes, filename=f"{source}.wav")
pred_label = 1 if result["verdict"] == "synthetic" else 0
y_true.append(label)
y_pred.append(pred_label)
confidences.append(result["confidence"])
details.append({
"source": source,
"label": "real" if label == 0 else "fake",
"predicted": result["verdict"],
"confidence": result["confidence"],
"severity": result["severity"],
"correct": pred_label == label,
"duration": result["analysis"].get("duration_seconds", 0),
})
if (i + 1) % 10 == 0:
print(f" Processed {i + 1}/{len(samples)}")
elapsed = time.time() - start
print(f" Done in {elapsed:.1f}s ({elapsed / len(samples):.2f}s/clip)")
print()
y_true = np.array(y_true)
y_pred = np.array(y_pred)
# --- Metrics ---
print("=" * 60)
print("FULL PIPELINE (Wav2Vec2 + Rule-Based)")
print("=" * 60)
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, pos_label=1, zero_division=0)
rec = recall_score(y_true, y_pred, pos_label=1, zero_division=0)
f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=0)
cm = confusion_matrix(y_true, y_pred)
print(f" Accuracy: {acc:.3f}")
print(f" Precision: {prec:.3f} (of predicted fake, how many actually fake)")
print(f" Recall: {rec:.3f} (of actual fake, how many caught)")
print(f" F1 Score: {f1:.3f}")
print()
print(f" Confusion Matrix:")
print(f" Predicted")
print(f" Real Fake")
print(f" Actual Real {cm[0][0]:>4} {cm[0][1]:>4}")
print(f" Actual Fake {cm[1][0]:>4} {cm[1][1]:>4}")
print()
real_mask = y_true == 0
fake_mask = y_true == 1
print(f" Real audio correct: {np.sum((y_pred == 0) & real_mask)}/{np.sum(real_mask)}")
print(f" Fake audio correct: {np.sum((y_pred == 1) & fake_mask)}/{np.sum(fake_mask)}")
print(f" Avg confidence: {np.mean(confidences):.3f}")
print()
# Save results
results = {
"dataset": "UniDataPro/real-vs-fake-human-voice-deepfake-audio",
"samples": {"real": int(np.sum(real_mask)), "fake": int(np.sum(fake_mask))},
"model": "garystafford/wav2vec2-deepfake-voice-detector",
"accuracy": round(float(acc), 4),
"precision": round(float(prec), 4),
"recall": round(float(rec), 4),
"f1": round(float(f1), 4),
"confusion_matrix": cm.tolist(),
"avg_confidence": round(float(np.mean(confidences)), 4),
"per_clip_details": details,
}
output_path = ROOT / "scripts" / "benchmark_audio_results.json"
with open(output_path, "w") as f:
json.dump(results, f, indent=2)
print(f"Results saved to {output_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark audio deepfake detector")
parser.add_argument(
"--sample",
type=int,
default=50,
help="Number of audio clips per class (default: 50)",
)
args = parser.parse_args()
run_benchmark(sample_per_class=args.sample)