Spaces:
Sleeping
Sleeping
| """Baseline benchmark for audio deepfake detector against labeled dataset. | |
| Downloads a sample from UniDataPro/real-vs-fake-human-voice-deepfake-audio, | |
| runs the full detection pipeline, and reports standard ML metrics. | |
| Usage: | |
| source venv/bin/activate | |
| python scripts/benchmark_audio.py [--sample N] | |
| """ | |
| import argparse | |
| import io | |
| import json | |
| import sys | |
| import time | |
| from pathlib import Path | |
| import numpy as np | |
| import soundfile as sf | |
| from sklearn.metrics import ( | |
| accuracy_score, | |
| confusion_matrix, | |
| f1_score, | |
| precision_score, | |
| recall_score, | |
| ) | |
| ROOT = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(ROOT)) | |
| from app.services.audio_detector import AudioDetector | |
| def download_dataset(sample_per_class: int = 50) -> list[tuple[bytes, int, str]]: | |
| """Download audio deepfake dataset and return (audio_bytes, label, source) tuples. | |
| Dataset: UniDataPro/real-vs-fake-human-voice-deepfake-audio | |
| Labels: 0 = real, 1 = fake. | |
| """ | |
| from datasets import load_dataset | |
| print("Downloading audio dataset (UniDataPro/real-vs-fake-human-voice-deepfake-audio)...") | |
| dataset = load_dataset( | |
| "UniDataPro/real-vs-fake-human-voice-deepfake-audio", | |
| split="train", | |
| streaming=True, | |
| ).shuffle(seed=42) | |
| real_samples = [] | |
| fake_samples = [] | |
| for row in dataset: | |
| arr = row["audio"]["array"] | |
| sr = row["audio"]["sampling_rate"] | |
| label = row["label"] | |
| # Convert numpy array to WAV bytes | |
| buf = io.BytesIO() | |
| sf.write(buf, arr, sr, format='WAV') | |
| wav_bytes = buf.getvalue() | |
| if label == 0 and len(real_samples) < sample_per_class: | |
| real_samples.append((wav_bytes, 0, f"real_{len(real_samples):04d}")) | |
| elif label == 1 and len(fake_samples) < sample_per_class: | |
| fake_samples.append((wav_bytes, 1, f"fake_{len(fake_samples):04d}")) | |
| if len(real_samples) >= sample_per_class and len(fake_samples) >= sample_per_class: | |
| break | |
| print(f" Real audio: {len(real_samples)}, Fake audio: {len(fake_samples)}") | |
| return real_samples + fake_samples | |
| def run_benchmark(sample_per_class: int = 50): | |
| """Run the full benchmark and print results.""" | |
| print("=" * 60) | |
| print("DeepFakeGuard — Audio Detector Baseline Benchmark") | |
| print("=" * 60) | |
| print() | |
| samples = download_dataset(sample_per_class) | |
| print() | |
| print("Loading audio detection model...") | |
| detector = AudioDetector() | |
| detector.load_model() | |
| if not detector.is_loaded: | |
| print("ERROR: Audio model not loaded. Cannot run benchmark.") | |
| sys.exit(1) | |
| print(f" Wav2Vec2 model: loaded") | |
| print() | |
| print(f"Running detection on {len(samples)} audio clips...") | |
| y_true = [] | |
| y_pred = [] | |
| confidences = [] | |
| details = [] | |
| start = time.time() | |
| for i, (audio_bytes, label, source) in enumerate(samples): | |
| result = detector.detect(audio_bytes, filename=f"{source}.wav") | |
| pred_label = 1 if result["verdict"] == "synthetic" else 0 | |
| y_true.append(label) | |
| y_pred.append(pred_label) | |
| confidences.append(result["confidence"]) | |
| details.append({ | |
| "source": source, | |
| "label": "real" if label == 0 else "fake", | |
| "predicted": result["verdict"], | |
| "confidence": result["confidence"], | |
| "severity": result["severity"], | |
| "correct": pred_label == label, | |
| "duration": result["analysis"].get("duration_seconds", 0), | |
| }) | |
| if (i + 1) % 10 == 0: | |
| print(f" Processed {i + 1}/{len(samples)}") | |
| elapsed = time.time() - start | |
| print(f" Done in {elapsed:.1f}s ({elapsed / len(samples):.2f}s/clip)") | |
| print() | |
| y_true = np.array(y_true) | |
| y_pred = np.array(y_pred) | |
| # --- Metrics --- | |
| print("=" * 60) | |
| print("FULL PIPELINE (Wav2Vec2 + Rule-Based)") | |
| print("=" * 60) | |
| acc = accuracy_score(y_true, y_pred) | |
| prec = precision_score(y_true, y_pred, pos_label=1, zero_division=0) | |
| rec = recall_score(y_true, y_pred, pos_label=1, zero_division=0) | |
| f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=0) | |
| cm = confusion_matrix(y_true, y_pred) | |
| print(f" Accuracy: {acc:.3f}") | |
| print(f" Precision: {prec:.3f} (of predicted fake, how many actually fake)") | |
| print(f" Recall: {rec:.3f} (of actual fake, how many caught)") | |
| print(f" F1 Score: {f1:.3f}") | |
| print() | |
| print(f" Confusion Matrix:") | |
| print(f" Predicted") | |
| print(f" Real Fake") | |
| print(f" Actual Real {cm[0][0]:>4} {cm[0][1]:>4}") | |
| print(f" Actual Fake {cm[1][0]:>4} {cm[1][1]:>4}") | |
| print() | |
| real_mask = y_true == 0 | |
| fake_mask = y_true == 1 | |
| print(f" Real audio correct: {np.sum((y_pred == 0) & real_mask)}/{np.sum(real_mask)}") | |
| print(f" Fake audio correct: {np.sum((y_pred == 1) & fake_mask)}/{np.sum(fake_mask)}") | |
| print(f" Avg confidence: {np.mean(confidences):.3f}") | |
| print() | |
| # Save results | |
| results = { | |
| "dataset": "UniDataPro/real-vs-fake-human-voice-deepfake-audio", | |
| "samples": {"real": int(np.sum(real_mask)), "fake": int(np.sum(fake_mask))}, | |
| "model": "garystafford/wav2vec2-deepfake-voice-detector", | |
| "accuracy": round(float(acc), 4), | |
| "precision": round(float(prec), 4), | |
| "recall": round(float(rec), 4), | |
| "f1": round(float(f1), 4), | |
| "confusion_matrix": cm.tolist(), | |
| "avg_confidence": round(float(np.mean(confidences)), 4), | |
| "per_clip_details": details, | |
| } | |
| output_path = ROOT / "scripts" / "benchmark_audio_results.json" | |
| with open(output_path, "w") as f: | |
| json.dump(results, f, indent=2) | |
| print(f"Results saved to {output_path}") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Benchmark audio deepfake detector") | |
| parser.add_argument( | |
| "--sample", | |
| type=int, | |
| default=50, | |
| help="Number of audio clips per class (default: 50)", | |
| ) | |
| args = parser.parse_args() | |
| run_benchmark(sample_per_class=args.sample) | |