AuthriX / backend /audio_detector.py
Deepfake Authenticator
fix: audio timeout 20s, cap chunks to 3, cap extraction to 30s β€” fix stuck at 80%
fa1d723
"""
Deepfake Authenticator β€” Audio Analysis Agent
Detects AI-generated / synthetic voices from video audio tracks.
Pipeline:
1. AudioExtractorAgent β€” extracts audio from video via moviepy
2. AudioAnalysisAgent β€” librosa heuristics (MFCC, pitch, spectral)
3. AudioDecisionAgent β€” Wav2Vec2 model (Bisher/wav2vec2_ASV_deepfake_audio_detection)
4. AudioReportAgent β€” builds structured result
"""
import os
import tempfile
import logging
import numpy as np
logger = logging.getLogger(__name__)
# ─────────────────────────────────────────────
# Agent 1: Audio Extractor
# Pulls audio track from video file
# ─────────────────────────────────────────────
class AudioExtractorAgent:
TARGET_SR = 16000 # Wav2Vec2 expects 16kHz
def extract(self, video_path: str) -> tuple[np.ndarray | None, int]:
"""
Extract mono 16kHz audio from video.
Returns (waveform_array, sample_rate) or (None, 0) if no audio.
"""
try:
from moviepy import VideoFileClip
except ImportError:
try:
from moviepy.editor import VideoFileClip
except ImportError:
logger.warning("moviepy not installed β€” audio analysis skipped")
return None, 0
tmp_wav = None
try:
clip = VideoFileClip(video_path)
if clip.audio is None:
logger.info("Video has no audio track")
clip.close()
return None, 0
# Cap at 30s β€” enough for detection, avoids slow extraction on long videos
MAX_AUDIO_SEC = 30
audio_clip = clip.audio
if clip.duration > MAX_AUDIO_SEC:
audio_clip = clip.audio.subclipped(0, MAX_AUDIO_SEC)
# Write to temp WAV
tmp_wav = tempfile.mktemp(suffix=".wav")
audio_clip.write_audiofile(
tmp_wav,
fps=self.TARGET_SR,
nbytes=2,
codec="pcm_s16le",
logger=None,
)
clip.close()
# Load with soundfile for clean numpy array
import soundfile as sf
waveform, sr = sf.read(tmp_wav, dtype="float32")
# Convert stereo β†’ mono
if waveform.ndim > 1:
waveform = waveform.mean(axis=1)
# Resample if needed
if sr != self.TARGET_SR:
import torchaudio
import torch
t = torch.from_numpy(waveform).unsqueeze(0)
resampler = torchaudio.transforms.Resample(sr, self.TARGET_SR)
waveform = resampler(t).squeeze(0).numpy()
sr = self.TARGET_SR
logger.info(f"Audio extracted: {len(waveform)/sr:.1f}s @ {sr}Hz")
return waveform, sr
except Exception as e:
logger.warning(f"Audio extraction failed: {e}")
return None, 0
finally:
if tmp_wav and os.path.exists(tmp_wav):
os.unlink(tmp_wav)
# ─────────────────────────────────────────────
# Agent 2: Audio Heuristic Analyzer
# Librosa-based feature analysis
# ─────────────────────────────────────────────
class AudioAnalysisAgent:
"""
Detects AI voice artifacts using signal processing:
- Pitch variance (AI voices are unnaturally consistent)
- MFCC delta variance (AI lacks natural micro-variations)
- Spectral flatness (AI voices have unusual spectral distribution)
- Zero-crossing rate (synthetic voices differ in ZCR patterns)
- Silence/breath ratio (AI voices often lack natural breath sounds)
"""
def analyze(self, waveform: np.ndarray, sr: int) -> dict:
try:
import librosa
except ImportError:
logger.warning("librosa not installed β€” heuristic audio analysis skipped")
return {"heuristic_fake_prob": 0.5, "features": {}, "available": False}
scores = []
features = {}
# ── 1. Pitch variance ─────────────────────────────────────────
# AI voices have unnaturally stable pitch (low variance = suspicious)
try:
f0, voiced_flag, _ = librosa.pyin(
waveform, fmin=50, fmax=500, sr=sr
)
voiced_f0 = f0[voiced_flag & ~np.isnan(f0)]
if len(voiced_f0) > 10:
pitch_std = float(np.std(voiced_f0))
features["pitch_std_hz"] = round(pitch_std, 2)
# Real human speech: std typically 20-80 Hz
# AI voices: often < 10 Hz (too stable)
if pitch_std < 8:
scores.append(0.80) # Very suspicious
elif pitch_std < 15:
scores.append(0.65)
elif pitch_std < 25:
scores.append(0.45)
else:
scores.append(0.25) # Natural variation
else:
scores.append(0.50)
except Exception as e:
logger.debug(f"Pitch analysis failed: {e}")
scores.append(0.50)
# ── 2. MFCC delta variance ────────────────────────────────────
# AI voices lack natural micro-variations in articulation
try:
mfcc = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
delta = librosa.feature.delta(mfcc)
delta_var = float(np.mean(np.var(delta, axis=1)))
features["mfcc_delta_var"] = round(delta_var, 4)
# Low delta variance β†’ unnaturally smooth transitions
if delta_var < 0.5:
scores.append(0.75)
elif delta_var < 1.5:
scores.append(0.55)
elif delta_var < 4.0:
scores.append(0.35)
else:
scores.append(0.20)
except Exception as e:
logger.debug(f"MFCC analysis failed: {e}")
scores.append(0.50)
# ── 3. Spectral flatness ──────────────────────────────────────
# AI voices often have unusual spectral distribution
try:
flatness = librosa.feature.spectral_flatness(y=waveform)
mean_flatness = float(np.mean(flatness))
features["spectral_flatness"] = round(mean_flatness, 4)
# Very low flatness = tonal (could be AI), very high = noisy
if mean_flatness < 0.001:
scores.append(0.65)
elif mean_flatness < 0.005:
scores.append(0.45)
else:
scores.append(0.30)
except Exception as e:
logger.debug(f"Spectral flatness failed: {e}")
scores.append(0.50)
# ── 4. Zero-crossing rate consistency ────────────────────────
# AI voices have unnaturally consistent ZCR
try:
zcr = librosa.feature.zero_crossing_rate(waveform)
zcr_std = float(np.std(zcr))
features["zcr_std"] = round(zcr_std, 4)
if zcr_std < 0.02:
scores.append(0.65) # Too consistent
elif zcr_std < 0.05:
scores.append(0.40)
else:
scores.append(0.25)
except Exception as e:
logger.debug(f"ZCR analysis failed: {e}")
scores.append(0.50)
# ── 5. Silence/breath detection ───────────────────────────────
# Real speech has natural pauses and breath sounds
# AI voices often have perfectly clean silence or no breaths
try:
rms = librosa.feature.rms(y=waveform)[0]
silence_ratio = float(np.mean(rms < 0.01))
features["silence_ratio"] = round(silence_ratio, 3)
# Very low silence ratio = no natural pauses (suspicious)
# Very high = mostly silent (not useful)
if silence_ratio < 0.05:
scores.append(0.60) # No natural pauses
elif 0.05 <= silence_ratio <= 0.35:
scores.append(0.25) # Natural speech rhythm
else:
scores.append(0.45)
except Exception as e:
logger.debug(f"Silence analysis failed: {e}")
scores.append(0.50)
heuristic_prob = float(np.mean(scores)) if scores else 0.5
logger.info(f"Audio heuristics: {features} β†’ fake_prob={heuristic_prob:.3f}")
return {
"heuristic_fake_prob": round(heuristic_prob, 4),
"features": features,
"available": True,
}
# ─────────────────────────────────────────────
# Agent 3: Audio Decision Agent
# Wav2Vec2 model for AI voice detection
# ─────────────────────────────────────────────
class AudioDecisionAgent:
# Primary: ASVspoof-trained model with bonafide/spoof labels
MODEL_ID = "Vansh180/deepfake-audio-wav2vec2"
CHUNK_SEC = 10
TARGET_SR = 16000
def __init__(self):
self.model = None
self.processor = None
self.fake_idx = 1 # default: label 1 = spoof/fake
self.available = False
self._load()
def _load(self):
try:
from transformers import (
AutoModelForAudioClassification,
AutoFeatureExtractor,
)
logger.info(f"Loading audio model: {self.MODEL_ID}")
self.processor = AutoFeatureExtractor.from_pretrained(self.MODEL_ID)
self.model = AutoModelForAudioClassification.from_pretrained(self.MODEL_ID)
self.model.eval()
# Find fake/spoof label index
for idx, lbl in self.model.config.id2label.items():
lbl_lower = lbl.lower()
if any(w in lbl_lower for w in ("fake", "spoof", "synthetic", "generated")):
self.fake_idx = idx
break
self.available = True
logger.info(
f"Audio model loaded β€” labels={self.model.config.id2label} "
f"fake_idx={self.fake_idx}"
)
except Exception as e:
logger.warning(f"Audio model unavailable: {e}")
self.available = False
def predict(self, waveform: np.ndarray, sr: int) -> float:
"""Run model on audio chunks, return mean fake probability."""
if not self.available:
return 0.5
import torch
chunk_size = self.CHUNK_SEC * sr
chunks = [
waveform[i : i + chunk_size]
for i in range(0, len(waveform), chunk_size)
if len(waveform[i : i + chunk_size]) > sr // 2
]
if not chunks:
return 0.5
# Cap at 3 chunks max β€” Wav2Vec2 is slow on CPU, 30s of audio is enough
chunks = chunks[:3]
fake_probs = []
for chunk in chunks:
try:
inputs = self.processor(
chunk,
sampling_rate=self.TARGET_SR,
return_tensors="pt",
padding=True,
)
with torch.no_grad():
logits = self.model(**inputs).logits
probs = torch.softmax(logits, dim=-1)[0]
fake_probs.append(probs[self.fake_idx].item())
except Exception as e:
logger.warning(f"Audio chunk inference failed: {e}")
if not fake_probs:
return 0.5
result = float(np.mean(fake_probs))
logger.info(f"Audio model: {len(fake_probs)} chunks β†’ fake_prob={result:.3f}")
return result
# ─────────────────────────────────────────────
# Agent 4: Audio Report Agent
# Builds structured audio result
# ─────────────────────────────────────────────
class AudioReportAgent:
FAKE_THRESHOLD = 0.60
def generate(
self,
model_prob: float,
heuristic: dict,
has_audio: bool,
visual_fake_prob: float = 0.5,
) -> dict:
if not has_audio:
return {
"available": False,
"result": "NO_AUDIO",
"confidence": 0,
"fake_probability": 0,
"details": ["No audio track found in video"],
}
heur_prob = heuristic.get("heuristic_fake_prob", 0.5)
features = heuristic.get("features", {})
# Ensemble: 65% model + 35% heuristics
if heuristic.get("available", False):
combined = model_prob * 0.65 + heur_prob * 0.35
else:
combined = model_prob
# ── Audio-Visual Mismatch Boost ───────────────────────────────
# Key insight: in face-swap deepfakes, the FACE is fake but the
# VOICE is real (dubbed from original footage). This mismatch
# is itself a strong deepfake signal.
# If visual says FAKE (high prob) but audio says HUMAN β†’ mismatch
av_mismatch = False
av_mismatch_score = 0.0
if visual_fake_prob >= 0.45 and model_prob < 0.55:
# Visual shows manipulation signs, audio sounds human β†’ face-swap
av_mismatch = True
av_mismatch_score = visual_fake_prob * 0.6
combined = max(combined, av_mismatch_score)
logger.info(
f"Audio-visual mismatch detected: visual_fake={visual_fake_prob:.2f} "
f"audio_fake={model_prob:.2f} β†’ boosted to {combined:.2f}"
)
combined = float(np.clip(combined, 0.0, 1.0))
is_fake = combined >= self.FAKE_THRESHOLD
confidence = round(combined * 100, 1)
details = self._build_details(
combined, is_fake, features, model_prob, heur_prob, av_mismatch
)
result_label = "AI_VOICE" if is_fake else "HUMAN_VOICE"
if av_mismatch:
result_label = "AV_MISMATCH" # special label for face-swap case
return {
"available": True,
"result": result_label,
"confidence": confidence,
"fake_probability": round(combined, 4),
"model_score": round(model_prob * 100, 1),
"heuristic_score": round(heur_prob * 100, 1),
"av_mismatch": av_mismatch,
"details": details,
"features": features,
}
def _build_details(
self,
prob: float,
is_fake: bool,
features: dict,
model_prob: float,
heur_prob: float,
av_mismatch: bool = False,
) -> list[str]:
details = []
# Audio-visual mismatch is the most important signal
if av_mismatch:
details.append(
"⚠️ Audio-visual mismatch detected β€” face appears manipulated but voice is human. "
"This is the hallmark of face-swap deepfakes where original audio is preserved."
)
details.append(
"Voice is authentic human speech, but does NOT match the manipulated face β€” "
"consistent with dubbed deepfake video (e.g. movie scene re-faced)"
)
details.append(
f"Visual deepfake confidence was high while voice model scored {(1-model_prob)*100:.1f}% human β€” "
"strong indicator of face-swap rather than full synthesis"
)
return details
if is_fake:
if prob > 0.85:
details.append("High-confidence AI-generated voice detected")
elif prob > 0.70:
details.append("Strong synthetic voice characteristics identified")
else:
details.append("AI voice patterns detected β€” likely TTS or voice cloning")
pitch_std = features.get("pitch_std_hz")
if pitch_std is not None and pitch_std < 15:
details.append(
f"Unnaturally stable pitch (Οƒ={pitch_std}Hz) β€” "
"human speech typically varies 20-80Hz"
)
delta_var = features.get("mfcc_delta_var")
if delta_var is not None and delta_var < 1.5:
details.append(
"Insufficient micro-variation in articulation β€” "
"characteristic of TTS synthesis"
)
silence = features.get("silence_ratio")
if silence is not None and silence < 0.05:
details.append(
"No natural breath pauses detected β€” "
"AI voices lack organic speech rhythm"
)
details.append(f"ASVspoof model confidence: {model_prob*100:.1f}% synthetic")
else:
if prob < 0.25:
details.append("Strong indicators of authentic human voice")
else:
details.append("Voice characteristics consistent with natural human speech")
pitch_std = features.get("pitch_std_hz")
if pitch_std is not None and pitch_std >= 20:
details.append(f"Natural pitch variation detected (Οƒ={pitch_std}Hz)")
silence = features.get("silence_ratio")
if silence is not None and 0.05 <= silence <= 0.35:
details.append(
"Natural speech rhythm with organic pauses and breath sounds"
)
details.append(f"ASVspoof model confidence: {(1-model_prob)*100:.1f}% human")
return details
# ─────────────────────────────────────────────
# Orchestrator
# ─────────────────────────────────────────────
class AudioAuthenticator:
def __init__(self):
self.extractor = AudioExtractorAgent()
self.analyzer = AudioAnalysisAgent()
self.decision = AudioDecisionAgent()
self.reporter = AudioReportAgent()
def analyze(self, video_path: str, visual_fake_prob: float = 0.5) -> dict:
# Step 1: Extract audio
waveform, sr = self.extractor.extract(video_path)
if waveform is None or len(waveform) == 0:
return self.reporter.generate(0.5, {}, has_audio=False)
# Step 2: Heuristic analysis
heuristic = self.analyzer.analyze(waveform, sr)
# Step 3: Model prediction
model_prob = self.decision.predict(waveform, sr)
# Step 4: Report (pass visual prob for mismatch detection)
return self.reporter.generate(
model_prob, heuristic, has_audio=True,
visual_fake_prob=visual_fake_prob,
)