Spaces:

Aarav13
/

AuthriX

Sleeping

AuthriX / backend /audio_detector.py

Deepfake Authenticator

fix: audio timeout 20s, cap chunks to 3, cap extraction to 30s — fix stuck at 80%

fa1d723 23 days ago

19.8 kB

	"""
	Deepfake Authenticator — Audio Analysis Agent
	Detects AI-generated / synthetic voices from video audio tracks.

	Pipeline:
	1. AudioExtractorAgent — extracts audio from video via moviepy
	2. AudioAnalysisAgent — librosa heuristics (MFCC, pitch, spectral)
	3. AudioDecisionAgent — Wav2Vec2 model (Bisher/wav2vec2_ASV_deepfake_audio_detection)
	4. AudioReportAgent — builds structured result
	"""

	import os
	import tempfile
	import logging
	import numpy as np

	logger = logging.getLogger(__name__)

	# ─────────────────────────────────────────────
	# Agent 1: Audio Extractor
	# Pulls audio track from video file
	# ─────────────────────────────────────────────
	class AudioExtractorAgent:
	TARGET_SR = 16000 # Wav2Vec2 expects 16kHz

	def extract(self, video_path: str) -> tuple[np.ndarray \| None, int]:
	"""
	Extract mono 16kHz audio from video.
	Returns (waveform_array, sample_rate) or (None, 0) if no audio.
	"""
	try:
	from moviepy import VideoFileClip
	except ImportError:
	try:
	from moviepy.editor import VideoFileClip
	except ImportError:
	logger.warning("moviepy not installed — audio analysis skipped")
	return None, 0

	tmp_wav = None
	try:
	clip = VideoFileClip(video_path)
	if clip.audio is None:
	logger.info("Video has no audio track")
	clip.close()
	return None, 0

	# Cap at 30s — enough for detection, avoids slow extraction on long videos
	MAX_AUDIO_SEC = 30
	audio_clip = clip.audio
	if clip.duration > MAX_AUDIO_SEC:
	audio_clip = clip.audio.subclipped(0, MAX_AUDIO_SEC)

	# Write to temp WAV
	tmp_wav = tempfile.mktemp(suffix=".wav")
	audio_clip.write_audiofile(
	tmp_wav,
	fps=self.TARGET_SR,
	nbytes=2,
	codec="pcm_s16le",
	logger=None,
	)
	clip.close()

	# Load with soundfile for clean numpy array
	import soundfile as sf
	waveform, sr = sf.read(tmp_wav, dtype="float32")

	# Convert stereo → mono
	if waveform.ndim > 1:
	waveform = waveform.mean(axis=1)

	# Resample if needed
	if sr != self.TARGET_SR:
	import torchaudio
	import torch
	t = torch.from_numpy(waveform).unsqueeze(0)
	resampler = torchaudio.transforms.Resample(sr, self.TARGET_SR)
	waveform = resampler(t).squeeze(0).numpy()
	sr = self.TARGET_SR

	logger.info(f"Audio extracted: {len(waveform)/sr:.1f}s @ {sr}Hz")
	return waveform, sr

	except Exception as e:
	logger.warning(f"Audio extraction failed: {e}")
	return None, 0
	finally:
	if tmp_wav and os.path.exists(tmp_wav):
	os.unlink(tmp_wav)


	# ─────────────────────────────────────────────
	# Agent 2: Audio Heuristic Analyzer
	# Librosa-based feature analysis
	# ─────────────────────────────────────────────
	class AudioAnalysisAgent:
	"""
	Detects AI voice artifacts using signal processing:
	- Pitch variance (AI voices are unnaturally consistent)
	- MFCC delta variance (AI lacks natural micro-variations)
	- Spectral flatness (AI voices have unusual spectral distribution)
	- Zero-crossing rate (synthetic voices differ in ZCR patterns)
	- Silence/breath ratio (AI voices often lack natural breath sounds)
	"""

	def analyze(self, waveform: np.ndarray, sr: int) -> dict:
	try:
	import librosa
	except ImportError:
	logger.warning("librosa not installed — heuristic audio analysis skipped")
	return {"heuristic_fake_prob": 0.5, "features": {}, "available": False}

	scores = []
	features = {}

	# ── 1. Pitch variance ─────────────────────────────────────────
	# AI voices have unnaturally stable pitch (low variance = suspicious)
	try:
	f0, voiced_flag, _ = librosa.pyin(
	waveform, fmin=50, fmax=500, sr=sr
	)
	voiced_f0 = f0[voiced_flag & ~np.isnan(f0)]
	if len(voiced_f0) > 10:
	pitch_std = float(np.std(voiced_f0))
	features["pitch_std_hz"] = round(pitch_std, 2)
	# Real human speech: std typically 20-80 Hz
	# AI voices: often < 10 Hz (too stable)
	if pitch_std < 8:
	scores.append(0.80) # Very suspicious
	elif pitch_std < 15:
	scores.append(0.65)
	elif pitch_std < 25:
	scores.append(0.45)
	else:
	scores.append(0.25) # Natural variation
	else:
	scores.append(0.50)
	except Exception as e:
	logger.debug(f"Pitch analysis failed: {e}")
	scores.append(0.50)

	# ── 2. MFCC delta variance ────────────────────────────────────
	# AI voices lack natural micro-variations in articulation
	try:
	mfcc = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
	delta = librosa.feature.delta(mfcc)
	delta_var = float(np.mean(np.var(delta, axis=1)))
	features["mfcc_delta_var"] = round(delta_var, 4)
	# Low delta variance → unnaturally smooth transitions
	if delta_var < 0.5:
	scores.append(0.75)
	elif delta_var < 1.5:
	scores.append(0.55)
	elif delta_var < 4.0:
	scores.append(0.35)
	else:
	scores.append(0.20)
	except Exception as e:
	logger.debug(f"MFCC analysis failed: {e}")
	scores.append(0.50)

	# ── 3. Spectral flatness ──────────────────────────────────────
	# AI voices often have unusual spectral distribution
	try:
	flatness = librosa.feature.spectral_flatness(y=waveform)
	mean_flatness = float(np.mean(flatness))
	features["spectral_flatness"] = round(mean_flatness, 4)
	# Very low flatness = tonal (could be AI), very high = noisy
	if mean_flatness < 0.001:
	scores.append(0.65)
	elif mean_flatness < 0.005:
	scores.append(0.45)
	else:
	scores.append(0.30)
	except Exception as e:
	logger.debug(f"Spectral flatness failed: {e}")
	scores.append(0.50)

	# ── 4. Zero-crossing rate consistency ────────────────────────
	# AI voices have unnaturally consistent ZCR
	try:
	zcr = librosa.feature.zero_crossing_rate(waveform)
	zcr_std = float(np.std(zcr))
	features["zcr_std"] = round(zcr_std, 4)
	if zcr_std < 0.02:
	scores.append(0.65) # Too consistent
	elif zcr_std < 0.05:
	scores.append(0.40)
	else:
	scores.append(0.25)
	except Exception as e:
	logger.debug(f"ZCR analysis failed: {e}")
	scores.append(0.50)

	# ── 5. Silence/breath detection ───────────────────────────────
	# Real speech has natural pauses and breath sounds
	# AI voices often have perfectly clean silence or no breaths
	try:
	rms = librosa.feature.rms(y=waveform)[0]
	silence_ratio = float(np.mean(rms < 0.01))
	features["silence_ratio"] = round(silence_ratio, 3)
	# Very low silence ratio = no natural pauses (suspicious)
	# Very high = mostly silent (not useful)
	if silence_ratio < 0.05:
	scores.append(0.60) # No natural pauses
	elif 0.05 <= silence_ratio <= 0.35:
	scores.append(0.25) # Natural speech rhythm
	else:
	scores.append(0.45)
	except Exception as e:
	logger.debug(f"Silence analysis failed: {e}")
	scores.append(0.50)

	heuristic_prob = float(np.mean(scores)) if scores else 0.5
	logger.info(f"Audio heuristics: {features} → fake_prob={heuristic_prob:.3f}")

	return {
	"heuristic_fake_prob": round(heuristic_prob, 4),
	"features": features,
	"available": True,
	}


	# ─────────────────────────────────────────────
	# Agent 3: Audio Decision Agent
	# Wav2Vec2 model for AI voice detection
	# ─────────────────────────────────────────────
	class AudioDecisionAgent:
	# Primary: ASVspoof-trained model with bonafide/spoof labels
	MODEL_ID = "Vansh180/deepfake-audio-wav2vec2"
	CHUNK_SEC = 10
	TARGET_SR = 16000

	def __init__(self):
	self.model = None
	self.processor = None
	self.fake_idx = 1 # default: label 1 = spoof/fake
	self.available = False
	self._load()

	def _load(self):
	try:
	from transformers import (
	AutoModelForAudioClassification,
	AutoFeatureExtractor,
	)
	logger.info(f"Loading audio model: {self.MODEL_ID}")
	self.processor = AutoFeatureExtractor.from_pretrained(self.MODEL_ID)
	self.model = AutoModelForAudioClassification.from_pretrained(self.MODEL_ID)
	self.model.eval()

	# Find fake/spoof label index
	for idx, lbl in self.model.config.id2label.items():
	lbl_lower = lbl.lower()
	if any(w in lbl_lower for w in ("fake", "spoof", "synthetic", "generated")):
	self.fake_idx = idx
	break

	self.available = True
	logger.info(
	f"Audio model loaded — labels={self.model.config.id2label} "
	f"fake_idx={self.fake_idx}"
	)
	except Exception as e:
	logger.warning(f"Audio model unavailable: {e}")
	self.available = False

	def predict(self, waveform: np.ndarray, sr: int) -> float:
	"""Run model on audio chunks, return mean fake probability."""
	if not self.available:
	return 0.5

	import torch

	chunk_size = self.CHUNK_SEC * sr
	chunks = [
	waveform[i : i + chunk_size]
	for i in range(0, len(waveform), chunk_size)
	if len(waveform[i : i + chunk_size]) > sr // 2
	]

	if not chunks:
	return 0.5

	# Cap at 3 chunks max — Wav2Vec2 is slow on CPU, 30s of audio is enough
	chunks = chunks[:3]

	fake_probs = []
	for chunk in chunks:
	try:
	inputs = self.processor(
	chunk,
	sampling_rate=self.TARGET_SR,
	return_tensors="pt",
	padding=True,
	)
	with torch.no_grad():
	logits = self.model(**inputs).logits
	probs = torch.softmax(logits, dim=-1)[0]
	fake_probs.append(probs[self.fake_idx].item())
	except Exception as e:
	logger.warning(f"Audio chunk inference failed: {e}")

	if not fake_probs:
	return 0.5

	result = float(np.mean(fake_probs))
	logger.info(f"Audio model: {len(fake_probs)} chunks → fake_prob={result:.3f}")
	return result


	# ─────────────────────────────────────────────
	# Agent 4: Audio Report Agent
	# Builds structured audio result
	# ─────────────────────────────────────────────
	class AudioReportAgent:
	FAKE_THRESHOLD = 0.60

	def generate(
	self,
	model_prob: float,
	heuristic: dict,
	has_audio: bool,
	visual_fake_prob: float = 0.5,
	) -> dict:
	if not has_audio:
	return {
	"available": False,
	"result": "NO_AUDIO",
	"confidence": 0,
	"fake_probability": 0,
	"details": ["No audio track found in video"],
	}

	heur_prob = heuristic.get("heuristic_fake_prob", 0.5)
	features = heuristic.get("features", {})

	# Ensemble: 65% model + 35% heuristics
	if heuristic.get("available", False):
	combined = model_prob * 0.65 + heur_prob * 0.35
	else:
	combined = model_prob

	# ── Audio-Visual Mismatch Boost ───────────────────────────────
	# Key insight: in face-swap deepfakes, the FACE is fake but the
	# VOICE is real (dubbed from original footage). This mismatch
	# is itself a strong deepfake signal.
	# If visual says FAKE (high prob) but audio says HUMAN → mismatch
	av_mismatch = False
	av_mismatch_score = 0.0
	if visual_fake_prob >= 0.45 and model_prob < 0.55:
	# Visual shows manipulation signs, audio sounds human → face-swap
	av_mismatch = True
	av_mismatch_score = visual_fake_prob * 0.6
	combined = max(combined, av_mismatch_score)
	logger.info(
	f"Audio-visual mismatch detected: visual_fake={visual_fake_prob:.2f} "
	f"audio_fake={model_prob:.2f} → boosted to {combined:.2f}"
	)

	combined = float(np.clip(combined, 0.0, 1.0))
	is_fake = combined >= self.FAKE_THRESHOLD
	confidence = round(combined * 100, 1)

	details = self._build_details(
	combined, is_fake, features, model_prob, heur_prob, av_mismatch
	)

	result_label = "AI_VOICE" if is_fake else "HUMAN_VOICE"
	if av_mismatch:
	result_label = "AV_MISMATCH" # special label for face-swap case

	return {
	"available": True,
	"result": result_label,
	"confidence": confidence,
	"fake_probability": round(combined, 4),
	"model_score": round(model_prob * 100, 1),
	"heuristic_score": round(heur_prob * 100, 1),
	"av_mismatch": av_mismatch,
	"details": details,
	"features": features,
	}

	def _build_details(
	self,
	prob: float,
	is_fake: bool,
	features: dict,
	model_prob: float,
	heur_prob: float,
	av_mismatch: bool = False,
	) -> list[str]:
	details = []

	# Audio-visual mismatch is the most important signal
	if av_mismatch:
	details.append(
	"⚠️ Audio-visual mismatch detected — face appears manipulated but voice is human. "
	"This is the hallmark of face-swap deepfakes where original audio is preserved."
	)
	details.append(
	"Voice is authentic human speech, but does NOT match the manipulated face — "
	"consistent with dubbed deepfake video (e.g. movie scene re-faced)"
	)
	details.append(
	f"Visual deepfake confidence was high while voice model scored {(1-model_prob)*100:.1f}% human — "
	"strong indicator of face-swap rather than full synthesis"
	)
	return details

	if is_fake:
	if prob > 0.85:
	details.append("High-confidence AI-generated voice detected")
	elif prob > 0.70:
	details.append("Strong synthetic voice characteristics identified")
	else:
	details.append("AI voice patterns detected — likely TTS or voice cloning")

	pitch_std = features.get("pitch_std_hz")
	if pitch_std is not None and pitch_std < 15:
	details.append(
	f"Unnaturally stable pitch (σ={pitch_std}Hz) — "
	"human speech typically varies 20-80Hz"
	)

	delta_var = features.get("mfcc_delta_var")
	if delta_var is not None and delta_var < 1.5:
	details.append(
	"Insufficient micro-variation in articulation — "
	"characteristic of TTS synthesis"
	)

	silence = features.get("silence_ratio")
	if silence is not None and silence < 0.05:
	details.append(
	"No natural breath pauses detected — "
	"AI voices lack organic speech rhythm"
	)

	details.append(f"ASVspoof model confidence: {model_prob*100:.1f}% synthetic")
	else:
	if prob < 0.25:
	details.append("Strong indicators of authentic human voice")
	else:
	details.append("Voice characteristics consistent with natural human speech")

	pitch_std = features.get("pitch_std_hz")
	if pitch_std is not None and pitch_std >= 20:
	details.append(f"Natural pitch variation detected (σ={pitch_std}Hz)")

	silence = features.get("silence_ratio")
	if silence is not None and 0.05 <= silence <= 0.35:
	details.append(
	"Natural speech rhythm with organic pauses and breath sounds"
	)

	details.append(f"ASVspoof model confidence: {(1-model_prob)*100:.1f}% human")

	return details


	# ─────────────────────────────────────────────
	# Orchestrator
	# ─────────────────────────────────────────────
	class AudioAuthenticator:
	def __init__(self):
	self.extractor = AudioExtractorAgent()
	self.analyzer = AudioAnalysisAgent()
	self.decision = AudioDecisionAgent()
	self.reporter = AudioReportAgent()

	def analyze(self, video_path: str, visual_fake_prob: float = 0.5) -> dict:
	# Step 1: Extract audio
	waveform, sr = self.extractor.extract(video_path)

	if waveform is None or len(waveform) == 0:
	return self.reporter.generate(0.5, {}, has_audio=False)

	# Step 2: Heuristic analysis
	heuristic = self.analyzer.analyze(waveform, sr)

	# Step 3: Model prediction
	model_prob = self.decision.predict(waveform, sr)

	# Step 4: Report (pass visual prob for mismatch detection)
	return self.reporter.generate(
	model_prob, heuristic, has_audio=True,
	visual_fake_prob=visual_fake_prob,
	)