Spaces:

ar07xd
/

deepshield

Runtime error

App Files Files Community

deepshield / services /audio_service.py

ar07xd

Sync from GitHub via hub-sync

36529c1 verified 25 days ago

raw

history blame contribute delete

7.52 kB

	"""Phase 17.2 — Audio Deepfake Detection.

	Extracts the audio track from a video with ffmpeg, then applies signal-processing
	heuristics (silence ratio, spectral centroid variance, RMS consistency) to produce
	an audio_authenticity_score (0–100, higher = more natural/authentic).

	AI-generated speech typically exhibits:
	- Near-zero silence between words (no natural breath pauses)
	- Very low spectral-centroid variance (monotone formant trajectory)
	- Unnaturally consistent RMS energy across voiced frames
	"""
	from __future__ import annotations

	import os
	import subprocess
	import tempfile
	from dataclasses import dataclass
	from typing import Optional

	import numpy as np
	from loguru import logger


	@dataclass
	class AudioAnalysis:
	audio_authenticity_score: float # 0–100
	has_audio: bool
	duration_s: float
	silence_ratio: float # fraction of 25ms frames below RMS threshold
	spectral_variance: float # normalised std of spectral centroid
	rms_consistency: float # 1 – normalised std of voiced-frame RMS
	notes: str = ""
	ml_analysis: dict \| None = None


	# ---------------------------------------------------------------------------
	# ffmpeg extraction
	# ---------------------------------------------------------------------------

	def _extract_audio_wav(video_path: str, out_path: str) -> bool:
	"""Extract mono 16 kHz WAV from video_path into out_path via ffmpeg."""
	try:
	result = subprocess.run(
	[
	"ffmpeg", "-y", "-i", video_path,
	"-vn", "-acodec", "pcm_s16le",
	"-ar", "16000", "-ac", "1",
	out_path,
	],
	capture_output=True,
	timeout=60,
	)
	if result.returncode != 0 or not os.path.exists(out_path) or os.path.getsize(out_path) == 0:
	stderr_tail = result.stderr.decode(errors="replace")[-400:].strip()
	logger.warning(f"ffmpeg exited {result.returncode} — {stderr_tail or '(no stderr)'}")
	return False
	return True
	except (FileNotFoundError, subprocess.TimeoutExpired, OSError) as exc:
	logger.warning(f"ffmpeg audio extraction failed: {exc}")
	return False


	# ---------------------------------------------------------------------------
	# Signal-processing analysis
	# ---------------------------------------------------------------------------

	def _analyse_wav(wav_path: str) -> AudioAnalysis:
	try:
	from scipy.io import wavfile # scipy already in requirements
	sr, data = wavfile.read(wav_path)
	except Exception as exc: # noqa: BLE001
	logger.warning(f"WAV read failed: {exc}")
	return AudioAnalysis(
	audio_authenticity_score=50.0, has_audio=True,
	duration_s=0.0, silence_ratio=0.0,
	spectral_variance=0.0, rms_consistency=0.0,
	notes="wav_read_failed",
	)

	# Flatten stereo → mono
	if data.ndim > 1:
	data = data[:, 0]

	data = data.astype(np.float32) / (np.iinfo(np.int16).max + 1)
	duration_s = float(len(data) / sr)

	if duration_s < 0.1:
	return AudioAnalysis(
	audio_authenticity_score=50.0, has_audio=True,
	duration_s=round(duration_s, 3), silence_ratio=1.0,
	spectral_variance=0.0, rms_consistency=0.0,
	notes="too_short",
	)

	# --- 25ms framing ---
	frame_len = max(1, int(sr * 0.025))
	hop_len = max(1, frame_len // 2)
	frames = [
	data[i: i + frame_len]
	for i in range(0, len(data) - frame_len, hop_len)
	]
	if not frames:
	return AudioAnalysis(
	audio_authenticity_score=50.0, has_audio=True,
	duration_s=round(duration_s, 3), silence_ratio=1.0,
	spectral_variance=0.0, rms_consistency=0.0,
	notes="no_frames",
	)

	rms_vals = np.array([np.sqrt(np.mean(f ** 2)) for f in frames])

	# Silence ratio
	SILENCE_THRESH = 0.01
	silence_ratio = float(np.mean(rms_vals < SILENCE_THRESH))

	# Spectral centroid variance
	freqs = np.fft.rfftfreq(frame_len, d=1.0 / sr)
	centroids: list[float] = []
	for frame in frames:
	spec = np.abs(np.fft.rfft(frame))
	total = float(np.sum(spec))
	if total < 1e-9:
	continue
	centroids.append(float(np.dot(freqs, spec) / total))

	spec_var = (
	float(np.std(centroids) / (np.mean(centroids) + 1e-6))
	if centroids else 0.0
	)

	# RMS consistency on voiced frames
	voiced = rms_vals[rms_vals >= SILENCE_THRESH]
	if len(voiced) > 0:
	rms_consistency = float(
	1.0 - min(1.0, np.std(voiced) / (np.mean(voiced) + 1e-6))
	)
	else:
	rms_consistency = 0.5

	# --- Heuristic scoring ---
	# Silence score: natural speech has moderate pauses (0.1–0.6).
	# < 0.05 → no pauses (suspicious); > 0.85 → near-silent (unclear).
	if silence_ratio < 0.05:
	silence_score = 55.0
	elif silence_ratio > 0.85:
	silence_score = 50.0
	else:
	silence_score = 100.0

	# Spectral variance score: natural formant motion gives spec_var > 0.25.
	spec_score = min(100.0, spec_var * 250.0)

	# RMS consistency: > 0.92 = unnaturally even (TTS/vocoder artifact).
	rms_score = 55.0 if rms_consistency > 0.92 else 100.0

	audio_score = float(
	0.30 * silence_score + 0.50 * spec_score + 0.20 * rms_score
	)
	audio_score = max(20.0, min(100.0, audio_score))

	logger.info(
	f"Audio: dur={duration_s:.1f}s silence={silence_ratio:.2f} "
	f"spec_var={spec_var:.4f} rms_cons={rms_consistency:.4f} "
	f"→ audio_score={audio_score:.1f}"
	)

	return AudioAnalysis(
	audio_authenticity_score=round(audio_score, 2),
	has_audio=True,
	duration_s=round(duration_s, 2),
	silence_ratio=round(silence_ratio, 4),
	spectral_variance=round(spec_var, 4),
	rms_consistency=round(rms_consistency, 4),
	)


	# ---------------------------------------------------------------------------
	# Public API
	# ---------------------------------------------------------------------------

	def analyze_audio(video_path: str) -> Optional[AudioAnalysis]:
	"""Extract and analyse the audio track from video_path.

	Returns an AudioAnalysis dataclass, or None if no audio track is present
	or if ffmpeg is unavailable.
	"""
	tmp_wav: Optional[str] = None
	try:
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fh:
	tmp_wav = fh.name

	if not _extract_audio_wav(video_path, tmp_wav):
	logger.info("No audio track found or ffmpeg unavailable — skipping audio analysis")
	return None

	analysis = _analyse_wav(tmp_wav)

	from services.audio_ml_service import analyze_audio_ml
	ml_score = analyze_audio_ml(tmp_wav)
	analysis.ml_analysis = ml_score

	heuristics_prob = 1.0 - (analysis.audio_authenticity_score / 100.0)
	final_prob = 0.5 * heuristics_prob + 0.5 * ml_score["fake_probability"]
	analysis.audio_authenticity_score = round((1.0 - final_prob) * 100.0, 2)

	return analysis

	except Exception as exc: # noqa: BLE001
	logger.warning(f"Audio analysis error: {exc}")
	return None

	finally:
	if tmp_wav and os.path.exists(tmp_wav):
	try:
	os.unlink(tmp_wav)
	except OSError:
	pass