deepshield / services /audio_service.py
ar07xd's picture
Sync from GitHub via hub-sync
36529c1 verified
"""Phase 17.2 — Audio Deepfake Detection.
Extracts the audio track from a video with ffmpeg, then applies signal-processing
heuristics (silence ratio, spectral centroid variance, RMS consistency) to produce
an audio_authenticity_score (0–100, higher = more natural/authentic).
AI-generated speech typically exhibits:
- Near-zero silence between words (no natural breath pauses)
- Very low spectral-centroid variance (monotone formant trajectory)
- Unnaturally consistent RMS energy across voiced frames
"""
from __future__ import annotations
import os
import subprocess
import tempfile
from dataclasses import dataclass
from typing import Optional
import numpy as np
from loguru import logger
@dataclass
class AudioAnalysis:
audio_authenticity_score: float # 0–100
has_audio: bool
duration_s: float
silence_ratio: float # fraction of 25ms frames below RMS threshold
spectral_variance: float # normalised std of spectral centroid
rms_consistency: float # 1 – normalised std of voiced-frame RMS
notes: str = ""
ml_analysis: dict | None = None
# ---------------------------------------------------------------------------
# ffmpeg extraction
# ---------------------------------------------------------------------------
def _extract_audio_wav(video_path: str, out_path: str) -> bool:
"""Extract mono 16 kHz WAV from *video_path* into *out_path* via ffmpeg."""
try:
result = subprocess.run(
[
"ffmpeg", "-y", "-i", video_path,
"-vn", "-acodec", "pcm_s16le",
"-ar", "16000", "-ac", "1",
out_path,
],
capture_output=True,
timeout=60,
)
if result.returncode != 0 or not os.path.exists(out_path) or os.path.getsize(out_path) == 0:
stderr_tail = result.stderr.decode(errors="replace")[-400:].strip()
logger.warning(f"ffmpeg exited {result.returncode}{stderr_tail or '(no stderr)'}")
return False
return True
except (FileNotFoundError, subprocess.TimeoutExpired, OSError) as exc:
logger.warning(f"ffmpeg audio extraction failed: {exc}")
return False
# ---------------------------------------------------------------------------
# Signal-processing analysis
# ---------------------------------------------------------------------------
def _analyse_wav(wav_path: str) -> AudioAnalysis:
try:
from scipy.io import wavfile # scipy already in requirements
sr, data = wavfile.read(wav_path)
except Exception as exc: # noqa: BLE001
logger.warning(f"WAV read failed: {exc}")
return AudioAnalysis(
audio_authenticity_score=50.0, has_audio=True,
duration_s=0.0, silence_ratio=0.0,
spectral_variance=0.0, rms_consistency=0.0,
notes="wav_read_failed",
)
# Flatten stereo → mono
if data.ndim > 1:
data = data[:, 0]
data = data.astype(np.float32) / (np.iinfo(np.int16).max + 1)
duration_s = float(len(data) / sr)
if duration_s < 0.1:
return AudioAnalysis(
audio_authenticity_score=50.0, has_audio=True,
duration_s=round(duration_s, 3), silence_ratio=1.0,
spectral_variance=0.0, rms_consistency=0.0,
notes="too_short",
)
# --- 25ms framing ---
frame_len = max(1, int(sr * 0.025))
hop_len = max(1, frame_len // 2)
frames = [
data[i: i + frame_len]
for i in range(0, len(data) - frame_len, hop_len)
]
if not frames:
return AudioAnalysis(
audio_authenticity_score=50.0, has_audio=True,
duration_s=round(duration_s, 3), silence_ratio=1.0,
spectral_variance=0.0, rms_consistency=0.0,
notes="no_frames",
)
rms_vals = np.array([np.sqrt(np.mean(f ** 2)) for f in frames])
# Silence ratio
SILENCE_THRESH = 0.01
silence_ratio = float(np.mean(rms_vals < SILENCE_THRESH))
# Spectral centroid variance
freqs = np.fft.rfftfreq(frame_len, d=1.0 / sr)
centroids: list[float] = []
for frame in frames:
spec = np.abs(np.fft.rfft(frame))
total = float(np.sum(spec))
if total < 1e-9:
continue
centroids.append(float(np.dot(freqs, spec) / total))
spec_var = (
float(np.std(centroids) / (np.mean(centroids) + 1e-6))
if centroids else 0.0
)
# RMS consistency on voiced frames
voiced = rms_vals[rms_vals >= SILENCE_THRESH]
if len(voiced) > 0:
rms_consistency = float(
1.0 - min(1.0, np.std(voiced) / (np.mean(voiced) + 1e-6))
)
else:
rms_consistency = 0.5
# --- Heuristic scoring ---
# Silence score: natural speech has moderate pauses (0.1–0.6).
# < 0.05 → no pauses (suspicious); > 0.85 → near-silent (unclear).
if silence_ratio < 0.05:
silence_score = 55.0
elif silence_ratio > 0.85:
silence_score = 50.0
else:
silence_score = 100.0
# Spectral variance score: natural formant motion gives spec_var > 0.25.
spec_score = min(100.0, spec_var * 250.0)
# RMS consistency: > 0.92 = unnaturally even (TTS/vocoder artifact).
rms_score = 55.0 if rms_consistency > 0.92 else 100.0
audio_score = float(
0.30 * silence_score + 0.50 * spec_score + 0.20 * rms_score
)
audio_score = max(20.0, min(100.0, audio_score))
logger.info(
f"Audio: dur={duration_s:.1f}s silence={silence_ratio:.2f} "
f"spec_var={spec_var:.4f} rms_cons={rms_consistency:.4f} "
f"→ audio_score={audio_score:.1f}"
)
return AudioAnalysis(
audio_authenticity_score=round(audio_score, 2),
has_audio=True,
duration_s=round(duration_s, 2),
silence_ratio=round(silence_ratio, 4),
spectral_variance=round(spec_var, 4),
rms_consistency=round(rms_consistency, 4),
)
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def analyze_audio(video_path: str) -> Optional[AudioAnalysis]:
"""Extract and analyse the audio track from *video_path*.
Returns an AudioAnalysis dataclass, or None if no audio track is present
or if ffmpeg is unavailable.
"""
tmp_wav: Optional[str] = None
try:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fh:
tmp_wav = fh.name
if not _extract_audio_wav(video_path, tmp_wav):
logger.info("No audio track found or ffmpeg unavailable — skipping audio analysis")
return None
analysis = _analyse_wav(tmp_wav)
from services.audio_ml_service import analyze_audio_ml
ml_score = analyze_audio_ml(tmp_wav)
analysis.ml_analysis = ml_score
heuristics_prob = 1.0 - (analysis.audio_authenticity_score / 100.0)
final_prob = 0.5 * heuristics_prob + 0.5 * ml_score["fake_probability"]
analysis.audio_authenticity_score = round((1.0 - final_prob) * 100.0, 2)
return analysis
except Exception as exc: # noqa: BLE001
logger.warning(f"Audio analysis error: {exc}")
return None
finally:
if tmp_wav and os.path.exists(tmp_wav):
try:
os.unlink(tmp_wav)
except OSError:
pass