Spaces:
Runtime error
Runtime error
| """Phase 17.2 — Audio Deepfake Detection. | |
| Extracts the audio track from a video with ffmpeg, then applies signal-processing | |
| heuristics (silence ratio, spectral centroid variance, RMS consistency) to produce | |
| an audio_authenticity_score (0–100, higher = more natural/authentic). | |
| AI-generated speech typically exhibits: | |
| - Near-zero silence between words (no natural breath pauses) | |
| - Very low spectral-centroid variance (monotone formant trajectory) | |
| - Unnaturally consistent RMS energy across voiced frames | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import subprocess | |
| import tempfile | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| import numpy as np | |
| from loguru import logger | |
| class AudioAnalysis: | |
| audio_authenticity_score: float # 0–100 | |
| has_audio: bool | |
| duration_s: float | |
| silence_ratio: float # fraction of 25ms frames below RMS threshold | |
| spectral_variance: float # normalised std of spectral centroid | |
| rms_consistency: float # 1 – normalised std of voiced-frame RMS | |
| notes: str = "" | |
| ml_analysis: dict | None = None | |
| # --------------------------------------------------------------------------- | |
| # ffmpeg extraction | |
| # --------------------------------------------------------------------------- | |
| def _extract_audio_wav(video_path: str, out_path: str) -> bool: | |
| """Extract mono 16 kHz WAV from *video_path* into *out_path* via ffmpeg.""" | |
| try: | |
| result = subprocess.run( | |
| [ | |
| "ffmpeg", "-y", "-i", video_path, | |
| "-vn", "-acodec", "pcm_s16le", | |
| "-ar", "16000", "-ac", "1", | |
| out_path, | |
| ], | |
| capture_output=True, | |
| timeout=60, | |
| ) | |
| if result.returncode != 0 or not os.path.exists(out_path) or os.path.getsize(out_path) == 0: | |
| stderr_tail = result.stderr.decode(errors="replace")[-400:].strip() | |
| logger.warning(f"ffmpeg exited {result.returncode} — {stderr_tail or '(no stderr)'}") | |
| return False | |
| return True | |
| except (FileNotFoundError, subprocess.TimeoutExpired, OSError) as exc: | |
| logger.warning(f"ffmpeg audio extraction failed: {exc}") | |
| return False | |
| # --------------------------------------------------------------------------- | |
| # Signal-processing analysis | |
| # --------------------------------------------------------------------------- | |
| def _analyse_wav(wav_path: str) -> AudioAnalysis: | |
| try: | |
| from scipy.io import wavfile # scipy already in requirements | |
| sr, data = wavfile.read(wav_path) | |
| except Exception as exc: # noqa: BLE001 | |
| logger.warning(f"WAV read failed: {exc}") | |
| return AudioAnalysis( | |
| audio_authenticity_score=50.0, has_audio=True, | |
| duration_s=0.0, silence_ratio=0.0, | |
| spectral_variance=0.0, rms_consistency=0.0, | |
| notes="wav_read_failed", | |
| ) | |
| # Flatten stereo → mono | |
| if data.ndim > 1: | |
| data = data[:, 0] | |
| data = data.astype(np.float32) / (np.iinfo(np.int16).max + 1) | |
| duration_s = float(len(data) / sr) | |
| if duration_s < 0.1: | |
| return AudioAnalysis( | |
| audio_authenticity_score=50.0, has_audio=True, | |
| duration_s=round(duration_s, 3), silence_ratio=1.0, | |
| spectral_variance=0.0, rms_consistency=0.0, | |
| notes="too_short", | |
| ) | |
| # --- 25ms framing --- | |
| frame_len = max(1, int(sr * 0.025)) | |
| hop_len = max(1, frame_len // 2) | |
| frames = [ | |
| data[i: i + frame_len] | |
| for i in range(0, len(data) - frame_len, hop_len) | |
| ] | |
| if not frames: | |
| return AudioAnalysis( | |
| audio_authenticity_score=50.0, has_audio=True, | |
| duration_s=round(duration_s, 3), silence_ratio=1.0, | |
| spectral_variance=0.0, rms_consistency=0.0, | |
| notes="no_frames", | |
| ) | |
| rms_vals = np.array([np.sqrt(np.mean(f ** 2)) for f in frames]) | |
| # Silence ratio | |
| SILENCE_THRESH = 0.01 | |
| silence_ratio = float(np.mean(rms_vals < SILENCE_THRESH)) | |
| # Spectral centroid variance | |
| freqs = np.fft.rfftfreq(frame_len, d=1.0 / sr) | |
| centroids: list[float] = [] | |
| for frame in frames: | |
| spec = np.abs(np.fft.rfft(frame)) | |
| total = float(np.sum(spec)) | |
| if total < 1e-9: | |
| continue | |
| centroids.append(float(np.dot(freqs, spec) / total)) | |
| spec_var = ( | |
| float(np.std(centroids) / (np.mean(centroids) + 1e-6)) | |
| if centroids else 0.0 | |
| ) | |
| # RMS consistency on voiced frames | |
| voiced = rms_vals[rms_vals >= SILENCE_THRESH] | |
| if len(voiced) > 0: | |
| rms_consistency = float( | |
| 1.0 - min(1.0, np.std(voiced) / (np.mean(voiced) + 1e-6)) | |
| ) | |
| else: | |
| rms_consistency = 0.5 | |
| # --- Heuristic scoring --- | |
| # Silence score: natural speech has moderate pauses (0.1–0.6). | |
| # < 0.05 → no pauses (suspicious); > 0.85 → near-silent (unclear). | |
| if silence_ratio < 0.05: | |
| silence_score = 55.0 | |
| elif silence_ratio > 0.85: | |
| silence_score = 50.0 | |
| else: | |
| silence_score = 100.0 | |
| # Spectral variance score: natural formant motion gives spec_var > 0.25. | |
| spec_score = min(100.0, spec_var * 250.0) | |
| # RMS consistency: > 0.92 = unnaturally even (TTS/vocoder artifact). | |
| rms_score = 55.0 if rms_consistency > 0.92 else 100.0 | |
| audio_score = float( | |
| 0.30 * silence_score + 0.50 * spec_score + 0.20 * rms_score | |
| ) | |
| audio_score = max(20.0, min(100.0, audio_score)) | |
| logger.info( | |
| f"Audio: dur={duration_s:.1f}s silence={silence_ratio:.2f} " | |
| f"spec_var={spec_var:.4f} rms_cons={rms_consistency:.4f} " | |
| f"→ audio_score={audio_score:.1f}" | |
| ) | |
| return AudioAnalysis( | |
| audio_authenticity_score=round(audio_score, 2), | |
| has_audio=True, | |
| duration_s=round(duration_s, 2), | |
| silence_ratio=round(silence_ratio, 4), | |
| spectral_variance=round(spec_var, 4), | |
| rms_consistency=round(rms_consistency, 4), | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Public API | |
| # --------------------------------------------------------------------------- | |
| def analyze_audio(video_path: str) -> Optional[AudioAnalysis]: | |
| """Extract and analyse the audio track from *video_path*. | |
| Returns an AudioAnalysis dataclass, or None if no audio track is present | |
| or if ffmpeg is unavailable. | |
| """ | |
| tmp_wav: Optional[str] = None | |
| try: | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fh: | |
| tmp_wav = fh.name | |
| if not _extract_audio_wav(video_path, tmp_wav): | |
| logger.info("No audio track found or ffmpeg unavailable — skipping audio analysis") | |
| return None | |
| analysis = _analyse_wav(tmp_wav) | |
| from services.audio_ml_service import analyze_audio_ml | |
| ml_score = analyze_audio_ml(tmp_wav) | |
| analysis.ml_analysis = ml_score | |
| heuristics_prob = 1.0 - (analysis.audio_authenticity_score / 100.0) | |
| final_prob = 0.5 * heuristics_prob + 0.5 * ml_score["fake_probability"] | |
| analysis.audio_authenticity_score = round((1.0 - final_prob) * 100.0, 2) | |
| return analysis | |
| except Exception as exc: # noqa: BLE001 | |
| logger.warning(f"Audio analysis error: {exc}") | |
| return None | |
| finally: | |
| if tmp_wav and os.path.exists(tmp_wav): | |
| try: | |
| os.unlink(tmp_wav) | |
| except OSError: | |
| pass | |