Deepfake Authenticator
fix: audio timeout 20s, cap chunks to 3, cap extraction to 30s β fix stuck at 80%
fa1d723 | """ | |
| Deepfake Authenticator β Audio Analysis Agent | |
| Detects AI-generated / synthetic voices from video audio tracks. | |
| Pipeline: | |
| 1. AudioExtractorAgent β extracts audio from video via moviepy | |
| 2. AudioAnalysisAgent β librosa heuristics (MFCC, pitch, spectral) | |
| 3. AudioDecisionAgent β Wav2Vec2 model (Bisher/wav2vec2_ASV_deepfake_audio_detection) | |
| 4. AudioReportAgent β builds structured result | |
| """ | |
| import os | |
| import tempfile | |
| import logging | |
| import numpy as np | |
| logger = logging.getLogger(__name__) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Agent 1: Audio Extractor | |
| # Pulls audio track from video file | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class AudioExtractorAgent: | |
| TARGET_SR = 16000 # Wav2Vec2 expects 16kHz | |
| def extract(self, video_path: str) -> tuple[np.ndarray | None, int]: | |
| """ | |
| Extract mono 16kHz audio from video. | |
| Returns (waveform_array, sample_rate) or (None, 0) if no audio. | |
| """ | |
| try: | |
| from moviepy import VideoFileClip | |
| except ImportError: | |
| try: | |
| from moviepy.editor import VideoFileClip | |
| except ImportError: | |
| logger.warning("moviepy not installed β audio analysis skipped") | |
| return None, 0 | |
| tmp_wav = None | |
| try: | |
| clip = VideoFileClip(video_path) | |
| if clip.audio is None: | |
| logger.info("Video has no audio track") | |
| clip.close() | |
| return None, 0 | |
| # Cap at 30s β enough for detection, avoids slow extraction on long videos | |
| MAX_AUDIO_SEC = 30 | |
| audio_clip = clip.audio | |
| if clip.duration > MAX_AUDIO_SEC: | |
| audio_clip = clip.audio.subclipped(0, MAX_AUDIO_SEC) | |
| # Write to temp WAV | |
| tmp_wav = tempfile.mktemp(suffix=".wav") | |
| audio_clip.write_audiofile( | |
| tmp_wav, | |
| fps=self.TARGET_SR, | |
| nbytes=2, | |
| codec="pcm_s16le", | |
| logger=None, | |
| ) | |
| clip.close() | |
| # Load with soundfile for clean numpy array | |
| import soundfile as sf | |
| waveform, sr = sf.read(tmp_wav, dtype="float32") | |
| # Convert stereo β mono | |
| if waveform.ndim > 1: | |
| waveform = waveform.mean(axis=1) | |
| # Resample if needed | |
| if sr != self.TARGET_SR: | |
| import torchaudio | |
| import torch | |
| t = torch.from_numpy(waveform).unsqueeze(0) | |
| resampler = torchaudio.transforms.Resample(sr, self.TARGET_SR) | |
| waveform = resampler(t).squeeze(0).numpy() | |
| sr = self.TARGET_SR | |
| logger.info(f"Audio extracted: {len(waveform)/sr:.1f}s @ {sr}Hz") | |
| return waveform, sr | |
| except Exception as e: | |
| logger.warning(f"Audio extraction failed: {e}") | |
| return None, 0 | |
| finally: | |
| if tmp_wav and os.path.exists(tmp_wav): | |
| os.unlink(tmp_wav) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Agent 2: Audio Heuristic Analyzer | |
| # Librosa-based feature analysis | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class AudioAnalysisAgent: | |
| """ | |
| Detects AI voice artifacts using signal processing: | |
| - Pitch variance (AI voices are unnaturally consistent) | |
| - MFCC delta variance (AI lacks natural micro-variations) | |
| - Spectral flatness (AI voices have unusual spectral distribution) | |
| - Zero-crossing rate (synthetic voices differ in ZCR patterns) | |
| - Silence/breath ratio (AI voices often lack natural breath sounds) | |
| """ | |
| def analyze(self, waveform: np.ndarray, sr: int) -> dict: | |
| try: | |
| import librosa | |
| except ImportError: | |
| logger.warning("librosa not installed β heuristic audio analysis skipped") | |
| return {"heuristic_fake_prob": 0.5, "features": {}, "available": False} | |
| scores = [] | |
| features = {} | |
| # ββ 1. Pitch variance βββββββββββββββββββββββββββββββββββββββββ | |
| # AI voices have unnaturally stable pitch (low variance = suspicious) | |
| try: | |
| f0, voiced_flag, _ = librosa.pyin( | |
| waveform, fmin=50, fmax=500, sr=sr | |
| ) | |
| voiced_f0 = f0[voiced_flag & ~np.isnan(f0)] | |
| if len(voiced_f0) > 10: | |
| pitch_std = float(np.std(voiced_f0)) | |
| features["pitch_std_hz"] = round(pitch_std, 2) | |
| # Real human speech: std typically 20-80 Hz | |
| # AI voices: often < 10 Hz (too stable) | |
| if pitch_std < 8: | |
| scores.append(0.80) # Very suspicious | |
| elif pitch_std < 15: | |
| scores.append(0.65) | |
| elif pitch_std < 25: | |
| scores.append(0.45) | |
| else: | |
| scores.append(0.25) # Natural variation | |
| else: | |
| scores.append(0.50) | |
| except Exception as e: | |
| logger.debug(f"Pitch analysis failed: {e}") | |
| scores.append(0.50) | |
| # ββ 2. MFCC delta variance ββββββββββββββββββββββββββββββββββββ | |
| # AI voices lack natural micro-variations in articulation | |
| try: | |
| mfcc = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13) | |
| delta = librosa.feature.delta(mfcc) | |
| delta_var = float(np.mean(np.var(delta, axis=1))) | |
| features["mfcc_delta_var"] = round(delta_var, 4) | |
| # Low delta variance β unnaturally smooth transitions | |
| if delta_var < 0.5: | |
| scores.append(0.75) | |
| elif delta_var < 1.5: | |
| scores.append(0.55) | |
| elif delta_var < 4.0: | |
| scores.append(0.35) | |
| else: | |
| scores.append(0.20) | |
| except Exception as e: | |
| logger.debug(f"MFCC analysis failed: {e}") | |
| scores.append(0.50) | |
| # ββ 3. Spectral flatness ββββββββββββββββββββββββββββββββββββββ | |
| # AI voices often have unusual spectral distribution | |
| try: | |
| flatness = librosa.feature.spectral_flatness(y=waveform) | |
| mean_flatness = float(np.mean(flatness)) | |
| features["spectral_flatness"] = round(mean_flatness, 4) | |
| # Very low flatness = tonal (could be AI), very high = noisy | |
| if mean_flatness < 0.001: | |
| scores.append(0.65) | |
| elif mean_flatness < 0.005: | |
| scores.append(0.45) | |
| else: | |
| scores.append(0.30) | |
| except Exception as e: | |
| logger.debug(f"Spectral flatness failed: {e}") | |
| scores.append(0.50) | |
| # ββ 4. Zero-crossing rate consistency ββββββββββββββββββββββββ | |
| # AI voices have unnaturally consistent ZCR | |
| try: | |
| zcr = librosa.feature.zero_crossing_rate(waveform) | |
| zcr_std = float(np.std(zcr)) | |
| features["zcr_std"] = round(zcr_std, 4) | |
| if zcr_std < 0.02: | |
| scores.append(0.65) # Too consistent | |
| elif zcr_std < 0.05: | |
| scores.append(0.40) | |
| else: | |
| scores.append(0.25) | |
| except Exception as e: | |
| logger.debug(f"ZCR analysis failed: {e}") | |
| scores.append(0.50) | |
| # ββ 5. Silence/breath detection βββββββββββββββββββββββββββββββ | |
| # Real speech has natural pauses and breath sounds | |
| # AI voices often have perfectly clean silence or no breaths | |
| try: | |
| rms = librosa.feature.rms(y=waveform)[0] | |
| silence_ratio = float(np.mean(rms < 0.01)) | |
| features["silence_ratio"] = round(silence_ratio, 3) | |
| # Very low silence ratio = no natural pauses (suspicious) | |
| # Very high = mostly silent (not useful) | |
| if silence_ratio < 0.05: | |
| scores.append(0.60) # No natural pauses | |
| elif 0.05 <= silence_ratio <= 0.35: | |
| scores.append(0.25) # Natural speech rhythm | |
| else: | |
| scores.append(0.45) | |
| except Exception as e: | |
| logger.debug(f"Silence analysis failed: {e}") | |
| scores.append(0.50) | |
| heuristic_prob = float(np.mean(scores)) if scores else 0.5 | |
| logger.info(f"Audio heuristics: {features} β fake_prob={heuristic_prob:.3f}") | |
| return { | |
| "heuristic_fake_prob": round(heuristic_prob, 4), | |
| "features": features, | |
| "available": True, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Agent 3: Audio Decision Agent | |
| # Wav2Vec2 model for AI voice detection | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class AudioDecisionAgent: | |
| # Primary: ASVspoof-trained model with bonafide/spoof labels | |
| MODEL_ID = "Vansh180/deepfake-audio-wav2vec2" | |
| CHUNK_SEC = 10 | |
| TARGET_SR = 16000 | |
| def __init__(self): | |
| self.model = None | |
| self.processor = None | |
| self.fake_idx = 1 # default: label 1 = spoof/fake | |
| self.available = False | |
| self._load() | |
| def _load(self): | |
| try: | |
| from transformers import ( | |
| AutoModelForAudioClassification, | |
| AutoFeatureExtractor, | |
| ) | |
| logger.info(f"Loading audio model: {self.MODEL_ID}") | |
| self.processor = AutoFeatureExtractor.from_pretrained(self.MODEL_ID) | |
| self.model = AutoModelForAudioClassification.from_pretrained(self.MODEL_ID) | |
| self.model.eval() | |
| # Find fake/spoof label index | |
| for idx, lbl in self.model.config.id2label.items(): | |
| lbl_lower = lbl.lower() | |
| if any(w in lbl_lower for w in ("fake", "spoof", "synthetic", "generated")): | |
| self.fake_idx = idx | |
| break | |
| self.available = True | |
| logger.info( | |
| f"Audio model loaded β labels={self.model.config.id2label} " | |
| f"fake_idx={self.fake_idx}" | |
| ) | |
| except Exception as e: | |
| logger.warning(f"Audio model unavailable: {e}") | |
| self.available = False | |
| def predict(self, waveform: np.ndarray, sr: int) -> float: | |
| """Run model on audio chunks, return mean fake probability.""" | |
| if not self.available: | |
| return 0.5 | |
| import torch | |
| chunk_size = self.CHUNK_SEC * sr | |
| chunks = [ | |
| waveform[i : i + chunk_size] | |
| for i in range(0, len(waveform), chunk_size) | |
| if len(waveform[i : i + chunk_size]) > sr // 2 | |
| ] | |
| if not chunks: | |
| return 0.5 | |
| # Cap at 3 chunks max β Wav2Vec2 is slow on CPU, 30s of audio is enough | |
| chunks = chunks[:3] | |
| fake_probs = [] | |
| for chunk in chunks: | |
| try: | |
| inputs = self.processor( | |
| chunk, | |
| sampling_rate=self.TARGET_SR, | |
| return_tensors="pt", | |
| padding=True, | |
| ) | |
| with torch.no_grad(): | |
| logits = self.model(**inputs).logits | |
| probs = torch.softmax(logits, dim=-1)[0] | |
| fake_probs.append(probs[self.fake_idx].item()) | |
| except Exception as e: | |
| logger.warning(f"Audio chunk inference failed: {e}") | |
| if not fake_probs: | |
| return 0.5 | |
| result = float(np.mean(fake_probs)) | |
| logger.info(f"Audio model: {len(fake_probs)} chunks β fake_prob={result:.3f}") | |
| return result | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Agent 4: Audio Report Agent | |
| # Builds structured audio result | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class AudioReportAgent: | |
| FAKE_THRESHOLD = 0.60 | |
| def generate( | |
| self, | |
| model_prob: float, | |
| heuristic: dict, | |
| has_audio: bool, | |
| visual_fake_prob: float = 0.5, | |
| ) -> dict: | |
| if not has_audio: | |
| return { | |
| "available": False, | |
| "result": "NO_AUDIO", | |
| "confidence": 0, | |
| "fake_probability": 0, | |
| "details": ["No audio track found in video"], | |
| } | |
| heur_prob = heuristic.get("heuristic_fake_prob", 0.5) | |
| features = heuristic.get("features", {}) | |
| # Ensemble: 65% model + 35% heuristics | |
| if heuristic.get("available", False): | |
| combined = model_prob * 0.65 + heur_prob * 0.35 | |
| else: | |
| combined = model_prob | |
| # ββ Audio-Visual Mismatch Boost βββββββββββββββββββββββββββββββ | |
| # Key insight: in face-swap deepfakes, the FACE is fake but the | |
| # VOICE is real (dubbed from original footage). This mismatch | |
| # is itself a strong deepfake signal. | |
| # If visual says FAKE (high prob) but audio says HUMAN β mismatch | |
| av_mismatch = False | |
| av_mismatch_score = 0.0 | |
| if visual_fake_prob >= 0.45 and model_prob < 0.55: | |
| # Visual shows manipulation signs, audio sounds human β face-swap | |
| av_mismatch = True | |
| av_mismatch_score = visual_fake_prob * 0.6 | |
| combined = max(combined, av_mismatch_score) | |
| logger.info( | |
| f"Audio-visual mismatch detected: visual_fake={visual_fake_prob:.2f} " | |
| f"audio_fake={model_prob:.2f} β boosted to {combined:.2f}" | |
| ) | |
| combined = float(np.clip(combined, 0.0, 1.0)) | |
| is_fake = combined >= self.FAKE_THRESHOLD | |
| confidence = round(combined * 100, 1) | |
| details = self._build_details( | |
| combined, is_fake, features, model_prob, heur_prob, av_mismatch | |
| ) | |
| result_label = "AI_VOICE" if is_fake else "HUMAN_VOICE" | |
| if av_mismatch: | |
| result_label = "AV_MISMATCH" # special label for face-swap case | |
| return { | |
| "available": True, | |
| "result": result_label, | |
| "confidence": confidence, | |
| "fake_probability": round(combined, 4), | |
| "model_score": round(model_prob * 100, 1), | |
| "heuristic_score": round(heur_prob * 100, 1), | |
| "av_mismatch": av_mismatch, | |
| "details": details, | |
| "features": features, | |
| } | |
| def _build_details( | |
| self, | |
| prob: float, | |
| is_fake: bool, | |
| features: dict, | |
| model_prob: float, | |
| heur_prob: float, | |
| av_mismatch: bool = False, | |
| ) -> list[str]: | |
| details = [] | |
| # Audio-visual mismatch is the most important signal | |
| if av_mismatch: | |
| details.append( | |
| "β οΈ Audio-visual mismatch detected β face appears manipulated but voice is human. " | |
| "This is the hallmark of face-swap deepfakes where original audio is preserved." | |
| ) | |
| details.append( | |
| "Voice is authentic human speech, but does NOT match the manipulated face β " | |
| "consistent with dubbed deepfake video (e.g. movie scene re-faced)" | |
| ) | |
| details.append( | |
| f"Visual deepfake confidence was high while voice model scored {(1-model_prob)*100:.1f}% human β " | |
| "strong indicator of face-swap rather than full synthesis" | |
| ) | |
| return details | |
| if is_fake: | |
| if prob > 0.85: | |
| details.append("High-confidence AI-generated voice detected") | |
| elif prob > 0.70: | |
| details.append("Strong synthetic voice characteristics identified") | |
| else: | |
| details.append("AI voice patterns detected β likely TTS or voice cloning") | |
| pitch_std = features.get("pitch_std_hz") | |
| if pitch_std is not None and pitch_std < 15: | |
| details.append( | |
| f"Unnaturally stable pitch (Ο={pitch_std}Hz) β " | |
| "human speech typically varies 20-80Hz" | |
| ) | |
| delta_var = features.get("mfcc_delta_var") | |
| if delta_var is not None and delta_var < 1.5: | |
| details.append( | |
| "Insufficient micro-variation in articulation β " | |
| "characteristic of TTS synthesis" | |
| ) | |
| silence = features.get("silence_ratio") | |
| if silence is not None and silence < 0.05: | |
| details.append( | |
| "No natural breath pauses detected β " | |
| "AI voices lack organic speech rhythm" | |
| ) | |
| details.append(f"ASVspoof model confidence: {model_prob*100:.1f}% synthetic") | |
| else: | |
| if prob < 0.25: | |
| details.append("Strong indicators of authentic human voice") | |
| else: | |
| details.append("Voice characteristics consistent with natural human speech") | |
| pitch_std = features.get("pitch_std_hz") | |
| if pitch_std is not None and pitch_std >= 20: | |
| details.append(f"Natural pitch variation detected (Ο={pitch_std}Hz)") | |
| silence = features.get("silence_ratio") | |
| if silence is not None and 0.05 <= silence <= 0.35: | |
| details.append( | |
| "Natural speech rhythm with organic pauses and breath sounds" | |
| ) | |
| details.append(f"ASVspoof model confidence: {(1-model_prob)*100:.1f}% human") | |
| return details | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Orchestrator | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class AudioAuthenticator: | |
| def __init__(self): | |
| self.extractor = AudioExtractorAgent() | |
| self.analyzer = AudioAnalysisAgent() | |
| self.decision = AudioDecisionAgent() | |
| self.reporter = AudioReportAgent() | |
| def analyze(self, video_path: str, visual_fake_prob: float = 0.5) -> dict: | |
| # Step 1: Extract audio | |
| waveform, sr = self.extractor.extract(video_path) | |
| if waveform is None or len(waveform) == 0: | |
| return self.reporter.generate(0.5, {}, has_audio=False) | |
| # Step 2: Heuristic analysis | |
| heuristic = self.analyzer.analyze(waveform, sr) | |
| # Step 3: Model prediction | |
| model_prob = self.decision.predict(waveform, sr) | |
| # Step 4: Report (pass visual prob for mismatch detection) | |
| return self.reporter.generate( | |
| model_prob, heuristic, has_audio=True, | |
| visual_fake_prob=visual_fake_prob, | |
| ) | |