Spaces:

Aarav13
/

AuthriX

Sleeping

File size: 19,815 Bytes

"""
Deepfake Authenticator — Audio Analysis Agent
Detects AI-generated / synthetic voices from video audio tracks.

Pipeline:
  1. AudioExtractorAgent  — extracts audio from video via moviepy
  2. AudioAnalysisAgent   — librosa heuristics (MFCC, pitch, spectral)
  3. AudioDecisionAgent   — Wav2Vec2 model (Bisher/wav2vec2_ASV_deepfake_audio_detection)
  4. AudioReportAgent     — builds structured result
"""

import os
import tempfile
import logging
import numpy as np

logger = logging.getLogger(__name__)

# ─────────────────────────────────────────────
# Agent 1: Audio Extractor
# Pulls audio track from video file
# ─────────────────────────────────────────────
class AudioExtractorAgent:
    TARGET_SR = 16000   # Wav2Vec2 expects 16kHz

    def extract(self, video_path: str) -> tuple[np.ndarray | None, int]:
        """
        Extract mono 16kHz audio from video.
        Returns (waveform_array, sample_rate) or (None, 0) if no audio.
        """
        try:
            from moviepy import VideoFileClip
        except ImportError:
            try:
                from moviepy.editor import VideoFileClip
            except ImportError:
                logger.warning("moviepy not installed — audio analysis skipped")
                return None, 0

        tmp_wav = None
        try:
            clip = VideoFileClip(video_path)
            if clip.audio is None:
                logger.info("Video has no audio track")
                clip.close()
                return None, 0

            # Cap at 30s — enough for detection, avoids slow extraction on long videos
            MAX_AUDIO_SEC = 30
            audio_clip = clip.audio
            if clip.duration > MAX_AUDIO_SEC:
                audio_clip = clip.audio.subclipped(0, MAX_AUDIO_SEC)

            # Write to temp WAV
            tmp_wav = tempfile.mktemp(suffix=".wav")
            audio_clip.write_audiofile(
                tmp_wav,
                fps=self.TARGET_SR,
                nbytes=2,
                codec="pcm_s16le",
                logger=None,
            )
            clip.close()

            # Load with soundfile for clean numpy array
            import soundfile as sf
            waveform, sr = sf.read(tmp_wav, dtype="float32")

            # Convert stereo → mono
            if waveform.ndim > 1:
                waveform = waveform.mean(axis=1)

            # Resample if needed
            if sr != self.TARGET_SR:
                import torchaudio
                import torch
                t = torch.from_numpy(waveform).unsqueeze(0)
                resampler = torchaudio.transforms.Resample(sr, self.TARGET_SR)
                waveform = resampler(t).squeeze(0).numpy()
                sr = self.TARGET_SR

            logger.info(f"Audio extracted: {len(waveform)/sr:.1f}s @ {sr}Hz")
            return waveform, sr

        except Exception as e:
            logger.warning(f"Audio extraction failed: {e}")
            return None, 0
        finally:
            if tmp_wav and os.path.exists(tmp_wav):
                os.unlink(tmp_wav)


# ─────────────────────────────────────────────
# Agent 2: Audio Heuristic Analyzer
# Librosa-based feature analysis
# ─────────────────────────────────────────────
class AudioAnalysisAgent:
    """
    Detects AI voice artifacts using signal processing:
    - Pitch variance (AI voices are unnaturally consistent)
    - MFCC delta variance (AI lacks natural micro-variations)
    - Spectral flatness (AI voices have unusual spectral distribution)
    - Zero-crossing rate (synthetic voices differ in ZCR patterns)
    - Silence/breath ratio (AI voices often lack natural breath sounds)
    """

    def analyze(self, waveform: np.ndarray, sr: int) -> dict:
        try:
            import librosa
        except ImportError:
            logger.warning("librosa not installed — heuristic audio analysis skipped")
            return {"heuristic_fake_prob": 0.5, "features": {}, "available": False}

        scores = []
        features = {}

        # ── 1. Pitch variance ─────────────────────────────────────────
        # AI voices have unnaturally stable pitch (low variance = suspicious)
        try:
            f0, voiced_flag, _ = librosa.pyin(
                waveform, fmin=50, fmax=500, sr=sr
            )
            voiced_f0 = f0[voiced_flag & ~np.isnan(f0)]
            if len(voiced_f0) > 10:
                pitch_std = float(np.std(voiced_f0))
                features["pitch_std_hz"] = round(pitch_std, 2)
                # Real human speech: std typically 20-80 Hz
                # AI voices: often < 10 Hz (too stable)
                if pitch_std < 8:
                    scores.append(0.80)   # Very suspicious
                elif pitch_std < 15:
                    scores.append(0.65)
                elif pitch_std < 25:
                    scores.append(0.45)
                else:
                    scores.append(0.25)   # Natural variation
            else:
                scores.append(0.50)
        except Exception as e:
            logger.debug(f"Pitch analysis failed: {e}")
            scores.append(0.50)

        # ── 2. MFCC delta variance ────────────────────────────────────
        # AI voices lack natural micro-variations in articulation
        try:
            mfcc = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
            delta = librosa.feature.delta(mfcc)
            delta_var = float(np.mean(np.var(delta, axis=1)))
            features["mfcc_delta_var"] = round(delta_var, 4)
            # Low delta variance → unnaturally smooth transitions
            if delta_var < 0.5:
                scores.append(0.75)
            elif delta_var < 1.5:
                scores.append(0.55)
            elif delta_var < 4.0:
                scores.append(0.35)
            else:
                scores.append(0.20)
        except Exception as e:
            logger.debug(f"MFCC analysis failed: {e}")
            scores.append(0.50)

        # ── 3. Spectral flatness ──────────────────────────────────────
        # AI voices often have unusual spectral distribution
        try:
            flatness = librosa.feature.spectral_flatness(y=waveform)
            mean_flatness = float(np.mean(flatness))
            features["spectral_flatness"] = round(mean_flatness, 4)
            # Very low flatness = tonal (could be AI), very high = noisy
            if mean_flatness < 0.001:
                scores.append(0.65)
            elif mean_flatness < 0.005:
                scores.append(0.45)
            else:
                scores.append(0.30)
        except Exception as e:
            logger.debug(f"Spectral flatness failed: {e}")
            scores.append(0.50)

        # ── 4. Zero-crossing rate consistency ────────────────────────
        # AI voices have unnaturally consistent ZCR
        try:
            zcr = librosa.feature.zero_crossing_rate(waveform)
            zcr_std = float(np.std(zcr))
            features["zcr_std"] = round(zcr_std, 4)
            if zcr_std < 0.02:
                scores.append(0.65)   # Too consistent
            elif zcr_std < 0.05:
                scores.append(0.40)
            else:
                scores.append(0.25)
        except Exception as e:
            logger.debug(f"ZCR analysis failed: {e}")
            scores.append(0.50)

        # ── 5. Silence/breath detection ───────────────────────────────
        # Real speech has natural pauses and breath sounds
        # AI voices often have perfectly clean silence or no breaths
        try:
            rms = librosa.feature.rms(y=waveform)[0]
            silence_ratio = float(np.mean(rms < 0.01))
            features["silence_ratio"] = round(silence_ratio, 3)
            # Very low silence ratio = no natural pauses (suspicious)
            # Very high = mostly silent (not useful)
            if silence_ratio < 0.05:
                scores.append(0.60)   # No natural pauses
            elif 0.05 <= silence_ratio <= 0.35:
                scores.append(0.25)   # Natural speech rhythm
            else:
                scores.append(0.45)
        except Exception as e:
            logger.debug(f"Silence analysis failed: {e}")
            scores.append(0.50)

        heuristic_prob = float(np.mean(scores)) if scores else 0.5
        logger.info(f"Audio heuristics: {features} → fake_prob={heuristic_prob:.3f}")

        return {
            "heuristic_fake_prob": round(heuristic_prob, 4),
            "features": features,
            "available": True,
        }


# ─────────────────────────────────────────────
# Agent 3: Audio Decision Agent
# Wav2Vec2 model for AI voice detection
# ─────────────────────────────────────────────
class AudioDecisionAgent:
    # Primary: ASVspoof-trained model with bonafide/spoof labels
    MODEL_ID   = "Vansh180/deepfake-audio-wav2vec2"
    CHUNK_SEC  = 10
    TARGET_SR  = 16000

    def __init__(self):
        self.model     = None
        self.processor = None
        self.fake_idx  = 1   # default: label 1 = spoof/fake
        self.available = False
        self._load()

    def _load(self):
        try:
            from transformers import (
                AutoModelForAudioClassification,
                AutoFeatureExtractor,
            )
            logger.info(f"Loading audio model: {self.MODEL_ID}")
            self.processor = AutoFeatureExtractor.from_pretrained(self.MODEL_ID)
            self.model     = AutoModelForAudioClassification.from_pretrained(self.MODEL_ID)
            self.model.eval()

            # Find fake/spoof label index
            for idx, lbl in self.model.config.id2label.items():
                lbl_lower = lbl.lower()
                if any(w in lbl_lower for w in ("fake", "spoof", "synthetic", "generated")):
                    self.fake_idx = idx
                    break

            self.available = True
            logger.info(
                f"Audio model loaded — labels={self.model.config.id2label} "
                f"fake_idx={self.fake_idx}"
            )
        except Exception as e:
            logger.warning(f"Audio model unavailable: {e}")
            self.available = False

    def predict(self, waveform: np.ndarray, sr: int) -> float:
        """Run model on audio chunks, return mean fake probability."""
        if not self.available:
            return 0.5

        import torch

        chunk_size = self.CHUNK_SEC * sr
        chunks = [
            waveform[i : i + chunk_size]
            for i in range(0, len(waveform), chunk_size)
            if len(waveform[i : i + chunk_size]) > sr // 2
        ]

        if not chunks:
            return 0.5

        # Cap at 3 chunks max — Wav2Vec2 is slow on CPU, 30s of audio is enough
        chunks = chunks[:3]

        fake_probs = []
        for chunk in chunks:
            try:
                inputs = self.processor(
                    chunk,
                    sampling_rate=self.TARGET_SR,
                    return_tensors="pt",
                    padding=True,
                )
                with torch.no_grad():
                    logits = self.model(**inputs).logits
                    probs  = torch.softmax(logits, dim=-1)[0]
                fake_probs.append(probs[self.fake_idx].item())
            except Exception as e:
                logger.warning(f"Audio chunk inference failed: {e}")

        if not fake_probs:
            return 0.5

        result = float(np.mean(fake_probs))
        logger.info(f"Audio model: {len(fake_probs)} chunks → fake_prob={result:.3f}")
        return result


# ─────────────────────────────────────────────
# Agent 4: Audio Report Agent
# Builds structured audio result
# ─────────────────────────────────────────────
class AudioReportAgent:
    FAKE_THRESHOLD = 0.60

    def generate(
        self,
        model_prob: float,
        heuristic: dict,
        has_audio: bool,
        visual_fake_prob: float = 0.5,
    ) -> dict:
        if not has_audio:
            return {
                "available": False,
                "result": "NO_AUDIO",
                "confidence": 0,
                "fake_probability": 0,
                "details": ["No audio track found in video"],
            }

        heur_prob = heuristic.get("heuristic_fake_prob", 0.5)
        features  = heuristic.get("features", {})

        # Ensemble: 65% model + 35% heuristics
        if heuristic.get("available", False):
            combined = model_prob * 0.65 + heur_prob * 0.35
        else:
            combined = model_prob

        # ── Audio-Visual Mismatch Boost ───────────────────────────────
        # Key insight: in face-swap deepfakes, the FACE is fake but the
        # VOICE is real (dubbed from original footage). This mismatch
        # is itself a strong deepfake signal.
        # If visual says FAKE (high prob) but audio says HUMAN → mismatch
        av_mismatch = False
        av_mismatch_score = 0.0
        if visual_fake_prob >= 0.45 and model_prob < 0.55:
            # Visual shows manipulation signs, audio sounds human → face-swap
            av_mismatch = True
            av_mismatch_score = visual_fake_prob * 0.6
            combined = max(combined, av_mismatch_score)
            logger.info(
                f"Audio-visual mismatch detected: visual_fake={visual_fake_prob:.2f} "
                f"audio_fake={model_prob:.2f} → boosted to {combined:.2f}"
            )

        combined   = float(np.clip(combined, 0.0, 1.0))
        is_fake    = combined >= self.FAKE_THRESHOLD
        confidence = round(combined * 100, 1)

        details = self._build_details(
            combined, is_fake, features, model_prob, heur_prob, av_mismatch
        )

        result_label = "AI_VOICE" if is_fake else "HUMAN_VOICE"
        if av_mismatch:
            result_label = "AV_MISMATCH"  # special label for face-swap case

        return {
            "available":        True,
            "result":           result_label,
            "confidence":       confidence,
            "fake_probability": round(combined, 4),
            "model_score":      round(model_prob * 100, 1),
            "heuristic_score":  round(heur_prob * 100, 1),
            "av_mismatch":      av_mismatch,
            "details":          details,
            "features":         features,
        }

    def _build_details(
        self,
        prob: float,
        is_fake: bool,
        features: dict,
        model_prob: float,
        heur_prob: float,
        av_mismatch: bool = False,
    ) -> list[str]:
        details = []

        # Audio-visual mismatch is the most important signal
        if av_mismatch:
            details.append(
                "⚠️ Audio-visual mismatch detected — face appears manipulated but voice is human. "
                "This is the hallmark of face-swap deepfakes where original audio is preserved."
            )
            details.append(
                "Voice is authentic human speech, but does NOT match the manipulated face — "
                "consistent with dubbed deepfake video (e.g. movie scene re-faced)"
            )
            details.append(
                f"Visual deepfake confidence was high while voice model scored {(1-model_prob)*100:.1f}% human — "
                "strong indicator of face-swap rather than full synthesis"
            )
            return details

        if is_fake:
            if prob > 0.85:
                details.append("High-confidence AI-generated voice detected")
            elif prob > 0.70:
                details.append("Strong synthetic voice characteristics identified")
            else:
                details.append("AI voice patterns detected — likely TTS or voice cloning")

            pitch_std = features.get("pitch_std_hz")
            if pitch_std is not None and pitch_std < 15:
                details.append(
                    f"Unnaturally stable pitch (σ={pitch_std}Hz) — "
                    "human speech typically varies 20-80Hz"
                )

            delta_var = features.get("mfcc_delta_var")
            if delta_var is not None and delta_var < 1.5:
                details.append(
                    "Insufficient micro-variation in articulation — "
                    "characteristic of TTS synthesis"
                )

            silence = features.get("silence_ratio")
            if silence is not None and silence < 0.05:
                details.append(
                    "No natural breath pauses detected — "
                    "AI voices lack organic speech rhythm"
                )

            details.append(f"ASVspoof model confidence: {model_prob*100:.1f}% synthetic")
        else:
            if prob < 0.25:
                details.append("Strong indicators of authentic human voice")
            else:
                details.append("Voice characteristics consistent with natural human speech")

            pitch_std = features.get("pitch_std_hz")
            if pitch_std is not None and pitch_std >= 20:
                details.append(f"Natural pitch variation detected (σ={pitch_std}Hz)")

            silence = features.get("silence_ratio")
            if silence is not None and 0.05 <= silence <= 0.35:
                details.append(
                    "Natural speech rhythm with organic pauses and breath sounds"
                )

            details.append(f"ASVspoof model confidence: {(1-model_prob)*100:.1f}% human")

        return details


# ─────────────────────────────────────────────
# Orchestrator
# ─────────────────────────────────────────────
class AudioAuthenticator:
    def __init__(self):
        self.extractor = AudioExtractorAgent()
        self.analyzer  = AudioAnalysisAgent()
        self.decision  = AudioDecisionAgent()
        self.reporter  = AudioReportAgent()

    def analyze(self, video_path: str, visual_fake_prob: float = 0.5) -> dict:
        # Step 1: Extract audio
        waveform, sr = self.extractor.extract(video_path)

        if waveform is None or len(waveform) == 0:
            return self.reporter.generate(0.5, {}, has_audio=False)

        # Step 2: Heuristic analysis
        heuristic = self.analyzer.analyze(waveform, sr)

        # Step 3: Model prediction
        model_prob = self.decision.predict(waveform, sr)

        # Step 4: Report (pass visual prob for mismatch detection)
        return self.reporter.generate(
            model_prob, heuristic, has_audio=True,
            visual_fake_prob=visual_fake_prob,
        )