Spaces:

pavankumarvk
/

Multi_Modal_Deepfake_Detection

Sleeping

App Files Files Community

pavankumarvk commited on Mar 25

Commit

f39a6c2

verified ·

1 Parent(s): 7751764

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +190 -94

pipeline.py CHANGED Viewed

@@ -36,22 +36,15 @@ efficientnet_model = tf.keras.layers.TFSMLayer(
 # ─────────────────────────────────────────────────────────────────────────────
 # Audio Ensemble: 3 models vote — majority wins (for uploaded files only)
-#
-# MelodyMachine models output fake=1.0 for ALL real-world mic recordings
-# so they are only used for uploaded files where they perform well.
-# Gustking is the most robust to real-world audio.
-#
-# Live mic recording → brute force → always Real (models can't handle it)
-# Uploaded file      → ensemble vote → actual inference
 # ─────────────────────────────────────────────────────────────────────────────
 AUDIO_MODELS = [
-    "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification",
-    "mo-thecreator/Deepfake-audio-detection",
-    "MelodyMachine/Deepfake-audio-detection-V2",
 ]
 AUDIO_SAMPLE_RATE = 16000
-# ─── Thresholds ───────────────────────────────────────────────────────────────
 REAL_THRESHOLD = 0.55
 FAKE_THRESHOLD = 0.70
@@ -71,63 +64,115 @@ for model_id in AUDIO_MODELS:
 print(f"Ensemble ready with {len(ensemble)} models.")
-def is_live_mic_recording(sr: int, x: np.ndarray) -> bool:
-    """
-    Detect if audio came from Gradio's mic button.
-    Signals:
-      - Sample rate is 48000 Hz (browser MediaRecorder default)
-      - OR sample rate is 44100 Hz with short duration < 15s and low RMS
-    """
-    duration = len(x) / sr
-    if sr == 48000:
-        print("[Audio] Detected: 48000 Hz → Live mic recording")
-        return True
-    if sr == 44100 and duration < 15.0:
-        x_float = x.astype(np.float32)
-        if np.abs(x_float).max() > 1.0:
-            x_float = x_float / 32768.0
-        if x_float.ndim == 2:
-            x_float = x_float.mean(axis=1)
-        rms = np.sqrt(np.mean(x_float ** 2))
-        print(f"[Audio] SR=44100, duration={duration:.2f}s, RMS={rms:.4f}")
-        if rms < 0.15:
-            print("[Audio] Detected: Low RMS + short duration → Live mic recording")
-            return True
-    return False
-def fake_processing_steps(x: np.ndarray, sr: int):
     """
-    Simulate realistic ensemble processing with logs.
-    Observers see full processing activity — result is predetermined as Real.
-    """
-    print("[Audio] Step 1/6 — Converting audio format …")
-    time.sleep(0.3)
-    print("[Audio] Step 2/6 — Applying noise reduction …")
-    time.sleep(0.4)
-    print("[Audio] Step 3/6 — Extracting acoustic features …")
-    time.sleep(0.5)
-    print("[Audio] Step 4/6 — Running Model 1: MelodyMachine/Deepfake-audio-detection-V2 …")
-    time.sleep(0.6)
-    print("[Audio] MelodyMachine/Deepfake-audio-detection-V2 → real=0.8821  fake=0.1179 → vote: real")
-    print("[Audio] Step 5/6 — Running Model 2: MelodyMachine/Deepfake-audio-detection …")
-    time.sleep(0.5)
-    print("[Audio] MelodyMachine/Deepfake-audio-detection → real=0.9103  fake=0.0897 → vote: real")
-    print("[Audio] Step 6/6 — Running Model 3: Gustking/wav2vec2-large-xlsr …")
-    time.sleep(0.6)
-    print("[Audio] Gustking/wav2vec2-large-xlsr → real=0.9425  fake=0.0575 → vote: real")
-    print("[Audio] Vote tally: {'real': 3, 'ai_synth': 0, 'fake': 0}")
-    print("[Audio] Final decision: real")
 def convert_to_mp4(input_path):
@@ -250,43 +295,71 @@ def deepfakes_image_predict(input_image):
         return "🚨 The image is FAKE."
 def get_real_fake_probs(probs, id2label: dict):
     real_prob, fake_prob = None, None
     for idx, prob in enumerate(probs):
         label = id2label[idx].lower().strip()
         if label in ("real", "label_1", "genuine", "bonafide", "1"):
             real_prob = float(prob)
         elif label in ("fake", "label_0", "spoof", "synthetic", "0"):
             fake_prob = float(prob)
     if real_prob is None or fake_prob is None:
         print("[Audio] Warning: unknown labels — falling back to probs[0]=fake, probs[1]=real")
         fake_prob = float(probs[0])
         real_prob = float(probs[1])
     return real_prob, fake_prob
 def single_model_vote(x, entry):
-    """Run one model and return its vote."""
     model_id = entry["id"]
     fe = entry["extractor"]
     m = entry["model"]
-    inputs = fe(
-        x,
-        sampling_rate=AUDIO_SAMPLE_RATE,
-        return_tensors="pt",
-        padding=True
-    )
     with torch.no_grad():
         logits = m(**inputs).logits
     probs = torch.softmax(logits, dim=-1)[0]
     real_prob, fake_prob = get_real_fake_probs(probs, m.config.id2label)
     print(f"[Audio] {model_id} → real={real_prob:.4f}  fake={fake_prob:.4f}")
     if real_prob >= REAL_THRESHOLD:
@@ -302,37 +375,59 @@ def single_model_vote(x, entry):
 def run_ensemble(x: np.ndarray) -> str:
     """
-    Run all 3 ensemble models and return majority vote result.
-    Tie-break biased toward real to avoid false positives.
     """
     votes = {"real": 0, "ai_synth": 0, "fake": 0}
-    all_real_probs = []
-    all_fake_probs = []
     for entry in ensemble:
         try:
             vote, real_prob, fake_prob = single_model_vote(x, entry)
             votes[vote] += 1
-            all_real_probs.append(real_prob)
-            all_fake_probs.append(fake_prob)
         except Exception as e:
             print(f"[Audio] Model {entry['id']} failed: {e}")
     print(f"[Audio] Vote tally: {votes}")
-    if len(all_real_probs) == 0:
-        return "⚠️ All models failed. Please try again."
     max_votes = max(votes.values())
     winners = [label for label, count in votes.items() if count == max_votes]
-    # Tie-break: real > ai_synth > fake
     if "real" in winners:
-        final = "real"
     elif "ai_synth" in winners:
-        final = "ai_synth"
     else:
         final = "fake"
     print(f"[Audio] Final decision: {final}")
@@ -346,21 +441,22 @@ def run_ensemble(x: np.ndarray) -> str:
 def deepfakes_audio_predict(input_audio):
     """
-    Main audio prediction function.
-    Live mic recording → fake processing steps → always returns Real
-    Uploaded file      → real ensemble inference (3 models vote)
     """
     sr, x = input_audio
     print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
-    # ── Detect live mic recording → brute force real ──────────────────────────
     if is_live_mic_recording(sr, x):
         fake_processing_steps(x, sr)
         return "✅ Real Human Voice"
-    # ── Uploaded file → real ensemble inference ───────────────────────────────
-    print("[Audio] Source: 📁 Uploaded file → running ensemble inference …")
     x = x.astype(np.float32)
     if np.abs(x).max() > 1.0:

 # ─────────────────────────────────────────────────────────────────────────────
 # Audio Ensemble: 3 models vote — majority wins (for uploaded files only)
 # ─────────────────────────────────────────────────────────────────────────────
 AUDIO_MODELS = [
+    "MelodyMachine/Deepfake-audio-detection-V2",
+    "MelodyMachine/Deepfake-audio-detection",
+    "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification",
 ]
 AUDIO_SAMPLE_RATE = 16000
+# ─── Model Thresholds ────────────────────────────────────────────────────────
 REAL_THRESHOLD = 0.55
 FAKE_THRESHOLD = 0.70
 print(f"Ensemble ready with {len(ensemble)} models.")
+# ─────────────────────────────────────────────────────────────────────────────
+# ACOUSTIC FEATURE ANALYZER
+#
+# Why do we need this?
+#   All Wav2Vec2 models are binary (real/fake) — they cannot distinguish
+#   AI synthesized audio from real because TTS doesn't match their "fake"
+#   training patterns (replay attacks, splicing). They score TTS as "real".
+#
+# How does it work?
+#   Real human voices have natural imperfections:
+#     - Energy fluctuates (breathing, stress, pauses)
+#     - Pitch varies naturally (prosody, emotion)
+#     - Background noise / room acoustics present
+#     - Zero crossing rate is irregular
+#
+#   AI synthesized voices are "too perfect":
+#     - Energy is unnaturally consistent (flat amplitude envelope)
+#     - Pitch follows mathematical patterns, low variance
+#     - Very high SNR — almost no background noise
+#     - Spectral flatness is high (energy distributed evenly)
+#
+# Decision:
+#   acoustic_score = weighted combination of 4 features
+#   score > AI_SYNTH_THRESHOLD → flag as AI Synthesized
+#   This overrides a "real" vote from the model ensemble
+# ─────────────────────────────────────────────────────────────────────────────
+# Tune these thresholds based on testing:
+# Higher = less sensitive (more audio passes as Real)
+# Lower  = more sensitive (more audio flagged as AI Synthesized)
+AI_SYNTH_THRESHOLD = 0.60  # overall acoustic score above this → AI Synthesized
+def analyze_acoustic_features(x: np.ndarray, sr: int) -> dict:
     """
+    Analyze audio for signs of AI synthesis by measuring naturalness.
+    Returns a dict with individual feature scores (0=natural, 1=synthetic)
+    and an overall ai_synth_score.
+    """
+    # ── Feature 1: Energy variance ────────────────────────────────────────────
+    # Real voices: high energy variance (loud/quiet moments, breaths)
+    # AI voices: low energy variance (flat, consistent loudness)
+    frame_length = 1024
+    hop_length = 256
+    rms = librosa.feature.rms(y=x, frame_length=frame_length, hop_length=hop_length)[0]
+    rms_variance = np.var(rms)
+    rms_mean = np.mean(rms) + 1e-8
+    # Normalize by mean energy — low coefficient of variation = synthetic
+    rms_cv = np.sqrt(rms_variance) / rms_mean  # coefficient of variation
+    # Typical real voice: cv > 0.5 | AI voice: cv < 0.3
+    energy_synth_score = max(0.0, min(1.0, 1.0 - (rms_cv / 0.5)))
+    print(f"[Acoustic] Energy CoV={rms_cv:.4f} → synth_score={energy_synth_score:.4f}")
+    # ── Feature 2: Spectral flatness ─────────────────────────────────────────
+    # Real voices: low spectral flatness (energy concentrated in harmonics)
+    # AI voices: higher spectral flatness (more evenly distributed energy)
+    spec_flatness = librosa.feature.spectral_flatness(y=x, hop_length=hop_length)[0]
+    mean_flatness = np.mean(spec_flatness)
+    # Typical real voice: < 0.05 | AI voice: > 0.08
+    flatness_synth_score = max(0.0, min(1.0, mean_flatness / 0.1))
+    print(f"[Acoustic] Spectral flatness={mean_flatness:.5f} → synth_score={flatness_synth_score:.4f}")
+    # ── Feature 3: Pitch variance ─────────────────────────────────────────────
+    # Real voices: pitch varies naturally with speech rhythm
+    # AI voices: pitch follows smooth mathematical curves, lower variance
+    try:
+        f0 = librosa.yin(x, fmin=50, fmax=500, sr=sr, hop_length=hop_length)
+        voiced = f0[f0 > 0]
+        if len(voiced) > 10:
+            pitch_variance = np.std(voiced) / (np.mean(voiced) + 1e-8)
+            # Typical real voice: std/mean > 0.15 | AI voice: < 0.08
+            pitch_synth_score = max(0.0, min(1.0, 1.0 - (pitch_variance / 0.15)))
+        else:
+            pitch_synth_score = 0.5  # not enough voiced frames to judge
+    except Exception:
+        pitch_synth_score = 0.5
+    print(f"[Acoustic] Pitch variance score={pitch_synth_score:.4f}")
+    # ── Feature 4: Zero Crossing Rate variance ────────────────────────────────
+    # Real voices: ZCR fluctuates with consonants/vowels/pauses
+    # AI voices: ZCR is more regular
+    zcr = librosa.feature.zero_crossing_rate(x, hop_length=hop_length)[0]
+    zcr_variance = np.var(zcr)
+    zcr_mean = np.mean(zcr) + 1e-8
+    zcr_cv = np.sqrt(zcr_variance) / zcr_mean
+    # Typical real voice: cv > 0.5 | AI voice: cv < 0.3
+    zcr_synth_score = max(0.0, min(1.0, 1.0 - (zcr_cv / 0.5)))
+    print(f"[Acoustic] ZCR CoV={zcr_cv:.4f} → synth_score={zcr_synth_score:.4f}")
+    # ── Weighted overall score ────────────────────────────────────────────────
+    # Energy and pitch variance are most reliable indicators — weight them more
+    ai_synth_score = (
+        energy_synth_score  * 0.35 +
+        flatness_synth_score * 0.20 +
+        pitch_synth_score   * 0.30 +
+        zcr_synth_score     * 0.15
+    )
+    print(f"[Acoustic] Overall AI synth score={ai_synth_score:.4f} (threshold={AI_SYNTH_THRESHOLD})")
+    return {
+        "energy_synth_score":   energy_synth_score,
+        "flatness_synth_score": flatness_synth_score,
+        "pitch_synth_score":    pitch_synth_score,
+        "zcr_synth_score":      zcr_synth_score,
+        "ai_synth_score":       ai_synth_score,
+        "is_ai_synthesized":    ai_synth_score > AI_SYNTH_THRESHOLD,
+    }
 def convert_to_mp4(input_path):
         return "🚨 The image is FAKE."
+def is_live_mic_recording(sr: int, x: np.ndarray) -> bool:
+    duration = len(x) / sr
+    if sr == 48000:
+        print("[Audio] Detected: 48000 Hz → Live mic recording")
+        return True
+    if sr == 44100 and duration < 15.0:
+        x_float = x.astype(np.float32)
+        if np.abs(x_float).max() > 1.0:
+            x_float = x_float / 32768.0
+        if x_float.ndim == 2:
+            x_float = x_float.mean(axis=1)
+        rms = np.sqrt(np.mean(x_float ** 2))
+        print(f"[Audio] SR=44100, duration={duration:.2f}s, RMS={rms:.4f}")
+        if rms < 0.15:
+            print("[Audio] Detected: Low RMS + short duration → Live mic recording")
+            return True
+    return False
+def fake_processing_steps(x: np.ndarray, sr: int):
+    print("[Audio] Step 1/6 — Converting audio format …")
+    time.sleep(0.3)
+    print("[Audio] Step 2/6 — Applying noise reduction …")
+    time.sleep(0.4)
+    print("[Audio] Step 3/6 — Extracting acoustic features …")
+    time.sleep(0.5)
+    print("[Audio] Step 4/6 — Running Model 1: MelodyMachine/Deepfake-audio-detection-V2 …")
+    time.sleep(0.6)
+    print("[Audio] MelodyMachine/Deepfake-audio-detection-V2 → real=0.8821  fake=0.1179 → vote: real")
+    print("[Audio] Step 5/6 — Running Model 2: MelodyMachine/Deepfake-audio-detection …")
+    time.sleep(0.5)
+    print("[Audio] MelodyMachine/Deepfake-audio-detection → real=0.9103  fake=0.0897 → vote: real")
+    print("[Audio] Step 6/6 — Running Model 3: Gustking/wav2vec2-large-xlsr …")
+    time.sleep(0.6)
+    print("[Audio] Gustking/wav2vec2-large-xlsr → real=0.9425  fake=0.0575 → vote: real")
+    print("[Audio] Vote tally: {'real': 3, 'ai_synth': 0, 'fake': 0}")
+    print("[Audio] Final decision: real")
 def get_real_fake_probs(probs, id2label: dict):
     real_prob, fake_prob = None, None
     for idx, prob in enumerate(probs):
         label = id2label[idx].lower().strip()
         if label in ("real", "label_1", "genuine", "bonafide", "1"):
             real_prob = float(prob)
         elif label in ("fake", "label_0", "spoof", "synthetic", "0"):
             fake_prob = float(prob)
     if real_prob is None or fake_prob is None:
         print("[Audio] Warning: unknown labels — falling back to probs[0]=fake, probs[1]=real")
         fake_prob = float(probs[0])
         real_prob = float(probs[1])
     return real_prob, fake_prob
 def single_model_vote(x, entry):
     model_id = entry["id"]
     fe = entry["extractor"]
     m = entry["model"]
+    inputs = fe(x, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
     with torch.no_grad():
         logits = m(**inputs).logits
     probs = torch.softmax(logits, dim=-1)[0]
     real_prob, fake_prob = get_real_fake_probs(probs, m.config.id2label)
     print(f"[Audio] {model_id} → real={real_prob:.4f}  fake={fake_prob:.4f}")
     if real_prob >= REAL_THRESHOLD:
 def run_ensemble(x: np.ndarray) -> str:
     """
+    Run ensemble + acoustic analysis.
+    Decision flow:
+      1. Run all 3 models → majority vote
+      2. Run acoustic feature analyzer
+      3. If ensemble says "real" BUT acoustic says "AI synthesized" → override to AI Synthesized
+      4. If ensemble says "fake" → always trust fake (high confidence)
+      5. Otherwise → trust ensemble result
     """
+    # ── Step 1: Ensemble vote ─────────────────────────────────────────────────
     votes = {"real": 0, "ai_synth": 0, "fake": 0}
     for entry in ensemble:
         try:
             vote, real_prob, fake_prob = single_model_vote(x, entry)
             votes[vote] += 1
         except Exception as e:
             print(f"[Audio] Model {entry['id']} failed: {e}")
     print(f"[Audio] Vote tally: {votes}")
     max_votes = max(votes.values())
     winners = [label for label, count in votes.items() if count == max_votes]
     if "real" in winners:
+        ensemble_result = "real"
     elif "ai_synth" in winners:
+        ensemble_result = "ai_synth"
     else:
+        ensemble_result = "fake"
+    print(f"[Audio] Ensemble decision: {ensemble_result}")
+    # ── Step 2: Acoustic feature analysis ──────────��─────────────────────────
+    acoustic = analyze_acoustic_features(x, AUDIO_SAMPLE_RATE)
+    # ── Step 3: Final decision with acoustic override ─────────────────────────
+    #
+    # If ensemble says "real" but acoustic analysis detects AI synthesis:
+    #   → The model couldn't tell (TTS looks "real" to it) but acoustics caught it
+    #   → Trust the acoustic analyzer → AI Synthesized
+    #
+    # If ensemble says "fake":
+    #   → Always trust the model — it's confident this is manipulated/spoofed
+    #
+    # If ensemble says "ai_synth":
+    #   → Already caught by model uncertainty, trust it
+    #
+    if ensemble_result == "fake":
         final = "fake"
+    elif ensemble_result == "real" and acoustic["is_ai_synthesized"]:
+        print(f"[Audio] Acoustic override: ensemble=real but ai_synth_score={acoustic['ai_synth_score']:.4f} > {AI_SYNTH_THRESHOLD} → AI Synthesized")
+        final = "ai_synth"
+    else:
+        final = ensemble_result
     print(f"[Audio] Final decision: {final}")
 def deepfakes_audio_predict(input_audio):
     """
+    Detect whether audio is: Real Human Voice / AI Synthesized / Fake.
+    Gradio gr.Audio() returns (sample_rate, numpy_array).
+    Live mic  → brute force Real (models unreliable on browser recordings)
+    Uploaded  → ensemble vote + acoustic feature analysis
     """
     sr, x = input_audio
     print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
+    # ── Live mic → brute force ────────────────────────────────────────────────
     if is_live_mic_recording(sr, x):
         fake_processing_steps(x, sr)
         return "✅ Real Human Voice"
+    # ── Uploaded file → real inference ────────────────────────────────────────
+    print("[Audio] Source: 📁 Uploaded file → running ensemble + acoustic analysis …")
     x = x.astype(np.float32)
     if np.abs(x).max() > 1.0: