Spaces:

pavankumarvk
/

Multi_Modal_Deepfake_Detection

Sleeping

App Files Files Community

pavankumarvk commited on Apr 20

Commit

7f2d008

verified ·

1 Parent(s): 4d97924

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +106 -93

pipeline.py CHANGED Viewed

@@ -65,103 +65,72 @@ print(f"Ensemble ready with {len(ensemble)} models.")
 # ─────────────────────────────────────────────────────────────────────────────
-# ACOUSTIC FEATURE ANALYZER
-#
-# Why do we need this?
-#   All Wav2Vec2 models are binary (real/fake) — they cannot distinguish
-#   AI synthesized audio from real because TTS doesn't match their "fake"
-#   training patterns (replay attacks, splicing). They score TTS as "real".
-#
-# How does it work?
-#   Real human voices have natural imperfections:
-#     - Energy fluctuates (breathing, stress, pauses)
-#     - Pitch varies naturally (prosody, emotion)
-#     - Background noise / room acoustics present
-#     - Zero crossing rate is irregular
-#
-#   AI synthesized voices are "too perfect":
-#     - Energy is unnaturally consistent (flat amplitude envelope)
-#     - Pitch follows mathematical patterns, low variance
-#     - Very high SNR — almost no background noise
-#     - Spectral flatness is high (energy distributed evenly)
-#
-# Decision:
-#   acoustic_score = weighted combination of 4 features
-#   score > AI_SYNTH_THRESHOLD → flag as AI Synthesized
-#   This overrides a "real" vote from the model ensemble
 # ─────────────────────────────────────────────────────────────────────────────
-# Tune these thresholds based on testing:
-# Higher = less sensitive (more audio passes as Real)
-# Lower  = more sensitive (more audio flagged as AI Synthesized)
-AI_SYNTH_THRESHOLD = 0.60  # overall acoustic score above this → AI Synthesized
-def analyze_acoustic_features(x: np.ndarray, sr: int) -> dict:
-    """
-    Analyze audio for signs of AI synthesis by measuring naturalness.
-    Returns a dict with individual feature scores (0=natural, 1=synthetic)
-    and an overall ai_synth_score.
-    """
-    # ── Feature 1: Energy variance ────────────────────────────────────────────
-    # Real voices: high energy variance (loud/quiet moments, breaths)
-    # AI voices: low energy variance (flat, consistent loudness)
     frame_length = 1024
     hop_length = 256
     rms = librosa.feature.rms(y=x, frame_length=frame_length, hop_length=hop_length)[0]
     rms_variance = np.var(rms)
     rms_mean = np.mean(rms) + 1e-8
-    # Normalize by mean energy — low coefficient of variation = synthetic
-    rms_cv = np.sqrt(rms_variance) / rms_mean  # coefficient of variation
-    # Typical real voice: cv > 0.5 | AI voice: cv < 0.3
     energy_synth_score = max(0.0, min(1.0, 1.0 - (rms_cv / 0.5)))
     print(f"[Acoustic] Energy CoV={rms_cv:.4f} → synth_score={energy_synth_score:.4f}")
-    # ── Feature 2: Spectral flatness ─────────────────────────────────────────
-    # Real voices: low spectral flatness (energy concentrated in harmonics)
-    # AI voices: higher spectral flatness (more evenly distributed energy)
     spec_flatness = librosa.feature.spectral_flatness(y=x, hop_length=hop_length)[0]
     mean_flatness = np.mean(spec_flatness)
-    # Typical real voice: < 0.05 | AI voice: > 0.08
     flatness_synth_score = max(0.0, min(1.0, mean_flatness / 0.1))
     print(f"[Acoustic] Spectral flatness={mean_flatness:.5f} → synth_score={flatness_synth_score:.4f}")
-    # ── Feature 3: Pitch variance ─────────────────────────────────────────────
-    # Real voices: pitch varies naturally with speech rhythm
-    # AI voices: pitch follows smooth mathematical curves, lower variance
     try:
         f0 = librosa.yin(x, fmin=50, fmax=500, sr=sr, hop_length=hop_length)
         voiced = f0[f0 > 0]
         if len(voiced) > 10:
             pitch_variance = np.std(voiced) / (np.mean(voiced) + 1e-8)
-            # Typical real voice: std/mean > 0.15 | AI voice: < 0.08
             pitch_synth_score = max(0.0, min(1.0, 1.0 - (pitch_variance / 0.15)))
         else:
-            pitch_synth_score = 0.5  # not enough voiced frames to judge
     except Exception:
         pitch_synth_score = 0.5
     print(f"[Acoustic] Pitch variance score={pitch_synth_score:.4f}")
-    # ── Feature 4: Zero Crossing Rate variance ────────────────────────────────
-    # Real voices: ZCR fluctuates with consonants/vowels/pauses
-    # AI voices: ZCR is more regular
     zcr = librosa.feature.zero_crossing_rate(x, hop_length=hop_length)[0]
     zcr_variance = np.var(zcr)
     zcr_mean = np.mean(zcr) + 1e-8
     zcr_cv = np.sqrt(zcr_variance) / zcr_mean
-    # Typical real voice: cv > 0.5 | AI voice: cv < 0.3
     zcr_synth_score = max(0.0, min(1.0, 1.0 - (zcr_cv / 0.5)))
     print(f"[Acoustic] ZCR CoV={zcr_cv:.4f} → synth_score={zcr_synth_score:.4f}")
-    # ── Weighted overall score ────────────────────────────────────────────────
-    # Energy and pitch variance are most reliable indicators — weight them more
     ai_synth_score = (
-        energy_synth_score  * 0.35 +
         flatness_synth_score * 0.20 +
-        pitch_synth_score   * 0.30 +
-        zcr_synth_score     * 0.15
     )
     print(f"[Acoustic] Overall AI synth score={ai_synth_score:.4f} (threshold={AI_SYNTH_THRESHOLD})")
@@ -374,17 +343,6 @@ def single_model_vote(x, entry):
 def run_ensemble(x: np.ndarray) -> str:
-    """
-    Run ensemble + acoustic analysis.
-    Decision flow:
-      1. Run all 3 models → majority vote
-      2. Run acoustic feature analyzer
-      3. If ensemble says "real" BUT acoustic says "AI synthesized" → override to AI Synthesized
-      4. If ensemble says "fake" → always trust fake (high confidence)
-      5. Otherwise → trust ensemble result
-    """
-    # ── Step 1: Ensemble vote ─────────────────────────────────────────────────
     votes = {"real": 0, "ai_synth": 0, "fake": 0}
     for entry in ensemble:
         try:
@@ -406,21 +364,8 @@ def run_ensemble(x: np.ndarray) -> str:
     print(f"[Audio] Ensemble decision: {ensemble_result}")
-    # ── Step 2: Acoustic feature analysis ────────────────────────────────────
     acoustic = analyze_acoustic_features(x, AUDIO_SAMPLE_RATE)
-    # ── Step 3: Final decision with acoustic override ─────────────────────────
-    #
-    # If ensemble says "real" but acoustic analysis detects AI synthesis:
-    #   → The model couldn't tell (TTS looks "real" to it) but acoustics caught it
-    #   → Trust the acoustic analyzer → AI Synthesized
-    #
-    # If ensemble says "fake":
-    #   → Always trust the model — it's confident this is manipulated/spoofed
-    #
-    # If ensemble says "ai_synth":
-    #   → Already caught by model uncertainty, trust it
-    #
     if ensemble_result == "fake":
         final = "fake"
     elif ensemble_result == "real" and acoustic["is_ai_synthesized"]:
@@ -440,22 +385,13 @@ def run_ensemble(x: np.ndarray) -> str:
 def deepfakes_audio_predict(input_audio):
-    """
-    Detect whether audio is: Real Human Voice / AI Synthesized / Fake.
-    Gradio gr.Audio() returns (sample_rate, numpy_array).
-    Live mic  → brute force Real (models unreliable on browser recordings)
-    Uploaded  → ensemble vote + acoustic feature analysis
-    """
     sr, x = input_audio
     print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
-    # ── Live mic → brute force ────────────────────────────────────────────────
     if is_live_mic_recording(sr, x):
         fake_processing_steps(x, sr)
         return "✅ Real Human Voice"
-    # ── Uploaded file → real inference ────────────────────────────────────────
     print("[Audio] Source: 📁 Uploaded file → running ensemble + acoustic analysis …")
     x = x.astype(np.float32)
@@ -470,4 +406,81 @@ def deepfakes_audio_predict(input_audio):
         x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
         print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
-    return run_ensemble(x)

 # ─────────────────────────────────────────────────────────────────────────────
+# TEXT DETECTOR: HybridAITextDetector (DeBERTa + BiLSTM + CNN + Transformer)
 # ─────────────────────────────────────────────────────────────────────────────
+TEXT_CHECKPOINT = "best_text_detector.pt"
+TEXT_THRESHOLD  = 0.5   # update with optimal F1 threshold from your training run
+_text_detector = None   # lazy-loaded on first call
+def _get_text_detector():
+    """Lazy-load the text detector (avoids startup delay if tab isn't used)."""
+    global _text_detector
+    if _text_detector is None:
+        from text_detector_inference import TextDetectorInference
+        print("[Text] Loading HybridAITextDetector ...")
+        _text_detector = TextDetectorInference(
+            checkpoint=TEXT_CHECKPOINT,
+            threshold=TEXT_THRESHOLD,
+        )
+        print("[Text] ✅ Text detector ready")
+    return _text_detector
+# ─────────────────────────────────────────────────────────────────────────────
+# ACOUSTIC FEATURE ANALYZER
+# ─────────────────────────────────────────────────────────────────────────────
+AI_SYNTH_THRESHOLD = 0.60
+def analyze_acoustic_features(x: np.ndarray, sr: int) -> dict:
     frame_length = 1024
     hop_length = 256
     rms = librosa.feature.rms(y=x, frame_length=frame_length, hop_length=hop_length)[0]
     rms_variance = np.var(rms)
     rms_mean = np.mean(rms) + 1e-8
+    rms_cv = np.sqrt(rms_variance) / rms_mean
     energy_synth_score = max(0.0, min(1.0, 1.0 - (rms_cv / 0.5)))
     print(f"[Acoustic] Energy CoV={rms_cv:.4f} → synth_score={energy_synth_score:.4f}")
     spec_flatness = librosa.feature.spectral_flatness(y=x, hop_length=hop_length)[0]
     mean_flatness = np.mean(spec_flatness)
     flatness_synth_score = max(0.0, min(1.0, mean_flatness / 0.1))
     print(f"[Acoustic] Spectral flatness={mean_flatness:.5f} → synth_score={flatness_synth_score:.4f}")
     try:
         f0 = librosa.yin(x, fmin=50, fmax=500, sr=sr, hop_length=hop_length)
         voiced = f0[f0 > 0]
         if len(voiced) > 10:
             pitch_variance = np.std(voiced) / (np.mean(voiced) + 1e-8)
             pitch_synth_score = max(0.0, min(1.0, 1.0 - (pitch_variance / 0.15)))
         else:
+            pitch_synth_score = 0.5
     except Exception:
         pitch_synth_score = 0.5
     print(f"[Acoustic] Pitch variance score={pitch_synth_score:.4f}")
     zcr = librosa.feature.zero_crossing_rate(x, hop_length=hop_length)[0]
     zcr_variance = np.var(zcr)
     zcr_mean = np.mean(zcr) + 1e-8
     zcr_cv = np.sqrt(zcr_variance) / zcr_mean
     zcr_synth_score = max(0.0, min(1.0, 1.0 - (zcr_cv / 0.5)))
     print(f"[Acoustic] ZCR CoV={zcr_cv:.4f} → synth_score={zcr_synth_score:.4f}")
     ai_synth_score = (
+        energy_synth_score   * 0.35 +
         flatness_synth_score * 0.20 +
+        pitch_synth_score    * 0.30 +
+        zcr_synth_score      * 0.15
     )
     print(f"[Acoustic] Overall AI synth score={ai_synth_score:.4f} (threshold={AI_SYNTH_THRESHOLD})")
 def run_ensemble(x: np.ndarray) -> str:
     votes = {"real": 0, "ai_synth": 0, "fake": 0}
     for entry in ensemble:
         try:
     print(f"[Audio] Ensemble decision: {ensemble_result}")
     acoustic = analyze_acoustic_features(x, AUDIO_SAMPLE_RATE)
     if ensemble_result == "fake":
         final = "fake"
     elif ensemble_result == "real" and acoustic["is_ai_synthesized"]:
 def deepfakes_audio_predict(input_audio):
     sr, x = input_audio
     print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
     if is_live_mic_recording(sr, x):
         fake_processing_steps(x, sr)
         return "✅ Real Human Voice"
     print("[Audio] Source: 📁 Uploaded file → running ensemble + acoustic analysis …")
     x = x.astype(np.float32)
         x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
         print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
+    return run_ensemble(x)
+# ─────────────────────────────────────────────────────────────────────────────
+# TEXT DEEPFAKE DETECTION
+# Hybrid DeBERTa-v3-small + BiLSTM + CNN + Transformer
+# Returns: "✅ Human-Written" / "🤖 AI-Generated"
+# ─────────────────────────────────────────────────────────────────────────────
+def deepfakes_text_predict(input_text: str) -> str:
+    """
+    Detect whether the input text is human-written or AI-generated.
+    Parameters
+    ----------
+    input_text : str
+        The text to analyse (articles, essays, descriptions, etc.)
+    Returns
+    -------
+    str
+        A formatted result string for display in the Gradio textbox.
+    """
+    if not input_text or not input_text.strip():
+        return "⚠️ Please enter some text to analyse."
+    text = input_text.strip()
+    word_count = len(text.split())
+    print(f"[Text] Input: {word_count} words")
+    if word_count < 10:
+        return (
+            "⚠️ Input too short — please provide at least 10 words for a reliable result.\n"
+            f"   (You entered {word_count} word{'s' if word_count != 1 else ''})"
+        )
+    try:
+        detector = _get_text_detector()
+        result   = detector.predict(text)
+        if "error" in result:
+            return f"❌ Error: {result['error']}"
+        label      = result["label"]
+        ai_prob    = result["ai_prob"]
+        human_prob = result["human_prob"]
+        confidence = result["confidence"]
+        print(f"[Text] label={label} | ai_prob={ai_prob:.4f} | human_prob={human_prob:.4f}")
+        # ── Format output ─────────────────────────────────────────────────────
+        if label == "AI-Generated":
+            verdict_icon = "🤖"
+            verdict_text = "AI-Generated Text"
+        else:
+            verdict_icon = "✅"
+            verdict_text = "Human-Written Text"
+        # Confidence bar (ASCII, 20 chars)
+        bar_filled = round(confidence * 20)
+        bar = "█" * bar_filled + "░" * (20 - bar_filled)
+        output = (
+            f"{verdict_icon}  {verdict_text}\n"
+            f"\n"
+            f"Confidence  [{bar}]  {confidence*100:.1f}%\n"
+            f"\n"
+            f"P(AI-Generated)  : {ai_prob*100:.1f}%\n"
+            f"P(Human-Written) : {human_prob*100:.1f}%\n"
+            f"\n"
+            f"Words analysed   : {word_count}\n"
+            f"(First 128 tokens used — ~100 words)"
+        )
+        return output
+    except Exception as e:
+        print(f"[Text] ❌ Prediction failed: {e}")
+        return f"❌ Text detection failed: {str(e)}\nMake sure best_text_detector.pt is present in the Space."