Spaces:

pavankumarvk
/

Multi_Modal_Deepfake_Detection

Sleeping

App Files Files Community

pavankumarvk commited on Mar 18

Commit

be37324

verified ·

1 Parent(s): 0caada5

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +30 -41

pipeline.py CHANGED Viewed

@@ -29,22 +29,11 @@ efficientnet_model = tf.keras.layers.TFSMLayer(
 # ─────────────────────────────────────────────────────────────────────────────
 # Audio Model: Wav2Vec2 fine-tuned for deepfake detection
-#
-# Why replace RawNet2?
-#   RawNet2 was trained on ASVspoof 2019 — a dataset that predates modern TTS
-#   systems (ElevenLabs, Vall-E, XTTS, Bark, etc.). It has never seen this
-#   class of audio and consistently misclassifies it as "Real".
-#
-# Why Wav2Vec2?
-#   "mo-thecreator/deepfake-audio-detection" is a Wav2Vec2-base model
-#   fine-tuned on FakeAVCeleb + ASVspoof 2021 LA, covering:
-#     - Genuine human speech
-#     - Neural TTS (modern AI voices)
-#     - Voice conversion / cloning
-#     - Replay / splicing attacks
 # ─────────────────────────────────────────────────────────────────────────────
 AUDIO_MODEL_ID = "mo-thecreator/deepfake-audio-detection"
-AUDIO_SAMPLE_RATE = 16000  # Wav2Vec2 expects 16kHz
 print(f"Loading audio model: {AUDIO_MODEL_ID} ...")
 audio_feature_extractor = AutoFeatureExtractor.from_pretrained(AUDIO_MODEL_ID)
@@ -61,13 +50,18 @@ LABEL_MAP = {
 }
 # ─── Confidence thresholds ────────────────────────────────────────────────────
-# High confidence real  → Genuine Human Voice
-# High confidence fake  → Fake / Manipulated Audio
-# Low confidence both   → AI Synthesized / Voice Cloned
-#   Modern TTS confuses the model — it sits in the uncertain middle zone.
-#   That low-confidence signature IS the AI synthesis detection signal.
-REAL_THRESHOLD = 0.75
-FAKE_THRESHOLD = 0.70
 def convert_to_mp4(input_path):
@@ -173,12 +167,12 @@ def deepfakes_video_predict(input_video):
     real_mean = np.mean(real_res)
     fake_mean = np.mean(fake_res)
-    print(f"Real Faces: {real_mean:.4f} | Fake Faces: {fake_mean:.4f}")
     if real_mean >= 0.5:
-        return "The video is REAL.\nDeepfakes Confidence: " + str(round(100 - real_mean * 100, 3)) + "%"
     else:
-        return "The video is FAKE.\nDeepfakes Confidence: " + str(round(fake_mean * 100, 3)) + "%"
 def deepfakes_image_predict(input_image):
@@ -187,36 +181,31 @@ def deepfakes_image_predict(input_image):
     pred = efficientnet_model(np.expand_dims(face2, axis=0))
     pred = list(pred.values())[0].numpy()[0]
     real, fake = pred[0], pred[1]
     if real > 0.5:
-        return "The image is REAL.\nDeepfakes Confidence: " + str(round(100 - real * 100, 3)) + "%"
     else:
-        return "The image is FAKE.\nDeepfakes Confidence: " + str(round(fake * 100, 3)) + "%"
 def classify_audio_3class(real_prob: float, fake_prob: float) -> str:
     """
-    Map 2-class probabilities → 3-class human-readable result.
-      real_prob >= REAL_THRESHOLD  →  Genuine Human Voice
-      fake_prob >= FAKE_THRESHOLD  →  Fake / Manipulated Audio
-      both below threshold         →  AI Synthesized / Voice Cloned
     """
     print(f"[Audio] real_prob={real_prob:.4f}  fake_prob={fake_prob:.4f}")
     if real_prob >= REAL_THRESHOLD:
-        return f"✅ Real Human Voice\nConfidence: {round(real_prob * 100, 2)}%"
     elif fake_prob >= FAKE_THRESHOLD:
-        return f"🚨 Fake / Manipulated Audio\nConfidence: {round(fake_prob * 100, 2)}%"
     else:
-        # Neither class wins confidently → hallmark of modern TTS / voice cloning
-        ai_conf = round(max(fake_prob, 1 - real_prob) * 100, 2)
-        return (
-            f"🤖 AI Synthesized / Voice Cloned\n"
-            f"Confidence: {ai_conf}%\n"
-            f"(Model uncertainty indicates modern neural TTS or voice cloning)"
-        )
 def deepfakes_audio_predict(input_audio):

 # ─────────────────────────────────────────────────────────────────────────────
 # Audio Model: Wav2Vec2 fine-tuned for deepfake detection
+# "mo-thecreator/deepfake-audio-detection"
+# Fine-tuned on FakeAVCeleb + ASVspoof 2021 LA
 # ─────────────────────────────────────────────────────────────────────────────
 AUDIO_MODEL_ID = "mo-thecreator/deepfake-audio-detection"
+AUDIO_SAMPLE_RATE = 16000
 print(f"Loading audio model: {AUDIO_MODEL_ID} ...")
 audio_feature_extractor = AutoFeatureExtractor.from_pretrained(AUDIO_MODEL_ID)
 }
 # ─── Confidence thresholds ────────────────────────────────────────────────────
+# REAL_THRESHOLD = 0.55 (loose)
+#   Lowered so genuine human voices are not incorrectly rejected.
+#   The model only needs to be 55% confident to call it real.
+#
+# FAKE_THRESHOLD = 0.90 (strict)
+#   Raised so real voices are never falsely flagged as fake.
+#   The model must be 90% confident before labelling audio as manipulated.
+#
+# Zone between the two → AI Synthesized / Voice Cloned
+# ─────────────────────────────────────────────────────────────────────────────
+REAL_THRESHOLD = 0.55
+FAKE_THRESHOLD = 0.90
 def convert_to_mp4(input_path):
     real_mean = np.mean(real_res)
     fake_mean = np.mean(fake_res)
+    print(f"[Video] Real={real_mean:.4f} | Fake={fake_mean:.4f}")
     if real_mean >= 0.5:
+        return "✅ The video is REAL."
     else:
+        return "🚨 The video is FAKE."
 def deepfakes_image_predict(input_image):
     pred = efficientnet_model(np.expand_dims(face2, axis=0))
     pred = list(pred.values())[0].numpy()[0]
     real, fake = pred[0], pred[1]
+    print(f"[Image] Real={real:.4f} | Fake={fake:.4f}")
     if real > 0.5:
+        return "✅ The image is REAL."
     else:
+        return "🚨 The image is FAKE."
 def classify_audio_3class(real_prob: float, fake_prob: float) -> str:
     """
+    Map 2-class probabilities → 3-class result.
+    Threshold logic:
+      real_prob >= 0.55  →  Real Human Voice       (loose — avoids false negatives)
+      fake_prob >= 0.90  →  Fake / Manipulated      (strict — avoids false positives)
+      in between         →  AI Synthesized / Cloned
     """
     print(f"[Audio] real_prob={real_prob:.4f}  fake_prob={fake_prob:.4f}")
     if real_prob >= REAL_THRESHOLD:
+        return "✅ Real Human Voice"
     elif fake_prob >= FAKE_THRESHOLD:
+        return "🚨 Fake / Manipulated Audio"
     else:
+        return "🤖 AI Synthesized / Voice Cloned"
 def deepfakes_audio_predict(input_audio):