Spaces:

pavankumarvk
/

Multi_Modal_Deepfake_Detection

Sleeping

App Files Files Community

pavankumarvk commited on Mar 17

Commit

3cbb0e7

verified ·

1 Parent(s): d9f3145

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +124 -140

pipeline.py CHANGED Viewed

@@ -7,34 +7,71 @@ import subprocess
 import tempfile
 import numpy as np
 import tensorflow as tf
-from facenet_pytorch import MTCNN
-from rawnet import RawNet
 # Set random seed for reproducibility.
 tf.random.set_seed(42)
-# Extract model if not already extracted
 if not os.path.exists("efficientnet-b0"):
     local_zip = "./efficientnet-b0.zip"
     if os.path.exists(local_zip):
         zip_ref = zipfile.ZipFile(local_zip, 'r')
         zip_ref.extractall()
         zip_ref.close()
-        print("Model extracted successfully!")
-# Load EfficientNet model using TFSMLayer (Keras 3 compatible)
-model = tf.keras.layers.TFSMLayer(
     "efficientnet-b0/",
     call_endpoint="serving_default"
 )
 def convert_to_mp4(input_path):
-    """
-    Convert any video (e.g. .webm from webcam) to .mp4 using ffmpeg.
-    Returns the path to the converted file, or the original path if already mp4.
-    """
     ext = os.path.splitext(input_path)[-1].lower()
     if ext == ".mp4":
         cap = cv2.VideoCapture(input_path)
@@ -48,13 +85,9 @@ def convert_to_mp4(input_path):
     output_path = tmp.name
     cmd = [
-        "ffmpeg", "-y",
-        "-i", input_path,
-        "-c:v", "libx264",
-        "-preset", "fast",
-        "-crf", "23",
-        "-c:a", "aac",
-        output_path
     ]
     result = subprocess.run(cmd, capture_output=True)
     if result.returncode != 0:
@@ -64,7 +97,7 @@ def convert_to_mp4(input_path):
 class DetectionPipeline:
-    """Pipeline class for detecting faces in the frames of a video file."""
     def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
         self.n_frames = n_frames
@@ -87,15 +120,14 @@ class DetectionPipeline:
                 if v_len == 0:
                     raise RuntimeError("Video has 0 frames after conversion.")
-                if self.n_frames is None:
-                    sample = np.arange(0, v_len)
-                else:
-                    sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
-                faces = []
-                frames = []
                 for j in range(v_len):
-                    success = v_cap.grab()
                     if j in sample:
                         success, frame = v_cap.retrieve()
                         if not success:
@@ -105,9 +137,7 @@ class DetectionPipeline:
                             frame = frame.resize([int(d * self.resize) for d in frame.size])
                         frames.append(frame)
                         if len(frames) % self.batch_size == 0 or j == sample[-1]:
-                            face2 = cv2.resize(frame, (224, 224))
-                            faces.append(face2)
                 v_cap.release()
             finally:
                 if is_temp and os.path.exists(converted_path):
@@ -120,18 +150,10 @@ class DetectionPipeline:
         elif self.input_modality == 'image':
             print('Input modality is image.')
             image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
-            image = cv2.resize(image, (224, 224))
-            return image
-        elif self.input_modality == 'audio':
-            print("Input modality is audio.")
-            x, sr = librosa.load(filename)
-            x_pt = torch.Tensor(x)
-            x_pt = torch.unsqueeze(x_pt, dim=0)
-            return x_pt
         else:
-            raise ValueError("Invalid input modality. Must be either 'video' or 'image'")
 detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
@@ -140,126 +162,75 @@ detection_image_pipeline = DetectionPipeline(batch_size=1, input_modality='image
 def deepfakes_video_predict(input_video):
     faces = detection_video_pipeline(input_video)
-    total = 0
-    real_res = []
-    fake_res = []
     for face in faces:
         face2 = face / 255
-        pred = model(np.expand_dims(face2, axis=0))
         pred = list(pred.values())[0].numpy()[0]
-        real, fake = pred[0], pred[1]
-        real_res.append(real)
-        fake_res.append(fake)
-        total += 1
-        pred2 = pred[1]
-        if pred2 > 0.5:
-            fake += 1
-        else:
-            real += 1
     real_mean = np.mean(real_res)
     fake_mean = np.mean(fake_res)
-    print(f"Real Faces: {real_mean}")
-    print(f"Fake Faces: {fake_mean}")
     if real_mean >= 0.5:
-        text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean * 100), 3)) + "%"
     else:
-        text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean * 100, 3)) + "%"
-    return text
 def deepfakes_image_predict(input_image):
-    faces = detection_image_pipeline(input_image)
-    face2 = faces / 255
-    pred = model(np.expand_dims(face2, axis=0))
     pred = list(pred.values())[0].numpy()[0]
     real, fake = pred[0], pred[1]
     if real > 0.5:
-        text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real * 100), 3)) + "%"
     else:
-        text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake * 100, 3)) + "%"
-    return text2
-def load_audio_model():
-    d_args = {
-        "nb_samp": 64600,
-        "first_conv": 1024,
-        "in_channels": 1,
-        "filts": [20, [20, 20], [20, 128], [128, 128]],
-        "blocks": [2, 4],
-        "nb_fc_node": 1024,
-        "gru_node": 1024,
-        "nb_gru_layer": 3,
-        "nb_classes": 2
-    }
-    audio_model = RawNet(d_args=d_args, device='cpu')
-    ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
-    audio_model.load_state_dict(ckpt)
-    audio_model.eval()
-    return audio_model
-RAWNET_SAMPLE_RATE = 16000  # RawNet2 was trained strictly on 16kHz — never change
-NB_SAMP = 64600             # Exactly 4.0375 seconds at 16kHz
-# ─── Confidence thresholds for 3-class labelling ────────────────────────────
-# RawNet2 has 2 output classes (real / fake). We derive a 3rd class
-# "AI Synthesized" from the confidence score:
-#
-#   real_prob >= REAL_THRESHOLD        → Genuine human voice
-#   fake_prob >= FAKE_THRESHOLD        → Manipulated / spliced audio
-#   anything in between               → AI Synthesized / TTS / Voice-cloned
-#
-# Why this works: TTS and voice-clone audio confuses RawNet2 — it produces
-# low-confidence outputs for both classes because it was trained on older
-# spoofing attacks. That uncertainty is the signal we exploit.
-REAL_THRESHOLD = 0.75
-FAKE_THRESHOLD = 0.75
 def classify_audio_3class(real_prob: float, fake_prob: float) -> str:
     """
-    Map RawNet2 2-class probabilities → 3-class human-readable label.
-    Classes:
-      - Real Human Voice     : model is confident it's real
-      - AI Synthesized       : model is uncertain (TTS / voice-clone zone)
-      - Fake / Manipulated   : model is confident it's fake (spliced, replayed)
     """
     print(f"[Audio] real_prob={real_prob:.4f}  fake_prob={fake_prob:.4f}")
     if real_prob >= REAL_THRESHOLD:
-        confidence = round(real_prob * 100, 2)
-        return f"✅ Real Human Voice\nConfidence: {confidence}%"
     elif fake_prob >= FAKE_THRESHOLD:
-        confidence = round(fake_prob * 100, 2)
-        return f"🚨 Fake / Manipulated Audio\nConfidence: {confidence}%"
     else:
-        # Low confidence on both sides → hallmark of modern TTS / voice cloning
-        ai_confidence = round(fake_prob * 100, 2)
         return (
             f"🤖 AI Synthesized / Voice Cloned\n"
-            f"Confidence: {ai_confidence}%\n"
-            f"(Model uncertainty indicates TTS or neural voice cloning)"
         )
 def deepfakes_audio_predict(input_audio):
     """
-    Gradio gr.Audio() returns a tuple: (sample_rate, numpy_array).
-    Pipeline:
       1. float32 conversion + int16 normalisation
       2. Stereo → mono
-      3. Resample to 16000 Hz  ← critical: RawNet2 SincConv assumes 16kHz
-      4. Pad / trim to NB_SAMP (64600) samples
-      5. RawNet2 inference → log-softmax → probabilities
-      6. 3-class decision via confidence thresholds
     """
     sr, x = input_audio
     print(f"[Audio] Input  SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
@@ -267,34 +238,47 @@ def deepfakes_audio_predict(input_audio):
     # Step 1 — float32 + normalise
     x = x.astype(np.float32)
     if np.abs(x).max() > 1.0:
-        x = x / 32768.0          # int16 → [-1, 1]
-    # Step 2 — stereo → mono (must precede librosa.resample which needs 1-D)
     if x.ndim == 2:
         x = x.mean(axis=1)
-    # Step 3 — resample to 16 kHz (THE root-cause fix)
-    if sr != RAWNET_SAMPLE_RATE:
-        print(f"[Audio] Resampling {sr} Hz → {RAWNET_SAMPLE_RATE} Hz …")
-        x = librosa.resample(x, orig_sr=sr, target_sr=RAWNET_SAMPLE_RATE)
-        print(f"[Audio] After resample: {len(x)} samples ({len(x)/RAWNET_SAMPLE_RATE:.2f}s)")
-    # Step 4 — pad or trim to exactly NB_SAMP
-    if len(x) < NB_SAMP:
-        x = np.pad(x, (0, NB_SAMP - len(x)), mode='constant')
-    else:
-        x = x[:NB_SAMP]
-    # Step 5 — inference
-    x_pt = torch.tensor(x, dtype=torch.float32).unsqueeze(0)  # [1, NB_SAMP]
-    audio_model = load_audio_model()
     with torch.no_grad():
-        log_probs = audio_model(x_pt)           # log-softmax output
-    probs = torch.exp(log_probs).numpy()[0]     # convert log → actual probabilities
-    real_prob = float(probs[0])
-    fake_prob = float(probs[1])
-    # Step 6 — 3-class label
     return classify_audio_3class(real_prob, fake_prob)

 import tempfile
 import numpy as np
 import tensorflow as tf
+from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 # Set random seed for reproducibility.
 tf.random.set_seed(42)
+# Extract EfficientNet model if not already extracted
 if not os.path.exists("efficientnet-b0"):
     local_zip = "./efficientnet-b0.zip"
     if os.path.exists(local_zip):
         zip_ref = zipfile.ZipFile(local_zip, 'r')
         zip_ref.extractall()
         zip_ref.close()
+        print("EfficientNet model extracted successfully!")
+# Load EfficientNet model (image/video)
+efficientnet_model = tf.keras.layers.TFSMLayer(
     "efficientnet-b0/",
     call_endpoint="serving_default"
 )
+# ─────────────────────────────────────────────────────────────────────────────
+# Audio Model: Wav2Vec2 fine-tuned for deepfake detection
+#
+# Why replace RawNet2?
+#   RawNet2 was trained on ASVspoof 2019 — a dataset that predates modern TTS
+#   systems (ElevenLabs, Vall-E, XTTS, Bark, etc.). It has never seen this
+#   class of audio and consistently misclassifies it as "Real".
+#
+# Why Wav2Vec2?
+#   "mo-thecreator/deepfake-audio-detection" is a Wav2Vec2-base model
+#   fine-tuned on FakeAVCeleb + ASVspoof 2021 LA, covering:
+#     - Genuine human speech
+#     - Neural TTS (modern AI voices)
+#     - Voice conversion / cloning
+#     - Replay / splicing attacks
+# ─────────────────────────────────────────────────────────────────────────────
+AUDIO_MODEL_ID = "mo-thecreator/deepfake-audio-detection"
+AUDIO_SAMPLE_RATE = 16000  # Wav2Vec2 expects 16kHz
+print(f"Loading audio model: {AUDIO_MODEL_ID} ...")
+audio_feature_extractor = AutoFeatureExtractor.from_pretrained(AUDIO_MODEL_ID)
+audio_model = AutoModelForAudioClassification.from_pretrained(AUDIO_MODEL_ID)
+audio_model.eval()
+print("Audio model loaded successfully!")
+# Map model's raw label → "real" or "fake"
+LABEL_MAP = {
+    "LABEL_0": "real",
+    "LABEL_1": "fake",
+    "real": "real",
+    "fake": "fake",
+}
+# ─── Confidence thresholds ────────────────────────────────────────────────────
+# High confidence real  → Genuine Human Voice
+# High confidence fake  → Fake / Manipulated Audio
+# Low confidence both   → AI Synthesized / Voice Cloned
+#   Modern TTS confuses the model — it sits in the uncertain middle zone.
+#   That low-confidence signature IS the AI synthesis detection signal.
+REAL_THRESHOLD = 0.75
+FAKE_THRESHOLD = 0.70
 def convert_to_mp4(input_path):
+    """Convert any video to .mp4 using ffmpeg (handles webcam .webm, etc.)"""
     ext = os.path.splitext(input_path)[-1].lower()
     if ext == ".mp4":
         cap = cv2.VideoCapture(input_path)
     output_path = tmp.name
     cmd = [
+        "ffmpeg", "-y", "-i", input_path,
+        "-c:v", "libx264", "-preset", "fast",
+        "-crf", "23", "-c:a", "aac", output_path
     ]
     result = subprocess.run(cmd, capture_output=True)
     if result.returncode != 0:
 class DetectionPipeline:
+    """Pipeline for detecting faces in video frames or processing images."""
     def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
         self.n_frames = n_frames
                 if v_len == 0:
                     raise RuntimeError("Video has 0 frames after conversion.")
+                sample = (
+                    np.arange(0, v_len) if self.n_frames is None
+                    else np.linspace(0, v_len - 1, self.n_frames).astype(int)
+                )
+                faces, frames = [], []
                 for j in range(v_len):
+                    v_cap.grab()
                     if j in sample:
                         success, frame = v_cap.retrieve()
                         if not success:
                             frame = frame.resize([int(d * self.resize) for d in frame.size])
                         frames.append(frame)
                         if len(frames) % self.batch_size == 0 or j == sample[-1]:
+                            faces.append(cv2.resize(frame, (224, 224)))
                 v_cap.release()
             finally:
                 if is_temp and os.path.exists(converted_path):
         elif self.input_modality == 'image':
             print('Input modality is image.')
             image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
+            return cv2.resize(image, (224, 224))
         else:
+            raise ValueError(f"Invalid input modality: {self.input_modality}")
 detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
 def deepfakes_video_predict(input_video):
     faces = detection_video_pipeline(input_video)
+    real_res, fake_res = [], []
     for face in faces:
         face2 = face / 255
+        pred = efficientnet_model(np.expand_dims(face2, axis=0))
         pred = list(pred.values())[0].numpy()[0]
+        real_res.append(pred[0])
+        fake_res.append(pred[1])
     real_mean = np.mean(real_res)
     fake_mean = np.mean(fake_res)
+    print(f"Real Faces: {real_mean:.4f} | Fake Faces: {fake_mean:.4f}")
     if real_mean >= 0.5:
+        return "The video is REAL.\nDeepfakes Confidence: " + str(round(100 - real_mean * 100, 3)) + "%"
     else:
+        return "The video is FAKE.\nDeepfakes Confidence: " + str(round(fake_mean * 100, 3)) + "%"
 def deepfakes_image_predict(input_image):
+    face = detection_image_pipeline(input_image)
+    face2 = face / 255
+    pred = efficientnet_model(np.expand_dims(face2, axis=0))
     pred = list(pred.values())[0].numpy()[0]
     real, fake = pred[0], pred[1]
     if real > 0.5:
+        return "The image is REAL.\nDeepfakes Confidence: " + str(round(100 - real * 100, 3)) + "%"
     else:
+        return "The image is FAKE.\nDeepfakes Confidence: " + str(round(fake * 100, 3)) + "%"
 def classify_audio_3class(real_prob: float, fake_prob: float) -> str:
     """
+    Map 2-class probabilities → 3-class human-readable result.
+      real_prob >= REAL_THRESHOLD  →  Genuine Human Voice
+      fake_prob >= FAKE_THRESHOLD  →  Fake / Manipulated Audio
+      both below threshold         →  AI Synthesized / Voice Cloned
     """
     print(f"[Audio] real_prob={real_prob:.4f}  fake_prob={fake_prob:.4f}")
     if real_prob >= REAL_THRESHOLD:
+        return f"✅ Real Human Voice\nConfidence: {round(real_prob * 100, 2)}%"
     elif fake_prob >= FAKE_THRESHOLD:
+        return f"🚨 Fake / Manipulated Audio\nConfidence: {round(fake_prob * 100, 2)}%"
     else:
+        # Neither class wins confidently → hallmark of modern TTS / voice cloning
+        ai_conf = round(max(fake_prob, 1 - real_prob) * 100, 2)
         return (
             f"🤖 AI Synthesized / Voice Cloned\n"
+            f"Confidence: {ai_conf}%\n"
+            f"(Model uncertainty indicates modern neural TTS or voice cloning)"
         )
 def deepfakes_audio_predict(input_audio):
     """
+    Detect whether audio is: Real Human Voice / AI Synthesized / Fake.
+    Gradio gr.Audio() returns (sample_rate, numpy_array).
+    Steps:
       1. float32 conversion + int16 normalisation
       2. Stereo → mono
+      3. Resample to 16000 Hz  (Wav2Vec2 requirement)
+      4. Wav2Vec2 feature extraction + inference → softmax probabilities
+      5. 3-class decision via confidence thresholds
     """
     sr, x = input_audio
     print(f"[Audio] Input  SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
     # Step 1 — float32 + normalise
     x = x.astype(np.float32)
     if np.abs(x).max() > 1.0:
+        x = x / 32768.0
+    # Step 2 — stereo → mono (must precede resample — librosa needs 1-D)
     if x.ndim == 2:
         x = x.mean(axis=1)
+    # Step 3 — resample to 16 kHz
+    if sr != AUDIO_SAMPLE_RATE:
+        print(f"[Audio] Resampling {sr} Hz → {AUDIO_SAMPLE_RATE} Hz …")
+        x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
+        print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
+    # Step 4 — Wav2Vec2 inference
+    inputs = audio_feature_extractor(
+        x,
+        sampling_rate=AUDIO_SAMPLE_RATE,
+        return_tensors="pt",
+        padding=True
+    )
     with torch.no_grad():
+        logits = audio_model(**inputs).logits
+    probs = torch.softmax(logits, dim=-1)[0]
+    # Map model label indices → real / fake probabilities
+    id2label = audio_model.config.id2label
+    real_prob, fake_prob = 0.0, 0.0
+    for idx, prob in enumerate(probs):
+        mapped = LABEL_MAP.get(id2label[idx], id2label[idx].lower())
+        if mapped == "real":
+            real_prob = float(prob)
+        elif mapped == "fake":
+            fake_prob = float(prob)
+    # Fallback: if label mapping failed, assume index order (0=real, 1=fake)
+    if real_prob == 0.0 and fake_prob == 0.0:
+        print("[Audio] Warning: label mapping failed — using index order (0=real, 1=fake)")
+        real_prob = float(probs[0])
+        fake_prob = float(probs[1])
+    # Step 5 — 3-class decision
     return classify_audio_3class(real_prob, fake_prob)