Spaces:

pavankumarvk
/

Multi_Modal_Deepfake_Detection

Sleeping

App Files Files Community

pavankumarvk commited on Mar 17

Commit

d9f3145

verified ·

1 Parent(s): e07119e

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +90 -78

pipeline.py CHANGED Viewed

@@ -11,8 +11,7 @@ from facenet_pytorch import MTCNN
 from rawnet import RawNet
-#Set random seed for reproducibility.
 tf.random.set_seed(42)
 # Extract model if not already extracted
@@ -24,9 +23,7 @@ if not os.path.exists("efficientnet-b0"):
         zip_ref.close()
         print("Model extracted successfully!")
-# Load models.
-# Load model without compiling to avoid optimizer dependency issues
-# Load model using TFSMLayer (Keras 3 compatible)
 model = tf.keras.layers.TFSMLayer(
     "efficientnet-b0/",
     call_endpoint="serving_default"
@@ -37,18 +34,15 @@ def convert_to_mp4(input_path):
     """
     Convert any video (e.g. .webm from webcam) to .mp4 using ffmpeg.
     Returns the path to the converted file, or the original path if already mp4.
-    The caller is responsible for deleting the temp file when done.
     """
     ext = os.path.splitext(input_path)[-1].lower()
     if ext == ".mp4":
-        # Already mp4 — verify OpenCV can actually open it
         cap = cv2.VideoCapture(input_path)
         ok = cap.isOpened()
         cap.release()
         if ok:
-            return input_path, False  # (path, is_temp)
-    # Write to a named temp file so OpenCV can open it by path
     tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
     tmp.close()
     output_path = tmp.name
@@ -65,16 +59,14 @@ def convert_to_mp4(input_path):
     result = subprocess.run(cmd, capture_output=True)
     if result.returncode != 0:
         os.unlink(output_path)
-        raise RuntimeError(
-            f"ffmpeg conversion failed:\n{result.stderr.decode()}"
-        )
-    return output_path, True  # (path, is_temp)
 class DetectionPipeline:
     """Pipeline class for detecting faces in the frames of a video file."""
-    def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality = 'video'):
         self.n_frames = n_frames
         self.batch_size = batch_size
         self.resize = resize
@@ -83,9 +75,6 @@ class DetectionPipeline:
     def __call__(self, filename):
         if self.input_modality == 'video':
             print('Input modality is video.')
-            # BUG FIX: Webcam recordings from Gradio arrive as .webm (VP8/VP9).
-            # OpenCV has no WebM support in headless builds — convert to .mp4 first.
             converted_path, is_temp = convert_to_mp4(filename)
             print(f"Processing video: {converted_path} (converted={is_temp})")
@@ -112,98 +101,88 @@ class DetectionPipeline:
                         if not success:
                             continue
                         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                         if self.resize is not None:
                             frame = frame.resize([int(d * self.resize) for d in frame.size])
                         frames.append(frame)
                         if len(frames) % self.batch_size == 0 or j == sample[-1]:
                             face2 = cv2.resize(frame, (224, 224))
                             faces.append(face2)
                 v_cap.release()
             finally:
-                # Clean up the temp converted file
                 if is_temp and os.path.exists(converted_path):
                     os.unlink(converted_path)
             if len(faces) == 0:
                 raise RuntimeError("No frames could be extracted from the video.")
             return faces
         elif self.input_modality == 'image':
             print('Input modality is image.')
-            print('Reading image')
             image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
             image = cv2.resize(image, (224, 224))
             return image
         elif self.input_modality == 'audio':
             print("Input modality is audio.")
             x, sr = librosa.load(filename)
             x_pt = torch.Tensor(x)
             x_pt = torch.unsqueeze(x_pt, dim=0)
             return x_pt
         else:
             raise ValueError("Invalid input modality. Must be either 'video' or 'image'")
 detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
 detection_image_pipeline = DetectionPipeline(batch_size=1, input_modality='image')
-def deepfakes_video_predict(input_video):
     faces = detection_video_pipeline(input_video)
     total = 0
     real_res = []
     fake_res = []
     for face in faces:
-        face2 = face/255
         pred = model(np.expand_dims(face2, axis=0))
         pred = list(pred.values())[0].numpy()[0]
         real, fake = pred[0], pred[1]
         real_res.append(real)
         fake_res.append(fake)
-        total+=1
         pred2 = pred[1]
         if pred2 > 0.5:
-          fake+=1
         else:
-          real+=1
     real_mean = np.mean(real_res)
     fake_mean = np.mean(fake_res)
     print(f"Real Faces: {real_mean}")
     print(f"Fake Faces: {fake_mean}")
-    text = ""
     if real_mean >= 0.5:
-        text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean*100), 3)) + "%"
     else:
-        text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean*100, 3)) + "%"
     return text
 def deepfakes_image_predict(input_image):
     faces = detection_image_pipeline(input_image)
-    face2 = faces/255
     pred = model(np.expand_dims(face2, axis=0))
     pred = list(pred.values())[0].numpy()[0]
     real, fake = pred[0], pred[1]
     if real > 0.5:
-        text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real*100), 3)) + "%"
     else:
-        text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake*100, 3)) + "%"
     return text2
 def load_audio_model():
     d_args = {
         "nb_samp": 64600,
@@ -216,73 +195,106 @@ def load_audio_model():
         "nb_gru_layer": 3,
         "nb_classes": 2
     }
     audio_model = RawNet(d_args=d_args, device='cpu')
     ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
     audio_model.load_state_dict(ckpt)
     audio_model.eval()
     return audio_model
-audio_label_map = {
-    0: "Real audio",
-    1: "Fake audio"
-}
-RAWNET_SAMPLE_RATE = 16000  # RawNet2 was trained on 16kHz audio — never change this
 NB_SAMP = 64600             # Exactly 4.0375 seconds at 16kHz
 def deepfakes_audio_predict(input_audio):
     """
     Gradio gr.Audio() returns a tuple: (sample_rate, numpy_array).
-    Critical fixes applied:
-      1. Resample to RAWNET_SAMPLE_RATE (16000 Hz) — the model was trained at 16kHz.
-         Without this, a 44100Hz input has its first ~1.46s fed to a model expecting ~4s,
-         completely breaking the SincConv filterbank frequency assumptions.
-      2. Stereo → mono before resampling (librosa.resample requires 1D input).
-      3. Normalize AFTER resampling to avoid float64 precision issues from librosa.
     """
     sr, x = input_audio
-    print(f"[Audio] Input sample rate: {sr} Hz, samples: {len(x)}, dtype: {x.dtype}")
-    # Step 1: Convert to float32
     x = x.astype(np.float32)
-    # Step 2: Normalize int16 → [-1.0, 1.0] range
     if np.abs(x).max() > 1.0:
-        x = x / 32768.0
-    # Step 3: Stereo → mono (must be done before librosa.resample)
     if x.ndim == 2:
         x = x.mean(axis=1)
-    # Step 4: Resample to 16000 Hz — THIS WAS THE ROOT CAUSE BUG
-    # RawNet2's SincConv filterbank is hard-coded to 16kHz frequencies.
-    # Feeding audio at any other sample rate produces completely wrong filter responses.
     if sr != RAWNET_SAMPLE_RATE:
-        print(f"[Audio] Resampling from {sr} Hz → {RAWNET_SAMPLE_RATE} Hz")
         x = librosa.resample(x, orig_sr=sr, target_sr=RAWNET_SAMPLE_RATE)
         print(f"[Audio] After resample: {len(x)} samples ({len(x)/RAWNET_SAMPLE_RATE:.2f}s)")
-    # Step 5: Pad or trim to exactly NB_SAMP (64600) samples
     if len(x) < NB_SAMP:
         x = np.pad(x, (0, NB_SAMP - len(x)), mode='constant')
     else:
         x = x[:NB_SAMP]
-    # Step 6: Build tensor [1, NB_SAMP] and run inference
-    x_pt = torch.tensor(x, dtype=torch.float32).unsqueeze(0)
     audio_model = load_audio_model()
     with torch.no_grad():
-        logits = audio_model(x_pt)
-    logits_np = logits.detach().numpy()
-    result = np.argmax(logits_np)
-    print(f"[Audio] Logits: {logits_np}, Predicted class: {result} ({audio_label_map[int(result)]})")
-    return audio_label_map[int(result)]

 from rawnet import RawNet
+# Set random seed for reproducibility.
 tf.random.set_seed(42)
 # Extract model if not already extracted
         zip_ref.close()
         print("Model extracted successfully!")
+# Load EfficientNet model using TFSMLayer (Keras 3 compatible)
 model = tf.keras.layers.TFSMLayer(
     "efficientnet-b0/",
     call_endpoint="serving_default"
     """
     Convert any video (e.g. .webm from webcam) to .mp4 using ffmpeg.
     Returns the path to the converted file, or the original path if already mp4.
     """
     ext = os.path.splitext(input_path)[-1].lower()
     if ext == ".mp4":
         cap = cv2.VideoCapture(input_path)
         ok = cap.isOpened()
         cap.release()
         if ok:
+            return input_path, False
     tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
     tmp.close()
     output_path = tmp.name
     result = subprocess.run(cmd, capture_output=True)
     if result.returncode != 0:
         os.unlink(output_path)
+        raise RuntimeError(f"ffmpeg conversion failed:\n{result.stderr.decode()}")
+    return output_path, True
 class DetectionPipeline:
     """Pipeline class for detecting faces in the frames of a video file."""
+    def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
         self.n_frames = n_frames
         self.batch_size = batch_size
         self.resize = resize
     def __call__(self, filename):
         if self.input_modality == 'video':
             print('Input modality is video.')
             converted_path, is_temp = convert_to_mp4(filename)
             print(f"Processing video: {converted_path} (converted={is_temp})")
                         if not success:
                             continue
                         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                         if self.resize is not None:
                             frame = frame.resize([int(d * self.resize) for d in frame.size])
                         frames.append(frame)
                         if len(frames) % self.batch_size == 0 or j == sample[-1]:
                             face2 = cv2.resize(frame, (224, 224))
                             faces.append(face2)
                 v_cap.release()
             finally:
                 if is_temp and os.path.exists(converted_path):
                     os.unlink(converted_path)
             if len(faces) == 0:
                 raise RuntimeError("No frames could be extracted from the video.")
             return faces
         elif self.input_modality == 'image':
             print('Input modality is image.')
             image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
             image = cv2.resize(image, (224, 224))
             return image
         elif self.input_modality == 'audio':
             print("Input modality is audio.")
             x, sr = librosa.load(filename)
             x_pt = torch.Tensor(x)
             x_pt = torch.unsqueeze(x_pt, dim=0)
             return x_pt
         else:
             raise ValueError("Invalid input modality. Must be either 'video' or 'image'")
 detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
 detection_image_pipeline = DetectionPipeline(batch_size=1, input_modality='image')
+def deepfakes_video_predict(input_video):
     faces = detection_video_pipeline(input_video)
     total = 0
     real_res = []
     fake_res = []
     for face in faces:
+        face2 = face / 255
         pred = model(np.expand_dims(face2, axis=0))
         pred = list(pred.values())[0].numpy()[0]
         real, fake = pred[0], pred[1]
         real_res.append(real)
         fake_res.append(fake)
+        total += 1
         pred2 = pred[1]
         if pred2 > 0.5:
+            fake += 1
         else:
+            real += 1
     real_mean = np.mean(real_res)
     fake_mean = np.mean(fake_res)
     print(f"Real Faces: {real_mean}")
     print(f"Fake Faces: {fake_mean}")
     if real_mean >= 0.5:
+        text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean * 100), 3)) + "%"
     else:
+        text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean * 100, 3)) + "%"
     return text
 def deepfakes_image_predict(input_image):
     faces = detection_image_pipeline(input_image)
+    face2 = faces / 255
     pred = model(np.expand_dims(face2, axis=0))
     pred = list(pred.values())[0].numpy()[0]
     real, fake = pred[0], pred[1]
     if real > 0.5:
+        text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real * 100), 3)) + "%"
     else:
+        text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake * 100, 3)) + "%"
     return text2
 def load_audio_model():
     d_args = {
         "nb_samp": 64600,
         "nb_gru_layer": 3,
         "nb_classes": 2
     }
     audio_model = RawNet(d_args=d_args, device='cpu')
     ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
     audio_model.load_state_dict(ckpt)
     audio_model.eval()
     return audio_model
+RAWNET_SAMPLE_RATE = 16000  # RawNet2 was trained strictly on 16kHz — never change
 NB_SAMP = 64600             # Exactly 4.0375 seconds at 16kHz
+# ─── Confidence thresholds for 3-class labelling ────────────────────────────
+# RawNet2 has 2 output classes (real / fake). We derive a 3rd class
+# "AI Synthesized" from the confidence score:
+#
+#   real_prob >= REAL_THRESHOLD        → Genuine human voice
+#   fake_prob >= FAKE_THRESHOLD        → Manipulated / spliced audio
+#   anything in between               → AI Synthesized / TTS / Voice-cloned
+#
+# Why this works: TTS and voice-clone audio confuses RawNet2 — it produces
+# low-confidence outputs for both classes because it was trained on older
+# spoofing attacks. That uncertainty is the signal we exploit.
+REAL_THRESHOLD = 0.75
+FAKE_THRESHOLD = 0.75
+def classify_audio_3class(real_prob: float, fake_prob: float) -> str:
+    """
+    Map RawNet2 2-class probabilities → 3-class human-readable label.
+    Classes:
+      - Real Human Voice     : model is confident it's real
+      - AI Synthesized       : model is uncertain (TTS / voice-clone zone)
+      - Fake / Manipulated   : model is confident it's fake (spliced, replayed)
+    """
+    print(f"[Audio] real_prob={real_prob:.4f}  fake_prob={fake_prob:.4f}")
+    if real_prob >= REAL_THRESHOLD:
+        confidence = round(real_prob * 100, 2)
+        return f"✅ Real Human Voice\nConfidence: {confidence}%"
+    elif fake_prob >= FAKE_THRESHOLD:
+        confidence = round(fake_prob * 100, 2)
+        return f"🚨 Fake / Manipulated Audio\nConfidence: {confidence}%"
+    else:
+        # Low confidence on both sides → hallmark of modern TTS / voice cloning
+        ai_confidence = round(fake_prob * 100, 2)
+        return (
+            f"🤖 AI Synthesized / Voice Cloned\n"
+            f"Confidence: {ai_confidence}%\n"
+            f"(Model uncertainty indicates TTS or neural voice cloning)"
+        )
 def deepfakes_audio_predict(input_audio):
     """
     Gradio gr.Audio() returns a tuple: (sample_rate, numpy_array).
+    Pipeline:
+      1. float32 conversion + int16 normalisation
+      2. Stereo → mono
+      3. Resample to 16000 Hz  ← critical: RawNet2 SincConv assumes 16kHz
+      4. Pad / trim to NB_SAMP (64600) samples
+      5. RawNet2 inference → log-softmax → probabilities
+      6. 3-class decision via confidence thresholds
     """
     sr, x = input_audio
+    print(f"[Audio] Input  SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
+    # Step 1 — float32 + normalise
     x = x.astype(np.float32)
     if np.abs(x).max() > 1.0:
+        x = x / 32768.0          # int16 → [-1, 1]
+    # Step 2 — stereo → mono (must precede librosa.resample which needs 1-D)
     if x.ndim == 2:
         x = x.mean(axis=1)
+    # Step 3 — resample to 16 kHz (THE root-cause fix)
     if sr != RAWNET_SAMPLE_RATE:
+        print(f"[Audio] Resampling {sr} Hz → {RAWNET_SAMPLE_RATE} Hz …")
         x = librosa.resample(x, orig_sr=sr, target_sr=RAWNET_SAMPLE_RATE)
         print(f"[Audio] After resample: {len(x)} samples ({len(x)/RAWNET_SAMPLE_RATE:.2f}s)")
+    # Step 4 — pad or trim to exactly NB_SAMP
     if len(x) < NB_SAMP:
         x = np.pad(x, (0, NB_SAMP - len(x)), mode='constant')
     else:
         x = x[:NB_SAMP]
+    # Step 5 — inference
+    x_pt = torch.tensor(x, dtype=torch.float32).unsqueeze(0)  # [1, NB_SAMP]
     audio_model = load_audio_model()
     with torch.no_grad():
+        log_probs = audio_model(x_pt)           # log-softmax output
+    probs = torch.exp(log_probs).numpy()[0]     # convert log → actual probabilities
+    real_prob = float(probs[0])
+    fake_prob = float(probs[1])
+    # Step 6 — 3-class label
+    return classify_audio_3class(real_prob, fake_prob)