Spaces:

pavankumarvk
/

Multi_Modal_Deepfake_Detection

Sleeping

App Files Files Community

pavankumarvk commited on Mar 15

Commit

5808494

verified ·

1 Parent(s): b23d2b0

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +88 -37

pipeline.py CHANGED Viewed

@@ -3,6 +3,8 @@ import cv2
 import torch
 import zipfile
 import librosa
 import numpy as np
 import tensorflow as tf
 from facenet_pytorch import MTCNN
@@ -31,6 +33,42 @@ model = tf.keras.layers.TFSMLayer(
 )
 class DetectionPipeline:
@@ -45,33 +83,53 @@ class DetectionPipeline:
     def __call__(self, filename):
         if self.input_modality == 'video':
             print('Input modality is video.')
-            v_cap = cv2.VideoCapture(filename)
-            v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
-            if self.n_frames is None:
-                sample = np.arange(0, v_len)
-            else:
-                sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
-            faces = []
-            frames = []
-            for j in range(v_len):
-                success = v_cap.grab()
-                if j in sample:
-                    success, frame = v_cap.retrieve()
-                    if not success:
-                        continue
-                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    if self.resize is not None:
-                        frame = frame.resize([int(d * self.resize) for d in frame.size])
-                    frames.append(frame)
-                    if len(frames) % self.batch_size == 0 or j == sample[-1]:
-                        face2 = cv2.resize(frame, (224, 224))
-                        faces.append(face2)
-            v_cap.release()
             return faces
         elif self.input_modality == 'image':
@@ -161,9 +219,8 @@ def load_audio_model():
     audio_model = RawNet(d_args=d_args, device='cpu')
-    # BUG FIX 2: Correct load_state_dict usage — second arg was wrongly a state_dict
     ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
-    audio_model.load_state_dict(ckpt)  # Fixed: removed incorrect model_dict argument
     audio_model.eval()
     return audio_model
@@ -179,10 +236,8 @@ def deepfakes_audio_predict(input_audio):
     Gradio gr.Audio() returns a tuple: (sample_rate, numpy_array)
     numpy_array is int16 by default and needs float32 normalization.
     """
-    # BUG FIX 1: Gradio returns (sample_rate, data) — original code had reversed order
-    sr, x = input_audio  # was: x, sr = input_audio  ← WRONG
-    # BUG FIX 3: Convert int16 audio from Gradio to float32 and normalize to [-1, 1]
     x = x.astype(np.float32)
     if x.max() > 1.0:
         x = x / 32768.0  # Normalize int16 range to float32
@@ -191,24 +246,20 @@ def deepfakes_audio_predict(input_audio):
     if x.ndim == 2:
         x = x.mean(axis=1)
-    # BUG FIX 4: RawNet2 expects exactly nb_samp=64600 samples — pad or trim
     if len(x) < NB_SAMP:
-        # Pad with zeros if audio is too short
         x = np.pad(x, (0, NB_SAMP - len(x)), mode='constant')
     else:
-        # Trim to expected length if audio is too long
         x = x[:NB_SAMP]
     # Convert to tensor with batch dimension: [1, nb_samp]
     x_pt = torch.tensor(x, dtype=torch.float32).unsqueeze(0)
-    # Load model and run inference
     audio_model = load_audio_model()
     with torch.no_grad():
         grads = audio_model(x_pt)
-    # Get the predicted class index
     grads_np = grads.detach().numpy()
     result = np.argmax(grads_np)

 import torch
 import zipfile
 import librosa
+import subprocess
+import tempfile
 import numpy as np
 import tensorflow as tf
 from facenet_pytorch import MTCNN
 )
+def convert_to_mp4(input_path):
+    """
+    Convert any video (e.g. .webm from webcam) to .mp4 using ffmpeg.
+    Returns the path to the converted file, or the original path if already mp4.
+    The caller is responsible for deleting the temp file when done.
+    """
+    ext = os.path.splitext(input_path)[-1].lower()
+    if ext == ".mp4":
+        # Already mp4 — verify OpenCV can actually open it
+        cap = cv2.VideoCapture(input_path)
+        ok = cap.isOpened()
+        cap.release()
+        if ok:
+            return input_path, False  # (path, is_temp)
+    # Write to a named temp file so OpenCV can open it by path
+    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    tmp.close()
+    output_path = tmp.name
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", input_path,
+        "-c:v", "libx264",
+        "-preset", "fast",
+        "-crf", "23",
+        "-c:a", "aac",
+        output_path
+    ]
+    result = subprocess.run(cmd, capture_output=True)
+    if result.returncode != 0:
+        os.unlink(output_path)
+        raise RuntimeError(
+            f"ffmpeg conversion failed:\n{result.stderr.decode()}"
+        )
+    return output_path, True  # (path, is_temp)
 class DetectionPipeline:
     def __call__(self, filename):
         if self.input_modality == 'video':
             print('Input modality is video.')
+            # BUG FIX: Webcam recordings from Gradio arrive as .webm (VP8/VP9).
+            # OpenCV has no WebM support in headless builds — convert to .mp4 first.
+            converted_path, is_temp = convert_to_mp4(filename)
+            print(f"Processing video: {converted_path} (converted={is_temp})")
+            try:
+                v_cap = cv2.VideoCapture(converted_path)
+                if not v_cap.isOpened():
+                    raise RuntimeError(f"OpenCV could not open video: {converted_path}")
+                v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                if v_len == 0:
+                    raise RuntimeError("Video has 0 frames after conversion.")
+                if self.n_frames is None:
+                    sample = np.arange(0, v_len)
+                else:
+                    sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
+                faces = []
+                frames = []
+                for j in range(v_len):
+                    success = v_cap.grab()
+                    if j in sample:
+                        success, frame = v_cap.retrieve()
+                        if not success:
+                            continue
+                        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                        if self.resize is not None:
+                            frame = frame.resize([int(d * self.resize) for d in frame.size])
+                        frames.append(frame)
+                        if len(frames) % self.batch_size == 0 or j == sample[-1]:
+                            face2 = cv2.resize(frame, (224, 224))
+                            faces.append(face2)
+                v_cap.release()
+            finally:
+                # Clean up the temp converted file
+                if is_temp and os.path.exists(converted_path):
+                    os.unlink(converted_path)
+            if len(faces) == 0:
+                raise RuntimeError("No frames could be extracted from the video.")
             return faces
         elif self.input_modality == 'image':
     audio_model = RawNet(d_args=d_args, device='cpu')
     ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
+    audio_model.load_state_dict(ckpt)
     audio_model.eval()
     return audio_model
     Gradio gr.Audio() returns a tuple: (sample_rate, numpy_array)
     numpy_array is int16 by default and needs float32 normalization.
     """
+    sr, x = input_audio
     x = x.astype(np.float32)
     if x.max() > 1.0:
         x = x / 32768.0  # Normalize int16 range to float32
     if x.ndim == 2:
         x = x.mean(axis=1)
+    # RawNet2 expects exactly nb_samp=64600 samples — pad or trim
     if len(x) < NB_SAMP:
         x = np.pad(x, (0, NB_SAMP - len(x)), mode='constant')
     else:
         x = x[:NB_SAMP]
     # Convert to tensor with batch dimension: [1, nb_samp]
     x_pt = torch.tensor(x, dtype=torch.float32).unsqueeze(0)
     audio_model = load_audio_model()
     with torch.no_grad():
         grads = audio_model(x_pt)
     grads_np = grads.detach().numpy()
     result = np.argmax(grads_np)