Spaces:

pavankumarvk
/

Multi_Modal_Deepfake_Detection

Sleeping

App Files Files Community

pavankumarvk commited on Feb 17

Commit

d9a982f

verified ·

1 Parent(s): 2697855

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +34 -174

pipeline.py CHANGED Viewed

@@ -1,17 +1,15 @@
-import os
-import cv2
-import torch
-import zipfile
 import librosa
-import numpy as np
 import tensorflow as tf
-from facenet_pytorch import MTCNN
 from rawnet import RawNet
-# Set random seed for reproducibility.
 tf.random.set_seed(42)
-# Extract model if not already extracted
 if not os.path.exists("efficientnet-b0"):
     local_zip = "./efficientnet-b0.zip"
     if os.path.exists(local_zip):
@@ -20,57 +18,44 @@ if not os.path.exists("efficientnet-b0"):
         zip_ref.close()
         print("Model extracted successfully!")
-# Load Video/Image models.
-# Load model without compiling to avoid optimizer dependency issues
 model = tf.keras.models.load_model("efficientnet-b0/", compile=False)
 class DetectionPipeline:
-    """Pipeline class for detecting faces in the frames of a video file."""
     def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
-        """Constructor for DetectionPipeline class."""
         self.n_frames = n_frames
         self.batch_size = batch_size
         self.resize = resize
         self.input_modality = input_modality
     def __call__(self, filename):
-        """Load frames from an MP4 video and detect faces."""
         if self.input_modality == 'video':
-            print('Input modality is video.')
             v_cap = cv2.VideoCapture(filename)
             v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
-            # Pick 'n_frames' evenly spaced frames to sample
-            if self.n_frames is None:
-                sample = np.arange(0, v_len)
-            else:
-                sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
-            # Loop through frames
             faces = []
             frames = []
             for j in range(v_len):
                 success = v_cap.grab()
                 if j in sample:
-                    # Load frame
                     success, frame = v_cap.retrieve()
                     if not success:
                         continue
                     frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    # Resize frame to desired size
                     if self.resize is not None:
-                        frame = cv2.resize(frame, None, fx=self.resize, fy=self.resize)
                     frames.append(frame)
-                    # When batch is full, detect faces and reset frame list
                     if len(frames) % self.batch_size == 0 or j == sample[-1]:
-                        # Simple resizing for the EfficientNet model (assuming face is centered or whole frame is analyzed)
-                        # For a more robust solution, MTCNN should be used here to extract faces first.
-                        # Based on your provided logic, we resize the frame directly.
                         face2 = cv2.resize(frame, (224, 224))
                         faces.append(face2)
@@ -78,175 +63,50 @@ class DetectionPipeline:
             return faces
         elif self.input_modality == 'image':
-            print('Input modality is image.')
-            # Perform inference for image modality.
-            # Note: 'filename' here is actually the numpy array from Gradio Image component
             image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
             image = cv2.resize(image, (224, 224))
             return image
         elif self.input_modality == 'audio':
-            # Audio is handled by deepfakes_audio_predict directly,
-            # but if you use this class, return placeholder or raw audio.
-            return None
-# Instantiate pipelines
 detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
 detection_image_pipeline = DetectionPipeline(batch_size=1, input_modality='image')
-# ---------------------------------------------------------
-# Video & Image Prediction Functions
-# ---------------------------------------------------------
 def deepfakes_video_predict(input_video):
     faces = detection_video_pipeline(input_video)
-    total = 0
-    real_res = []
-    fake_res = []
-    # Initialize counters for the simple voting logic
-    real_count = 0
-    fake_count = 0
     for face in faces:
-        face2 = face / 255.0
         pred = model.predict(np.expand_dims(face2, axis=0))[0]
         real, fake = pred[0], pred[1]
         real_res.append(real)
         fake_res.append(fake)
-        total += 1
-        pred2 = pred[1] # Probability of Fake
-        if pred2 > 0.5:
-            fake_count += 1
-        else:
-            real_count += 1
     real_mean = np.mean(real_res)
     fake_mean = np.mean(fake_res)
-    print(f"Real Faces: {real_mean}")
-    print(f"Fake Faces: {fake_mean}")
-    text = ""
     if real_mean >= 0.5:
-        text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean * 100), 3)) + "%"
     else:
-        text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean * 100, 3)) + "%"
-    return text
 def deepfakes_image_predict(input_image):
-    faces = detection_image_pipeline(input_image)
-    face2 = faces / 255.0
     pred = model.predict(np.expand_dims(face2, axis=0))[0]
     real, fake = pred[0], pred[1]
     if real > 0.5:
-        text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real * 100), 3)) + "%"
-    else:
-        # Fixed the parenthesis placement here
-        text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake * 100, 3)) + "%"
-    return text2
-# ---------------------------------------------------------
-# Audio Prediction Functions
-# ---------------------------------------------------------
-def load_audio_model():
-    d_args = {
-        "nb_samp": 64600,
-        "first_conv": 1024,
-        "in_channels": 1,
-        "filts": [20, [20, 20], [20, 128], [128, 128]],
-        "blocks": [2, 4],
-        "nb_fc_node": 1024,
-        "gru_node": 1024,
-        "nb_gru_layer": 3,
-        "nb_classes": 2
-    }
-    device = torch.device('cpu')
-    model = RawNet(d_args=d_args, device=device)
-    model.eval()
-    # Load weights
-    # Ensure 'RawNet2.pth' is in your repository root
-    if os.path.exists('RawNet2.pth'):
-        try:
-            checkpoint = torch.load('RawNet2.pth', map_location=device)
-            # Handle different checkpoint formats (strict or not)
-            if isinstance(checkpoint, dict):
-                if 'model' in checkpoint:
-                    model.load_state_dict(checkpoint['model'])
-                elif 'state_dict' in checkpoint:
-                    model.load_state_dict(checkpoint['state_dict'])
-                else:
-                    model.load_state_dict(checkpoint, strict=False)
-            else:
-                model.load_state_dict(checkpoint, strict=False)
-            print("Audio model loaded successfully.")
-        except Exception as e:
-            print(f"Error loading audio model weights: {e}")
     else:
-        print("Warning: 'RawNet2.pth' not found. Audio detection will not work.")
-    return model
-# Load the audio model globally to avoid reloading it on every request
-audio_model = load_audio_model()
-audio_label_map = {
-    0: "Real",
-    1: "Fake"
-}
-def deepfakes_audio_predict(input_audio):
-    """
-    input_audio: tuple (sample_rate, audio_data) provided by Gradio
-    """
-    if audio_model is None:
-        return "Error: Audio model not loaded."
-    try:
-        sr, x = input_audio
-    except ValueError:
-        # Fallback if input format is different (e.g. just file path)
-        return "Error: Invalid audio input format."
-    # Target sampling rate and length for RawNet
-    target_sr = 16000
-    target_len = 64600
-    # Resample if necessary
-    if sr != target_sr:
-        x = librosa.resample(x, orig_sr=sr, target_sr=target_sr)
-    # Pad or crop to target length
-    len_x = x.shape[0]
-    if len_x < target_len:
-        # Pad with zeros
-        x = np.pad(x, (0, target_len - len_x), mode='constant')
-    elif len_x > target_len:
-        # Center crop
-        start = (len_x - target_len) // 2
-        x = x[start:start + target_len]
-    # Convert to Tensor and add dimensions (Batch, Channel, Length)
-    x_pt = torch.from_numpy(x).float().unsqueeze(0).unsqueeze(0)
-    # Perform inference
-    with torch.no_grad():
-        output = audio_model(x_pt)
-    # Output is LogSoftmax, convert to probabilities
-    probs = torch.exp(output)
-    confidence, prediction = torch.max(probs, 1)
-    label = audio_label_map[prediction.item()]
-    confidence_score = confidence.item() * 100
-    return f"The audio is {label}.\nConfidence: {confidence_score:.2f}%"

+import os
+import cv2
+import torch
+import zipfile
 import librosa
+import numpy as np
 import tensorflow as tf
+from facenet_pytorch import MTCNN
 from rawnet import RawNet
 tf.random.set_seed(42)
 if not os.path.exists("efficientnet-b0"):
     local_zip = "./efficientnet-b0.zip"
     if os.path.exists(local_zip):
         zip_ref.close()
         print("Model extracted successfully!")
 model = tf.keras.models.load_model("efficientnet-b0/", compile=False)
 class DetectionPipeline:
     def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
         self.n_frames = n_frames
         self.batch_size = batch_size
         self.resize = resize
         self.input_modality = input_modality
     def __call__(self, filename):
         if self.input_modality == 'video':
             v_cap = cv2.VideoCapture(filename)
             v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            sample = np.arange(0, v_len) if self.n_frames is None \
+                     else np.linspace(0, v_len-1, self.n_frames).astype(int)
             faces = []
             frames = []
             for j in range(v_len):
                 success = v_cap.grab()
                 if j in sample:
                     success, frame = v_cap.retrieve()
                     if not success:
                         continue
                     frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                     if self.resize is not None:
+                        frame = frame.resize(
+                            [int(d * self.resize) for d in frame.size]
+                        )
                     frames.append(frame)
                     if len(frames) % self.batch_size == 0 or j == sample[-1]:
                         face2 = cv2.resize(frame, (224, 224))
                         faces.append(face2)
             return faces
         elif self.input_modality == 'image':
             image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
             image = cv2.resize(image, (224, 224))
             return image
         elif self.input_modality == 'audio':
+            x, sr = librosa.load(filename)
+            x_pt = torch.Tensor(x)
+            x_pt = torch.unsqueeze(x_pt, dim=0)
+            return x_pt
+        else:
+            raise ValueError("Invalid modality")
 detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
 detection_image_pipeline = DetectionPipeline(batch_size=1, input_modality='image')
 def deepfakes_video_predict(input_video):
     faces = detection_video_pipeline(input_video)
+    real_res, fake_res = [], []
     for face in faces:
+        face2 = face / 255
         pred = model.predict(np.expand_dims(face2, axis=0))[0]
         real, fake = pred[0], pred[1]
         real_res.append(real)
         fake_res.append(fake)
     real_mean = np.mean(real_res)
     fake_mean = np.mean(fake_res)
     if real_mean >= 0.5:
+        return "The video is REAL. Confidence: " + str(round(100 - real_mean*100, 3)) + "%"
     else:
+        return "The video is FAKE. Confidence: " + str(round(fake_mean*100, 3)) + "%"
 def deepfakes_image_predict(input_image):
+    face = detection_image_pipeline(input_image)
+    face2 = face / 255
     pred = model.predict(np.expand_dims(face2, axis=0))[0]
     real, fake = pred[0], pred[1]
     if real > 0.5:
+        return "The image is REAL."
     else:
+        return "The image is FAKE."