Spaces:

pavankumarvk
/

Multi_Modal_Deepfake_Detection

Sleeping

App Files Files Community

pavankumarvk commited on Nov 7, 2025

Commit

73dc161

verified ·

1 Parent(s): 7dc7c6a

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +169 -211

pipeline.py CHANGED Viewed

@@ -1,211 +1,169 @@
-import os
-import cv2
-import torch
-import zipfile
-import librosa
-import numpy as np
-import tensorflow as tf
-from facenet_pytorch import MTCNN
-from rawnet import RawNet
-#Set random seed for reproducibility.
-tf.random.set_seed(42)
-# Extract model if not already extracted
-if not os.path.exists("efficientnet-b0"):
-    local_zip = "./efficientnet-b0.zip"
-    if os.path.exists(local_zip):
-        zip_ref = zipfile.ZipFile(local_zip, 'r')
-        zip_ref.extractall()
-        zip_ref.close()
-        print("Model extracted successfully!")
-# Load models.
-# Load model without compiling to avoid optimizer dependency issues
-model = tf.keras.models.load_model("efficientnet-b0/", compile=False)
-class DetectionPipeline:
-    """Pipeline class for detecting faces in the frames of a video file."""
-    def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality = 'video'):
-        """Constructor for DetectionPipeline class.
-        Keyword Arguments:
-            n_frames {int} -- Total number of frames to load. These will be evenly spaced
-                throughout the video. If not specified (i.e., None), all frames will be loaded.
-                (default: {None})
-            batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
-            resize {float} -- Fraction by which to resize frames from original prior to face
-                detection. A value less than 1 results in downsampling and a value greater than
-                1 result in upsampling. (default: {None})
-        """
-        self.n_frames = n_frames
-        self.batch_size = batch_size
-        self.resize = resize
-        self.input_modality = input_modality
-    def __call__(self, filename):
-        """Load frames from an MP4 video and detect faces.
-        Arguments:
-            filename {str} -- Path to video.
-        """
-        # Create video reader and find length
-        if self.input_modality == 'video':
-            print('Input modality is video.')
-            v_cap = cv2.VideoCapture(filename)
-            v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
-            # Pick 'n_frames' evenly spaced frames to sample
-            if self.n_frames is None:
-                sample = np.arange(0, v_len)
-            else:
-                sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
-            # Loop through frames
-            faces = []
-            frames = []
-            for j in range(v_len):
-                success = v_cap.grab()
-                if j in sample:
-                    # Load frame
-                    success, frame = v_cap.retrieve()
-                    if not success:
-                        continue
-                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    # Resize frame to desired size
-                    if self.resize is not None:
-                        frame = frame.resize([int(d * self.resize) for d in frame.size])
-                    frames.append(frame)
-                    # When batch is full, detect faces and reset frame list
-                    if len(frames) % self.batch_size == 0 or j == sample[-1]:
-                        face2 = cv2.resize(frame, (224, 224))
-                        faces.append(face2)
-            v_cap.release()
-            return faces
-        elif self.input_modality == 'image':
-            print('Input modality is image.')
-            #Perform inference for image modality.
-            print('Reading image')
-            # print(f"Image path is: {filename}")
-            image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
-            image = cv2.resize(image, (224, 224))
-            # if not face.any():
-            #     print("No faces found...")
-            return image
-        elif self.input_modality == 'audio':
-            print("INput modality is audio.")
-            #Load audio.
-            x, sr = librosa.load(filename)
-            x_pt = torch.Tensor(x)
-            x_pt = torch.unsqueeze(x_pt, dim = 0)
-            return x_pt
-        else:
-            raise ValueError("Invalid input modality. Must be either 'video' or image")
-detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
-detection_image_pipeline = DetectionPipeline(batch_size = 1, input_modality = 'image')
-def deepfakes_video_predict(input_video):
-    faces = detection_video_pipeline(input_video)
-    total = 0
-    real_res = []
-    fake_res = []
-    for face in faces:
-        face2 = face/255
-        pred = model.predict(np.expand_dims(face2, axis=0))[0]
-        real, fake = pred[0], pred[1]
-        real_res.append(real)
-        fake_res.append(fake)
-        total+=1
-        pred2 = pred[1]
-        if pred2 > 0.5:
-          fake+=1
-        else:
-          real+=1
-    real_mean = np.mean(real_res)
-    fake_mean = np.mean(fake_res)
-    print(f"Real Faces: {real_mean}")
-    print(f"Fake Faces: {fake_mean}")
-    text = ""
-    if real_mean >= 0.5:
-        text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean*100), 3)) + "%"
-    else:
-        text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean*100, 3)) + "%"
-    return text
-def deepfakes_image_predict(input_image):
-    faces = detection_image_pipeline(input_image)
-    face2 = faces/255
-    pred = model.predict(np.expand_dims(face2, axis = 0))[0]
-    real, fake = pred[0], pred[1]
-    if real > 0.5:
-        text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real*100), 3)) + "%"
-    else:
-        text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake*100, 3)) + "%"
-    return text2
-def load_audio_model():
-    d_args = {
-  "nb_samp": 64600,
-  "first_conv": 1024,
-  "in_channels": 1,
-  "filts": [20, [20, 20], [20, 128], [128, 128]],
-  "blocks": [2, 4],
-  "nb_fc_node": 1024,
-  "gru_node": 1024,
-  "nb_gru_layer": 3,
-  "nb_classes": 2}
-    model = RawNet(d_args = d_args, device='cpu')
-    #Load ckpt.
-    model_dict = model.state_dict()
-    ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
-    model.load_state_dict(ckpt, model_dict)
-    return model
-audio_label_map = {
-    0: "Real audio",
-    1: "Fake audio"
-}
-def deepfakes_audio_predict(input_audio):
-    #Perform inference on audio.
-    x, sr = input_audio
-    x_pt = torch.Tensor(x)
-    x_pt = torch.unsqueeze(x_pt, dim = 0)
-    #Load model.
-    model = load_audio_model()
-    #Perform inference.
-    grads = model(x_pt)
-    #Get the argmax.
-    grads_np = grads.detach().numpy()
-    result = np.argmax(grads_np)
-    return audio_label_map[result]

+import os
+import cv2
+import torch
+import zipfile
+import librosa
+import numpy as np
+import tensorflow as tf
+from facenet_pytorch import MTCNN
+#Set random seed for reproducibility.
+tf.random.set_seed(42)
+# Extract model if not already extracted
+if not os.path.exists("efficientnet-b0"):
+    local_zip = "./efficientnet-b0.zip"
+    if os.path.exists(local_zip):
+        zip_ref = zipfile.ZipFile(local_zip, 'r')
+        zip_ref.extractall()
+        zip_ref.close()
+        print("Model extracted successfully!")
+# Load models.
+# Load model without compiling to avoid optimizer dependency issues
+model = tf.keras.models.load_model("efficientnet-b0/", compile=False)
+class DetectionPipeline:
+    """Pipeline class for detecting faces in the frames of a video file."""
+    def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality = 'video'):
+        """Constructor for DetectionPipeline class.
+        Keyword Arguments:
+            n_frames {int} -- Total number of frames to load. These will be evenly spaced
+                throughout the video. If not specified (i.e., None), all frames will be loaded.
+                (default: {None})
+            batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
+            resize {float} -- Fraction by which to resize frames from original prior to face
+                detection. A value less than 1 results in downsampling and a value greater than
+                1 result in upsampling. (default: {None})
+        """
+        self.n_frames = n_frames
+        self.batch_size = batch_size
+        self.resize = resize
+        self.input_modality = input_modality
+    def __call__(self, filename):
+        """Load frames from an MP4 video and detect faces.
+        Arguments:
+            filename {str} -- Path to video.
+        """
+        # Create video reader and find length
+        if self.input_modality == 'video':
+            print('Input modality is video.')
+            v_cap = cv2.VideoCapture(filename)
+            v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            # Pick 'n_frames' evenly spaced frames to sample
+            if self.n_frames is None:
+                sample = np.arange(0, v_len)
+            else:
+                sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
+            # Loop through frames
+            faces = []
+            frames = []
+            for j in range(v_len):
+                success = v_cap.grab()
+                if j in sample:
+                    # Load frame
+                    success, frame = v_cap.retrieve()
+                    if not success:
+                        continue
+                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    # Resize frame to desired size
+                    if self.resize is not None:
+                        frame = frame.resize([int(d * self.resize) for d in frame.size])
+                    frames.append(frame)
+                    # When batch is full, detect faces and reset frame list
+                    if len(frames) % self.batch_size == 0 or j == sample[-1]:
+                        face2 = cv2.resize(frame, (224, 224))
+                        faces.append(face2)
+            v_cap.release()
+            return faces
+        elif self.input_modality == 'image':
+            print('Input modality is image.')
+            #Perform inference for image modality.
+            print('Reading image')
+            # print(f"Image path is: {filename}")
+            image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
+            image = cv2.resize(image, (224, 224))
+            # if not face.any():
+            #     print("No faces found...")
+            return image
+        elif self.input_modality == 'audio':
+            print("INput modality is audio.")
+            #Load audio.
+            x, sr = librosa.load(filename)
+            x_pt = torch.Tensor(x)
+            x_pt = torch.unsqueeze(x_pt, dim = 0)
+            return x_pt
+        else:
+            raise ValueError("Invalid input modality. Must be either 'video' or image")
+detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
+detection_image_pipeline = DetectionPipeline(batch_size = 1, input_modality = 'image')
+def deepfakes_video_predict(input_video):
+    faces = detection_video_pipeline(input_video)
+    total = 0
+    real_res = []
+    fake_res = []
+    for face in faces:
+        face2 = face/255
+        pred = model.predict(np.expand_dims(face2, axis=0))[0]
+        real, fake = pred[0], pred[1]
+        real_res.append(real)
+        fake_res.append(fake)
+        total+=1
+        pred2 = pred[1]
+        if pred2 > 0.5:
+          fake+=1
+        else:
+          real+=1
+    real_mean = np.mean(real_res)
+    fake_mean = np.mean(fake_res)
+    print(f"Real Faces: {real_mean}")
+    print(f"Fake Faces: {fake_mean}")
+    text = ""
+    if real_mean >= 0.5:
+        text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean*100), 3)) + "%"
+    else:
+        text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean*100, 3)) + "%"
+    return text
+def deepfakes_image_predict(input_image):
+    faces = detection_image_pipeline(input_image)
+    face2 = faces/255
+    pred = model.predict(np.expand_dims(face2, axis = 0))[0]
+    real, fake = pred[0], pred[1]
+    if real > 0.5:
+        text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real*100), 3)) + "%"
+    else:
+        text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake*100, 3)) + "%"
+    return text2