Spaces:

pavankumarvk
/

Multi_Modal_Deepfake_Detection

Sleeping

App Files Files Community

pavankumarvk commited on Feb 9

Commit

7f307a0

verified ·

1 Parent(s): 156fc9b

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +127 -91

pipeline.py CHANGED Viewed

@@ -8,9 +8,7 @@ import tensorflow as tf
 from facenet_pytorch import MTCNN
 from rawnet import RawNet
-#Set random seed for reproducibility.
 tf.random.set_seed(42)
 # Extract model if not already extracted
@@ -22,39 +20,23 @@ if not os.path.exists("efficientnet-b0"):
         zip_ref.close()
         print("Model extracted successfully!")
-# Load models.
 # Load model without compiling to avoid optimizer dependency issues
 model = tf.keras.models.load_model("efficientnet-b0/", compile=False)
 class DetectionPipeline:
     """Pipeline class for detecting faces in the frames of a video file."""
-    def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality = 'video'):
-        """Constructor for DetectionPipeline class.
-        Keyword Arguments:
-            n_frames {int} -- Total number of frames to load. These will be evenly spaced
-                throughout the video. If not specified (i.e., None), all frames will be loaded.
-                (default: {None})
-            batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
-            resize {float} -- Fraction by which to resize frames from original prior to face
-                detection. A value less than 1 results in downsampling and a value greater than
-                1 result in upsampling. (default: {None})
-        """
         self.n_frames = n_frames
         self.batch_size = batch_size
         self.resize = resize
         self.input_modality = input_modality
     def __call__(self, filename):
-        """Load frames from an MP4 video and detect faces.
-        Arguments:
-            filename {str} -- Path to video.
-        """
-        # Create video reader and find length
         if self.input_modality == 'video':
             print('Input modality is video.')
             v_cap = cv2.VideoCapture(filename)
@@ -80,11 +62,15 @@ class DetectionPipeline:
                     # Resize frame to desired size
                     if self.resize is not None:
-                        frame = frame.resize([int(d * self.resize) for d in frame.size])
                     frames.append(frame)
                     # When batch is full, detect faces and reset frame list
                     if len(frames) % self.batch_size == 0 or j == sample[-1]:
                         face2 = cv2.resize(frame, (224, 224))
                         faces.append(face2)
@@ -93,55 +79,51 @@ class DetectionPipeline:
         elif self.input_modality == 'image':
             print('Input modality is image.')
-            #Perform inference for image modality.
-            print('Reading image')
-            # print(f"Image path is: {filename}")
             image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
             image = cv2.resize(image, (224, 224))
-            # if not face.any():
-            #     print("No faces found...")
             return image
         elif self.input_modality == 'audio':
-            print("INput modality is audio.")
-            #Load audio.
-            x, sr = librosa.load(filename)
-            x_pt = torch.Tensor(x)
-            x_pt = torch.unsqueeze(x_pt, dim = 0)
-            return x_pt
-        else:
-            raise ValueError("Invalid input modality. Must be either 'video' or image")
 detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
-detection_image_pipeline = DetectionPipeline(batch_size = 1, input_modality = 'image')
-def deepfakes_video_predict(input_video):
     faces = detection_video_pipeline(input_video)
     total = 0
     real_res = []
     fake_res = []
     for face in faces:
-        face2 = face/255
         pred = model.predict(np.expand_dims(face2, axis=0))[0]
         real, fake = pred[0], pred[1]
         real_res.append(real)
         fake_res.append(fake)
-        total+=1
-        pred2 = pred[1]
         if pred2 > 0.5:
-          fake+=1
         else:
-          real+=1
     real_mean = np.mean(real_res)
     fake_mean = np.mean(fake_res)
     print(f"Real Faces: {real_mean}")
@@ -149,65 +131,119 @@ def deepfakes_video_predict(input_video):
     text = ""
     if real_mean >= 0.5:
-        text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean*100), 3)) + "%"
     else:
-        text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean*100, 3)) + "%"
     return text
 def deepfakes_image_predict(input_image):
     faces = detection_image_pipeline(input_image)
-    face2 = faces/255
-    pred = model.predict(np.expand_dims(face2, axis = 0))[0]
     real, fake = pred[0], pred[1]
     if real > 0.5:
-        text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real*100), 3)) + "%"
     else:
-        text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake*100, 3)) + "%"
     return text2
 def load_audio_model():
     d_args = {
-  "nb_samp": 64600,
-  "first_conv": 1024,
-  "in_channels": 1,
-  "filts": [20, [20, 20], [20, 128], [128, 128]],
-  "blocks": [2, 4],
-  "nb_fc_node": 1024,
-  "gru_node": 1024,
-  "nb_gru_layer": 3,
-  "nb_classes": 2}
-    model = RawNet(d_args = d_args, device='cpu')
-    #Load ckpt.
-    model_dict = model.state_dict()
-    ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
-    model.load_state_dict(ckpt, model_dict)
     return model
 audio_label_map = {
-    0: "Real audio",
-    1: "Fake audio"
 }
 def deepfakes_audio_predict(input_audio):
-    #Perform inference on audio.
-    x, sr = input_audio
-    x_pt = torch.Tensor(x)
-    x_pt = torch.unsqueeze(x_pt, dim = 0)
-    #Load model.
-    model = load_audio_model()
-    #Perform inference.
-    grads = model(x_pt)
-    #Get the argmax.
-    grads_np = grads.detach().numpy()
-    result = np.argmax(grads_np)
-    return audio_label_map[result]

 from facenet_pytorch import MTCNN
 from rawnet import RawNet
+# Set random seed for reproducibility.
 tf.random.set_seed(42)
 # Extract model if not already extracted
         zip_ref.close()
         print("Model extracted successfully!")
+# Load Video/Image models.
 # Load model without compiling to avoid optimizer dependency issues
 model = tf.keras.models.load_model("efficientnet-b0/", compile=False)
 class DetectionPipeline:
     """Pipeline class for detecting faces in the frames of a video file."""
+    def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
+        """Constructor for DetectionPipeline class."""
         self.n_frames = n_frames
         self.batch_size = batch_size
         self.resize = resize
         self.input_modality = input_modality
     def __call__(self, filename):
+        """Load frames from an MP4 video and detect faces."""
         if self.input_modality == 'video':
             print('Input modality is video.')
             v_cap = cv2.VideoCapture(filename)
                     # Resize frame to desired size
                     if self.resize is not None:
+                        frame = cv2.resize(frame, None, fx=self.resize, fy=self.resize)
                     frames.append(frame)
                     # When batch is full, detect faces and reset frame list
                     if len(frames) % self.batch_size == 0 or j == sample[-1]:
+                        # Simple resizing for the EfficientNet model (assuming face is centered or whole frame is analyzed)
+                        # For a more robust solution, MTCNN should be used here to extract faces first.
+                        # Based on your provided logic, we resize the frame directly.
                         face2 = cv2.resize(frame, (224, 224))
                         faces.append(face2)
         elif self.input_modality == 'image':
             print('Input modality is image.')
+            # Perform inference for image modality.
+            # Note: 'filename' here is actually the numpy array from Gradio Image component
             image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
             image = cv2.resize(image, (224, 224))
             return image
         elif self.input_modality == 'audio':
+            # Audio is handled by deepfakes_audio_predict directly,
+            # but if you use this class, return placeholder or raw audio.
+            return None
+# Instantiate pipelines
 detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
+detection_image_pipeline = DetectionPipeline(batch_size=1, input_modality='image')
+# ---------------------------------------------------------
+# Video & Image Prediction Functions
+# ---------------------------------------------------------
+def deepfakes_video_predict(input_video):
     faces = detection_video_pipeline(input_video)
     total = 0
     real_res = []
     fake_res = []
+    # Initialize counters for the simple voting logic
+    real_count = 0
+    fake_count = 0
     for face in faces:
+        face2 = face / 255.0
         pred = model.predict(np.expand_dims(face2, axis=0))[0]
         real, fake = pred[0], pred[1]
         real_res.append(real)
         fake_res.append(fake)
+        total += 1
+        pred2 = pred[1] # Probability of Fake
         if pred2 > 0.5:
+            fake_count += 1
         else:
+            real_count += 1
     real_mean = np.mean(real_res)
     fake_mean = np.mean(fake_res)
     print(f"Real Faces: {real_mean}")
     text = ""
     if real_mean >= 0.5:
+        text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean * 100), 3)) + "%"
     else:
+        text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean * 100, 3)) + "%"
     return text
 def deepfakes_image_predict(input_image):
     faces = detection_image_pipeline(input_image)
+    face2 = faces / 255.0
+    pred = model.predict(np.expand_dims(face2, axis=0))[0]
     real, fake = pred[0], pred[1]
     if real > 0.5:
+        text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real * 100), 3)) + "%"
     else:
+        text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake * 100), 3)) + "%"
     return text2
+# ---------------------------------------------------------
+# Audio Prediction Functions
+# ---------------------------------------------------------
 def load_audio_model():
     d_args = {
+        "nb_samp": 64600,
+        "first_conv": 1024,
+        "in_channels": 1,
+        "filts": [20, [20, 20], [20, 128], [128, 128]],
+        "blocks": [2, 4],
+        "nb_fc_node": 1024,
+        "gru_node": 1024,
+        "nb_gru_layer": 3,
+        "nb_classes": 2
+    }
+    device = torch.device('cpu')
+    model = RawNet(d_args=d_args, device=device)
+    model.eval()
+    # Load weights
+    # Ensure 'RawNet2.pth' is in your repository root
+    if os.path.exists('RawNet2.pth'):
+        try:
+            checkpoint = torch.load('RawNet2.pth', map_location=device)
+            # Handle different checkpoint formats (strict or not)
+            if isinstance(checkpoint, dict):
+                if 'model' in checkpoint:
+                    model.load_state_dict(checkpoint['model'])
+                elif 'state_dict' in checkpoint:
+                    model.load_state_dict(checkpoint['state_dict'])
+                else:
+                    model.load_state_dict(checkpoint, strict=False)
+            else:
+                model.load_state_dict(checkpoint, strict=False)
+            print("Audio model loaded successfully.")
+        except Exception as e:
+            print(f"Error loading audio model weights: {e}")
+    else:
+        print("Warning: 'RawNet2.pth' not found. Audio detection will not work.")
     return model
+# Load the audio model globally to avoid reloading it on every request
+audio_model = load_audio_model()
 audio_label_map = {
+    0: "Real",
+    1: "Fake"
 }
 def deepfakes_audio_predict(input_audio):
+    """
+    input_audio: tuple (sample_rate, audio_data) provided by Gradio
+    """
+    if audio_model is None:
+        return "Error: Audio model not loaded."
+    try:
+        sr, x = input_audio
+    except ValueError:
+        # Fallback if input format is different (e.g. just file path)
+        return "Error: Invalid audio input format."
+    # Target sampling rate and length for RawNet
+    target_sr = 16000
+    target_len = 64600
+    # Resample if necessary
+    if sr != target_sr:
+        x = librosa.resample(x, orig_sr=sr, target_sr=target_sr)
+    # Pad or crop to target length
+    len_x = x.shape[0]
+    if len_x < target_len:
+        # Pad with zeros
+        x = np.pad(x, (0, target_len - len_x), mode='constant')
+    elif len_x > target_len:
+        # Center crop
+        start = (len_x - target_len) // 2
+        x = x[start:start + target_len]
+    # Convert to Tensor and add dimensions (Batch, Channel, Length)
+    x_pt = torch.from_numpy(x).float().unsqueeze(0).unsqueeze(0)
+    # Perform inference
+    with torch.no_grad():
+        output = audio_model(x_pt)
+    # Output is LogSoftmax, convert to probabilities
+    probs = torch.exp(output)
+    confidence, prediction = torch.max(probs, 1)
+    label = audio_label_map[prediction.item()]
+    confidence_score = confidence.item() * 100
+    return f"The audio is {label}.\nConfidence: {confidence_score:.2f}%"