Spaces:

SanskarModi
/

deepdetect

Sleeping

File size: 12,965 Bytes

import cv2
import mediapipe as mp
import numpy as np
import torch
import torch.nn.functional as F
from torchvision import transforms
from pathlib import Path
from common import read_yaml
import os
os.environ['MPLCONFIGDIR'] = '/tmp'


PARAMS_FILE_PATH = Path("params.yaml")

class Prediction:
    def __init__(self):
        """
        Initialize the Prediction class with a pre-trained model and necessary parameters.
        """
        self.device = torch.device("cpu")
        self.model = torch.jit.load("model.pt", map_location=self.device)

        self.model.eval()
        params = read_yaml(PARAMS_FILE_PATH)
        self.expansion_factor = params.expansion_factor
        self.resolution = params.resolution
        self.default_frame_count = params.sequence_length

        # Initialize MediaPipe face detector
        self.face_detection = mp.solutions.face_detection.FaceDetection(
            model_selection=0, min_detection_confidence=0.6
        )

        # Define the classes for prediction
        self.classes = [
            "original",
            "Deepfake (Face2Face)",
            "Deepfake (FaceShifter)",
            "Deepfake (FaceSwap)",
            "Deepfake (NeuralTextures)",
        ]

    def get_frames(self, video):
        """
        Yields frames from the given video file.
        """
        vidobj = cv2.VideoCapture(video)
        success, image = vidobj.read()
        while success:
            yield image
            success, image = vidobj.read()

    def get_face(self, frame):
        """
        Detect faces in a frame using MediaPipe.

        Args:
            frame (np.ndarray): Input frame

        Returns:
            tuple: (top, right, bottom, left) coordinates of the face or None if no face detected
        """
        try:
            # Convert frame from BGR (OpenCV) to RGB
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            # Detect faces
            results = self.face_detection.process(rgb_frame)

            if results.detections:
                detection = results.detections[0]  # Use the first detected face
                h, w, _ = frame.shape
                bboxC = detection.location_data.relative_bounding_box

                # Calculate absolute coordinates
                xmin = int(bboxC.xmin * w)
                ymin = int(bboxC.ymin * h)
                box_width = int(bboxC.width * w)
                box_height = int(bboxC.height * h)

                # Return in top, right, bottom, left format
                top = max(ymin, 0)
                right = min(xmin + box_width, w)
                bottom = min(ymin + box_height, h)
                left = max(xmin, 0)

                return (top, right, bottom, left)

            return None  # No face detected

        except Exception as e:
            print(f"Error in get_face: {e}")
            print(f"Frame shape: {frame.shape}, dtype: {frame.dtype}")
            raise

    def color_jitter(self, image):
        """
        Applies color jitter to the given image for data augmentation.

        Args:
            image (np.ndarray): The input image

        Returns:
            np.ndarray: The color jittered image
        """
        rng = np.random.default_rng(seed=42)

        # Convert to HSV for easier manipulation
        hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        h, s, v = cv2.split(hsv)

        # Adjust brightness
        value = rng.uniform(0.8, 1.2)
        v = cv2.multiply(v, value)

        # Adjust contrast
        mean = np.mean(v)
        value = rng.uniform(0.8, 1.2)
        v = cv2.addWeighted(v, value, mean, 1 - value, 0)

        # Adjust saturation
        value = rng.uniform(0.8, 1.2)
        s = cv2.multiply(s, value)

        final_hsv = cv2.merge((h, s, v))
        image = cv2.cvtColor(final_hsv, cv2.COLOR_HSV2BGR)
        return image

    def preprocess(self, video, seq_length=None):
        """
        Preprocess the video by extracting frames, detecting faces, and resizing.
        Applies same preprocessing as training pipeline.

        Args:
            video (str): Path to the video file
            seq_length (int, optional): Number of frames to extract

        Returns:
            list: List of preprocessed frames
        """
        frames = []
        raw_frames = []  # Store original cropped frames for visualization

        # Use provided sequence length or default from params
        target_seq_length = (
            seq_length if seq_length is not None else self.default_frame_count
        )

        transform = transforms.Compose(
            [
                transforms.ToPILImage(),
                transforms.Resize(
                    tuple(self.resolution),
                    interpolation=transforms.InterpolationMode.BILINEAR,
                ),
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
                ),
            ]
        )

        buffer = []  # For processing in batches of 4 like training pipeline

        for idx, frame in enumerate(self.get_frames(video)):
            if len(frames) < target_seq_length:
                buffer.append(frame)

                if len(buffer) == 4:  # Process in batches of 4
                    faces = [self.get_face(f) for f in buffer]

                    for i, face in enumerate(faces):
                        if face is not None:
                            top, right, bottom, left = face
                            face_height = bottom - top
                            face_width = right - left

                            # Expand face region using expansion factor
                            expanded_top = max(
                                0, top - int(self.expansion_factor / 2 * face_height)
                            )
                            expanded_bottom = min(
                                buffer[i].shape[0],
                                bottom + int(self.expansion_factor / 2 * face_height),
                            )
                            expanded_left = max(
                                0, left - int(self.expansion_factor / 2 * face_width)
                            )
                            expanded_right = min(
                                buffer[i].shape[1],
                                right + int(self.expansion_factor / 2 * face_width),
                            )

                            # Crop and resize
                            cropped_face = cv2.resize(
                                buffer[i][
                                    expanded_top:expanded_bottom,
                                    expanded_left:expanded_right,
                                    :,
                                ],
                                tuple(self.resolution),
                            )

                            # Store original cropped face for visualization
                            raw_frames.append(cropped_face.copy())

                            # Apply color jitter like in training
                            cropped_face = self.color_jitter(cropped_face)

                            # Transform for model input
                            transformed = transform(cropped_face)
                            frames.append(transformed)

                    buffer = []  # Reset buffer
            else:
                break

        # Handle padding if we have fewer frames than required
        if len(frames) < target_seq_length:
            # If we have some frames, duplicate the last one
            if frames:
                while len(frames) < target_seq_length:
                    frames.append(frames[-1])
                    raw_frames.append(raw_frames[-1])
            else:
                return [], []  # No faces detected

        return frames[:target_seq_length], raw_frames[:target_seq_length]

    def save_gradients(self, grad):
        """
        Hook function to capture gradients.
        """
        self.gradients = grad

    def grad_cam(self, fmap, grads):
        """
        Compute Grad-CAM using feature maps and gradients.
        """
        pooled_grads = torch.mean(grads, dim=[0])
        for i in range(fmap.shape[1]):
            fmap[:, i, :, :] *= pooled_grads[i]

        cam = torch.mean(fmap, dim=1).squeeze().cpu().detach().numpy()

        # Apply ReLU to retain only positive activations
        cam = np.maximum(cam, 0)

        # Normalize Grad-CAM
        cam = cam - np.min(cam)
        cam = cam / np.max(cam) if np.max(cam) > 0 else cam  # Prevent division by zero

        # Resize the cam to match the resolution of the original image
        cam = cv2.resize(cam, tuple(self.resolution))
        # Convert to single-channel by summing or taking one of the channels
        cam = np.sum(cam, axis=-1) if cam.shape[-1] > 1 else cam
        return cam

    def generate_gradcam(self, fmap, video_frame, grads):
        """
        Generate the Grad-CAM heatmap and overlay it on the frame.
        """
        cam = self.grad_cam(fmap, grads)
        # Ensure cam is a single-channel 8-bit image
        cam = np.uint8(255 * cam)  # Scale to 0-255
        heatmap = cv2.applyColorMap(cam, cv2.COLORMAP_JET)  # Apply colormap

        # Ensure video_frame is in the right format
        video_frame = np.float32(cv2.cvtColor(video_frame, cv2.COLOR_RGB2BGR))

        # Convert the normalized video_frame back to uint8 (0-255)
        video_frame = np.uint8(255 * video_frame)

        # Blend heatmap and original image with a weight to ensure the face is visible
        alpha = 0.01  # Lower weight for the heatmap to make face more visible
        beta = 1 - alpha  # Weight for the original frame
        overlayed_img = cv2.addWeighted(heatmap, alpha, video_frame, beta, 0)

        return overlayed_img

    def predict(self, video, seq_length=None):
        """
        Predict whether a video is real or fake.
    
        Args:
            video (str): Path to the video file
            seq_length (int, optional): Number of frames to use
    
        Returns:
            tuple: (prediction_result, gradcam_image, classification_details)
        """
        frames, raw_frames = self.preprocess(video, seq_length)
    
        if not frames:
            return "No faces detected in the video", None, None
    
        # Prepare input tensor for the model
        target_seq_length = (
            seq_length if seq_length is not None else self.default_frame_count
        )
        input_tensor = torch.stack(frames).unsqueeze(0)
        input_tensor = input_tensor.view(1, target_seq_length, 3, *self.resolution)
        input_tensor = input_tensor.to(self.device)
        
        # Remove the torch.no_grad() context to allow gradient computation
        input_tensor.requires_grad_(True)
        
        # Forward pass with gradient tracking enabled
        fmap, attn_wts, logits = self.model(input_tensor)
        
        # Register hook for Grad-CAM
        fmap.register_hook(self.save_gradients)
    
        # Get predictions for all classes
        class_probs = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    
        # Get the predicted class
        predicted_class_idx = np.argmax(class_probs)
        predicted_class = (
            self.classes[predicted_class_idx]
            if predicted_class_idx < len(self.classes)
            else "Unknown"
        )
        prediction = "Deepfake" if predicted_class_idx > 0 else "Real"
    
        # Format confidence values to 2 decimal places
        confidence_class = round(class_probs[predicted_class_idx] * 100, 2)
        confidence_deepfake_real = (
            round(class_probs[1:].max() * 100, 2)
            if prediction == "Deepfake"
            else round(class_probs[0] * 100, 2)
        )
        prediction_string = f"{prediction} {confidence_deepfake_real:.2f}% Confidence"
    
        # Create detailed classification results as a dictionary
        if prediction == "Deepfake":
            # For deepfakes, show probabilities for each deepfake type
            classification_details = {
                self.classes[i]: float(class_probs[i]) for i in range(1, len(self.classes))
            }
        else:
            # For real videos, just show real confidence
            classification_details = {
                "Real": float(class_probs[0])
            }
    
        # Backpropagate for Grad-CAM
        self.model.zero_grad()
        logits[0, predicted_class_idx].backward()
        grads = self.gradients
    
        # Generate Grad-CAM visualization for the best frame
        if raw_frames:
            middle_idx = len(raw_frames) // 2
            gradcam_image = self.generate_gradcam(fmap, raw_frames[middle_idx], grads)
        else:
            gradcam_image = None
    
        return prediction_string, gradcam_image, classification_details