File size: 4,774 Bytes
b5fabfd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import torch
import torch.nn as nn
from torchvision import models

class FeatureExtractor(nn.Module):
    """

    Extracts spatial features from a single frame using a pre-trained ResNeXt.

    """
    def __init__(self, freeze=True):
        super(FeatureExtractor, self).__init__()
        
        # Load a pretrained ResNeXt50
        # weights=models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2 is the new syntax
        self.model = models.resnext50_32x4d(weights=models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2)
        
        # Freeze all layers in the network
        if freeze:
            for param in self.model.parameters():
                param.requires_grad = False
        
        # Get the number of output features from the layer before the classifier
        # In ResNeXt, this is self.model.fc
        self.feature_dim = self.model.fc.in_features
        
        # Remove the final classification layer (we don't need 1000 ImageNet classes)
        # nn.Identity() is a placeholder that just passes the input through
        self.model.fc = nn.Identity() 

    def forward(self, x):
        # Input x has shape [B*T, C, H, W]
        # Output will have shape [B*T, feature_dim]
        return self.model(x)

class DeepfakeDetector(nn.Module):
    """

    Combines the CNN extractor and LSTM sequencer to classify a video.

    """
    def __init__(self, cnn_feature_dim, lstm_hidden_size=512, lstm_layers=2, num_classes=2, dropout=0.5):
        """

        Args:

            cnn_feature_dim (int): The output dimension from our FeatureExtractor (e.g., 2048 for ResNeXt50)

            lstm_hidden_size (int): The number of features in the LSTM's hidden state.

            lstm_layers (int): The number of stacked LSTM layers.

            num_classes (int): The number of output classes (2: Real/Fake).

            dropout (float): Dropout probability for regularization.

        """
        super(DeepfakeDetector, self).__init__()
        
        self.feature_extractor = FeatureExtractor(freeze=True)
        self.lstm_hidden_size = lstm_hidden_size
        self.lstm_layers = lstm_layers
        
        # --- Sequence Modeling (LSTM) ---
        # The LSTM will take the CNN features for each frame as input
        self.lstm = nn.LSTM(
            input_size=cnn_feature_dim,
            hidden_size=lstm_hidden_size,
            num_layers=lstm_layers,
            batch_first=True,       # Input shape is [BatchSize, SeqLength, Features]
            bidirectional=True,     # It will look at the sequence forwards and backwards
            dropout=dropout if lstm_layers > 1 else 0
        )
        
        # --- Classification Head ---
        # We'll build a small classifier on top of the LSTM's output
        self.fc1 = nn.Linear(
            lstm_hidden_size * 2,  # * 2 because the LSTM is bidirectional
            lstm_hidden_size // 2
        )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(lstm_hidden_size // 2, num_classes) # Final output: 2 classes

    def forward(self, x):
        # Input x has shape: [B, T, C, H, W]
        # B = Batch Size
        # T = Sequence Length (e.g., 20 frames)
        # C, H, W = Frame dimensions (3, 224, 224)
        
        batch_size, seq_len, c, h, w = x.shape
        
        # --- 1. Feature Extraction (CNN) ---
        # We need to pass all frames through the CNN.
        # Reshape to [B * T, C, H, W] to treat all frames as one big batch.
        x_flat = x.view(batch_size * seq_len, c, h, w)
        
        features = self.feature_extractor(x_flat)
        # 'features' now has shape [B * T, cnn_feature_dim]
        
        # --- 2. Sequence Modeling (LSTM) ---
        # Reshape features back into sequences: [B, T, cnn_feature_dim]
        features_seq = features.view(batch_size, seq_len, -1)
        
        # Pass the sequence of features through the LSTM
        # lstm_out shape: [B, T, 2 * lstm_hidden_size] (because bidirectional)
        # h_n, c_n are the final hidden/cell states, which we don't need here
        lstm_out, (h_n, c_n) = self.lstm(features_seq)
        
        # We'll use the output from the *last* time step for classification
        # lstm_out[:, -1, :] gets the output of the last frame in the sequence
        last_time_step_out = lstm_out[:, -1, :]
        # Shape is now [B, 2 * lstm_hidden_size]
        
        # --- 3. Classification ---
        # Pass the LSTM's final output through our classifier
        x = self.dropout(self.relu(self.fc1(last_time_step_out)))
        out = self.fc2(x)
        # 'out' shape: [B, num_classes] (e.g., [8, 2])
        
        return out