Spaces:
Build error
Build error
| import torch | |
| import torch.nn as nn | |
| from torchvision import models | |
| class FeatureExtractor(nn.Module): | |
| """ | |
| Extracts spatial features from a single frame using a pre-trained ResNeXt. | |
| """ | |
| def __init__(self, freeze=True): | |
| super(FeatureExtractor, self).__init__() | |
| # Load a pretrained ResNeXt50 | |
| # weights=models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2 is the new syntax | |
| self.model = models.resnext50_32x4d(weights=models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2) | |
| # Freeze all layers in the network | |
| if freeze: | |
| for param in self.model.parameters(): | |
| param.requires_grad = False | |
| # Get the number of output features from the layer before the classifier | |
| # In ResNeXt, this is self.model.fc | |
| self.feature_dim = self.model.fc.in_features | |
| # Remove the final classification layer (we don't need 1000 ImageNet classes) | |
| # nn.Identity() is a placeholder that just passes the input through | |
| self.model.fc = nn.Identity() | |
| def forward(self, x): | |
| # Input x has shape [B*T, C, H, W] | |
| # Output will have shape [B*T, feature_dim] | |
| return self.model(x) | |
| class DeepfakeDetector(nn.Module): | |
| """ | |
| Combines the CNN extractor and LSTM sequencer to classify a video. | |
| """ | |
| def __init__(self, cnn_feature_dim, lstm_hidden_size=512, lstm_layers=2, num_classes=2, dropout=0.5): | |
| """ | |
| Args: | |
| cnn_feature_dim (int): The output dimension from our FeatureExtractor (e.g., 2048 for ResNeXt50) | |
| lstm_hidden_size (int): The number of features in the LSTM's hidden state. | |
| lstm_layers (int): The number of stacked LSTM layers. | |
| num_classes (int): The number of output classes (2: Real/Fake). | |
| dropout (float): Dropout probability for regularization. | |
| """ | |
| super(DeepfakeDetector, self).__init__() | |
| self.feature_extractor = FeatureExtractor(freeze=True) | |
| self.lstm_hidden_size = lstm_hidden_size | |
| self.lstm_layers = lstm_layers | |
| # --- Sequence Modeling (LSTM) --- | |
| # The LSTM will take the CNN features for each frame as input | |
| self.lstm = nn.LSTM( | |
| input_size=cnn_feature_dim, | |
| hidden_size=lstm_hidden_size, | |
| num_layers=lstm_layers, | |
| batch_first=True, # Input shape is [BatchSize, SeqLength, Features] | |
| bidirectional=True, # It will look at the sequence forwards and backwards | |
| dropout=dropout if lstm_layers > 1 else 0 | |
| ) | |
| # --- Classification Head --- | |
| # We'll build a small classifier on top of the LSTM's output | |
| self.fc1 = nn.Linear( | |
| lstm_hidden_size * 2, # * 2 because the LSTM is bidirectional | |
| lstm_hidden_size // 2 | |
| ) | |
| self.relu = nn.ReLU() | |
| self.dropout = nn.Dropout(dropout) | |
| self.fc2 = nn.Linear(lstm_hidden_size // 2, num_classes) # Final output: 2 classes | |
| def forward(self, x): | |
| # Input x has shape: [B, T, C, H, W] | |
| # B = Batch Size | |
| # T = Sequence Length (e.g., 20 frames) | |
| # C, H, W = Frame dimensions (3, 224, 224) | |
| batch_size, seq_len, c, h, w = x.shape | |
| # --- 1. Feature Extraction (CNN) --- | |
| # We need to pass all frames through the CNN. | |
| # Reshape to [B * T, C, H, W] to treat all frames as one big batch. | |
| x_flat = x.view(batch_size * seq_len, c, h, w) | |
| features = self.feature_extractor(x_flat) | |
| # 'features' now has shape [B * T, cnn_feature_dim] | |
| # --- 2. Sequence Modeling (LSTM) --- | |
| # Reshape features back into sequences: [B, T, cnn_feature_dim] | |
| features_seq = features.view(batch_size, seq_len, -1) | |
| # Pass the sequence of features through the LSTM | |
| # lstm_out shape: [B, T, 2 * lstm_hidden_size] (because bidirectional) | |
| # h_n, c_n are the final hidden/cell states, which we don't need here | |
| lstm_out, (h_n, c_n) = self.lstm(features_seq) | |
| # We'll use the output from the *last* time step for classification | |
| # lstm_out[:, -1, :] gets the output of the last frame in the sequence | |
| last_time_step_out = lstm_out[:, -1, :] | |
| # Shape is now [B, 2 * lstm_hidden_size] | |
| # --- 3. Classification --- | |
| # Pass the LSTM's final output through our classifier | |
| x = self.dropout(self.relu(self.fc1(last_time_step_out))) | |
| out = self.fc2(x) | |
| # 'out' shape: [B, num_classes] (e.g., [8, 2]) | |
| return out | |