Deepfake / model.py
Shreesha1's picture
First
b5fabfd verified
import torch
import torch.nn as nn
from torchvision import models
class FeatureExtractor(nn.Module):
"""
Extracts spatial features from a single frame using a pre-trained ResNeXt.
"""
def __init__(self, freeze=True):
super(FeatureExtractor, self).__init__()
# Load a pretrained ResNeXt50
# weights=models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2 is the new syntax
self.model = models.resnext50_32x4d(weights=models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2)
# Freeze all layers in the network
if freeze:
for param in self.model.parameters():
param.requires_grad = False
# Get the number of output features from the layer before the classifier
# In ResNeXt, this is self.model.fc
self.feature_dim = self.model.fc.in_features
# Remove the final classification layer (we don't need 1000 ImageNet classes)
# nn.Identity() is a placeholder that just passes the input through
self.model.fc = nn.Identity()
def forward(self, x):
# Input x has shape [B*T, C, H, W]
# Output will have shape [B*T, feature_dim]
return self.model(x)
class DeepfakeDetector(nn.Module):
"""
Combines the CNN extractor and LSTM sequencer to classify a video.
"""
def __init__(self, cnn_feature_dim, lstm_hidden_size=512, lstm_layers=2, num_classes=2, dropout=0.5):
"""
Args:
cnn_feature_dim (int): The output dimension from our FeatureExtractor (e.g., 2048 for ResNeXt50)
lstm_hidden_size (int): The number of features in the LSTM's hidden state.
lstm_layers (int): The number of stacked LSTM layers.
num_classes (int): The number of output classes (2: Real/Fake).
dropout (float): Dropout probability for regularization.
"""
super(DeepfakeDetector, self).__init__()
self.feature_extractor = FeatureExtractor(freeze=True)
self.lstm_hidden_size = lstm_hidden_size
self.lstm_layers = lstm_layers
# --- Sequence Modeling (LSTM) ---
# The LSTM will take the CNN features for each frame as input
self.lstm = nn.LSTM(
input_size=cnn_feature_dim,
hidden_size=lstm_hidden_size,
num_layers=lstm_layers,
batch_first=True, # Input shape is [BatchSize, SeqLength, Features]
bidirectional=True, # It will look at the sequence forwards and backwards
dropout=dropout if lstm_layers > 1 else 0
)
# --- Classification Head ---
# We'll build a small classifier on top of the LSTM's output
self.fc1 = nn.Linear(
lstm_hidden_size * 2, # * 2 because the LSTM is bidirectional
lstm_hidden_size // 2
)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout)
self.fc2 = nn.Linear(lstm_hidden_size // 2, num_classes) # Final output: 2 classes
def forward(self, x):
# Input x has shape: [B, T, C, H, W]
# B = Batch Size
# T = Sequence Length (e.g., 20 frames)
# C, H, W = Frame dimensions (3, 224, 224)
batch_size, seq_len, c, h, w = x.shape
# --- 1. Feature Extraction (CNN) ---
# We need to pass all frames through the CNN.
# Reshape to [B * T, C, H, W] to treat all frames as one big batch.
x_flat = x.view(batch_size * seq_len, c, h, w)
features = self.feature_extractor(x_flat)
# 'features' now has shape [B * T, cnn_feature_dim]
# --- 2. Sequence Modeling (LSTM) ---
# Reshape features back into sequences: [B, T, cnn_feature_dim]
features_seq = features.view(batch_size, seq_len, -1)
# Pass the sequence of features through the LSTM
# lstm_out shape: [B, T, 2 * lstm_hidden_size] (because bidirectional)
# h_n, c_n are the final hidden/cell states, which we don't need here
lstm_out, (h_n, c_n) = self.lstm(features_seq)
# We'll use the output from the *last* time step for classification
# lstm_out[:, -1, :] gets the output of the last frame in the sequence
last_time_step_out = lstm_out[:, -1, :]
# Shape is now [B, 2 * lstm_hidden_size]
# --- 3. Classification ---
# Pass the LSTM's final output through our classifier
x = self.dropout(self.relu(self.fc1(last_time_step_out)))
out = self.fc2(x)
# 'out' shape: [B, num_classes] (e.g., [8, 2])
return out