Spaces:

Shreesha1
/

deepfake-detection

Build error

App Files Files Community

deepfake-detection / model.py

Shreesha1

Upload 3 files

a01fbc1 verified 5 months ago

raw

history blame contribute delete

4.77 kB

	import torch
	import torch.nn as nn
	from torchvision import models

	class FeatureExtractor(nn.Module):
	"""
	Extracts spatial features from a single frame using a pre-trained ResNeXt.
	"""
	def __init__(self, freeze=True):
	super(FeatureExtractor, self).__init__()

	# Load a pretrained ResNeXt50
	# weights=models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2 is the new syntax
	self.model = models.resnext50_32x4d(weights=models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2)

	# Freeze all layers in the network
	if freeze:
	for param in self.model.parameters():
	param.requires_grad = False

	# Get the number of output features from the layer before the classifier
	# In ResNeXt, this is self.model.fc
	self.feature_dim = self.model.fc.in_features

	# Remove the final classification layer (we don't need 1000 ImageNet classes)
	# nn.Identity() is a placeholder that just passes the input through
	self.model.fc = nn.Identity()

	def forward(self, x):
	# Input x has shape [B*T, C, H, W]
	# Output will have shape [B*T, feature_dim]
	return self.model(x)

	class DeepfakeDetector(nn.Module):
	"""
	Combines the CNN extractor and LSTM sequencer to classify a video.
	"""
	def __init__(self, cnn_feature_dim, lstm_hidden_size=512, lstm_layers=2, num_classes=2, dropout=0.5):
	"""
	Args:
	cnn_feature_dim (int): The output dimension from our FeatureExtractor (e.g., 2048 for ResNeXt50)
	lstm_hidden_size (int): The number of features in the LSTM's hidden state.
	lstm_layers (int): The number of stacked LSTM layers.
	num_classes (int): The number of output classes (2: Real/Fake).
	dropout (float): Dropout probability for regularization.
	"""
	super(DeepfakeDetector, self).__init__()

	self.feature_extractor = FeatureExtractor(freeze=True)
	self.lstm_hidden_size = lstm_hidden_size
	self.lstm_layers = lstm_layers

	# --- Sequence Modeling (LSTM) ---
	# The LSTM will take the CNN features for each frame as input
	self.lstm = nn.LSTM(
	input_size=cnn_feature_dim,
	hidden_size=lstm_hidden_size,
	num_layers=lstm_layers,
	batch_first=True, # Input shape is [BatchSize, SeqLength, Features]
	bidirectional=True, # It will look at the sequence forwards and backwards
	dropout=dropout if lstm_layers > 1 else 0
	)

	# --- Classification Head ---
	# We'll build a small classifier on top of the LSTM's output
	self.fc1 = nn.Linear(
	lstm_hidden_size * 2, # * 2 because the LSTM is bidirectional
	lstm_hidden_size // 2
	)
	self.relu = nn.ReLU()
	self.dropout = nn.Dropout(dropout)
	self.fc2 = nn.Linear(lstm_hidden_size // 2, num_classes) # Final output: 2 classes

	def forward(self, x):
	# Input x has shape: [B, T, C, H, W]
	# B = Batch Size
	# T = Sequence Length (e.g., 20 frames)
	# C, H, W = Frame dimensions (3, 224, 224)

	batch_size, seq_len, c, h, w = x.shape

	# --- 1. Feature Extraction (CNN) ---
	# We need to pass all frames through the CNN.
	# Reshape to [B * T, C, H, W] to treat all frames as one big batch.
	x_flat = x.view(batch_size * seq_len, c, h, w)

	features = self.feature_extractor(x_flat)
	# 'features' now has shape [B * T, cnn_feature_dim]

	# --- 2. Sequence Modeling (LSTM) ---
	# Reshape features back into sequences: [B, T, cnn_feature_dim]
	features_seq = features.view(batch_size, seq_len, -1)

	# Pass the sequence of features through the LSTM
	# lstm_out shape: [B, T, 2 * lstm_hidden_size] (because bidirectional)
	# h_n, c_n are the final hidden/cell states, which we don't need here
	lstm_out, (h_n, c_n) = self.lstm(features_seq)

	# We'll use the output from the last time step for classification
	# lstm_out[:, -1, :] gets the output of the last frame in the sequence
	last_time_step_out = lstm_out[:, -1, :]
	# Shape is now [B, 2 * lstm_hidden_size]

	# --- 3. Classification ---
	# Pass the LSTM's final output through our classifier
	x = self.dropout(self.relu(self.fc1(last_time_step_out)))
	out = self.fc2(x)
	# 'out' shape: [B, num_classes] (e.g., [8, 2])

	return out