import torch import torch.nn as nn class SpatialAttention(nn.Module): """ Spatial Attention mechanism mapping specific potential local dominant regions (lips, eyes, mouth) """ def __init__(self, in_channels): super(SpatialAttention, self).__init__() # Using 1x1 convolution to identify spatial feature importance self.conv = nn.Conv2d(in_channels, 1, kernel_size=1) def forward(self, x): attn_weights = torch.sigmoid(self.conv(x)) return x * attn_weights class ChannelAttention(nn.Module): """ Channel Attention capturing global information constraints (overall face structure, lighting, poses) """ def __init__(self, in_channels, reduction=16): super(ChannelAttention, self).__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.max_pool = nn.AdaptiveMaxPool2d(1) # Shared Multilayer Perceptron self.mlp = nn.Sequential( nn.Conv2d(in_channels, in_channels // reduction, 1, bias=False), nn.ReLU(inplace=True), nn.Conv2d(in_channels // reduction, in_channels, 1, bias=False) ) self.sigmoid = nn.Sigmoid() def forward(self, x): avg_out = self.mlp(self.avg_pool(x)) max_out = self.mlp(self.max_pool(x)) attn_weights = self.sigmoid(avg_out + max_out) return x * attn_weights class DualAttentionCrossFusion(nn.Module): """ Combines both Spatial and Channel attentions in parallel and uses cross-fusion to merge them avoiding information destruction. """ def __init__(self, in_channels): super(DualAttentionCrossFusion, self).__init__() self.spatial = SpatialAttention(in_channels) self.channel = ChannelAttention(in_channels) # DCNN cross-conv to seamlessly fuse sizes natively self.cross_conv = nn.Conv2d(in_channels * 2, in_channels, kernel_size=1) def forward(self, x): s_attn = self.spatial(x) c_attn = self.channel(x) # Cross-fusion: concatenate maps fused = torch.cat([s_attn, c_attn], dim=1) return self.cross_conv(fused) class DCNN(nn.Module): """ Deep Convolutional Neural Network module matching Requirement Form: - 3 Conv layers - 2 MaxPool layers """ def __init__(self): super(DCNN, self).__init__() # Expected input: 1 Channel (HOG preprocessed grayscale), 64x64 self.c1 = nn.Conv2d(1, 32, kernel_size=5, padding=2) self.s1 = nn.MaxPool2d(2, 2) self.c2 = nn.Conv2d(32, 64, kernel_size=5, padding=2) self.s2 = nn.MaxPool2d(2, 2) self.c3 = nn.Conv2d(64, 128, kernel_size=5, padding=2) self.relu = nn.ReLU(inplace=True) def forward(self, x): x = self.s1(self.relu(self.c1(x))) # Output: 32x32 x = self.s2(self.relu(self.c2(x))) # Output: 16x16 x = self.relu(self.c3(x)) # Output: 16x16 return x class BiLSTM(nn.Module): """ Bidirectional LSTM module processing both Forward/Backward time series for continuous facial representations. """ def __init__(self, input_dim, hidden_dim): super(BiLSTM, self).__init__() self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, batch_first=True, bidirectional=True) def forward(self, x): # Output shape: (Batch, Sequence Length, Hidden_Dim * 2) out, _ = self.lstm(x) return out class DCNN_BiLSTM_DAM(nn.Module): """ Full Architecture integration exactly matching the college requirement constraints. """ def __init__(self, num_classes=7): super(DCNN_BiLSTM_DAM, self).__init__() self.dcnn = DCNN() self.dam = DualAttentionCrossFusion(in_channels=128) # After 16x16 pooling from DCNN -> spatial flattened length = 256 self.seq_len = 16 * 16 self.feature_channels = 128 # Bi-LSTM for sequence temporal features self.bilstm = BiLSTM(input_dim=self.feature_channels, hidden_dim=64) # Fully connected block bilstm_output_features = 64 * 2 # Bidirectional self.fc1 = nn.Linear(bilstm_output_features * self.seq_len, 300) self.dropout = nn.Dropout(0.4) # Softmax classifier implemented automatically by PyTorch CrossEntropyLoss on the final un-activated layer self.fc2 = nn.Linear(300, num_classes) def forward(self, x): # 1. Feature Extraction (DCNN) features = self.dcnn(x) # 2. Attention Focus (Dual Attention Mechanism) attention_maps = self.dam(features) # Reshaping Spatial Dims into sequences for BiLSTM B, C, H, W = attention_maps.size() seq_input = attention_maps.view(B, C, H * W).permute(0, 2, 1) # 3. Sequential Processing (Bi-LSTM) bilstm_out = self.bilstm(seq_input) # Flatten for classification flat_out = bilstm_out.reshape(B, -1) # 4. Classification fc1_out = self.fc1(flat_out) dropped_out = self.dropout(fc1_out) # Raw logits for external Softmax output = self.fc2(dropped_out) return output # --- Example Usage --- if __name__ == "__main__": model = DCNN_BiLSTM_DAM(num_classes=7) dummy_input = torch.randn(1, 1, 64, 64) predictions = model(dummy_input) print(f"Model Output Shape (Batches, Classes): {predictions.shape}")