Spaces:
Sleeping
Sleeping
| import torch | |
| import torch.nn as nn | |
| class SpatialAttention(nn.Module): | |
| """ | |
| Spatial Attention mechanism mapping specific potential local dominant regions | |
| (lips, eyes, mouth) | |
| """ | |
| def __init__(self, in_channels): | |
| super(SpatialAttention, self).__init__() | |
| # Using 1x1 convolution to identify spatial feature importance | |
| self.conv = nn.Conv2d(in_channels, 1, kernel_size=1) | |
| def forward(self, x): | |
| attn_weights = torch.sigmoid(self.conv(x)) | |
| return x * attn_weights | |
| class ChannelAttention(nn.Module): | |
| """ | |
| Channel Attention capturing global information constraints | |
| (overall face structure, lighting, poses) | |
| """ | |
| def __init__(self, in_channels, reduction=16): | |
| super(ChannelAttention, self).__init__() | |
| self.avg_pool = nn.AdaptiveAvgPool2d(1) | |
| self.max_pool = nn.AdaptiveMaxPool2d(1) | |
| # Shared Multilayer Perceptron | |
| self.mlp = nn.Sequential( | |
| nn.Conv2d(in_channels, in_channels // reduction, 1, bias=False), | |
| nn.ReLU(inplace=True), | |
| nn.Conv2d(in_channels // reduction, in_channels, 1, bias=False) | |
| ) | |
| self.sigmoid = nn.Sigmoid() | |
| def forward(self, x): | |
| avg_out = self.mlp(self.avg_pool(x)) | |
| max_out = self.mlp(self.max_pool(x)) | |
| attn_weights = self.sigmoid(avg_out + max_out) | |
| return x * attn_weights | |
| class DualAttentionCrossFusion(nn.Module): | |
| """ | |
| Combines both Spatial and Channel attentions in parallel | |
| and uses cross-fusion to merge them avoiding information destruction. | |
| """ | |
| def __init__(self, in_channels): | |
| super(DualAttentionCrossFusion, self).__init__() | |
| self.spatial = SpatialAttention(in_channels) | |
| self.channel = ChannelAttention(in_channels) | |
| # DCNN cross-conv to seamlessly fuse sizes natively | |
| self.cross_conv = nn.Conv2d(in_channels * 2, in_channels, kernel_size=1) | |
| def forward(self, x): | |
| s_attn = self.spatial(x) | |
| c_attn = self.channel(x) | |
| # Cross-fusion: concatenate maps | |
| fused = torch.cat([s_attn, c_attn], dim=1) | |
| return self.cross_conv(fused) | |
| class DCNN(nn.Module): | |
| """ | |
| Deep Convolutional Neural Network module matching Requirement Form: | |
| - 3 Conv layers | |
| - 2 MaxPool layers | |
| """ | |
| def __init__(self): | |
| super(DCNN, self).__init__() | |
| # Expected input: 1 Channel (HOG preprocessed grayscale), 64x64 | |
| self.c1 = nn.Conv2d(1, 32, kernel_size=5, padding=2) | |
| self.s1 = nn.MaxPool2d(2, 2) | |
| self.c2 = nn.Conv2d(32, 64, kernel_size=5, padding=2) | |
| self.s2 = nn.MaxPool2d(2, 2) | |
| self.c3 = nn.Conv2d(64, 128, kernel_size=5, padding=2) | |
| self.relu = nn.ReLU(inplace=True) | |
| def forward(self, x): | |
| x = self.s1(self.relu(self.c1(x))) # Output: 32x32 | |
| x = self.s2(self.relu(self.c2(x))) # Output: 16x16 | |
| x = self.relu(self.c3(x)) # Output: 16x16 | |
| return x | |
| class BiLSTM(nn.Module): | |
| """ | |
| Bidirectional LSTM module processing both Forward/Backward time series | |
| for continuous facial representations. | |
| """ | |
| def __init__(self, input_dim, hidden_dim): | |
| super(BiLSTM, self).__init__() | |
| self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, | |
| num_layers=1, batch_first=True, bidirectional=True) | |
| def forward(self, x): | |
| # Output shape: (Batch, Sequence Length, Hidden_Dim * 2) | |
| out, _ = self.lstm(x) | |
| return out | |
| class DCNN_BiLSTM_DAM(nn.Module): | |
| """ | |
| Full Architecture integration exactly matching the college requirement constraints. | |
| """ | |
| def __init__(self, num_classes=7): | |
| super(DCNN_BiLSTM_DAM, self).__init__() | |
| self.dcnn = DCNN() | |
| self.dam = DualAttentionCrossFusion(in_channels=128) | |
| # After 16x16 pooling from DCNN -> spatial flattened length = 256 | |
| self.seq_len = 16 * 16 | |
| self.feature_channels = 128 | |
| # Bi-LSTM for sequence temporal features | |
| self.bilstm = BiLSTM(input_dim=self.feature_channels, hidden_dim=64) | |
| # Fully connected block | |
| bilstm_output_features = 64 * 2 # Bidirectional | |
| self.fc1 = nn.Linear(bilstm_output_features * self.seq_len, 300) | |
| self.dropout = nn.Dropout(0.4) | |
| # Softmax classifier implemented automatically by PyTorch CrossEntropyLoss on the final un-activated layer | |
| self.fc2 = nn.Linear(300, num_classes) | |
| def forward(self, x): | |
| # 1. Feature Extraction (DCNN) | |
| features = self.dcnn(x) | |
| # 2. Attention Focus (Dual Attention Mechanism) | |
| attention_maps = self.dam(features) | |
| # Reshaping Spatial Dims into sequences for BiLSTM | |
| B, C, H, W = attention_maps.size() | |
| seq_input = attention_maps.view(B, C, H * W).permute(0, 2, 1) | |
| # 3. Sequential Processing (Bi-LSTM) | |
| bilstm_out = self.bilstm(seq_input) | |
| # Flatten for classification | |
| flat_out = bilstm_out.reshape(B, -1) | |
| # 4. Classification | |
| fc1_out = self.fc1(flat_out) | |
| dropped_out = self.dropout(fc1_out) | |
| # Raw logits for external Softmax | |
| output = self.fc2(dropped_out) | |
| return output | |
| # --- Example Usage --- | |
| if __name__ == "__main__": | |
| model = DCNN_BiLSTM_DAM(num_classes=7) | |
| dummy_input = torch.randn(1, 1, 64, 64) | |
| predictions = model(dummy_input) | |
| print(f"Model Output Shape (Batches, Classes): {predictions.shape}") | |