Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # In[1]: | |
| import torch.nn as nn | |
| import torchvision.transforms as transforms | |
| # First Model | |
| # In[ ]: | |
| class PoseNetV1(nn.Module): | |
| def __init__(self): | |
| super(PoseNetV1, self).__init__() | |
| self.conv = nn.Sequential( | |
| nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 112x112 | |
| nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 56x56 | |
| nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 28x28 | |
| ) | |
| self.fc = nn.Sequential( | |
| nn.Flatten(), | |
| nn.Linear(512 * 14 * 14, 512), | |
| nn.ReLU(), | |
| nn.Dropout(0.3), | |
| nn.Linear(512, 32) | |
| ) | |
| def forward(self, x): | |
| x = self.conv(x) | |
| x = self.fc(x) | |
| return x | |
| # Here, we added one more layer and we added Dropout to the fully connected layer. We also added a Flatten layer to flatten the output of the convolutional layers before passing it to the fully connected layers. | |
| # In[ ]: | |
| class PoseNetV2(nn.Module): | |
| def __init__(self): | |
| super(PoseNetV2, self).__init__() | |
| self.conv = nn.Sequential( | |
| nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 112x112 | |
| nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 56x56 | |
| nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 28x28 | |
| nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 14x14 | |
| ) | |
| self.fc = nn.Sequential( | |
| nn.Flatten(), | |
| nn.Linear(256 * 14 * 14, 512), | |
| nn.ReLU(), | |
| nn.Dropout(0.3), | |
| nn.Linear(512, 32) | |
| ) | |
| def forward(self, x): | |
| x = self.conv(x) | |
| x = self.fc(x) | |
| return x | |
| # In[ ]: | |
| class PoseNetV3(nn.Module): | |
| def __init__(self): | |
| super(PoseNetV3, self).__init__() | |
| self.conv = nn.Sequential( | |
| nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 112x112 | |
| nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 56x56 | |
| nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 28x28 | |
| nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 14x14 | |
| ) | |
| self.fc = nn.Sequential( | |
| nn.Flatten(), | |
| nn.Linear(256 * 14 * 14, 512), | |
| nn.ReLU(), | |
| nn.Dropout(0.3), | |
| nn.Linear(512, 32) | |
| ) | |
| def forward(self, x): | |
| x = self.conv(x) | |
| x = self.fc(x) | |
| return x | |
| # We added batch normalization in each layer, Adaptive Pooling and a Tahn function at the end of the fully conected layers | |
| # In[ ]: | |
| class PoseNetV4(nn.Module): | |
| def __init__(self): | |
| super(PoseNetV4, self).__init__() | |
| self.conv = nn.Sequential( | |
| nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(32), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 112x112 | |
| nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(64), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 56x56 | |
| nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(128), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 28x28 | |
| nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(256), | |
| nn.ReLU(), | |
| nn.AdaptiveAvgPool2d((7, 7)) # Adaptive pooling to make output size consistent | |
| ) | |
| self.fc = nn.Sequential( | |
| nn.Flatten(), | |
| nn.Linear(256 * 7 * 7, 512), | |
| nn.ReLU(), | |
| nn.Dropout(0.4), # Increased dropout to prevent overfitting | |
| nn.Linear(512, 32), | |
| nn.Tanh() # Normalizing keypoint predictions | |
| ) | |
| def forward(self, x): | |
| x = self.conv(x) | |
| x = self.fc(x) | |
| return x | |
| # 4 Layers -> 5 Layers | |
| # | |
| # Tahn() -> Sigmoid() | |
| # In[ ]: | |
| class PoseNetV5(nn.Module): | |
| def __init__(self): | |
| super(PoseNetV5, self).__init__() | |
| self.conv = nn.Sequential( | |
| nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(32), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 112x112 | |
| nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(64), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 56x56 | |
| nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(128), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 28x28 | |
| nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(256), | |
| nn.ReLU(), | |
| nn.MaxPool2d(2, 2), # 28x28 | |
| nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(512), | |
| nn.ReLU(), | |
| nn.AdaptiveAvgPool2d((7, 7)) # Adaptive pooling to make output size consistent | |
| ) | |
| self.fc = nn.Sequential( | |
| nn.Flatten(), | |
| nn.Linear(512 * 7 * 7, 512), | |
| nn.ReLU(), | |
| nn.Dropout(0.50), # Increased dropout to prevent overfitting | |
| nn.Linear(512, 32), | |
| nn.Sigmoid() # Normalizing keypoint predictions | |
| ) | |
| def forward(self, x): | |
| x = self.conv(x) | |
| x = self.fc(x) | |
| return x | |
| # In[ ]: | |
| class ResidualBlock(nn.Module): | |
| def __init__(self, in_channels, out_channels): | |
| super(ResidualBlock, self).__init__() | |
| self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) | |
| self.bn1 = nn.BatchNorm2d(out_channels) | |
| self.relu = nn.ReLU() | |
| self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) | |
| self.bn2 = nn.BatchNorm2d(out_channels) | |
| # Skip connection (identity mapping) | |
| self.shortcut = nn.Sequential() | |
| if in_channels != out_channels: | |
| self.shortcut = nn.Sequential( | |
| nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0), | |
| nn.BatchNorm2d(out_channels) | |
| ) | |
| def forward(self, x): | |
| out = self.relu(self.bn1(self.conv1(x))) | |
| out = self.bn2(self.conv2(out)) | |
| out += self.shortcut(x) # Adding the residual connection | |
| out = self.relu(out) | |
| return out | |
| class ResPoseNet(nn.Module): | |
| def __init__(self): | |
| super(ResPoseNet, self).__init__() | |
| # Using residual blocks for feature extraction | |
| self.conv = nn.Sequential( | |
| ResidualBlock(3, 32), # Initial Conv + Residual Block | |
| nn.MaxPool2d(2, 2), # 112x112 | |
| ResidualBlock(32, 64), # Residual Block | |
| nn.MaxPool2d(2, 2), # 56x56 | |
| ResidualBlock(64, 128), # Residual Block | |
| nn.MaxPool2d(2, 2), # 28x28 | |
| ResidualBlock(128, 256), # Residual Block | |
| nn.MaxPool2d(2, 2), # 28x28 | |
| ResidualBlock(256, 512), # Residual Block | |
| nn.AdaptiveAvgPool2d((7, 7)) # 14x14 output | |
| ) | |
| self.fc = nn.Sequential( | |
| nn.Flatten(), | |
| nn.Linear(512 * 7 * 7, 1024), | |
| nn.ReLU(), | |
| nn.Dropout(0.40), | |
| nn.Linear(1024, 32), # Assuming 16 keypoints, each with x, y = 32 values | |
| nn.Sigmoid() # Output keypoint coordinates between [0,1] | |
| ) | |
| def forward(self, x): | |
| x = self.conv(x) | |
| x = self.fc(x) | |
| return x | |
| transform = transforms.Compose([ | |
| transforms.ToTensor(), # Convert to tensor (3, 224, 224) | |
| transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) # Normalize RGB | |
| ]) |