|
|
"""
|
|
|
ISL Sign Language Translation - TechMatrix Solvers Initiative
|
|
|
Model definitions for body pose and hand pose estimation
|
|
|
Developed by: TechMatrix Solvers Team
|
|
|
"""
|
|
|
|
|
|
import torch
|
|
|
from collections import OrderedDict
|
|
|
import torch.nn as nn
|
|
|
|
|
|
|
|
|
def construct_layers(layer_config, no_relu_layers, prelu_layers=[]):
|
|
|
"""
|
|
|
Constructs neural network layers based on configuration
|
|
|
|
|
|
Args:
|
|
|
layer_config: Dictionary defining layer parameters
|
|
|
no_relu_layers: List of layers that shouldn't have ReLU activation
|
|
|
prelu_layers: List of layers that should use PReLU instead of ReLU
|
|
|
"""
|
|
|
layers = []
|
|
|
|
|
|
for layer_name, params in layer_config.items():
|
|
|
if 'pool' in layer_name:
|
|
|
layer = nn.MaxPool2d(kernel_size=params[0], stride=params[1], padding=params[2])
|
|
|
layers.append((layer_name, layer))
|
|
|
else:
|
|
|
conv2d = nn.Conv2d(
|
|
|
in_channels=params[0],
|
|
|
out_channels=params[1],
|
|
|
kernel_size=params[2],
|
|
|
stride=params[3],
|
|
|
padding=params[4]
|
|
|
)
|
|
|
layers.append((layer_name, conv2d))
|
|
|
|
|
|
if layer_name not in no_relu_layers:
|
|
|
if layer_name not in prelu_layers:
|
|
|
layers.append(('relu_' + layer_name, nn.ReLU(inplace=True)))
|
|
|
else:
|
|
|
layers.append(('prelu' + layer_name[4:], nn.PReLU(params[1])))
|
|
|
|
|
|
return nn.Sequential(OrderedDict(layers))
|
|
|
|
|
|
|
|
|
def construct_multi_conv_layers(layer_config, no_relu_layers):
|
|
|
"""
|
|
|
Constructs multiple convolution layers for complex architectures
|
|
|
"""
|
|
|
modules = []
|
|
|
for layer_name, params in layer_config.items():
|
|
|
layers = []
|
|
|
if 'pool' in layer_name:
|
|
|
layer = nn.MaxPool2d(kernel_size=params[0], stride=params[1], padding=params[2])
|
|
|
layers.append((layer_name, layer))
|
|
|
else:
|
|
|
conv2d = nn.Conv2d(
|
|
|
in_channels=params[0],
|
|
|
out_channels=params[1],
|
|
|
kernel_size=params[2],
|
|
|
stride=params[3],
|
|
|
padding=params[4]
|
|
|
)
|
|
|
layers.append((layer_name, conv2d))
|
|
|
if layer_name not in no_relu_layers:
|
|
|
layers.append(('Mprelu' + layer_name[5:], nn.PReLU(params[1])))
|
|
|
modules.append(nn.Sequential(OrderedDict(layers)))
|
|
|
return nn.ModuleList(modules)
|
|
|
|
|
|
|
|
|
class BodyPose25Model(nn.Module):
|
|
|
"""
|
|
|
Body pose estimation model using 25-point skeleton
|
|
|
Developed by TechMatrix Solvers for ISL translation
|
|
|
"""
|
|
|
|
|
|
def __init__(self):
|
|
|
super(BodyPose25Model, self).__init__()
|
|
|
|
|
|
|
|
|
no_relu_layers = [
|
|
|
'Mconv7_stage0_L1', 'Mconv7_stage0_L2',
|
|
|
'Mconv7_stage1_L1', 'Mconv7_stage1_L2',
|
|
|
'Mconv7_stage2_L2', 'Mconv7_stage3_L2'
|
|
|
]
|
|
|
prelu_layers = ['conv4_2', 'conv4_3_CPM', 'conv4_4_CPM']
|
|
|
|
|
|
|
|
|
base_layers = OrderedDict([
|
|
|
('conv1_1', [3, 64, 3, 1, 1]),
|
|
|
('conv1_2', [64, 64, 3, 1, 1]),
|
|
|
('pool1_stage1', [2, 2, 0]),
|
|
|
('conv2_1', [64, 128, 3, 1, 1]),
|
|
|
('conv2_2', [128, 128, 3, 1, 1]),
|
|
|
('pool2_stage1', [2, 2, 0]),
|
|
|
('conv3_1', [128, 256, 3, 1, 1]),
|
|
|
('conv3_2', [256, 256, 3, 1, 1]),
|
|
|
('conv3_3', [256, 256, 3, 1, 1]),
|
|
|
('conv3_4', [256, 256, 3, 1, 1]),
|
|
|
('pool3_stage1', [2, 2, 0]),
|
|
|
('conv4_1', [256, 512, 3, 1, 1]),
|
|
|
('conv4_2', [512, 512, 3, 1, 1]),
|
|
|
('conv4_3_CPM', [512, 256, 3, 1, 1]),
|
|
|
('conv4_4_CPM', [256, 128, 3, 1, 1])
|
|
|
])
|
|
|
self.base_model = construct_layers(base_layers, no_relu_layers, prelu_layers)
|
|
|
|
|
|
|
|
|
stage_blocks = {}
|
|
|
|
|
|
|
|
|
stage_blocks['Mconv1_stage0_L2'] = OrderedDict([
|
|
|
('Mconv1_stage0_L2_0', [128, 96, 3, 1, 1]),
|
|
|
('Mconv1_stage0_L2_1', [96, 96, 3, 1, 1]),
|
|
|
('Mconv1_stage0_L2_2', [96, 96, 3, 1, 1])
|
|
|
])
|
|
|
|
|
|
for i in range(2, 6):
|
|
|
stage_blocks[f'Mconv{i}_stage0_L2'] = OrderedDict([
|
|
|
(f'Mconv{i}_stage0_L2_0', [288, 96, 3, 1, 1]),
|
|
|
(f'Mconv{i}_stage0_L2_1', [96, 96, 3, 1, 1]),
|
|
|
(f'Mconv{i}_stage0_L2_2', [96, 96, 3, 1, 1])
|
|
|
])
|
|
|
|
|
|
stage_blocks['Mconv6_7_stage0_L2'] = OrderedDict([
|
|
|
('Mconv6_stage0_L2', [288, 256, 1, 1, 0]),
|
|
|
('Mconv7_stage0_L2', [256, 52, 1, 1, 0])
|
|
|
])
|
|
|
|
|
|
|
|
|
for stage in range(1, 4):
|
|
|
stage_blocks[f'Mconv1_stage{stage}_L2'] = OrderedDict([
|
|
|
(f'Mconv1_stage{stage}_L2_0', [180, 128, 3, 1, 1]),
|
|
|
(f'Mconv1_stage{stage}_L2_1', [128, 128, 3, 1, 1]),
|
|
|
(f'Mconv1_stage{stage}_L2_2', [128, 128, 3, 1, 1])
|
|
|
])
|
|
|
for i in range(2, 6):
|
|
|
stage_blocks[f'Mconv{i}_stage{stage}_L2'] = OrderedDict([
|
|
|
(f'Mconv{i}_stage{stage}_L2_0', [384, 128, 3, 1, 1]),
|
|
|
(f'Mconv{i}_stage{stage}_L2_1', [128, 128, 3, 1, 1]),
|
|
|
(f'Mconv{i}_stage{stage}_L2_2', [128, 128, 3, 1, 1])
|
|
|
])
|
|
|
stage_blocks[f'Mconv6_7_stage{stage}_L2'] = OrderedDict([
|
|
|
(f'Mconv6_stage{stage}_L2', [384, 512, 1, 1, 0]),
|
|
|
(f'Mconv7_stage{stage}_L2', [512, 52, 1, 1, 0])
|
|
|
])
|
|
|
|
|
|
|
|
|
stage_blocks['Mconv1_stage0_L1'] = OrderedDict([
|
|
|
('Mconv1_stage0_L1_0', [180, 96, 3, 1, 1]),
|
|
|
('Mconv1_stage0_L1_1', [96, 96, 3, 1, 1]),
|
|
|
('Mconv1_stage0_L1_2', [96, 96, 3, 1, 1])
|
|
|
])
|
|
|
|
|
|
for i in range(2, 6):
|
|
|
stage_blocks[f'Mconv{i}_stage0_L1'] = OrderedDict([
|
|
|
(f'Mconv{i}_stage0_L1_0', [288, 96, 3, 1, 1]),
|
|
|
(f'Mconv{i}_stage0_L1_1', [96, 96, 3, 1, 1]),
|
|
|
(f'Mconv{i}_stage0_L1_2', [96, 96, 3, 1, 1])
|
|
|
])
|
|
|
|
|
|
stage_blocks['Mconv6_7_stage0_L1'] = OrderedDict([
|
|
|
('Mconv6_stage0_L1', [288, 256, 1, 1, 0]),
|
|
|
('Mconv7_stage0_L1', [256, 26, 1, 1, 0])
|
|
|
])
|
|
|
|
|
|
stage_blocks['Mconv1_stage1_L1'] = OrderedDict([
|
|
|
('Mconv1_stage1_L1_0', [206, 128, 3, 1, 1]),
|
|
|
('Mconv1_stage1_L1_1', [128, 128, 3, 1, 1]),
|
|
|
('Mconv1_stage1_L1_2', [128, 128, 3, 1, 1])
|
|
|
])
|
|
|
|
|
|
for i in range(2, 6):
|
|
|
stage_blocks[f'Mconv{i}_stage1_L1'] = OrderedDict([
|
|
|
(f'Mconv{i}_stage1_L1_0', [384, 128, 3, 1, 1]),
|
|
|
(f'Mconv{i}_stage1_L1_1', [128, 128, 3, 1, 1]),
|
|
|
(f'Mconv{i}_stage1_L1_2', [128, 128, 3, 1, 1])
|
|
|
])
|
|
|
|
|
|
stage_blocks['Mconv6_7_stage1_L1'] = OrderedDict([
|
|
|
('Mconv6_stage1_L1', [384, 512, 1, 1, 0]),
|
|
|
('Mconv7_stage1_L1', [512, 26, 1, 1, 0])
|
|
|
])
|
|
|
|
|
|
|
|
|
for block_name in stage_blocks.keys():
|
|
|
stage_blocks[block_name] = construct_multi_conv_layers(stage_blocks[block_name], no_relu_layers)
|
|
|
|
|
|
self.stage_models = nn.ModuleDict(stage_blocks)
|
|
|
|
|
|
|
|
|
for param in self.parameters():
|
|
|
param.requires_grad = False
|
|
|
|
|
|
def _multi_conv_forward(self, x, models):
|
|
|
"""Forward pass through multi-convolution blocks"""
|
|
|
outputs = []
|
|
|
current_output = x
|
|
|
for model in models:
|
|
|
current_output = model(current_output)
|
|
|
outputs.append(current_output)
|
|
|
return torch.cat(outputs, 1)
|
|
|
|
|
|
def forward(self, x):
|
|
|
"""Forward pass through the body pose model"""
|
|
|
base_features = self.base_model(x)
|
|
|
|
|
|
|
|
|
current_features = base_features
|
|
|
for stage in range(4):
|
|
|
current_features = self._multi_conv_forward(
|
|
|
current_features, self.stage_models[f'Mconv1_stage{stage}_L2']
|
|
|
)
|
|
|
for layer in range(2, 6):
|
|
|
current_features = self._multi_conv_forward(
|
|
|
current_features, self.stage_models[f'Mconv{layer}_stage{stage}_L2']
|
|
|
)
|
|
|
current_features = self.stage_models[f'Mconv6_7_stage{stage}_L2'][0](current_features)
|
|
|
current_features = self.stage_models[f'Mconv6_7_stage{stage}_L2'][1](current_features)
|
|
|
l2_output = current_features
|
|
|
current_features = torch.cat([base_features, current_features], 1)
|
|
|
|
|
|
|
|
|
current_features = self._multi_conv_forward(
|
|
|
current_features, self.stage_models['Mconv1_stage0_L1']
|
|
|
)
|
|
|
for layer in range(2, 6):
|
|
|
current_features = self._multi_conv_forward(
|
|
|
current_features, self.stage_models[f'Mconv{layer}_stage0_L1']
|
|
|
)
|
|
|
current_features = self.stage_models['Mconv6_7_stage0_L1'][0](current_features)
|
|
|
current_features = self.stage_models['Mconv6_7_stage0_L1'][1](current_features)
|
|
|
stage0_l1_output = current_features
|
|
|
current_features = torch.cat([base_features, stage0_l1_output, l2_output], 1)
|
|
|
|
|
|
|
|
|
current_features = self._multi_conv_forward(
|
|
|
current_features, self.stage_models['Mconv1_stage1_L1']
|
|
|
)
|
|
|
for layer in range(2, 6):
|
|
|
current_features = self._multi_conv_forward(
|
|
|
current_features, self.stage_models[f'Mconv{layer}_stage1_L1']
|
|
|
)
|
|
|
current_features = self.stage_models['Mconv6_7_stage1_L1'][0](current_features)
|
|
|
stage1_l1_output = self.stage_models['Mconv6_7_stage1_L1'][1](current_features)
|
|
|
|
|
|
return l2_output, stage1_l1_output
|
|
|
|
|
|
|
|
|
class HandPoseModel(nn.Module):
|
|
|
"""
|
|
|
Hand pose estimation model using 21-point hand landmarks
|
|
|
Developed by TechMatrix Solvers for ISL translation
|
|
|
"""
|
|
|
|
|
|
def __init__(self):
|
|
|
super(HandPoseModel, self).__init__()
|
|
|
|
|
|
|
|
|
no_relu_layers = [
|
|
|
'conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',
|
|
|
'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6'
|
|
|
]
|
|
|
|
|
|
|
|
|
stage1_base = OrderedDict([
|
|
|
('conv1_1', [3, 64, 3, 1, 1]),
|
|
|
('conv1_2', [64, 64, 3, 1, 1]),
|
|
|
('pool1_stage1', [2, 2, 0]),
|
|
|
('conv2_1', [64, 128, 3, 1, 1]),
|
|
|
('conv2_2', [128, 128, 3, 1, 1]),
|
|
|
('pool2_stage1', [2, 2, 0]),
|
|
|
('conv3_1', [128, 256, 3, 1, 1]),
|
|
|
('conv3_2', [256, 256, 3, 1, 1]),
|
|
|
('conv3_3', [256, 256, 3, 1, 1]),
|
|
|
('conv3_4', [256, 256, 3, 1, 1]),
|
|
|
('pool3_stage1', [2, 2, 0]),
|
|
|
('conv4_1', [256, 512, 3, 1, 1]),
|
|
|
('conv4_2', [512, 512, 3, 1, 1]),
|
|
|
('conv4_3', [512, 512, 3, 1, 1]),
|
|
|
('conv4_4', [512, 512, 3, 1, 1]),
|
|
|
('conv5_1', [512, 512, 3, 1, 1]),
|
|
|
('conv5_2', [512, 512, 3, 1, 1]),
|
|
|
('conv5_3_CPM', [512, 128, 3, 1, 1])
|
|
|
])
|
|
|
|
|
|
stage1_prediction = OrderedDict([
|
|
|
('conv6_1_CPM', [128, 512, 1, 1, 0]),
|
|
|
('conv6_2_CPM', [512, 22, 1, 1, 0])
|
|
|
])
|
|
|
|
|
|
stage_blocks = {}
|
|
|
stage_blocks['stage1_base'] = stage1_base
|
|
|
stage_blocks['stage1_prediction'] = stage1_prediction
|
|
|
|
|
|
|
|
|
for i in range(2, 7):
|
|
|
stage_blocks[f'stage{i}'] = OrderedDict([
|
|
|
(f'Mconv1_stage{i}', [150, 128, 7, 1, 3]),
|
|
|
(f'Mconv2_stage{i}', [128, 128, 7, 1, 3]),
|
|
|
(f'Mconv3_stage{i}', [128, 128, 7, 1, 3]),
|
|
|
(f'Mconv4_stage{i}', [128, 128, 7, 1, 3]),
|
|
|
(f'Mconv5_stage{i}', [128, 128, 7, 1, 3]),
|
|
|
(f'Mconv6_stage{i}', [128, 128, 1, 1, 0]),
|
|
|
(f'Mconv7_stage{i}', [128, 22, 1, 1, 0])
|
|
|
])
|
|
|
|
|
|
|
|
|
for block_name in stage_blocks.keys():
|
|
|
stage_blocks[block_name] = construct_layers(stage_blocks[block_name], no_relu_layers)
|
|
|
|
|
|
self.stage1_base_model = stage_blocks['stage1_base']
|
|
|
self.stage1_prediction_model = stage_blocks['stage1_prediction']
|
|
|
self.stage2_model = stage_blocks['stage2']
|
|
|
self.stage3_model = stage_blocks['stage3']
|
|
|
self.stage4_model = stage_blocks['stage4']
|
|
|
self.stage5_model = stage_blocks['stage5']
|
|
|
self.stage6_model = stage_blocks['stage6']
|
|
|
|
|
|
|
|
|
for param in self.parameters():
|
|
|
param.requires_grad = False
|
|
|
|
|
|
def forward(self, x):
|
|
|
"""Forward pass through the hand pose model"""
|
|
|
base_features = self.stage1_base_model(x)
|
|
|
stage1_output = self.stage1_prediction_model(base_features)
|
|
|
|
|
|
|
|
|
stage2_input = torch.cat([stage1_output, base_features], 1)
|
|
|
stage2_output = self.stage2_model(stage2_input)
|
|
|
|
|
|
|
|
|
stage3_input = torch.cat([stage2_output, base_features], 1)
|
|
|
stage3_output = self.stage3_model(stage3_input)
|
|
|
|
|
|
|
|
|
stage4_input = torch.cat([stage3_output, base_features], 1)
|
|
|
stage4_output = self.stage4_model(stage4_input)
|
|
|
|
|
|
|
|
|
stage5_input = torch.cat([stage4_output, base_features], 1)
|
|
|
stage5_output = self.stage5_model(stage5_input)
|
|
|
|
|
|
|
|
|
stage6_input = torch.cat([stage5_output, base_features], 1)
|
|
|
stage6_output = self.stage6_model(stage6_input)
|
|
|
|
|
|
return stage6_output
|
|
|
|
|
|
|
|
|
|
|
|
def create_bodypose_model():
|
|
|
"""Create and return body pose detection model"""
|
|
|
return BodyPose25Model()
|
|
|
|
|
|
|
|
|
def create_handpose_model():
|
|
|
"""Create and return hand pose detection model"""
|
|
|
return HandPoseModel() |