""" ISL Sign Language Translation - TechMatrix Solvers Initiative Model definitions for body pose and hand pose estimation Developed by: TechMatrix Solvers Team """ import torch from collections import OrderedDict import torch.nn as nn def construct_layers(layer_config, no_relu_layers, prelu_layers=[]): """ Constructs neural network layers based on configuration Args: layer_config: Dictionary defining layer parameters no_relu_layers: List of layers that shouldn't have ReLU activation prelu_layers: List of layers that should use PReLU instead of ReLU """ layers = [] for layer_name, params in layer_config.items(): if 'pool' in layer_name: layer = nn.MaxPool2d(kernel_size=params[0], stride=params[1], padding=params[2]) layers.append((layer_name, layer)) else: conv2d = nn.Conv2d( in_channels=params[0], out_channels=params[1], kernel_size=params[2], stride=params[3], padding=params[4] ) layers.append((layer_name, conv2d)) if layer_name not in no_relu_layers: if layer_name not in prelu_layers: layers.append(('relu_' + layer_name, nn.ReLU(inplace=True))) else: layers.append(('prelu' + layer_name[4:], nn.PReLU(params[1]))) return nn.Sequential(OrderedDict(layers)) def construct_multi_conv_layers(layer_config, no_relu_layers): """ Constructs multiple convolution layers for complex architectures """ modules = [] for layer_name, params in layer_config.items(): layers = [] if 'pool' in layer_name: layer = nn.MaxPool2d(kernel_size=params[0], stride=params[1], padding=params[2]) layers.append((layer_name, layer)) else: conv2d = nn.Conv2d( in_channels=params[0], out_channels=params[1], kernel_size=params[2], stride=params[3], padding=params[4] ) layers.append((layer_name, conv2d)) if layer_name not in no_relu_layers: layers.append(('Mprelu' + layer_name[5:], nn.PReLU(params[1]))) modules.append(nn.Sequential(OrderedDict(layers))) return nn.ModuleList(modules) class BodyPose25Model(nn.Module): """ Body pose estimation model using 25-point skeleton Developed by TechMatrix Solvers for ISL translation """ def __init__(self): super(BodyPose25Model, self).__init__() # Define layers without ReLU activation no_relu_layers = [ 'Mconv7_stage0_L1', 'Mconv7_stage0_L2', 'Mconv7_stage1_L1', 'Mconv7_stage1_L2', 'Mconv7_stage2_L2', 'Mconv7_stage3_L2' ] prelu_layers = ['conv4_2', 'conv4_3_CPM', 'conv4_4_CPM'] # Initial feature extraction layers base_layers = OrderedDict([ ('conv1_1', [3, 64, 3, 1, 1]), ('conv1_2', [64, 64, 3, 1, 1]), ('pool1_stage1', [2, 2, 0]), ('conv2_1', [64, 128, 3, 1, 1]), ('conv2_2', [128, 128, 3, 1, 1]), ('pool2_stage1', [2, 2, 0]), ('conv3_1', [128, 256, 3, 1, 1]), ('conv3_2', [256, 256, 3, 1, 1]), ('conv3_3', [256, 256, 3, 1, 1]), ('conv3_4', [256, 256, 3, 1, 1]), ('pool3_stage1', [2, 2, 0]), ('conv4_1', [256, 512, 3, 1, 1]), ('conv4_2', [512, 512, 3, 1, 1]), ('conv4_3_CPM', [512, 256, 3, 1, 1]), ('conv4_4_CPM', [256, 128, 3, 1, 1]) ]) self.base_model = construct_layers(base_layers, no_relu_layers, prelu_layers) # Multi-stage refinement blocks stage_blocks = {} # L2 branch - Stage 0 stage_blocks['Mconv1_stage0_L2'] = OrderedDict([ ('Mconv1_stage0_L2_0', [128, 96, 3, 1, 1]), ('Mconv1_stage0_L2_1', [96, 96, 3, 1, 1]), ('Mconv1_stage0_L2_2', [96, 96, 3, 1, 1]) ]) for i in range(2, 6): stage_blocks[f'Mconv{i}_stage0_L2'] = OrderedDict([ (f'Mconv{i}_stage0_L2_0', [288, 96, 3, 1, 1]), (f'Mconv{i}_stage0_L2_1', [96, 96, 3, 1, 1]), (f'Mconv{i}_stage0_L2_2', [96, 96, 3, 1, 1]) ]) stage_blocks['Mconv6_7_stage0_L2'] = OrderedDict([ ('Mconv6_stage0_L2', [288, 256, 1, 1, 0]), ('Mconv7_stage0_L2', [256, 52, 1, 1, 0]) ]) # L2 branch - Stages 1-3 for stage in range(1, 4): stage_blocks[f'Mconv1_stage{stage}_L2'] = OrderedDict([ (f'Mconv1_stage{stage}_L2_0', [180, 128, 3, 1, 1]), (f'Mconv1_stage{stage}_L2_1', [128, 128, 3, 1, 1]), (f'Mconv1_stage{stage}_L2_2', [128, 128, 3, 1, 1]) ]) for i in range(2, 6): stage_blocks[f'Mconv{i}_stage{stage}_L2'] = OrderedDict([ (f'Mconv{i}_stage{stage}_L2_0', [384, 128, 3, 1, 1]), (f'Mconv{i}_stage{stage}_L2_1', [128, 128, 3, 1, 1]), (f'Mconv{i}_stage{stage}_L2_2', [128, 128, 3, 1, 1]) ]) stage_blocks[f'Mconv6_7_stage{stage}_L2'] = OrderedDict([ (f'Mconv6_stage{stage}_L2', [384, 512, 1, 1, 0]), (f'Mconv7_stage{stage}_L2', [512, 52, 1, 1, 0]) ]) # L1 branch configurations stage_blocks['Mconv1_stage0_L1'] = OrderedDict([ ('Mconv1_stage0_L1_0', [180, 96, 3, 1, 1]), ('Mconv1_stage0_L1_1', [96, 96, 3, 1, 1]), ('Mconv1_stage0_L1_2', [96, 96, 3, 1, 1]) ]) for i in range(2, 6): stage_blocks[f'Mconv{i}_stage0_L1'] = OrderedDict([ (f'Mconv{i}_stage0_L1_0', [288, 96, 3, 1, 1]), (f'Mconv{i}_stage0_L1_1', [96, 96, 3, 1, 1]), (f'Mconv{i}_stage0_L1_2', [96, 96, 3, 1, 1]) ]) stage_blocks['Mconv6_7_stage0_L1'] = OrderedDict([ ('Mconv6_stage0_L1', [288, 256, 1, 1, 0]), ('Mconv7_stage0_L1', [256, 26, 1, 1, 0]) ]) stage_blocks['Mconv1_stage1_L1'] = OrderedDict([ ('Mconv1_stage1_L1_0', [206, 128, 3, 1, 1]), ('Mconv1_stage1_L1_1', [128, 128, 3, 1, 1]), ('Mconv1_stage1_L1_2', [128, 128, 3, 1, 1]) ]) for i in range(2, 6): stage_blocks[f'Mconv{i}_stage1_L1'] = OrderedDict([ (f'Mconv{i}_stage1_L1_0', [384, 128, 3, 1, 1]), (f'Mconv{i}_stage1_L1_1', [128, 128, 3, 1, 1]), (f'Mconv{i}_stage1_L1_2', [128, 128, 3, 1, 1]) ]) stage_blocks['Mconv6_7_stage1_L1'] = OrderedDict([ ('Mconv6_stage1_L1', [384, 512, 1, 1, 0]), ('Mconv7_stage1_L1', [512, 26, 1, 1, 0]) ]) # Build multi-conv modules for block_name in stage_blocks.keys(): stage_blocks[block_name] = construct_multi_conv_layers(stage_blocks[block_name], no_relu_layers) self.stage_models = nn.ModuleDict(stage_blocks) # Freeze parameters for efficiency for param in self.parameters(): param.requires_grad = False def _multi_conv_forward(self, x, models): """Forward pass through multi-convolution blocks""" outputs = [] current_output = x for model in models: current_output = model(current_output) outputs.append(current_output) return torch.cat(outputs, 1) def forward(self, x): """Forward pass through the body pose model""" base_features = self.base_model(x) # L2 branch processing current_features = base_features for stage in range(4): current_features = self._multi_conv_forward( current_features, self.stage_models[f'Mconv1_stage{stage}_L2'] ) for layer in range(2, 6): current_features = self._multi_conv_forward( current_features, self.stage_models[f'Mconv{layer}_stage{stage}_L2'] ) current_features = self.stage_models[f'Mconv6_7_stage{stage}_L2'][0](current_features) current_features = self.stage_models[f'Mconv6_7_stage{stage}_L2'][1](current_features) l2_output = current_features current_features = torch.cat([base_features, current_features], 1) # L1 branch - Stage 0 current_features = self._multi_conv_forward( current_features, self.stage_models['Mconv1_stage0_L1'] ) for layer in range(2, 6): current_features = self._multi_conv_forward( current_features, self.stage_models[f'Mconv{layer}_stage0_L1'] ) current_features = self.stage_models['Mconv6_7_stage0_L1'][0](current_features) current_features = self.stage_models['Mconv6_7_stage0_L1'][1](current_features) stage0_l1_output = current_features current_features = torch.cat([base_features, stage0_l1_output, l2_output], 1) # L1 branch - Stage 1 current_features = self._multi_conv_forward( current_features, self.stage_models['Mconv1_stage1_L1'] ) for layer in range(2, 6): current_features = self._multi_conv_forward( current_features, self.stage_models[f'Mconv{layer}_stage1_L1'] ) current_features = self.stage_models['Mconv6_7_stage1_L1'][0](current_features) stage1_l1_output = self.stage_models['Mconv6_7_stage1_L1'][1](current_features) return l2_output, stage1_l1_output class HandPoseModel(nn.Module): """ Hand pose estimation model using 21-point hand landmarks Developed by TechMatrix Solvers for ISL translation """ def __init__(self): super(HandPoseModel, self).__init__() # Layers without ReLU activation no_relu_layers = [ 'conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3', 'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6' ] # Stage 1 - Feature extraction stage1_base = OrderedDict([ ('conv1_1', [3, 64, 3, 1, 1]), ('conv1_2', [64, 64, 3, 1, 1]), ('pool1_stage1', [2, 2, 0]), ('conv2_1', [64, 128, 3, 1, 1]), ('conv2_2', [128, 128, 3, 1, 1]), ('pool2_stage1', [2, 2, 0]), ('conv3_1', [128, 256, 3, 1, 1]), ('conv3_2', [256, 256, 3, 1, 1]), ('conv3_3', [256, 256, 3, 1, 1]), ('conv3_4', [256, 256, 3, 1, 1]), ('pool3_stage1', [2, 2, 0]), ('conv4_1', [256, 512, 3, 1, 1]), ('conv4_2', [512, 512, 3, 1, 1]), ('conv4_3', [512, 512, 3, 1, 1]), ('conv4_4', [512, 512, 3, 1, 1]), ('conv5_1', [512, 512, 3, 1, 1]), ('conv5_2', [512, 512, 3, 1, 1]), ('conv5_3_CPM', [512, 128, 3, 1, 1]) ]) stage1_prediction = OrderedDict([ ('conv6_1_CPM', [128, 512, 1, 1, 0]), ('conv6_2_CPM', [512, 22, 1, 1, 0]) ]) stage_blocks = {} stage_blocks['stage1_base'] = stage1_base stage_blocks['stage1_prediction'] = stage1_prediction # Stages 2-6 refinement for i in range(2, 7): stage_blocks[f'stage{i}'] = OrderedDict([ (f'Mconv1_stage{i}', [150, 128, 7, 1, 3]), (f'Mconv2_stage{i}', [128, 128, 7, 1, 3]), (f'Mconv3_stage{i}', [128, 128, 7, 1, 3]), (f'Mconv4_stage{i}', [128, 128, 7, 1, 3]), (f'Mconv5_stage{i}', [128, 128, 7, 1, 3]), (f'Mconv6_stage{i}', [128, 128, 1, 1, 0]), (f'Mconv7_stage{i}', [128, 22, 1, 1, 0]) ]) # Build all stage models for block_name in stage_blocks.keys(): stage_blocks[block_name] = construct_layers(stage_blocks[block_name], no_relu_layers) self.stage1_base_model = stage_blocks['stage1_base'] self.stage1_prediction_model = stage_blocks['stage1_prediction'] self.stage2_model = stage_blocks['stage2'] self.stage3_model = stage_blocks['stage3'] self.stage4_model = stage_blocks['stage4'] self.stage5_model = stage_blocks['stage5'] self.stage6_model = stage_blocks['stage6'] # Freeze parameters for efficiency for param in self.parameters(): param.requires_grad = False def forward(self, x): """Forward pass through the hand pose model""" base_features = self.stage1_base_model(x) stage1_output = self.stage1_prediction_model(base_features) # Stage 2 stage2_input = torch.cat([stage1_output, base_features], 1) stage2_output = self.stage2_model(stage2_input) # Stage 3 stage3_input = torch.cat([stage2_output, base_features], 1) stage3_output = self.stage3_model(stage3_input) # Stage 4 stage4_input = torch.cat([stage3_output, base_features], 1) stage4_output = self.stage4_model(stage4_input) # Stage 5 stage5_input = torch.cat([stage4_output, base_features], 1) stage5_output = self.stage5_model(stage5_input) # Stage 6 stage6_input = torch.cat([stage5_output, base_features], 1) stage6_output = self.stage6_model(stage6_input) return stage6_output # Factory functions for easy model instantiation def create_bodypose_model(): """Create and return body pose detection model""" return BodyPose25Model() def create_handpose_model(): """Create and return hand pose detection model""" return HandPoseModel()