Spaces:

Abs6187
/

ISL_Sign_Language_Translation

Sleeping

File size: 14,531 Bytes

e2cffd9

"""

ISL Sign Language Translation - TechMatrix Solvers Initiative

Model definitions for body pose and hand pose estimation

Developed by: TechMatrix Solvers Team

"""

import torch
from collections import OrderedDict
import torch.nn as nn


def construct_layers(layer_config, no_relu_layers, prelu_layers=[]):
    """

    Constructs neural network layers based on configuration

    

    Args:

        layer_config: Dictionary defining layer parameters

        no_relu_layers: List of layers that shouldn't have ReLU activation

        prelu_layers: List of layers that should use PReLU instead of ReLU

    """
    layers = []
    
    for layer_name, params in layer_config.items():
        if 'pool' in layer_name:
            layer = nn.MaxPool2d(kernel_size=params[0], stride=params[1], padding=params[2])
            layers.append((layer_name, layer))
        else:
            conv2d = nn.Conv2d(
                in_channels=params[0], 
                out_channels=params[1],
                kernel_size=params[2], 
                stride=params[3],
                padding=params[4]
            )
            layers.append((layer_name, conv2d))
            
            if layer_name not in no_relu_layers:
                if layer_name not in prelu_layers:
                    layers.append(('relu_' + layer_name, nn.ReLU(inplace=True)))
                else:
                    layers.append(('prelu' + layer_name[4:], nn.PReLU(params[1])))

    return nn.Sequential(OrderedDict(layers))


def construct_multi_conv_layers(layer_config, no_relu_layers):
    """

    Constructs multiple convolution layers for complex architectures

    """
    modules = []
    for layer_name, params in layer_config.items():
        layers = []
        if 'pool' in layer_name:
            layer = nn.MaxPool2d(kernel_size=params[0], stride=params[1], padding=params[2])
            layers.append((layer_name, layer))
        else:
            conv2d = nn.Conv2d(
                in_channels=params[0], 
                out_channels=params[1],
                kernel_size=params[2], 
                stride=params[3],
                padding=params[4]
            )
            layers.append((layer_name, conv2d))
            if layer_name not in no_relu_layers:
                layers.append(('Mprelu' + layer_name[5:], nn.PReLU(params[1])))
        modules.append(nn.Sequential(OrderedDict(layers)))
    return nn.ModuleList(modules)


class BodyPose25Model(nn.Module):
    """

    Body pose estimation model using 25-point skeleton

    Developed by TechMatrix Solvers for ISL translation

    """
    
    def __init__(self):
        super(BodyPose25Model, self).__init__()
        
        # Define layers without ReLU activation
        no_relu_layers = [
            'Mconv7_stage0_L1', 'Mconv7_stage0_L2',
            'Mconv7_stage1_L1', 'Mconv7_stage1_L2',
            'Mconv7_stage2_L2', 'Mconv7_stage3_L2'
        ]
        prelu_layers = ['conv4_2', 'conv4_3_CPM', 'conv4_4_CPM']
        
        # Initial feature extraction layers
        base_layers = OrderedDict([
            ('conv1_1', [3, 64, 3, 1, 1]),
            ('conv1_2', [64, 64, 3, 1, 1]),
            ('pool1_stage1', [2, 2, 0]),
            ('conv2_1', [64, 128, 3, 1, 1]),
            ('conv2_2', [128, 128, 3, 1, 1]),
            ('pool2_stage1', [2, 2, 0]),
            ('conv3_1', [128, 256, 3, 1, 1]),
            ('conv3_2', [256, 256, 3, 1, 1]),
            ('conv3_3', [256, 256, 3, 1, 1]),
            ('conv3_4', [256, 256, 3, 1, 1]),
            ('pool3_stage1', [2, 2, 0]),
            ('conv4_1', [256, 512, 3, 1, 1]),
            ('conv4_2', [512, 512, 3, 1, 1]),
            ('conv4_3_CPM', [512, 256, 3, 1, 1]),
            ('conv4_4_CPM', [256, 128, 3, 1, 1])
        ])
        self.base_model = construct_layers(base_layers, no_relu_layers, prelu_layers)
        
        # Multi-stage refinement blocks
        stage_blocks = {}
        
        # L2 branch - Stage 0
        stage_blocks['Mconv1_stage0_L2'] = OrderedDict([
            ('Mconv1_stage0_L2_0', [128, 96, 3, 1, 1]),
            ('Mconv1_stage0_L2_1', [96, 96, 3, 1, 1]),
            ('Mconv1_stage0_L2_2', [96, 96, 3, 1, 1])
        ])
        
        for i in range(2, 6):
            stage_blocks[f'Mconv{i}_stage0_L2'] = OrderedDict([
                (f'Mconv{i}_stage0_L2_0', [288, 96, 3, 1, 1]),
                (f'Mconv{i}_stage0_L2_1', [96, 96, 3, 1, 1]),
                (f'Mconv{i}_stage0_L2_2', [96, 96, 3, 1, 1])
            ])
            
        stage_blocks['Mconv6_7_stage0_L2'] = OrderedDict([
            ('Mconv6_stage0_L2', [288, 256, 1, 1, 0]),
            ('Mconv7_stage0_L2', [256, 52, 1, 1, 0])
        ])
        
        # L2 branch - Stages 1-3
        for stage in range(1, 4):
            stage_blocks[f'Mconv1_stage{stage}_L2'] = OrderedDict([
                (f'Mconv1_stage{stage}_L2_0', [180, 128, 3, 1, 1]),
                (f'Mconv1_stage{stage}_L2_1', [128, 128, 3, 1, 1]),
                (f'Mconv1_stage{stage}_L2_2', [128, 128, 3, 1, 1])
            ])
            for i in range(2, 6):
                stage_blocks[f'Mconv{i}_stage{stage}_L2'] = OrderedDict([
                    (f'Mconv{i}_stage{stage}_L2_0', [384, 128, 3, 1, 1]),
                    (f'Mconv{i}_stage{stage}_L2_1', [128, 128, 3, 1, 1]),
                    (f'Mconv{i}_stage{stage}_L2_2', [128, 128, 3, 1, 1])
                ])
            stage_blocks[f'Mconv6_7_stage{stage}_L2'] = OrderedDict([
                (f'Mconv6_stage{stage}_L2', [384, 512, 1, 1, 0]),
                (f'Mconv7_stage{stage}_L2', [512, 52, 1, 1, 0])
            ])
        
        # L1 branch configurations
        stage_blocks['Mconv1_stage0_L1'] = OrderedDict([
            ('Mconv1_stage0_L1_0', [180, 96, 3, 1, 1]),
            ('Mconv1_stage0_L1_1', [96, 96, 3, 1, 1]),
            ('Mconv1_stage0_L1_2', [96, 96, 3, 1, 1])
        ])
        
        for i in range(2, 6):
            stage_blocks[f'Mconv{i}_stage0_L1'] = OrderedDict([
                (f'Mconv{i}_stage0_L1_0', [288, 96, 3, 1, 1]),
                (f'Mconv{i}_stage0_L1_1', [96, 96, 3, 1, 1]),
                (f'Mconv{i}_stage0_L1_2', [96, 96, 3, 1, 1])
            ])
            
        stage_blocks['Mconv6_7_stage0_L1'] = OrderedDict([
            ('Mconv6_stage0_L1', [288, 256, 1, 1, 0]),
            ('Mconv7_stage0_L1', [256, 26, 1, 1, 0])
        ])
        
        stage_blocks['Mconv1_stage1_L1'] = OrderedDict([
            ('Mconv1_stage1_L1_0', [206, 128, 3, 1, 1]),
            ('Mconv1_stage1_L1_1', [128, 128, 3, 1, 1]),
            ('Mconv1_stage1_L1_2', [128, 128, 3, 1, 1])
        ])
        
        for i in range(2, 6):
            stage_blocks[f'Mconv{i}_stage1_L1'] = OrderedDict([
                (f'Mconv{i}_stage1_L1_0', [384, 128, 3, 1, 1]),
                (f'Mconv{i}_stage1_L1_1', [128, 128, 3, 1, 1]),
                (f'Mconv{i}_stage1_L1_2', [128, 128, 3, 1, 1])
            ])
            
        stage_blocks['Mconv6_7_stage1_L1'] = OrderedDict([
            ('Mconv6_stage1_L1', [384, 512, 1, 1, 0]),
            ('Mconv7_stage1_L1', [512, 26, 1, 1, 0])
        ])
        
        # Build multi-conv modules
        for block_name in stage_blocks.keys():
            stage_blocks[block_name] = construct_multi_conv_layers(stage_blocks[block_name], no_relu_layers)
        
        self.stage_models = nn.ModuleDict(stage_blocks)
        
        # Freeze parameters for efficiency
        for param in self.parameters():
            param.requires_grad = False
            
    def _multi_conv_forward(self, x, models):
        """Forward pass through multi-convolution blocks"""
        outputs = []
        current_output = x
        for model in models:
            current_output = model(current_output)
            outputs.append(current_output)
        return torch.cat(outputs, 1)
        
    def forward(self, x):
        """Forward pass through the body pose model"""
        base_features = self.base_model(x)
        
        # L2 branch processing
        current_features = base_features
        for stage in range(4):
            current_features = self._multi_conv_forward(
                current_features, self.stage_models[f'Mconv1_stage{stage}_L2']
            )
            for layer in range(2, 6):
                current_features = self._multi_conv_forward(
                    current_features, self.stage_models[f'Mconv{layer}_stage{stage}_L2']
                )
            current_features = self.stage_models[f'Mconv6_7_stage{stage}_L2'][0](current_features)
            current_features = self.stage_models[f'Mconv6_7_stage{stage}_L2'][1](current_features)
            l2_output = current_features
            current_features = torch.cat([base_features, current_features], 1)
        
        # L1 branch - Stage 0
        current_features = self._multi_conv_forward(
            current_features, self.stage_models['Mconv1_stage0_L1']
        )
        for layer in range(2, 6):
            current_features = self._multi_conv_forward(
                current_features, self.stage_models[f'Mconv{layer}_stage0_L1']
            )
        current_features = self.stage_models['Mconv6_7_stage0_L1'][0](current_features)
        current_features = self.stage_models['Mconv6_7_stage0_L1'][1](current_features)
        stage0_l1_output = current_features
        current_features = torch.cat([base_features, stage0_l1_output, l2_output], 1)
        
        # L1 branch - Stage 1
        current_features = self._multi_conv_forward(
            current_features, self.stage_models['Mconv1_stage1_L1']
        )
        for layer in range(2, 6):
            current_features = self._multi_conv_forward(
                current_features, self.stage_models[f'Mconv{layer}_stage1_L1']
            )
        current_features = self.stage_models['Mconv6_7_stage1_L1'][0](current_features)
        stage1_l1_output = self.stage_models['Mconv6_7_stage1_L1'][1](current_features)
        
        return l2_output, stage1_l1_output


class HandPoseModel(nn.Module):
    """

    Hand pose estimation model using 21-point hand landmarks

    Developed by TechMatrix Solvers for ISL translation

    """
    
    def __init__(self):
        super(HandPoseModel, self).__init__()
        
        # Layers without ReLU activation
        no_relu_layers = [
            'conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',
            'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6'
        ]
        
        # Stage 1 - Feature extraction
        stage1_base = OrderedDict([
            ('conv1_1', [3, 64, 3, 1, 1]),
            ('conv1_2', [64, 64, 3, 1, 1]),
            ('pool1_stage1', [2, 2, 0]),
            ('conv2_1', [64, 128, 3, 1, 1]),
            ('conv2_2', [128, 128, 3, 1, 1]),
            ('pool2_stage1', [2, 2, 0]),
            ('conv3_1', [128, 256, 3, 1, 1]),
            ('conv3_2', [256, 256, 3, 1, 1]),
            ('conv3_3', [256, 256, 3, 1, 1]),
            ('conv3_4', [256, 256, 3, 1, 1]),
            ('pool3_stage1', [2, 2, 0]),
            ('conv4_1', [256, 512, 3, 1, 1]),
            ('conv4_2', [512, 512, 3, 1, 1]),
            ('conv4_3', [512, 512, 3, 1, 1]),
            ('conv4_4', [512, 512, 3, 1, 1]),
            ('conv5_1', [512, 512, 3, 1, 1]),
            ('conv5_2', [512, 512, 3, 1, 1]),
            ('conv5_3_CPM', [512, 128, 3, 1, 1])
        ])

        stage1_prediction = OrderedDict([
            ('conv6_1_CPM', [128, 512, 1, 1, 0]),
            ('conv6_2_CPM', [512, 22, 1, 1, 0])
        ])

        stage_blocks = {}
        stage_blocks['stage1_base'] = stage1_base
        stage_blocks['stage1_prediction'] = stage1_prediction

        # Stages 2-6 refinement
        for i in range(2, 7):
            stage_blocks[f'stage{i}'] = OrderedDict([
                (f'Mconv1_stage{i}', [150, 128, 7, 1, 3]),
                (f'Mconv2_stage{i}', [128, 128, 7, 1, 3]),
                (f'Mconv3_stage{i}', [128, 128, 7, 1, 3]),
                (f'Mconv4_stage{i}', [128, 128, 7, 1, 3]),
                (f'Mconv5_stage{i}', [128, 128, 7, 1, 3]),
                (f'Mconv6_stage{i}', [128, 128, 1, 1, 0]),
                (f'Mconv7_stage{i}', [128, 22, 1, 1, 0])
            ])

        # Build all stage models
        for block_name in stage_blocks.keys():
            stage_blocks[block_name] = construct_layers(stage_blocks[block_name], no_relu_layers)

        self.stage1_base_model = stage_blocks['stage1_base']
        self.stage1_prediction_model = stage_blocks['stage1_prediction']
        self.stage2_model = stage_blocks['stage2']
        self.stage3_model = stage_blocks['stage3']
        self.stage4_model = stage_blocks['stage4']
        self.stage5_model = stage_blocks['stage5']
        self.stage6_model = stage_blocks['stage6']
        
        # Freeze parameters for efficiency
        for param in self.parameters():
            param.requires_grad = False

    def forward(self, x):
        """Forward pass through the hand pose model"""
        base_features = self.stage1_base_model(x)
        stage1_output = self.stage1_prediction_model(base_features)
        
        # Stage 2
        stage2_input = torch.cat([stage1_output, base_features], 1)
        stage2_output = self.stage2_model(stage2_input)
        
        # Stage 3
        stage3_input = torch.cat([stage2_output, base_features], 1)
        stage3_output = self.stage3_model(stage3_input)
        
        # Stage 4
        stage4_input = torch.cat([stage3_output, base_features], 1)
        stage4_output = self.stage4_model(stage4_input)
        
        # Stage 5
        stage5_input = torch.cat([stage4_output, base_features], 1)
        stage5_output = self.stage5_model(stage5_input)
        
        # Stage 6
        stage6_input = torch.cat([stage5_output, base_features], 1)
        stage6_output = self.stage6_model(stage6_input)
        
        return stage6_output


# Factory functions for easy model instantiation
def create_bodypose_model():
    """Create and return body pose detection model"""
    return BodyPose25Model()


def create_handpose_model():
    """Create and return hand pose detection model"""
    return HandPoseModel()