Spaces:

fisherman611
/

handwritten-mathematical-expression-recognition

Sleeping

App Files Files Community

fisherman611 commited on Jul 22

Commit

89ae6ce

verified ·

1 Parent(s): 8313ba2

Create models/can/can.py

Browse files

Files changed (1) hide show

models/can/can.py +819 -0

models/can/can.py ADDED Viewed

	@@ -0,0 +1,819 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+import math
+"""Custom DenseNet Backbone"""
+class DenseBlock(nn.Module):
+    """
+    Basic DenseNet block
+    """
+    def __init__(self, in_channels, growth_rate, num_layers):
+        super(DenseBlock, self).__init__()
+        self.layers = nn.ModuleList()
+        for i in range(num_layers):
+            self.layers.append(self._make_layer(in_channels + i * growth_rate, growth_rate))
+    def _make_layer(self, in_channels, growth_rate):
+        layer = nn.Sequential(
+            nn.BatchNorm2d(in_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(in_channels, 4 * growth_rate, kernel_size=1, bias=False),
+            nn.BatchNorm2d(4 * growth_rate),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(4 * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
+        )
+        return layer
+    def forward(self, x):
+        features = [x]
+        for layer in self.layers:
+            new_feature = layer(torch.cat(features, dim=1))
+            features.append(new_feature)
+        return torch.cat(features, dim=1)
+class TransitionLayer(nn.Module):
+    """
+    Transition layer between DenseBlocks
+    """
+    def __init__(self, in_channels, out_channels):
+        super(TransitionLayer, self).__init__()
+        self.transition = nn.Sequential(
+            nn.BatchNorm2d(in_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
+            nn.AvgPool2d(kernel_size=2, stride=2)
+        )
+    def forward(self, x):
+        return self.transition(x)
+class DenseNetBackbone(nn.Module):
+    """
+    DenseNet backbone for CAN
+    """
+    def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16), num_init_features=64):
+        super(DenseNetBackbone, self).__init__()
+        # Initial layer
+        self.features = nn.Sequential(
+            nn.Conv2d(1, num_init_features, kernel_size=7, stride=2, padding=3, bias=False),
+            nn.BatchNorm2d(num_init_features),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        )
+        # DenseBlocks
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            block = DenseBlock(num_features, growth_rate, num_layers)
+            self.features.add_module(f'denseblock{i+1}', block)
+            num_features = num_features + growth_rate * num_layers
+            if i != len(block_config) - 1:
+                trans = TransitionLayer(num_features, num_features // 2)
+                self.features.add_module(f'transition{i+1}', trans)
+                num_features = num_features // 2
+        # Final processing
+        self.features.add_module('norm5', nn.BatchNorm2d(num_features))
+        self.features.add_module('relu5', nn.ReLU(inplace=True))
+        self.out_channels = num_features  # 684 (with default configuration)
+    def forward(self, x):
+        return self.features(x)
+"""Pretrained DenseNet"""
+class DenseNetFeatureExtractor(nn.Module):
+    def __init__(self, densenet_model, out_channels=684):
+        super().__init__()
+        # Change input conv to 1 channel
+        self.conv0 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        # Copy pretrained weights (average over RGB channels)
+        self.conv0.weight.data = densenet_model.features.conv0.weight.data.mean(dim=1, keepdim=True)
+        self.features = densenet_model.features
+        self.out_channels = out_channels
+        # Add a 1x1 conv to match your expected output channels if needed
+        self.final_conv = nn.Conv2d(1024, out_channels, kernel_size=1)
+        self.final_bn = nn.BatchNorm2d(out_channels)
+        self.final_relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        x = self.conv0(x)
+        x = self.features.norm0(x)
+        x = self.features.relu0(x)
+        x = self.features.pool0(x)
+        x = self.features.denseblock1(x)
+        x = self.features.transition1(x)
+        x = self.features.denseblock2(x)
+        x = self.features.transition2(x)
+        x = self.features.denseblock3(x)
+        x = self.features.transition3(x)
+        x = self.features.denseblock4(x)
+        x = self.features.norm5(x)
+        x = self.final_conv(x)
+        x = self.final_bn(x)
+        x = self.final_relu(x)
+        return x
+"""Custom ResNet Backbone"""
+class BasicBlock(nn.Module):
+    """
+    Basic ResNet block
+    """
+    expansion = 1
+    def __init__(self, in_channels, out_channels, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_channels != out_channels * self.expansion:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels * self.expansion, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels * self.expansion)
+            )
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out += self.shortcut(identity)
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    """
+    Bottleneck ResNet block
+    """
+    expansion = 4
+    def __init__(self, in_channels, out_channels, stride=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+        self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_channels != out_channels * self.expansion:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels * self.expansion, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels * self.expansion)
+            )
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out += self.shortcut(identity)
+        out = self.relu(out)
+        return out
+class ResNetBackbone(nn.Module):
+    """
+    ResNet backbone for CAN model, designed to output similar dimensions as DenseNet
+    """
+    def __init__(self, block_type='bottleneck', layers=[3, 4, 6, 3], num_init_features=64):
+        super(ResNetBackbone, self).__init__()
+        # Initial layer
+        self.conv1 = nn.Conv2d(1, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(num_init_features)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        # Define block type
+        if block_type == 'basic':
+            block = BasicBlock
+            expansion = 1
+        elif block_type == 'bottleneck':
+            block = Bottleneck
+            expansion = 4
+        else:
+            raise ValueError(f"Unknown block type: {block_type}")
+        # Create layers
+        self.layer1 = self._make_layer(block, num_init_features, 64, layers[0], stride=1)
+        self.layer2 = self._make_layer(block, 64 * expansion, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 128 * expansion, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 256 * expansion, 512, layers[3], stride=2)
+        # Final processing to match DenseNet output channels
+        self.final_conv = nn.Conv2d(512 * expansion, 684, kernel_size=1)
+        self.final_bn = nn.BatchNorm2d(684)
+        self.final_relu = nn.ReLU(inplace=True)
+        self.out_channels = 684  # Match DenseNet output channels
+        # Initialize weights
+        self._initialize_weights()
+    def _make_layer(self, block, in_channels, out_channels, num_blocks, stride):
+        layers = []
+        layers.append(block(in_channels, out_channels, stride))
+        for _ in range(1, num_blocks):
+            layers.append(block(out_channels * block.expansion, out_channels))
+        return nn.Sequential(*layers)
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.final_conv(x)
+        x = self.final_bn(x)
+        x = self.final_relu(x)
+        return x
+"""Pretrained ResNet"""
+class ResNetFeatureExtractor(nn.Module):
+    def __init__(self, resnet_model, out_channels=684):
+        super().__init__()
+        # Change input conv to 1 channel
+        self.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.conv1.weight.data = resnet_model.conv1.weight.data.sum(dim=1, keepdim=True)  # average weights if needed
+        self.bn1 = resnet_model.bn1
+        self.relu = resnet_model.relu
+        self.maxpool = resnet_model.maxpool
+        self.layer1 = resnet_model.layer1
+        self.layer2 = resnet_model.layer2
+        self.layer3 = resnet_model.layer3
+        self.layer4 = resnet_model.layer4
+        # Add a 1x1 conv to match DenseNet output channels if needed
+        self.final_conv = nn.Conv2d(2048, out_channels, kernel_size=1)
+        self.final_bn = nn.BatchNorm2d(out_channels)
+        self.final_relu = nn.ReLU(inplace=True)
+        self.out_channels = out_channels
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.final_conv(x)
+        x = self.final_bn(x)
+        x = self.final_relu(x)
+        return x
+"""Channel Attention"""
+class ChannelAttention(nn.Module):
+    """
+    Channel-wise attention mechanism
+    """
+    def __init__(self, in_channels, ratio=16):
+        super(ChannelAttention, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.max_pool = nn.AdaptiveMaxPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Conv2d(in_channels, in_channels // ratio, kernel_size=1, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(in_channels // ratio, in_channels, kernel_size=1, bias=False)
+        )
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        avg_out = self.fc(self.avg_pool(x))
+        max_out = self.fc(self.max_pool(x))
+        out = avg_out + max_out
+        return self.sigmoid(out)
+"""Multi-scale Couting Module"""
+class MSCM(nn.Module):
+    """
+    Multi-Scale Counting Module
+    """
+    def __init__(self, in_channels, num_classes):
+        super(MSCM, self).__init__()
+        # Branch 1: 3x3 kernel
+        self.branch1 = nn.Sequential(
+            nn.Conv2d(in_channels, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Dropout2d(p=0.2)
+        )
+        self.attention1 = ChannelAttention(256)
+        # Branch 2: 5x5 kernel
+        self.branch2 = nn.Sequential(
+            nn.Conv2d(in_channels, 256, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.Dropout2d(p=0.2)
+        )
+        self.attention2 = ChannelAttention(256)
+        # 1x1 Conv layer to reduce channels and create counting map
+        self.conv_reduce = nn.Conv2d(512, num_classes, kernel_size=1)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        # Process branch 1
+        out1 = self.branch1(x)
+        out1 = out1 * self.attention1(out1)
+        # Process branch 2
+        out2 = self.branch2(x)
+        out2 = out2 * self.attention2(out2)
+        # Concatenate features from both branches
+        concat_features = torch.cat([out1, out2], dim=1)  # Shape: B x 512 x H x W
+        # Create counting map
+        count_map = self.sigmoid(self.conv_reduce(concat_features))  # Shape: B x C x H x W
+        # Apply sum-pooling to create 1D counting vector
+        # Sum over the entire feature map along height and width
+        count_vector = torch.sum(count_map, dim=(2, 3))  # Shape: B x C
+        return count_map, count_vector
+"""Positional Encoding"""
+class PositionalEncoding(nn.Module):
+    """
+    Positional encoding for attention decoder
+    """
+    def __init__(self, d_model, max_seq_len=1024):
+        super(PositionalEncoding, self).__init__()
+        self.d_model = d_model
+        # Create positional encoding matrix
+        pe = torch.zeros(max_seq_len, d_model)
+        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        # x shape: B x H x W x d_model
+        b, h, w, _ = x.shape
+        # Ensure we have enough positional encodings for the feature map size
+        if h*w > self.pe.size(0):       #type: ignore
+            # Dynamically extend positional encodings if needed
+            device = self.pe.device
+            extended_pe = torch.zeros(h*w, self.d_model, device=device)                                                             #type: ignore
+            position = torch.arange(0, h*w, dtype=torch.float, device=device).unsqueeze(1)                                          #type: ignore
+            div_term = torch.exp(torch.arange(0, self.d_model, 2, device=device).float() * (-math.log(10000.0) / self.d_model))     #type: ignore
+            extended_pe[:, 0::2] = torch.sin(position * div_term)
+            extended_pe[:, 1::2] = torch.cos(position * div_term)
+            pos_encoding = extended_pe.view(h, w, -1)
+        else:
+            # Use pre-computed positional encodings
+            pos_encoding = self.pe[:h*w].view(h, w, -1)                                                                             #type: ignore
+        pos_encoding = pos_encoding.unsqueeze(0).expand(b, -1, -1, -1)  # B x H x W x d_model
+        return pos_encoding
+"""Counting-combined Attentional Decoder"""
+class CCAD(nn.Module):
+    """
+    Counting-Combined Attentional Decoder
+    """
+    def __init__(self, input_channels, hidden_size, embedding_dim, num_classes, use_coverage=True):
+        super(CCAD, self).__init__()
+        self.hidden_size = hidden_size
+        self.embedding_dim = embedding_dim
+        self.use_coverage = use_coverage
+        # Input layer to reduce feature map
+        self.feature_proj = nn.Conv2d(input_channels, hidden_size * 2, kernel_size=1)
+        # Positional encoding
+        self.pos_encoder = PositionalEncoding(hidden_size * 2)
+        # Embedding layer for output symbols
+        self.embedding = nn.Embedding(num_classes, embedding_dim)
+        # GRU cell
+        self.gru = nn.GRUCell(embedding_dim + hidden_size + num_classes, hidden_size)
+        # Attention
+        self.attention_w = nn.Linear(hidden_size * 2, hidden_size)
+        self.attention_v = nn.Linear(hidden_size, 1)
+        if use_coverage:
+            self.coverage_proj = nn.Linear(1, hidden_size)
+        # Output layer
+        self.out = nn.Linear(hidden_size + hidden_size + num_classes, num_classes)
+        self.dropout = nn.Dropout(p=0.3)
+    def forward(self, feature_map, count_vector, target=None, teacher_forcing_ratio=0.5, max_len=200):
+        batch_size = feature_map.size(0)
+        device = feature_map.device
+        # Transform feature map
+        projected_features = self.feature_proj(feature_map)  # B x 2*hidden_size x H x W
+        H, W = projected_features.size(2), projected_features.size(3)
+        # Reshape feature map to B x H*W x 2*hidden_size
+        projected_features = projected_features.permute(0, 2, 3, 1).contiguous()  # B x H x W x 2*hidden_size
+        # Add positional encoding
+        pos_encoding = self.pos_encoder(projected_features)  # B x H x W x 2*hidden_size
+        projected_features = projected_features + pos_encoding
+        # Reshape for attention processing
+        projected_features = projected_features.view(batch_size, H*W, -1)  # B x H*W x 2*hidden_size
+        # Initialize initial hidden state
+        h_t = torch.zeros(batch_size, self.hidden_size, device=device)
+        # Initialize coverage attention if used
+        if self.use_coverage:
+            coverage = torch.zeros(batch_size, H*W, 1, device=device)
+        # First <SOS> token
+        y_t_1 = torch.ones(batch_size, dtype=torch.long, device=device)
+        # Prepare target sequence if provided
+        if target is not None:
+            max_len = target.size(1)
+        # Array to store predictions
+        outputs = torch.zeros(batch_size, max_len, self.embedding.num_embeddings, device=device)
+        for t in range(max_len):
+            # Apply embedding to the previous symbol
+            embedded = self.embedding(y_t_1)  # B x embedding_dim
+            # Compute attention
+            attention_input = self.attention_w(projected_features)  # B x H*W x hidden_size
+            # Add coverage attention if used
+            if self.use_coverage:
+                coverage_input = self.coverage_proj(coverage.float())                        #type: ignore
+                attention_input = attention_input + coverage_input
+            # Add hidden state to attention
+            h_expanded = h_t.unsqueeze(1).expand(-1, H*W, -1)  # B x H*W x hidden_size
+            attention_input = torch.tanh(attention_input + h_expanded)
+            # Compute attention weights
+            e_t = self.attention_v(attention_input).squeeze(-1)  # B x H*W
+            alpha_t = F.softmax(e_t, dim=1)  # B x H*W
+            # Update coverage if used
+            if self.use_coverage:
+                coverage = coverage + alpha_t.unsqueeze(-1)                                  #type: ignore
+            # Compute context vector
+            alpha_t = alpha_t.unsqueeze(1)  # B x 1 x H*W
+            context = torch.bmm(alpha_t, projected_features).squeeze(1)  # B x 2*hidden_size
+            context = context[:, :self.hidden_size]  # Take the first half as context vector
+            # Combine embedding, context vector, and count vector
+            gru_input = torch.cat([embedded, context, count_vector], dim=1)
+            # Update hidden state
+            h_t = self.gru(gru_input, h_t)
+            # Predict output symbol
+            output = self.out(torch.cat([h_t, context, count_vector], dim=1))
+            outputs[:, t] = output
+            # Decide the next input symbol
+            if target is not None and torch.rand(1).item() < teacher_forcing_ratio:
+                y_t_1 = target[:, t]
+            else:
+                # Greedy decoding
+                _, y_t_1 = output.max(1)
+        return outputs
+"""Full model CAN (Counting-Aware Network)"""
+class CAN(nn.Module):
+    """
+    Counting-Aware Network for handwritten mathematical expression recognition
+    """
+    def __init__(self, num_classes, backbone=None, hidden_size=256, embedding_dim=256, use_coverage=True):
+        super(CAN, self).__init__()
+        # Backbone
+        if backbone is None:
+            self.backbone = DenseNetBackbone()
+        else:
+            self.backbone = backbone
+        backbone_channels = self.backbone.out_channels
+        # Multi-Scale Counting Module
+        self.mscm = MSCM(backbone_channels, num_classes)
+        # Counting-Combined Attentional Decoder
+        self.decoder = CCAD(
+            input_channels=backbone_channels,
+            hidden_size=hidden_size,
+            embedding_dim=embedding_dim,
+            num_classes=num_classes,
+            use_coverage=use_coverage
+        )
+        # Save parameters for later use
+        self.hidden_size = hidden_size
+        self.embedding_dim = embedding_dim
+        self.num_classes = num_classes
+        self.use_coverage = use_coverage
+    def init_hidden_state(self, visual_features):
+        """
+        Initialize hidden state and cell state for LSTM
+        Args:
+            visual_features: Visual features from backbone
+        Returns:
+            h, c: Initial hidden and cell states
+        """
+        batch_size = visual_features.size(0)
+        device = visual_features.device
+        # Initialize hidden state with zeros
+        h = torch.zeros(1, batch_size, self.hidden_size, device=device)
+        c = torch.zeros(1, batch_size, self.hidden_size, device=device)
+        return h, c
+    def forward(self, x, target=None, teacher_forcing_ratio=0.5):
+        # Extract features from backbone
+        features = self.backbone(x)
+        # Compute count map and count vector from MSCM
+        count_map, count_vector = self.mscm(features)
+        # Decode with CCAD
+        outputs = self.decoder(features, count_vector, target, teacher_forcing_ratio)
+        return outputs, count_vector
+    def calculate_loss(self, outputs, targets, count_vectors, count_targets, lambda_count=0.01):
+        """
+        Compute the combined loss function for CAN
+        Args:
+            outputs: Predicted output sequence from decoder
+            targets: Actual target sequence
+            count_vectors: Predicted count vector
+            count_targets: Actual target count vector
+            lambda_count: Weight for counting loss
+        Returns:
+            Total loss: L = L_cls + λ * L_counting
+        """
+        # Loss for decoder (cross entropy)
+        L_cls = F.cross_entropy(outputs.view(-1, outputs.size(-1)), targets.view(-1))
+        # Loss for counting (MSE)
+        L_counting = F.smooth_l1_loss(count_vectors / self.num_classes, count_targets / self.num_classes)
+        # Total loss
+        total_loss = L_cls + lambda_count * L_counting
+        return total_loss, L_cls, L_counting
+    def recognize(self, images, max_length=150, start_token=None, end_token=None, beam_width=5):
+        """
+        Recognize the handwritten expression using beam search (batch_size=1 only).
+        Args:
+            images: Input image tensor, shape (1, channels, height, width)
+            max_length: Maximum length of the output sequence
+            start_token: Start token index
+            end_token: End token index
+            beam_width: Beam width for beam search
+        Returns:
+            best_sequence: List of token indices
+            attention_weights: List of attention weights for visualization
+        """
+        if images.size(0) != 1:
+            raise ValueError("Beam search is implemented only for batch_size=1")
+        device = images.device
+        # Encode the image
+        visual_features = self.backbone(images)
+        # Get count vector
+        _, count_vector = self.mscm(visual_features)
+        # Prepare feature map for decoder
+        projected_features = self.decoder.feature_proj(visual_features)  # (1, 2*hidden_size, H, W)
+        H, W = projected_features.size(2), projected_features.size(3)
+        projected_features = projected_features.permute(0, 2, 3, 1).contiguous()  # (1, H, W, 2*hidden_size)
+        pos_encoding = self.decoder.pos_encoder(projected_features)  # (1, H, W, 2*hidden_size)
+        projected_features = projected_features + pos_encoding  # (1, H, W, 2*hidden_size)
+        projected_features = projected_features.view(1, H*W, -1)  # (1, H*W, 2*hidden_size)
+        # Initialize beams
+        beam_sequences = [torch.tensor([start_token], device=device)] * beam_width  # List of (seq_len) tensors
+        beam_scores = torch.zeros(beam_width, device=device)  # (beam_width)
+        h_t = torch.zeros(beam_width, self.hidden_size, device=device)  # (beam_width, hidden_size)
+        if self.use_coverage:
+            coverage = torch.zeros(beam_width, H*W, device=device)  # (beam_width, H*W)
+        all_attention_weights = []
+        for step in range(max_length):
+            # Get current tokens for all beams
+            current_tokens = torch.tensor([seq[-1] for seq in beam_sequences], device=device)  # (beam_width)
+            # Apply embedding
+            embedded = self.decoder.embedding(current_tokens)  # (beam_width, embedding_dim)
+            # Compute attention for each beam
+            attention_input = self.decoder.attention_w(projected_features.expand(beam_width, -1, -1))  # (beam_width, H*W, hidden_size)
+            if self.use_coverage:
+                coverage_input = self.decoder.coverage_proj(coverage.unsqueeze(-1))  # (beam_width, H*W, hidden_size)            #type: ignore
+                attention_input = attention_input + coverage_input
+            h_expanded = h_t.unsqueeze(1).expand(-1, H*W, -1)  # (beam_width, H*W, hidden_size)
+            attention_input = torch.tanh(attention_input + h_expanded)
+            e_t = self.decoder.attention_v(attention_input).squeeze(-1)  # (beam_width, H*W)
+            alpha_t = F.softmax(e_t, dim=1)  # (beam_width, H*W)
+            all_attention_weights.append(alpha_t.detach())
+            if self.use_coverage:
+                coverage = coverage + alpha_t            #type: ignore
+            context = torch.bmm(alpha_t.unsqueeze(1), projected_features.expand(beam_width, -1, -1)).squeeze(1)  # (beam_width, 2*hidden_size)
+            context = context[:, :self.hidden_size]  # (beam_width, hidden_size)
+            # Expand count_vector to (beam_width, num_classes)
+            count_vector_expanded = count_vector.expand(beam_width, -1)  # (beam_width, num_classes)
+            gru_input = torch.cat([embedded, context, count_vector_expanded], dim=1)  # (beam_width, embedding_dim + hidden_size + num_classes)
+            h_t = self.decoder.gru(gru_input, h_t)  # (beam_width, hidden_size)
+            output = self.decoder.out(torch.cat([h_t, context, count_vector_expanded], dim=1))  # (beam_width, num_classes)
+            scores = F.log_softmax(output, dim=1)  # (beam_width, num_classes)
+            # Compute new scores for all beam-token combinations
+            new_beam_scores = beam_scores.unsqueeze(1) + scores  # (beam_width, num_classes)
+            new_beam_scores_flat = new_beam_scores.view(-1)  # (beam_width * num_classes)
+            # Select top beam_width scores and indices
+            topk_scores, topk_indices = new_beam_scores_flat.topk(beam_width)
+            # Determine which beam and token each top score corresponds to
+            beam_indices = topk_indices // self.num_classes  # (beam_width)
+            token_indices = topk_indices % self.num_classes  # (beam_width)
+            # Create new beam sequences and states
+            new_beam_sequences = []
+            new_h_t = []
+            if self.use_coverage:
+                new_coverage = []
+            for i in range(beam_width):
+                prev_beam_idx = beam_indices[i].item()
+                token = token_indices[i].item()
+                new_seq = torch.cat([beam_sequences[prev_beam_idx], torch.tensor([token], device=device)])           #type: ignore
+                new_beam_sequences.append(new_seq)
+                new_h_t.append(h_t[prev_beam_idx])
+                if self.use_coverage:
+                    new_coverage.append(coverage[prev_beam_idx])           #type: ignore
+            # Update beams
+            beam_sequences = new_beam_sequences
+            beam_scores = topk_scores
+            h_t = torch.stack(new_h_t)
+            if self.use_coverage:
+                coverage = torch.stack(new_coverage)           #type: ignore
+        # Select the sequence with the highest score
+        best_idx = beam_scores.argmax()
+        best_sequence = beam_sequences[best_idx].tolist()
+        # Remove <start> and stop at <end>
+        if best_sequence[0] == start_token:
+            best_sequence = best_sequence[1:]
+        if end_token in best_sequence:
+            end_idx = best_sequence.index(end_token)
+            best_sequence = best_sequence[:end_idx]
+        return best_sequence, all_attention_weights
+def create_can_model(num_classes, hidden_size=256, embedding_dim=256, use_coverage=True, pretrained_backbone=False, backbone_type='densenet'):
+    """
+    Create CAN model with either DenseNet or ResNet backbone
+    Args:
+        num_classes: Number of symbol classes
+        pretrained_backbone: Whether to use a pretrained backbone
+        backbone_type: Type of backbone to use ('densenet' or 'resnet')
+    Returns:
+        CAN model
+    """
+    # Create backbone
+    if backbone_type == 'densenet':
+        if pretrained_backbone:
+            densenet = models.densenet121(pretrained=True)
+            backbone = DenseNetFeatureExtractor(densenet, out_channels=684)
+        else:
+            backbone = DenseNetBackbone()
+    elif backbone_type == 'resnet':
+        if pretrained_backbone:
+            resnet = models.resnet50(pretrained=True)
+            backbone = ResNetFeatureExtractor(resnet, out_channels=684)
+        else:
+            backbone = ResNetBackbone(block_type='bottleneck', layers=[3, 4, 6, 3])
+    else:
+        raise ValueError(f"Unknown backbone type: {backbone_type}")
+    # Create model
+    model = CAN(
+        num_classes=num_classes,
+        backbone=backbone,
+        hidden_size=hidden_size,
+        embedding_dim=embedding_dim,
+        use_coverage=use_coverage
+    )
+    return model
+# # Example usage
+# if __name__ == "__main__":
+#     # Create CAN model with 101 symbol classes (example)
+#     num_classes = 101  # Number of symbol classes + special tokens like <SOS>, <EOS>
+#     model = create_can_model(num_classes)
+#     # Create dummy input data
+#     batch_size = 4
+#     input_image = torch.randn(batch_size, 1, 128, 384)  # B x C x H x W
+#     target = torch.randint(0, num_classes, (batch_size, 50))  # B x max_len
+#     # Forward pass
+#     outputs, count_vectors = model(input_image, target)
+#     # Print output shapes
+#     print(f"Outputs shape: {outputs.shape}")  # B x max_len x num_classes
+#     print(f"Count vectors shape: {count_vectors.shape}")  # B x num_classes