from typing import Tuple import torch import torch.nn as nn import torch.nn.functional as F class CRNN(nn.Module): """ CNN + BiLSTM + Linear for CTC. Input: images [B, 1, H, W] Output: log_probs [T, B, C] for CTCLoss (T = time steps, C = num_classes) """ def __init__(self, num_classes: int): super().__init__() # CNN feature extractor (keeps width relatively large) self.cnn = nn.Sequential( nn.Conv2d(1, 64, kernel_size=3, padding=1), # [B,64,H,W] nn.ReLU(inplace=True), nn.MaxPool2d(2, 2), # [B,64,H/2,W/2] nn.Conv2d(64, 128, kernel_size=3, padding=1),# [B,128,H/2,W/2] nn.ReLU(inplace=True), nn.MaxPool2d(2, 2), # [B,128,H/4,W/4] nn.Conv2d(128, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1)), # [B,256,H/8,W/4] nn.Conv2d(256, 512, kernel_size=3, padding=1), nn.BatchNorm2d(512), nn.ReLU(inplace=True), nn.Conv2d(512, 512, kernel_size=3, padding=1), nn.BatchNorm2d(512), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1)), # [B,512,H/16,W/4] nn.Conv2d(512, 512, kernel_size=2, padding=0), nn.ReLU(inplace=True), # reduces height further ) # BiLSTM sequence modeler self.rnn = nn.LSTM( input_size=512, hidden_size=256, num_layers=2, bidirectional=True, batch_first=False, ) self.fc = nn.Linear(256 * 2, num_classes) # bidirectional def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Returns: log_probs: [T, B, C] input_lengths: [B] lengths in time steps after CNN (for CTCLoss) """ feats = self.cnn(x) # [B, 512, H', W'] b, c, h, w = feats.shape # We want height to be 1 for CRNN; if not, squeeze/mean if h != 1: feats = feats.mean(dim=2) # [B,512,W] else: feats = feats.squeeze(2) # [B,512,W] # Convert to time-major: [W, B, 512] seq = feats.permute(2, 0, 1) rnn_out, _ = self.rnn(seq) # [T,W?] actually [T,B,512] logits = self.fc(rnn_out) # [T,B,C] log_probs = F.log_softmax(logits, dim=-1) input_lengths = torch.full(size=(b,), fill_value=log_probs.size(0), dtype=torch.long, device=log_probs.device) return log_probs, input_lengths