|
|
from typing import Tuple
|
|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
import torch.nn.functional as F
|
|
|
|
|
|
|
|
|
class CRNN(nn.Module):
|
|
|
"""
|
|
|
CNN + BiLSTM + Linear for CTC.
|
|
|
Input: images [B, 1, H, W]
|
|
|
Output: log_probs [T, B, C] for CTCLoss (T = time steps, C = num_classes)
|
|
|
"""
|
|
|
|
|
|
def __init__(self, num_classes: int):
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
self.cnn = nn.Sequential(
|
|
|
nn.Conv2d(1, 64, kernel_size=3, padding=1),
|
|
|
nn.ReLU(inplace=True),
|
|
|
nn.MaxPool2d(2, 2),
|
|
|
|
|
|
nn.Conv2d(64, 128, kernel_size=3, padding=1),
|
|
|
nn.ReLU(inplace=True),
|
|
|
nn.MaxPool2d(2, 2),
|
|
|
|
|
|
nn.Conv2d(128, 256, kernel_size=3, padding=1),
|
|
|
nn.ReLU(inplace=True),
|
|
|
|
|
|
nn.Conv2d(256, 256, kernel_size=3, padding=1),
|
|
|
nn.ReLU(inplace=True),
|
|
|
nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1)),
|
|
|
|
|
|
nn.Conv2d(256, 512, kernel_size=3, padding=1),
|
|
|
nn.BatchNorm2d(512),
|
|
|
nn.ReLU(inplace=True),
|
|
|
|
|
|
nn.Conv2d(512, 512, kernel_size=3, padding=1),
|
|
|
nn.BatchNorm2d(512),
|
|
|
nn.ReLU(inplace=True),
|
|
|
nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1)),
|
|
|
|
|
|
nn.Conv2d(512, 512, kernel_size=2, padding=0),
|
|
|
nn.ReLU(inplace=True),
|
|
|
)
|
|
|
|
|
|
|
|
|
self.rnn = nn.LSTM(
|
|
|
input_size=512,
|
|
|
hidden_size=256,
|
|
|
num_layers=2,
|
|
|
bidirectional=True,
|
|
|
batch_first=False,
|
|
|
)
|
|
|
|
|
|
self.fc = nn.Linear(256 * 2, num_classes)
|
|
|
|
|
|
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
|
"""
|
|
|
Returns:
|
|
|
log_probs: [T, B, C]
|
|
|
input_lengths: [B] lengths in time steps after CNN (for CTCLoss)
|
|
|
"""
|
|
|
feats = self.cnn(x)
|
|
|
b, c, h, w = feats.shape
|
|
|
|
|
|
|
|
|
if h != 1:
|
|
|
feats = feats.mean(dim=2)
|
|
|
else:
|
|
|
feats = feats.squeeze(2)
|
|
|
|
|
|
|
|
|
seq = feats.permute(2, 0, 1)
|
|
|
|
|
|
rnn_out, _ = self.rnn(seq)
|
|
|
logits = self.fc(rnn_out)
|
|
|
log_probs = F.log_softmax(logits, dim=-1)
|
|
|
|
|
|
input_lengths = torch.full(size=(b,), fill_value=log_probs.size(0), dtype=torch.long, device=log_probs.device)
|
|
|
return log_probs, input_lengths
|
|
|
|