File size: 2,839 Bytes
2411029 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
from typing import Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
class CRNN(nn.Module):
"""
CNN + BiLSTM + Linear for CTC.
Input: images [B, 1, H, W]
Output: log_probs [T, B, C] for CTCLoss (T = time steps, C = num_classes)
"""
def __init__(self, num_classes: int):
super().__init__()
# CNN feature extractor (keeps width relatively large)
self.cnn = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=3, padding=1), # [B,64,H,W]
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2), # [B,64,H/2,W/2]
nn.Conv2d(64, 128, kernel_size=3, padding=1),# [B,128,H/2,W/2]
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2), # [B,128,H/4,W/4]
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1)), # [B,256,H/8,W/4]
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.BatchNorm2d(512),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.BatchNorm2d(512),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1)), # [B,512,H/16,W/4]
nn.Conv2d(512, 512, kernel_size=2, padding=0),
nn.ReLU(inplace=True), # reduces height further
)
# BiLSTM sequence modeler
self.rnn = nn.LSTM(
input_size=512,
hidden_size=256,
num_layers=2,
bidirectional=True,
batch_first=False,
)
self.fc = nn.Linear(256 * 2, num_classes) # bidirectional
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Returns:
log_probs: [T, B, C]
input_lengths: [B] lengths in time steps after CNN (for CTCLoss)
"""
feats = self.cnn(x) # [B, 512, H', W']
b, c, h, w = feats.shape
# We want height to be 1 for CRNN; if not, squeeze/mean
if h != 1:
feats = feats.mean(dim=2) # [B,512,W]
else:
feats = feats.squeeze(2) # [B,512,W]
# Convert to time-major: [W, B, 512]
seq = feats.permute(2, 0, 1)
rnn_out, _ = self.rnn(seq) # [T,W?] actually [T,B,512]
logits = self.fc(rnn_out) # [T,B,C]
log_probs = F.log_softmax(logits, dim=-1)
input_lengths = torch.full(size=(b,), fill_value=log_probs.size(0), dtype=torch.long, device=log_probs.device)
return log_probs, input_lengths
|