File size: 2,839 Bytes
2411029
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from typing import Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F


class CRNN(nn.Module):
    """

    CNN + BiLSTM + Linear for CTC.

    Input:  images [B, 1, H, W]

    Output: log_probs [T, B, C] for CTCLoss (T = time steps, C = num_classes)

    """

    def __init__(self, num_classes: int):
        super().__init__()

        # CNN feature extractor (keeps width relatively large)
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),  # [B,64,H,W]
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),                          # [B,64,H/2,W/2]

            nn.Conv2d(64, 128, kernel_size=3, padding=1),# [B,128,H/2,W/2]
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),                          # [B,128,H/4,W/4]

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),

            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1)),  # [B,256,H/8,W/4]

            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1)),  # [B,512,H/16,W/4]

            nn.Conv2d(512, 512, kernel_size=2, padding=0),
            nn.ReLU(inplace=True),  # reduces height further
        )

        # BiLSTM sequence modeler
        self.rnn = nn.LSTM(
            input_size=512,
            hidden_size=256,
            num_layers=2,
            bidirectional=True,
            batch_first=False,
        )

        self.fc = nn.Linear(256 * 2, num_classes)  # bidirectional

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """

        Returns:

            log_probs: [T, B, C]

            input_lengths: [B] lengths in time steps after CNN (for CTCLoss)

        """
        feats = self.cnn(x)  # [B, 512, H', W']
        b, c, h, w = feats.shape

        # We want height to be 1 for CRNN; if not, squeeze/mean
        if h != 1:
            feats = feats.mean(dim=2)  # [B,512,W]
        else:
            feats = feats.squeeze(2)   # [B,512,W]

        # Convert to time-major: [W, B, 512]
        seq = feats.permute(2, 0, 1)

        rnn_out, _ = self.rnn(seq)      # [T,W?] actually [T,B,512]
        logits = self.fc(rnn_out)       # [T,B,C]
        log_probs = F.log_softmax(logits, dim=-1)

        input_lengths = torch.full(size=(b,), fill_value=log_probs.size(0), dtype=torch.long, device=log_probs.device)
        return log_probs, input_lengths