Spaces:
Sleeping
Sleeping
| import torch | |
| import torch.nn as nn | |
| class CRAFT_Demonstration(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| # In reality, this is a deep ResNet-based U-Net architecture. | |
| self.feature_extractor = nn.Conv2d(3, 64, kernel_size=3, padding=1) | |
| self.heatmap_predictor = nn.Conv2d(64, 2, kernel_size=1) | |
| def forward(self, image): | |
| features = self.feature_extractor(image) | |
| # Returns [Region Score, Affinity Score] | |
| return self.heatmap_predictor(features) | |
| class VGG_FeatureExtractor(nn.Module): | |
| def __init__(self, input_channel=1, output_channel=256): | |
| super(VGG_FeatureExtractor, self).__init__() | |
| self.ConvNet = nn.Sequential( | |
| nn.Conv2d(input_channel, 64, 3, 1, 1), nn.ReLU(True), | |
| nn.MaxPool2d(2, 2), | |
| nn.Conv2d(64, 128, 3, 1, 1), nn.ReLU(True), | |
| nn.MaxPool2d(2, 2), | |
| nn.Conv2d(128, 256, 3, 1, 1), nn.ReLU(True), | |
| nn.Conv2d(256, 256, 3, 1, 1), nn.ReLU(True), | |
| nn.MaxPool2d((2, 1), (2, 1)), | |
| nn.Conv2d(256, output_channel, 3, 1, 1, bias=False), | |
| nn.BatchNorm2d(output_channel), nn.ReLU(True) | |
| ) | |
| def forward(self, input): | |
| return self.ConvNet(input) | |
| class BidirectionalLSTM(nn.Module): | |
| def __init__(self, input_size, hidden_size, output_size): | |
| super(BidirectionalLSTM, self).__init__() | |
| self.rnn = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True) | |
| self.linear = nn.Linear(hidden_size * 2, output_size) | |
| def forward(self, input): | |
| recurrent, _ = self.rnn(input) | |
| output = self.linear(recurrent) # Contextual Features mapped to Classes | |
| return output | |
| class CRNN_Model(nn.Module): | |
| def __init__(self, num_classes=97): | |
| super(CRNN_Model, self).__init__() | |
| self.FeatureExtraction = VGG_FeatureExtractor(input_channel=1, output_channel=256) | |
| self.AdaptiveAvgPool = nn.AdaptiveAvgPool2d((None, 1)) | |
| self.SequenceModeling = nn.Sequential( | |
| BidirectionalLSTM(256, 256, 256), | |
| BidirectionalLSTM(256, 256, 256) | |
| ) | |
| self.Prediction = nn.Linear(256, num_classes) | |
| def forward(self, image_tensor): | |
| visual_feature = self.FeatureExtraction(image_tensor) | |
| visual_feature = self.AdaptiveAvgPool(visual_feature.permute(0, 3, 1, 2)).squeeze(3) | |
| contextual_feature = self.SequenceModeling(visual_feature) | |
| prediction = self.Prediction(contextual_feature.contiguous()) | |
| return prediction | |
| def CTCDecoder(predictions): | |
| max_probs = torch.argmax(predictions, dim=2) | |
| final_string = [] | |
| for i in range(len(max_probs)): | |
| if max_probs[i] != 0 and (i == 0 or max_probs[i] != max_probs[i-1]): | |
| final_string.append(str(max_probs[i].item())) | |
| return "".join(final_string) | |