|
|
""" |
|
|
Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 |
|
|
Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora |
|
|
GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition |
|
|
Project Website: https://abdur75648.github.io/UTRNet/ |
|
|
Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial |
|
|
4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) |
|
|
""" |
|
|
|
|
|
import torch.nn as nn |
|
|
import torch.nn.functional as F |
|
|
import torch |
|
|
|
|
|
""" |
|
|
Source - https://github.com/sfczekalski/attention_unet |
|
|
Article - https://towardsdatascience.com/biomedical-image-segmentation-attention-u-net-29b6f0827405 |
|
|
""" |
|
|
|
|
|
class ConvBlock(nn.Module): |
|
|
|
|
|
def __init__(self, in_channels, out_channels): |
|
|
super(ConvBlock, self).__init__() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.conv = nn.Sequential( |
|
|
nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True), |
|
|
nn.BatchNorm2d(out_channels), |
|
|
nn.ReLU(inplace=True), |
|
|
nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True), |
|
|
nn.BatchNorm2d(out_channels), |
|
|
nn.ReLU(inplace=True) |
|
|
) |
|
|
|
|
|
def forward(self, x): |
|
|
x = self.conv(x) |
|
|
return x |
|
|
|
|
|
class UpConv(nn.Module): |
|
|
|
|
|
def __init__(self, in_channels, out_channels): |
|
|
super(UpConv, self).__init__() |
|
|
|
|
|
self.up = nn.Sequential( |
|
|
nn.Upsample(scale_factor=2), |
|
|
nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True), |
|
|
nn.BatchNorm2d(out_channels), |
|
|
nn.ReLU(inplace=True) |
|
|
) |
|
|
|
|
|
def forward(self, x): |
|
|
x = self.up(x) |
|
|
return x |
|
|
|
|
|
|
|
|
class AttentionBlock(nn.Module): |
|
|
"""Attention block with learnable parameters""" |
|
|
|
|
|
def __init__(self, F_g, F_l, n_coefficients): |
|
|
""" |
|
|
:param F_g: number of feature maps (channels) in previous layer |
|
|
:param F_l: number of feature maps in corresponding encoder layer, transferred via skip connection |
|
|
:param n_coefficients: number of learnable multi-dimensional attention coefficients |
|
|
""" |
|
|
super(AttentionBlock, self).__init__() |
|
|
|
|
|
self.W_gate = nn.Sequential( |
|
|
nn.Conv2d(F_g, n_coefficients, kernel_size=1, stride=1, padding=0, bias=True), |
|
|
nn.BatchNorm2d(n_coefficients) |
|
|
) |
|
|
|
|
|
self.W_x = nn.Sequential( |
|
|
nn.Conv2d(F_l, n_coefficients, kernel_size=1, stride=1, padding=0, bias=True), |
|
|
nn.BatchNorm2d(n_coefficients) |
|
|
) |
|
|
|
|
|
self.psi = nn.Sequential( |
|
|
nn.Conv2d(n_coefficients, 1, kernel_size=1, stride=1, padding=0, bias=True), |
|
|
nn.BatchNorm2d(1), |
|
|
nn.Sigmoid() |
|
|
) |
|
|
|
|
|
self.relu = nn.ReLU(inplace=True) |
|
|
|
|
|
def forward(self, gate, skip_connection): |
|
|
""" |
|
|
:param gate: gating signal from previous layer |
|
|
:param skip_connection: activation from corresponding encoder layer |
|
|
:return: output activations |
|
|
""" |
|
|
g1 = self.W_gate(gate) |
|
|
x1 = self.W_x(skip_connection) |
|
|
psi = self.relu(g1 + x1) |
|
|
psi = self.psi(psi) |
|
|
out = skip_connection * psi |
|
|
return out |
|
|
|
|
|
class AttnUNet(nn.Module): |
|
|
|
|
|
def __init__(self, img_ch=1, output_ch=512): |
|
|
super(AttnUNet, self).__init__() |
|
|
|
|
|
self.MaxPool = nn.MaxPool2d(kernel_size=2, stride=2) |
|
|
|
|
|
self.Conv1 = ConvBlock(img_ch, 32) |
|
|
self.Conv2 = ConvBlock(32, 64) |
|
|
self.Conv3 = ConvBlock(64, 128) |
|
|
self.Conv4 = ConvBlock(128, 256) |
|
|
self.Conv5 = ConvBlock(256, 512) |
|
|
|
|
|
self.Up5 = UpConv(512, 256) |
|
|
self.Att5 = AttentionBlock(F_g=256, F_l=256, n_coefficients=128) |
|
|
self.UpConv5 = ConvBlock(512, 256) |
|
|
|
|
|
self.Up4 = UpConv(256, 128) |
|
|
self.Att4 = AttentionBlock(F_g=128, F_l=128, n_coefficients=64) |
|
|
self.UpConv4 = ConvBlock(256, 128) |
|
|
|
|
|
self.Up3 = UpConv(128, 64) |
|
|
self.Att3 = AttentionBlock(F_g=64, F_l=64, n_coefficients=32) |
|
|
self.UpConv3 = ConvBlock(128, 64) |
|
|
|
|
|
self.Up2 = UpConv(64, 32) |
|
|
self.Att2 = AttentionBlock(F_g=32, F_l=32, n_coefficients=16) |
|
|
self.UpConv2 = ConvBlock(64, 32) |
|
|
|
|
|
self.Conv = nn.Conv2d(32, output_ch, kernel_size=1, stride=1, padding=0) |
|
|
|
|
|
def forward(self, x): |
|
|
""" |
|
|
e : encoder layers |
|
|
d : decoder layers |
|
|
s : skip-connections from encoder layers to decoder layers |
|
|
""" |
|
|
|
|
|
|
|
|
e1 = self.Conv1(x) |
|
|
|
|
|
|
|
|
e2 = self.MaxPool(e1) |
|
|
e2 = self.Conv2(e2) |
|
|
|
|
|
|
|
|
e3 = self.MaxPool(e2) |
|
|
e3 = self.Conv3(e3) |
|
|
|
|
|
|
|
|
e4 = self.MaxPool(e3) |
|
|
e4 = self.Conv4(e4) |
|
|
|
|
|
|
|
|
e5 = self.MaxPool(e4) |
|
|
e5 = self.Conv5(e5) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
d5 = self.Up5(e5) |
|
|
s4 = self.Att5(gate=d5, skip_connection=e4) |
|
|
d5 = torch.cat((s4, d5), dim=1) |
|
|
d5 = self.UpConv5(d5) |
|
|
|
|
|
|
|
|
d4 = self.Up4(d5) |
|
|
s3 = self.Att4(gate=d4, skip_connection=e3) |
|
|
d4 = torch.cat((s3, d4), dim=1) |
|
|
d4 = self.UpConv4(d4) |
|
|
|
|
|
|
|
|
d3 = self.Up3(d4) |
|
|
s2 = self.Att3(gate=d3, skip_connection=e2) |
|
|
d3 = torch.cat((s2, d3), dim=1) |
|
|
d3 = self.UpConv3(d3) |
|
|
|
|
|
|
|
|
d2 = self.Up2(d3) |
|
|
s1 = self.Att2(gate=d2, skip_connection=e1) |
|
|
d2 = torch.cat((s1, d2), dim=1) |
|
|
d2 = self.UpConv2(d2) |
|
|
|
|
|
|
|
|
out = self.Conv(d2) |
|
|
|
|
|
|
|
|
return out |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|