diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..972f837b5596cda19e88e9ba28d3997525e77027 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +# Ignore all .log files +*.pt +*.pth +models diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..665bbb8cc00f9cff130dee6180600f3fc75721a6 --- /dev/null +++ b/app.py @@ -0,0 +1,50 @@ +import gradio as gr +from code.detection.recognize_id.detect_and_recognize_id import Recognize_ID +from code.detection.detection import detection +from code.recognization.recognization import TextRecognition +import os + +# Define a dummy prediction function +def predict_image(image): + + # Recognize ID + rec_id = Recognize_ID() + id = rec_id.give_me_id_number(image) + + # Detection + det = detection() + detection_list = det.full_pipeline(image) + + result = '' + # Loop on all detected images and recognize them + recognizer = TextRecognition() + for line in detection_list[2:6]: + for word in line: + recognized_word = recognizer.recognize_image(word) + result = result + recognized_word + ' ' + result += '\n' + + # Add Id number + result = result + id + + return result + +# List of paths to your sample images +current_dir = os.path.dirname(os.path.abspath(__file__)) +sample_images = [ + os.path.join(current_dir , "samples/id_1.png" ) +] + +# Create the Gradio interface +interface = gr.Interface( + fn=predict_image, # Function to run + inputs="image", # Input type + outputs="text", # Output type + title="Recognization", + description="Upload an image", + examples=sample_images +) + +# Launch the app +interface.launch() + diff --git a/code/__init__.py b/code/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/__pycache__/__init__.cpython-310.pyc b/code/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..209d20f5d54c51608cfb741f36ecef2026601841 Binary files /dev/null and b/code/__pycache__/__init__.cpython-310.pyc differ diff --git a/code/detection/__pycache__/detection.cpython-310.pyc b/code/detection/__pycache__/detection.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfa83b2aedff3ecd8d0aa2324149475966f20423 Binary files /dev/null and b/code/detection/__pycache__/detection.cpython-310.pyc differ diff --git a/code/detection/detection.py b/code/detection/detection.py new file mode 100644 index 0000000000000000000000000000000000000000..f6cb924b369e536b91f2858d338cd7066ce756dd --- /dev/null +++ b/code/detection/detection.py @@ -0,0 +1,77 @@ +from ultralytics import YOLO +from glob import glob +import matplotlib.pyplot as plt +import cv2 +import os +from PIL import Image +from ultralytics.engine.results import Results +import numpy as np + + +class detection: + + def __init__(self,model_path='detection.pt'): + current_dir = os.path.dirname(os.path.abspath(__file__)) + model_path = os.path.join(current_dir , model_path ) + self.model = YOLO(model_path) + + def get_distance(self,res): + boxes = res[0].boxes.xywh.numpy() # Convert to numpy array + # Sort primarily by Y (vertical), then X (horizontal) using lexsort + sorted_indices = np.lexsort((boxes[:, 0], boxes[:, 1])) + sorted_boxes = boxes[sorted_indices] + return sorted_boxes[:, 1], sorted_indices # Return sorted Y values and indices + + def handle_the_boxes(self,res, img, y_threshold=30): + distance_sorted, sorted_indices = self.get_distance(res) + PB = res[0].boxes.xyxy.numpy()[sorted_indices] # Get boxes in sorted order + same_object = [] + current_line = [PB[0]] + + # Group boxes into lines using Y threshold + for i in range(1, len(PB)): + prev_y = current_line[-1][1] # Use ymin from XYXY format + current_y = PB[i][1] + if abs(current_y - prev_y) > y_threshold: + # Sort line left-to-right before adding + current_line = sorted(current_line, key=lambda x: x[0] , reverse=True) + same_object.append(current_line) + current_line = [PB[i]] + else: + current_line.append(PB[i]) + + # Add the last line and sort it + if current_line: + current_line = sorted(current_line, key=lambda x: x[0]) + same_object.append(current_line) + + # Extract word images in final order + return [ + [self.words_pixels(img, box) for box in line] + for line in same_object + ] + + # Keep words_pixels as original + def words_pixels(self,img, xyxy): + xmin, ymin, xmax, ymax = xyxy.tolist() + return img[int(ymin):int(ymax)+1, int(xmin):int(xmax)+1] + + def full_pipeline(self,image,show=False): + + if isinstance(image, str): # If the input is a file path + img = cv2.imread(image) + elif isinstance(image, np.ndarray): # If the input is a NumPy array + image = image + img = image + + res = self.model(image) + + if show: + res[0].show() + + + return self.handle_the_boxes(res , img) + + + + \ No newline at end of file diff --git a/code/detection/recognize_id/__pycache__/detect_and_recognize_id.cpython-310.pyc b/code/detection/recognize_id/__pycache__/detect_and_recognize_id.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22304a52210bfc76499801bcf7fff3eae4e3b5d5 Binary files /dev/null and b/code/detection/recognize_id/__pycache__/detect_and_recognize_id.cpython-310.pyc differ diff --git a/code/detection/recognize_id/data/id_1.png b/code/detection/recognize_id/data/id_1.png new file mode 100644 index 0000000000000000000000000000000000000000..b4dc92b97fca873a8fd6934610c8e99cc5fcbf8e Binary files /dev/null and b/code/detection/recognize_id/data/id_1.png differ diff --git a/code/detection/recognize_id/detect_and_recognize_id.py b/code/detection/recognize_id/detect_and_recognize_id.py new file mode 100644 index 0000000000000000000000000000000000000000..c38ccde2824246b4248725ee372c6b805e531395 --- /dev/null +++ b/code/detection/recognize_id/detect_and_recognize_id.py @@ -0,0 +1,36 @@ +from ultralytics import YOLO +from ultralytics.engine.results import Results +import cv2 +import os +import numpy as np + +class Recognize_ID: + + def __init__(self,model_path='recognization_id.pt'): + + current_dir = os.path.dirname(os.path.abspath(__file__)) + model_path = os.path.join(current_dir , model_path ) + self.model = YOLO(model=model_path ) + + def give_me_id_number(self,image:str): + """ + image_dir : input image directory + model : yolo model + """ + if isinstance(image, str): # If the input is a file path + current_dir = os.path.dirname(os.path.abspath(__file__)) + image_path = os.path.join(current_dir , image ) + img = cv2.imread(image_path) + elif isinstance(image, np.ndarray): # If the input is a NumPy array + img = image + + print(type(img)) + res = self.model(img) + boxes = res[0].boxes.xywh[::,0].tolist() + classes = res[0].boxes.cls.tolist() + boxes_labels =[(int(key) , int(value)) for key , value in zip(boxes, classes)] + boxes_labels.sort() + national_id = "".join([str(i[1]) for i in boxes_labels]) + + return national_id + \ No newline at end of file diff --git a/code/recognization/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/code/recognization/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..363fcab7ed6e9634e198cf5555ceb88932c9a245 --- /dev/null +++ b/code/recognization/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/code/recognization/__pycache__/augmentation.cpython-310.pyc b/code/recognization/__pycache__/augmentation.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5af1414f3796a4eae38c8688cd02a550383c25c9 Binary files /dev/null and b/code/recognization/__pycache__/augmentation.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/config.cpython-310.pyc b/code/recognization/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51a87ec3a0ccc3ac524aba7c6986b855a89bb7a1 Binary files /dev/null and b/code/recognization/__pycache__/config.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/custom_test.cpython-310.pyc b/code/recognization/__pycache__/custom_test.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e020acaec0da4e1d8abc9c4b45b8006485e25fd Binary files /dev/null and b/code/recognization/__pycache__/custom_test.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/dataset.cpython-310.pyc b/code/recognization/__pycache__/dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e51df8ed6ffd49d2625e12cb960f45db08fb61f3 Binary files /dev/null and b/code/recognization/__pycache__/dataset.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/densenet.cpython-310.pyc b/code/recognization/__pycache__/densenet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b64ced255432ff308391e8278347b4ac756f1a13 Binary files /dev/null and b/code/recognization/__pycache__/densenet.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/dropout_layer.cpython-310.pyc b/code/recognization/__pycache__/dropout_layer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06de6d5dd2521a6ee62d0b1f62942faa3cb56151 Binary files /dev/null and b/code/recognization/__pycache__/dropout_layer.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/feature_extraction.cpython-310.pyc b/code/recognization/__pycache__/feature_extraction.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bff21b39a7cd312e5500faaf2fe613f8a422b047 Binary files /dev/null and b/code/recognization/__pycache__/feature_extraction.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/hrnet.cpython-310.pyc b/code/recognization/__pycache__/hrnet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92d810caeae28a3aaf61228e63000553d30d2e60 Binary files /dev/null and b/code/recognization/__pycache__/hrnet.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/inception_unet.cpython-310.pyc b/code/recognization/__pycache__/inception_unet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4260c91d50a105719d1c1a5bc77ced6d0e4ec4ed Binary files /dev/null and b/code/recognization/__pycache__/inception_unet.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/model.cpython-310.pyc b/code/recognization/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2948126c22ecc33a5b5ff8a8bbfece283a84439 Binary files /dev/null and b/code/recognization/__pycache__/model.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/my_test.cpython-310.pyc b/code/recognization/__pycache__/my_test.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5324d9948252e22e29951d83a38d622aef6eae9c Binary files /dev/null and b/code/recognization/__pycache__/my_test.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/prediction.cpython-310.pyc b/code/recognization/__pycache__/prediction.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94d0400a3600559692f49771537aaeb74df0dcd1 Binary files /dev/null and b/code/recognization/__pycache__/prediction.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/rcnn.cpython-310.pyc b/code/recognization/__pycache__/rcnn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24329e2201dcd529e41d36acc4afe4fcbee0f1a7 Binary files /dev/null and b/code/recognization/__pycache__/rcnn.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/recognization.cpython-310.pyc b/code/recognization/__pycache__/recognization.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..911d8be4425b964b232625833150e3fe3b924169 Binary files /dev/null and b/code/recognization/__pycache__/recognization.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/resnet.cpython-310.pyc b/code/recognization/__pycache__/resnet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bad720be92ac510c5a2cb834409f9398c869c5f7 Binary files /dev/null and b/code/recognization/__pycache__/resnet.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/resunet.cpython-310.pyc b/code/recognization/__pycache__/resunet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2308ff5dc0319659b2d978c8e1e19ffe98266b0b Binary files /dev/null and b/code/recognization/__pycache__/resunet.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/sequence_modeling.cpython-310.pyc b/code/recognization/__pycache__/sequence_modeling.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..622e1a604b6aabede55f16a0ab808c6a9be611d6 Binary files /dev/null and b/code/recognization/__pycache__/sequence_modeling.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/unet.cpython-310.pyc b/code/recognization/__pycache__/unet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45a2fa5dec862f00a7b5fa09052449927f62dee0 Binary files /dev/null and b/code/recognization/__pycache__/unet.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/unet_attn.cpython-310.pyc b/code/recognization/__pycache__/unet_attn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..885d45ee8912671290f7655fcb6f2d22b890c33d Binary files /dev/null and b/code/recognization/__pycache__/unet_attn.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/unet_plus_plus.cpython-310.pyc b/code/recognization/__pycache__/unet_plus_plus.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc24f330fa39c125bac66063a5e30d801dbefa6b Binary files /dev/null and b/code/recognization/__pycache__/unet_plus_plus.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/utils.cpython-310.pyc b/code/recognization/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35ca1ebdccf26eb530b09f661968cf3394ebc890 Binary files /dev/null and b/code/recognization/__pycache__/utils.cpython-310.pyc differ diff --git a/code/recognization/__pycache__/vgg.cpython-310.pyc b/code/recognization/__pycache__/vgg.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..746508d9b03175d5be81a9ac9c4b9cc135a67d7c Binary files /dev/null and b/code/recognization/__pycache__/vgg.cpython-310.pyc differ diff --git a/code/recognization/augmentation.py b/code/recognization/augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..2c75388bac3c3f27562527d5a1e6024aae19c755 --- /dev/null +++ b/code/recognization/augmentation.py @@ -0,0 +1,134 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +from functools import partial +import random as rnd +import imgaug.augmenters as iaa +import numpy as np +from PIL import ImageFilter, Image +from timm.data import auto_augment + +_OP_CACHE = {} + +def _get_op(key, factory): + try: + op = _OP_CACHE[key] + except KeyError: + op = factory() + _OP_CACHE[key] = op + return op + + +def _get_param(level, img, max_dim_factor, min_level=1): + max_level = max(min_level, max_dim_factor * max(img.size)) + return round(min(level, max_level)) + +def gaussian_blur(img, radius, **__): + radius = _get_param(radius, img, 0.02) + key = 'gaussian_blur_' + str(radius) + op = _get_op(key, lambda: ImageFilter.GaussianBlur(radius)) + return img.filter(op) + +def motion_blur(img, k, **__): + k = _get_param(k, img, 0.08, 3) | 1 # bin to odd values + key = 'motion_blur_' + str(k) + op = _get_op(key, lambda: iaa.MotionBlur(k)) + return Image.fromarray(op(image=np.asarray(img))) + +def gaussian_noise(img, scale, **_): + scale = _get_param(scale, img, 0.25) | 1 # bin to odd values + key = 'gaussian_noise_' + str(scale) + op = _get_op(key, lambda: iaa.AdditiveGaussianNoise(scale=scale)) + return Image.fromarray(op(image=np.asarray(img))) + +def poisson_noise(img, lam, **_): + lam = _get_param(lam, img, 0.2) | 1 # bin to odd values + key = 'poisson_noise_' + str(lam) + op = _get_op(key, lambda: iaa.AdditivePoissonNoise(lam)) + return Image.fromarray(op(image=np.asarray(img))) + +def salt_and_pepper_noise(image, prob=0.05): + if prob <= 0: + return image + arr = np.asarray(image) + original_dtype = arr.dtype + intensity_levels = 2 ** (arr[0, 0].nbytes * 8) + min_intensity = 0 + max_intensity = intensity_levels - 1 + random_image_arr = np.random.choice([min_intensity, 1, np.nan], p=[prob / 2, 1 - prob, prob / 2], size=arr.shape) + salt_and_peppered_arr = arr.astype(np.float) * random_image_arr + salt_and_peppered_arr = np.nan_to_num(salt_and_peppered_arr, nan=max_intensity).astype(original_dtype) + return Image.fromarray(salt_and_peppered_arr) + +def random_border_crop(image): + img_width,img_height = image.size + crop_left = int(img_width * rnd.uniform(0.0, 0.025)) + crop_top = int(img_height * rnd.uniform(0.0, 0.075)) + crop_right = int(img_width * rnd.uniform(0.975, 1.0)) + crop_bottom = int(img_height * rnd.uniform(0.925, 1.0)) + final_image = image.crop((crop_left, crop_top, crop_right, crop_bottom)) + return final_image + +def random_resize(image): + size = image.size + new_size = [rnd.randint(int(0.5*size[0]), int(1.5*size[0])), rnd.randint(int(0.5*size[1]), int(1.5*size[1]))] + reduce_factor = rnd.randint(1,4) + new_size = tuple([int(x/reduce_factor) for x in new_size]) + final_image = image.resize(new_size) + return final_image + +def _level_to_arg(level, _hparams, max): + level = max * level / auto_augment._LEVEL_DENOM + return level, + +_RAND_TRANSFORMS = [ + 'AutoContrast', + 'Equalize', + 'Invert', + # 'Rotate', + 'Posterize', + 'Solarize', + 'SolarizeAdd', + 'Color', + 'Contrast', + 'Brightness', + 'Sharpness', + 'ShearX', +] +#_RAND_TRANSFORMS.remove('SharpnessIncreasing') # remove, interferes with *blur ops +_RAND_TRANSFORMS.extend([ + 'GaussianBlur', + 'GaussianNoise', + 'PoissonNoise' +]) +auto_augment.LEVEL_TO_ARG.update({ + 'GaussianBlur': partial(_level_to_arg, max=4), + 'MotionBlur': partial(_level_to_arg, max=20), + 'GaussianNoise': partial(_level_to_arg, max=0.1 * 255), + 'PoissonNoise': partial(_level_to_arg, max=40) +}) +auto_augment.NAME_TO_OP.update({ + 'GaussianBlur': gaussian_blur, + 'MotionBlur': motion_blur, + 'GaussianNoise': gaussian_noise, + 'PoissonNoise': poisson_noise +}) + +def rand_augment_transform(magnitude=5, num_layers=3): + # These are tuned for magnitude=5, which means that effective magnitudes are half of these values. + hparams = { + 'img_mean':128, + # 'rotate_deg': 5, + 'shear_x_pct': 0.9, + 'shear_y_pct': 0.0, + } + ra_ops = auto_augment.rand_augment_ops(magnitude, hparams, transforms=_RAND_TRANSFORMS) + # Supply weights to disable replacement in random selection (i.e. avoid applying the same op twice) + choice_weights = [1. / len(ra_ops) for _ in range(len(ra_ops))] + return auto_augment.RandAugment(ra_ops, num_layers, choice_weights) diff --git a/code/recognization/config.py b/code/recognization/config.py new file mode 100644 index 0000000000000000000000000000000000000000..6cdbbf624809a4dd07ab458554cf7691555ec32b --- /dev/null +++ b/code/recognization/config.py @@ -0,0 +1,30 @@ +class Config: + FeatureExtraction = 'HRNet' # or any other feature extraction method + SequenceModeling = 'DBiLSTM' # or any other sequential model + Prediction = 'CTC' # or 'Attn' + input_channel = 1 # e.g., RGB image has 3 channels + output_channel = 32 # Adjust based on your architecture + hidden_size = 256 # Adjust based on your architecture + num_class = 182 # Number of output classes + device = 'cpu' # or 'cuda' for GPU + batch_max_length = 8 # Maximum sequence length for prediction + # Adam optimizer + adam = False + lr = 0.1 + batch_size = 4 + beta1 = 0.9 + workers = 4 + num_epochs = 5 + rho = 0.95 + eps = 1e-8 + + imgH = 32 + imgW = 400 + train_data = 'result/train/' # path to train data + valid_data = 'result/validate/' # path to validation data + saved_model = 'model/' + + character ='' + rgb = False + grad_clip = 5 + diff --git a/code/recognization/custom_test.py b/code/recognization/custom_test.py new file mode 100644 index 0000000000000000000000000000000000000000..fa03e4aa5b8c79b293f90e27f43240dbaa483858 --- /dev/null +++ b/code/recognization/custom_test.py @@ -0,0 +1,235 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import os,shutil +import time +import argparse +import random +import numpy as np +import matplotlib.pyplot as plt +from datetime import datetime +import pytz + +import torch +import torch.utils.data +import torch.nn.functional as F +from tqdm import tqdm +from nltk.metrics.distance import edit_distance + +from utils import CTCLabelConverter, AttnLabelConverter, Averager, Logger +from dataset import hierarchical_dataset, AlignCollate +from model import Model + +def validation(model, criterion, evaluation_loader, converter, opt, device): + """ validation or evaluation """ + eval_arr = [] + sum_len_gt = 0 + + n_correct = 0 + + norm_ED = 0 + length_of_data = 0 + infer_time = 0 + valid_loss_avg = Averager() + + for i, (image_tensors, labels) in enumerate(tqdm(evaluation_loader)): + batch_size = image_tensors.size(0) + length_of_data = length_of_data + batch_size + image = image_tensors.to(device) + # For max length prediction + length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) + text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) + + text_for_loss, length_for_loss = converter.encode(labels, batch_max_length=opt.batch_max_length) + + start_time = time.time() + if 'CTC' in opt.Prediction: + preds = model(image) + forward_time = time.time() - start_time + preds_size = torch.IntTensor([preds.size(1)] * batch_size) + cost = criterion(preds.log_softmax(2).permute(1, 0, 2), text_for_loss, preds_size, length_for_loss) + _, preds_index = preds.max(2) + preds_str = converter.decode(preds_index.data, preds_size.data) + else: + preds = model(image, text=text_for_pred, is_train=False) + forward_time = time.time() - start_time + + preds = preds[:, :text_for_loss.shape[1] - 1, :].to(device) + target = text_for_loss[:, 1:].to(device) # without [GO] Symbol + cost = criterion(preds.contiguous().view(-1, preds.shape[-1]), target.contiguous().view(-1)) + _, preds_index = preds.max(2) + preds_str = converter.decode(preds_index, length_for_pred) + labels = converter.decode(text_for_loss[:, 1:], length_for_loss) + + infer_time += forward_time + valid_loss_avg.add(cost) + + # calculate accuracy & confidence score + preds_prob = F.softmax(preds, dim=2) + preds_max_prob, _ = preds_prob.max(dim=2) + confidence_score_list = [] + for gt, pred, pred_max_prob in zip(labels, preds_str, preds_max_prob): + if 'Attn' in opt.Prediction: + gt = gt[:gt.find('[s]')] + pred_EOS = pred.find('[s]') + pred = pred[:pred_EOS] # prune after "end of sentence" token ([s]) + pred_max_prob = pred_max_prob[:pred_EOS] + + if pred == gt: + n_correct += 1 + + # ICDAR2019 Normalized Edit Distance + if len(gt) == 0 or len(pred) == 0: + ED = 0 + elif len(gt) > len(pred): + ED = 1 - edit_distance(pred, gt) / len(gt) + else: + ED = 1 - edit_distance(pred, gt) / len(pred) + + eval_arr.append([gt,pred,ED]) + + sum_len_gt += len(gt) + norm_ED += (ED*len(gt)) + + # calculate confidence score (= multiply of pred_max_prob) + try: + confidence_score = pred_max_prob.cumprod(dim=0)[-1] + except: + confidence_score = 0 # for empty pred case, when prune after "end of sentence" token ([s]) + confidence_score_list.append(confidence_score) + # print(pred, gt, pred==gt, confidence_score) + + accuracy = n_correct / float(length_of_data) * 100 + norm_ED = norm_ED / float(sum_len_gt) + + return valid_loss_avg.val(), accuracy, norm_ED, eval_arr + + +def test(opt, device): + opt.device = device + os.makedirs("test_outputs", exist_ok=True) + datetime_now = str(datetime.now(pytz.timezone('Asia/Kolkata')).strftime("%Y-%m-%d_%H-%M-%S")) + logger = Logger(f'test_outputs/{datetime_now}.txt') + """ model configuration """ + if 'CTC' in opt.Prediction: + converter = CTCLabelConverter(opt.character) + else: + converter = AttnLabelConverter(opt.character) + opt.num_class = len(converter.character) + + if opt.rgb: + opt.input_channel = 3 + model = Model(opt) + logger.log('model input parameters', opt.imgH, opt.imgW, opt.input_channel, opt.output_channel, + opt.hidden_size, opt.num_class, opt.batch_max_length, opt.FeatureExtraction, + opt.SequenceModeling, opt.Prediction) + model = model.to(device) + + # load model + model.load_state_dict(torch.load(opt.saved_model, map_location=device)) + logger.log('Loaded pretrained model from %s' % opt.saved_model) + # logger.log(model) + + """ setup loss """ + if 'CTC' in opt.Prediction: + criterion = torch.nn.CTCLoss(zero_infinity=True).to(device) + else: + criterion = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) # ignore [GO] token = ignore index 0 + + """ evaluation """ + model.eval() + with torch.no_grad(): + AlignCollate_evaluation = AlignCollate(imgH=opt.imgH, imgW=opt.imgW)#, keep_ratio_with_pad=opt.PAD) + eval_data, eval_data_log = hierarchical_dataset(root=opt.eval_data, opt=opt, rand_aug=False) + logger.log(eval_data_log) + evaluation_loader = torch.utils.data.DataLoader( + eval_data, batch_size=opt.batch_size, + shuffle=False, + num_workers=int(opt.workers), + collate_fn=AlignCollate_evaluation, pin_memory=True) + _, accuracy, norm_ED, eval_arr = validation( model, criterion, evaluation_loader, converter, opt,device) + logger.log("="*20) + logger.log(f'Accuracy : {accuracy:0.4f}\n') + logger.log(f'Norm_ED : {norm_ED:0.4f}\n') + logger.log("="*20) + + if opt.visualize: + logger.log("Threshold - ", opt.threshold) + logger.log("ED","\t","gt","\t","pred") + arr = [] + for gt,pred,ED in eval_arr: + ED = ED*100.0 + arr.append(ED) + if ED<=(opt.threshold): + logger.log(ED,"\t",gt,"\t",pred) + plt.hist(arr, edgecolor="red") + plt.savefig('test_outputs/'+str(datetime_now)+".png") + plt.close() + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--visualize', action='store_true', help='for visualization of bad samples') + parser.add_argument('--threshold', type=float, help='Save samples below this threshold in txt file', default=50.0) + parser.add_argument('--eval_data', required=True, help='path to evaluation dataset') + parser.add_argument('--workers', type=int, help='number of data loading workers', default=4) + parser.add_argument('--batch_size', type=int, default=32, help='input batch size') + parser.add_argument('--saved_model', required=True, help="path to saved_model to evaluation") + """ Data processing """ + parser.add_argument('--batch_max_length', type=int, default=100, help='maximum-label-length') + parser.add_argument('--imgH', type=int, default=32, help='the height of the input image') + parser.add_argument('--imgW', type=int, default=400, help='the width of the input image') + parser.add_argument('--rgb', action='store_true', help='use rgb input') + """ Model Architecture """ + parser.add_argument('--FeatureExtraction', type=str, default="HRNet", #required=True, + help='FeatureExtraction stage VGG|RCNN|ResNet|UNet|HRNet|Densenet|InceptionUnet|ResUnet|AttnUNet|UNet|VGG') + parser.add_argument('--SequenceModeling', type=str, default="DBiLSTM", #required=True, + help='SequenceModeling stage LSTM|GRU|MDLSTM|BiLSTM|DBiLSTM') + parser.add_argument('--Prediction', type=str, default="CTC", #required=True, + help='Prediction stage CTC|Attn') + parser.add_argument('--input_channel', type=int, default=1, help='the number of input channel of Feature extractor') + parser.add_argument('--output_channel', type=int, default=512, help='the number of output channel of Feature extractor') + parser.add_argument('--hidden_size', type=int, default=256, help='the size of the LSTM hidden state') + """ GPU Selection """ + parser.add_argument('--device_id', type=str, default=None, help='cuda device ID') + + opt = parser.parse_args() + if opt.FeatureExtraction == "HRNet": + opt.output_channel = 32 + + # Fix random seeds for both numpy and pytorch + seed = 1111 + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + """ vocab / character number configuration """ + file = open("UrduGlyphs.txt","r",encoding="utf-8") + content = file.readlines() + content = ''.join([str(elem).strip('\n') for elem in content]) + opt.character = content+" " + + cuda_str = 'cuda' + if opt.device_id is not None: + cuda_str = f'cuda:{opt.device_id}' + device = torch.device(cuda_str if torch.cuda.is_available() else 'cpu') + print("Device : ", device) + + # opt.eval_data = "/DATA/parseq/val/" + # test(opt, device) + + # opt.eval_data = "/DATA/parseq/IIITH/lmdb_new/" + # test(opt, device) + + # opt.eval_data = "/DATA/public_datasets/UPTI/valid/" + # test(opt, device) + + test(opt, device) \ No newline at end of file diff --git a/code/recognization/data/1.png b/code/recognization/data/1.png new file mode 100644 index 0000000000000000000000000000000000000000..f94ddbc7af79bd6d47d29193dd9bb762446ece59 Binary files /dev/null and b/code/recognization/data/1.png differ diff --git a/code/recognization/data/10.png b/code/recognization/data/10.png new file mode 100644 index 0000000000000000000000000000000000000000..a650218fbf7cdf2f519e6cdffe93abe7fc4c6367 Binary files /dev/null and b/code/recognization/data/10.png differ diff --git a/code/recognization/data/11.png b/code/recognization/data/11.png new file mode 100644 index 0000000000000000000000000000000000000000..11e1b8ba39559b94b79e9c9dbf3664c55a770f4a Binary files /dev/null and b/code/recognization/data/11.png differ diff --git a/code/recognization/data/12.png b/code/recognization/data/12.png new file mode 100644 index 0000000000000000000000000000000000000000..ef9831effbd724280bce703ae3a831a10bc33dff Binary files /dev/null and b/code/recognization/data/12.png differ diff --git a/code/recognization/data/13.png b/code/recognization/data/13.png new file mode 100644 index 0000000000000000000000000000000000000000..2f172918a33b6f2c066ef50114a8331cbc3ae7aa Binary files /dev/null and b/code/recognization/data/13.png differ diff --git a/code/recognization/data/14.png b/code/recognization/data/14.png new file mode 100644 index 0000000000000000000000000000000000000000..de999a280c19d9e3232f18723c50264ab26b941f Binary files /dev/null and b/code/recognization/data/14.png differ diff --git a/code/recognization/data/15.png b/code/recognization/data/15.png new file mode 100644 index 0000000000000000000000000000000000000000..d017a9f04b9c16c96c6d1fff4126dc3f53decd09 Binary files /dev/null and b/code/recognization/data/15.png differ diff --git a/code/recognization/data/16.png b/code/recognization/data/16.png new file mode 100644 index 0000000000000000000000000000000000000000..dbe64cbab6ebe6062ec0df625b04ab274e0136ea Binary files /dev/null and b/code/recognization/data/16.png differ diff --git a/code/recognization/data/2.png b/code/recognization/data/2.png new file mode 100644 index 0000000000000000000000000000000000000000..0c8360bc48b14d88ac08931e8fee4571feb68df7 Binary files /dev/null and b/code/recognization/data/2.png differ diff --git a/code/recognization/data/2_1.png b/code/recognization/data/2_1.png new file mode 100644 index 0000000000000000000000000000000000000000..65934d3b52b2c9846187c0f546574cec4757b91d Binary files /dev/null and b/code/recognization/data/2_1.png differ diff --git a/code/recognization/data/2_2.png b/code/recognization/data/2_2.png new file mode 100644 index 0000000000000000000000000000000000000000..cd65ca3d9a89d365e62b0074e9277e2994fb3450 Binary files /dev/null and b/code/recognization/data/2_2.png differ diff --git a/code/recognization/data/3.png b/code/recognization/data/3.png new file mode 100644 index 0000000000000000000000000000000000000000..79937c25582d171d3f2d434df38fa0ed7e454b64 Binary files /dev/null and b/code/recognization/data/3.png differ diff --git a/code/recognization/data/4.png b/code/recognization/data/4.png new file mode 100644 index 0000000000000000000000000000000000000000..8bfa8d1e3ee9c2c2e761a1138374423d2a14de8c Binary files /dev/null and b/code/recognization/data/4.png differ diff --git a/code/recognization/data/5.png b/code/recognization/data/5.png new file mode 100644 index 0000000000000000000000000000000000000000..8d9c702b8df2dceab649256d1bb8d8898b04e57d Binary files /dev/null and b/code/recognization/data/5.png differ diff --git a/code/recognization/data/6.png b/code/recognization/data/6.png new file mode 100644 index 0000000000000000000000000000000000000000..4bf454ecbd45de060de29f0e1e7fda49d124dfc1 Binary files /dev/null and b/code/recognization/data/6.png differ diff --git a/code/recognization/data/7.png b/code/recognization/data/7.png new file mode 100644 index 0000000000000000000000000000000000000000..bc2067af157757b83871d257cafb7b04435c55cc Binary files /dev/null and b/code/recognization/data/7.png differ diff --git a/code/recognization/data/8.png b/code/recognization/data/8.png new file mode 100644 index 0000000000000000000000000000000000000000..baae41841253de77caee0298aa19f37e75311e37 Binary files /dev/null and b/code/recognization/data/8.png differ diff --git a/code/recognization/data/9.png b/code/recognization/data/9.png new file mode 100644 index 0000000000000000000000000000000000000000..b494b962205b7818169e8b107b5bfa68d3e6c701 Binary files /dev/null and b/code/recognization/data/9.png differ diff --git a/code/recognization/dataset.py b/code/recognization/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..5a81b10c8b207fe5916c7c8b447e72cf6f5bab42 --- /dev/null +++ b/code/recognization/dataset.py @@ -0,0 +1,319 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import os +import sys +import re +import six +import math +import lmdb +import torch +import random +import numpy as np +from PIL import Image +import torchvision.transforms as T +#from torch._utils import _accumulate +from torch.utils.data import Dataset, ConcatDataset, Subset + +def accumulate(iterable): + total = 0 + for value in iterable: + total += value + yield total + + +class Batch_Balanced_Dataset(object): + + def __init__(self, opt, rand_aug = False): + """ + Modulate the data ratio in the batch. + For example, when select_data is "MJ-ST" and batch_ratio is "0.5-0.5", + the 50% of the batch is filled with MJ and the other 50% of the batch is filled with ST. + """ + log = open(f'./saved_models/{opt.exp_name}/log_dataset.txt', 'a',encoding="utf-8") + dashed_line = '-' * 80 + print(dashed_line) + log.write(dashed_line + '\n') + print(f'dataset_root: {opt.train_data}\nopt.select_data: {opt.select_data}\nopt.batch_ratio: {opt.batch_ratio}') + log.write(f'dataset_root: {opt.train_data}\nopt.select_data: {opt.select_data}\nopt.batch_ratio: {opt.batch_ratio}\n') + assert len(opt.select_data) == len(opt.batch_ratio) + + _AlignCollate = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) + self.data_loader_list = [] + self.dataloader_iter_list = [] + batch_size_list = [] + Total_batch_size = 0 + for selected_d, batch_ratio_d in zip(opt.select_data, opt.batch_ratio): + _batch_size = max(round(opt.batch_size * float(batch_ratio_d)), 1) + print(dashed_line) + log.write(dashed_line + '\n') + _dataset, _dataset_log = hierarchical_dataset(root=opt.train_data, opt=opt, select_data=[selected_d], rand_aug=rand_aug) + total_number_dataset = len(_dataset) + log.write(_dataset_log) + + """ + The total number of data can be modified with opt.total_data_usage_ratio. + ex) opt.total_data_usage_ratio = 1 indicates 100% usage, and 0.2 indicates 20% usage. + See 4.2 section in our paper. + """ + number_dataset = int(total_number_dataset * float(opt.total_data_usage_ratio)) + dataset_split = [number_dataset, total_number_dataset - number_dataset] + indices = range(total_number_dataset) + _dataset, _ = [Subset(_dataset, indices[offset - length:offset]) + for offset, length in zip(_accumulate(dataset_split), dataset_split)] + selected_d_log = f'num total samples of {selected_d}: {total_number_dataset} x {opt.total_data_usage_ratio} (total_data_usage_ratio) = {len(_dataset)}\n' + selected_d_log += f'num samples of {selected_d} per batch: {opt.batch_size} x {float(batch_ratio_d)} (batch_ratio) = {_batch_size}' + print(selected_d_log) + log.write(selected_d_log + '\n') + batch_size_list.append(str(_batch_size)) + Total_batch_size += _batch_size + + _data_loader = torch.utils.data.DataLoader( + _dataset, batch_size=_batch_size, + shuffle=True, + num_workers=0, + collate_fn=_AlignCollate, pin_memory=True) + self.data_loader_list.append(_data_loader) + self.dataloader_iter_list.append(iter(_data_loader)) + + Total_batch_size_log = f'{dashed_line}\n' + batch_size_sum = '+'.join(batch_size_list) + Total_batch_size_log += f'Total_batch_size: {batch_size_sum} = {Total_batch_size}\n' + Total_batch_size_log += f'{dashed_line}' + opt.batch_size = Total_batch_size + + print(Total_batch_size_log) + log.write(Total_batch_size_log + '\n') + log.close() + + def get_batch(self): + balanced_batch_images = [] + balanced_batch_texts = [] + + for i, data_loader_iter in enumerate(self.dataloader_iter_list): + try: + image, text = data_loader_iter.next() + balanced_batch_images.append(image) + balanced_batch_texts += text + except StopIteration: + self.dataloader_iter_list[i] = iter(self.data_loader_list[i]) + image, text = self.dataloader_iter_list[i].next() + balanced_batch_images.append(image) + balanced_batch_texts += text + except ValueError: + pass + + balanced_batch_images = torch.cat(balanced_batch_images, 0) + + return balanced_batch_images, balanced_batch_texts + + +def hierarchical_dataset(root, opt, select_data='/', rand_aug = False): + """ select_data='/' contains all sub-directory of root directory """ + dataset_list = [] + dataset_log = f'dataset_root: {root}\t dataset: {select_data[0]}' + # print(dataset_log) + dataset_log += '\n' + for dirpath, dirnames, filenames in os.walk(root+'/'): + if not dirnames: + select_flag = False + for selected_d in select_data: + if selected_d in dirpath: + select_flag = True + break + + if select_flag: + dataset = LmdbDataset(dirpath, opt, rand_aug=rand_aug) + sub_dataset_log = f'sub-directory:\t/{os.path.relpath(dirpath, root)}\t num samples: {len(dataset)}' + # print(sub_dataset_log) + dataset_log += f'{sub_dataset_log}\n' + dataset_list.append(dataset) + + print( len(dataset_list) ) + concatenated_dataset = ConcatDataset(dataset_list) + + return concatenated_dataset, dataset_log + + +class LmdbDataset(Dataset): + def __init__(self, root, opt,rand_aug=False, transform=None): + self.root = root + self.opt = opt + self.rand_aug = rand_aug + self.transform = transform + self.env = lmdb.open(root, max_readers=32, readonly=True, lock=False, readahead=False, meminit=False) + if not self.env: + print('cannot create lmdb from %s' % (root)) + sys.exit(0) + + with self.env.begin(write=False) as txn: + nSamples = int(txn.get('num-samples'.encode())) + self.nSamples = nSamples + self.filtered_index_list = [] + for index in range(self.nSamples): + index += 1 # lmdb starts with 1 + label_key = 'label-%09d'.encode() % index + label = txn.get(label_key).decode('utf-8') + + if len(label) > self.opt.batch_max_length: + # print(f'The length of the label is longer than max_length: length {len(label)}, {label} in dataset {self.root}') + continue + + # By default, images containing characters which are not in opt.character are filtered. + # You can add [UNK] token to `opt.character` in utils.py instead of this filtering + out_of_char = f'[^{self.opt.character}]' + if re.search(out_of_char, label): + print ("This string contains a character not part of our dictionnary") + continue + + self.filtered_index_list.append(index) + + self.nSamples = len(self.filtered_index_list) + if self.transform is None: + self.transform = [] + if self.rand_aug: + from augmentation import rand_augment_transform,salt_and_pepper_noise, random_border_crop, random_resize + self.transform.append(rand_augment_transform()) + self.transform.append(T.ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.25)) + if random.random()<0.25: + self.transform.append(lambda img: salt_and_pepper_noise(img)) + if random.random()<0.25: + self.transform.append(lambda img: random_border_crop(img)) + self.transform.append(T.RandomRotation(5)) + if random.random()<0.25: + self.transform.append(lambda img: random_resize(img)) + self.transform = T.Compose(self.transform) + + def __len__(self): + return self.nSamples + + def __getitem__(self, index): + assert index <= len(self), 'index range error' + index = self.filtered_index_list[index] + + with self.env.begin(write=False) as txn: + label_key = 'label-%09d'.encode() % index + label = txn.get(label_key).decode('utf-8') + img_key = 'image-%09d'.encode() % index + imgbuf = txn.get(img_key) + + buf = six.BytesIO() + buf.write(imgbuf) + buf.seek(0) + try: + if self.opt.rgb: + img1 = Image.open(buf).convert('RGB') # for color image + img = img1.transpose(Image.FLIP_LEFT_RIGHT) + else: + img1 = Image.open(buf).convert('L') + img = img1.transpose(Image.FLIP_LEFT_RIGHT) + + except IOError: + print(f'Corrupted image for {index}') + # make dummy image and dummy label for corrupted image. + if self.opt.rgb: + img = Image.new('RGB', (self.opt.imgW, self.opt.imgH)) + img = img.transpose(Image.FLIP_LEFT_RIGHT) + else: + img = Image.new('L', (self.opt.imgW, self.opt.imgH)) + img = img.transpose(Image.FLIP_LEFT_RIGHT) + label = '[dummy_label]' + + # We only train and evaluate on alphanumerics (or pre-defined character set in train.py) + out_of_char = f'[^{self.opt.character}]' + label = re.sub(out_of_char, '', label) + + if self.transform: + img = self.transform(img) + + return (img, label) + +class ResizeNormalize(object): + + def __init__(self, size, interpolation=Image.BICUBIC): + self.size = size + self.interpolation = interpolation + self.toTensor = T.ToTensor() + + def __call__(self, img): + img = img.resize(self.size, self.interpolation) + img = self.toTensor(img) + img.sub_(0.5).div_(0.5) + return img + + +class NormalizePAD(object): + + def __init__(self, max_size, PAD_type='right'): + self.toTensor = T.ToTensor() + self.max_size = max_size + self.max_width_half = math.floor(max_size[2] / 2) + self.PAD_type = PAD_type + + def __call__(self, img): + img = self.toTensor(img) + img.sub_(0.5).div_(0.5) + c, h, w = img.size() + Pad_img = torch.FloatTensor(*self.max_size).fill_(0) + Pad_img[:, :, :w] = img # right pad + if self.max_size[2] != w: # add border Pad + Pad_img[:, :, w:] = img[:, :, w - 1].unsqueeze(2).expand(c, h, self.max_size[2] - w) + + return Pad_img + + +class AlignCollate(object): + + def __init__(self, imgH=32, imgW=100, keep_ratio_with_pad=True): + self.imgH = imgH + self.imgW = imgW + self.keep_ratio_with_pad = keep_ratio_with_pad + + def __call__(self, batch): + batch = filter(lambda x: x is not None, batch) + images, labels = zip(*batch) + + if self.keep_ratio_with_pad: # same concept with 'Rosetta' paper + resized_max_w = self.imgW + input_channel = 3 if images[0].mode == 'RGB' else 1 + transform = NormalizePAD((input_channel, self.imgH, resized_max_w)) + + resized_images = [] + for image in images: + w, h = image.size + ratio = w / float(h) + if math.ceil(self.imgH * ratio) > self.imgW: + resized_w = self.imgW + else: + resized_w = math.ceil(self.imgH * ratio) + + resized_image = image.resize((resized_w, self.imgH), Image.BICUBIC) + resized_images.append(transform(resized_image)) + # resized_image.save('./image_test/%d_test.jpg' % w) + + image_tensors = torch.cat([t.unsqueeze(0) for t in resized_images], 0) + + else: + transform = ResizeNormalize((self.imgW, self.imgH)) + image_tensors = [transform(image) for image in images] + image_tensors = torch.cat([t.unsqueeze(0) for t in image_tensors], 0) + + return image_tensors, labels + +def tensor2im(image_tensor, imtype=np.uint8): + image_numpy = image_tensor.cpu().float().numpy() + if image_numpy.shape[0] == 1: + image_numpy = np.tile(image_numpy, (3, 1, 1)) + image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + 1) / 2.0 * 255.0 + return image_numpy.astype(imtype) + + +def save_image(image_numpy, image_path): + image_pil = Image.fromarray(image_numpy) + image_pil.save(image_path) diff --git a/code/recognization/densenet.py b/code/recognization/densenet.py new file mode 100644 index 0000000000000000000000000000000000000000..363a2849de7ace8d813a99f0750e412f887af9c7 --- /dev/null +++ b/code/recognization/densenet.py @@ -0,0 +1,112 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +''' +Source - https://github.com/NYUMedML/DARTS/blob/master/DARTS/models/dense_unet_model.py +An implementation of this paper - https://arxiv.org/abs/1608.06993 +''' + +class Single_level_densenet(nn.Module): + def __init__(self,filters, num_conv = 4): + super(Single_level_densenet, self).__init__() + self.num_conv = num_conv + self.conv_list = nn.ModuleList() + self.bn_list = nn.ModuleList() + for i in range(self.num_conv): + self.conv_list.append(nn.Conv2d(filters,filters,3, padding = 1)) + self.bn_list.append(nn.BatchNorm2d(filters)) + + def forward(self,x): + outs = [] + outs.append(x) + for i in range(self.num_conv): + temp_out = self.conv_list[i](outs[i]) + if i > 0: + for j in range(i): + temp_out += outs[j] + outs.append(F.relu(self.bn_list[i](temp_out))) + out_final = outs[-1] + del outs + return out_final + +class Down_sample(nn.Module): + def __init__(self,kernel_size = 2, stride = 2): + super(Down_sample, self).__init__() + self.down_sample_layer = nn.MaxPool2d(kernel_size, stride) + + def forward(self,x): + y = self.down_sample_layer(x) + return y,x + +class Upsample_n_Concat(nn.Module): + def __init__(self,filters): + super(Upsample_n_Concat, self).__init__() + self.upsample_layer = nn.ConvTranspose2d(filters, filters, 4, padding = 1, stride = 2) + self.conv = nn.Conv2d(2*filters,filters,3, padding = 1) + self.bn = nn.BatchNorm2d(filters) + + def forward(self,x,y): + x = self.upsample_layer(x) + x = torch.cat([x,y],dim = 1) + x = F.relu(self.bn(self.conv(x))) + return x + + +class DenseNet(nn.Module): + def __init__(self, in_chan=1, out_chan=512, filters=256, num_conv = 4): + super(DenseNet, self).__init__() + self.conv1 = nn.Conv2d(in_chan,filters,1) + self.d1 = Single_level_densenet(filters,num_conv ) + self.down1 = Down_sample() + self.d2 = Single_level_densenet(filters,num_conv ) + self.down2 = Down_sample() + self.d3 = Single_level_densenet(filters,num_conv ) + self.down3 = Down_sample() + self.d4 = Single_level_densenet(filters,num_conv ) + self.down4 = Down_sample() + self.bottom = Single_level_densenet(filters,num_conv ) + self.up4 = Upsample_n_Concat(filters) + self.u4 = Single_level_densenet(filters,num_conv ) + self.up3 = Upsample_n_Concat(filters) + self.u3 = Single_level_densenet(filters,num_conv ) + self.up2 = Upsample_n_Concat(filters) + self.u2 = Single_level_densenet(filters,num_conv ) + self.up1 = Upsample_n_Concat(filters) + self.u1 = Single_level_densenet(filters,num_conv ) + self.outconv = nn.Conv2d(filters,out_chan, 1) +# self.outconvp1 = nn.Conv2d(filters,out_chan, 1) +# self.outconvm1 = nn.Conv2d(filters,out_chan, 1) + + + def forward(self,x): + bsz = x.shape[0] + x = self.conv1(x) + x,y1 = self.down1(self.d1(x)) + x,y2 = self.down1(self.d2(x)) + x,y3 = self.down1(self.d3(x)) + x,y4 = self.down1(self.d4(x)) + x = self.bottom(x) + x = self.u4(self.up4(x,y4)) + x = self.u3(self.up3(x,y3)) + x = self.u2(self.up2(x,y2)) + x = self.u1(self.up1(x,y1)) + x1 = self.outconv(x) +# xm1 = self.outconvm1(x) +# xp1 = self.outconvp1(x) + + return x1 + +# # x = torch.randn(1, 1, 32, 400) +# model = DenseNet(1, 512) +# # out = model(x) +# # print(out.shape) \ No newline at end of file diff --git a/code/recognization/dropout_layer.py b/code/recognization/dropout_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..39a8f2e61156732692acfc2cc1a91e282969040d --- /dev/null +++ b/code/recognization/dropout_layer.py @@ -0,0 +1,24 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import torch.nn as nn +import torch +import numpy as np + +class dropout_layer(nn.Module): + def __init__(self,device): + super(dropout_layer, self).__init__() + self.device = device + def forward(self, input): + nums = (np.random.rand(input.shape[1]) > 0.2).astype (int) + dummy_array_output = torch.from_numpy(nums).to(self.device) + dummy_array_output_t = torch.reshape(dummy_array_output, (input.shape[1], 1)).to(self.device) #Transpose + dummy_array_output_f = dummy_array_output_t.repeat(input.shape[0], 1,input.shape[2]).to(self.device) #Same size as input + output = input*dummy_array_output_f #element-wise multiplication + return output \ No newline at end of file diff --git a/code/recognization/feature_extraction.py b/code/recognization/feature_extraction.py new file mode 100755 index 0000000000000000000000000000000000000000..fb22154712b10018df5563d5101aab6f2b912084 --- /dev/null +++ b/code/recognization/feature_extraction.py @@ -0,0 +1,105 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import torch.nn as nn + +from .densenet import DenseNet +from .hrnet import HRNet +from .inception_unet import InceptionUNet +from .rcnn import RCNN +from .resnet import ResNet +from .resunet import ResUnet +from .unet_attn import AttnUNet +from .unet_plus_plus import NestedUNet +from .unet import UNet +from .vgg import VGG + +class DenseNet_FeatureExtractor(nn.Module): + def __init__(self, input_channel=1, output_channel=512): + super(DenseNet_FeatureExtractor, self).__init__() + self.ConvNet = DenseNet(input_channel, output_channel) + + def forward(self, input): + return self.ConvNet(input) + +class HRNet_FeatureExtractor(nn.Module): + def __init__(self, input_channel=1, output_channel=32): + super(HRNet_FeatureExtractor, self).__init__() + self.ConvNet = HRNet(input_channel, output_channel) + + def forward(self, input): + return self.ConvNet(input) + +class InceptionUNet_FeatureExtractor(nn.Module): + def __init__(self, input_channel=1, output_channel=512): + super(InceptionUNet_FeatureExtractor, self).__init__() + self.ConvNet = InceptionUNet(input_channel, output_channel) + + def forward(self, input): + return self.ConvNet(input) + +class RCNN_FeatureExtractor(nn.Module): + def __init__(self, input_channel=1, output_channel=512): + super(RCNN_FeatureExtractor, self).__init__() + self.ConvNet = RCNN(input_channel, output_channel) + + def forward(self, input): + return self.ConvNet(input) + +class ResNet_FeatureExtractor(nn.Module): + def __init__(self, input_channel=1, output_channel=512): + super(ResNet_FeatureExtractor, self).__init__() + self.ConvNet = ResNet(input_channel, output_channel) + + def forward(self, input): + return self.ConvNet(input) + +class ResUnet_FeatureExtractor(nn.Module): + def __init__(self, input_channel=1, output_channel=512): + super(ResUnet_FeatureExtractor, self).__init__() + self.ConvNet = ResUnet(input_channel, output_channel) + + def forward(self, input): + return self.ConvNet(input) + +class AttnUNet_FeatureExtractor(nn.Module): + def __init__(self, input_channel=1, output_channel=512): + super(AttnUNet_FeatureExtractor, self).__init__() + self.ConvNet = AttnUNet(input_channel, output_channel) + + def forward(self, input): + return self.ConvNet(input) + +class UNet_FeatureExtractor(nn.Module): + def __init__(self, input_channel=1, output_channel=512): + super(UNet_FeatureExtractor, self).__init__() + self.ConvNet = UNet(input_channel, output_channel) + + def forward(self, input): + return self.ConvNet(input) + +class UNetPlusPlus_FeatureExtractor(nn.Module): + def __init__(self, input_channel=1, output_channel=512): + super(UNetPlusPlus_FeatureExtractor, self).__init__() + self.ConvNet = NestedUNet(input_channel, output_channel) + + def forward(self, input): + return self.ConvNet(input) + +class VGG_FeatureExtractor(nn.Module): + def __init__(self, input_channel=1, output_channel=512): + super(VGG_FeatureExtractor, self).__init__() + self.ConvNet = VGG(input_channel, output_channel) + + def forward(self, input): + return self.ConvNet(input) + +# x = torch.randn(1, 1, 32, 400) +# model = UNet_FeatureExtractor() +# out = model(x) diff --git a/code/recognization/hrnet.py b/code/recognization/hrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..7d819d6833a2cbbc5a3ee118cc0df30ad6b0bdfe --- /dev/null +++ b/code/recognization/hrnet.py @@ -0,0 +1,242 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +''' +Source - https://github.com/shuuchen/HRNet/blob/master/hrnet.py +An implementation of this paper - https://arxiv.org/pdf/1908.07919.pdf +''' + +import torch +from torch import nn + +# from utils import draw_feature_map + + +BN_MOMENTUM = 0.1 + +# Conv Module - Does not change the shape of the input, only the number of channels +class Conv(nn.Module): + def __init__(self, in_ch, out_ch, kernel_size=3, stride=1, relued=True): + super(Conv, self).__init__() + padding = (kernel_size - 1) // 2 + self.conv_bn = nn.Sequential( + nn.Conv2d(in_ch, out_ch, kernel_size, stride, padding, bias=False), + nn.BatchNorm2d(out_ch, momentum=BN_MOMENTUM)) + self.relu = nn.ReLU() + self.relued = relued + + def forward(self, x): + x = self.conv_bn(x) + if self.relued: + x = self.relu(x) + return x + + +# BasicBlock - Does not change shape or size of the input at all +class BasicBlock(nn.Module): + def __init__(self, in_ch, out_ch): + super(BasicBlock, self).__init__() + self.conv = nn.Sequential( + Conv(in_ch, out_ch), + Conv(in_ch, out_ch, relued=False)) + self.relu = nn.ReLU() + def forward(self, x): + identity = x + x = self.conv(x) + x = x + identity + return self.relu(x) + +# Bottleneck - Does not change shape of the input, increases channels to 4*out_ch (instead of out_ch) +class Bottleneck(nn.Module): + + expansion = 4 + + def __init__(self, in_ch, out_ch, downsampling=None): + super(Bottleneck, self).__init__() + self.conv = nn.Sequential( + Conv(in_ch, out_ch, kernel_size=1), + Conv(out_ch, out_ch), + Conv(out_ch, out_ch * self.expansion, kernel_size=1, relued=False)) + self.relu = nn.ReLU() + self.downsampling = downsampling + + def forward(self, x): + identity = x + x = self.conv(x) + if self.downsampling: + identity = self.downsampling(identity) + x = x + identity + return self.relu(x) + +# UpSampling - Reduces the number of channels to ch // up_factor and increases input size up_factor times +class UpSampling(nn.Module): + def __init__(self, ch, up_factor): + super(UpSampling, self).__init__() + self.up_sampling = nn.Sequential( + nn.Upsample(scale_factor=up_factor, mode='bilinear', align_corners=False), + Conv(ch, ch // up_factor, 1, relued=False)) + def forward(self, x): + return self.up_sampling(x) + +# DownSampling - Increases the number of channels and reduces input size by a factor of (2^num_samplings) +class DownSampling(nn.Module): + def __init__(self, ch, num_samplings): + super(DownSampling, self).__init__() + convs = [] + for i in range(num_samplings): + relued = True if i < num_samplings - 1 else False + convs.append(Conv(ch, ch * 2, 3, 2, relued=relued)) + ch *= 2 + self.down_sampling = nn.Sequential(*convs) + + def forward(self, x): + return self.down_sampling(x) + + +class HRBlock(nn.Module): + def __init__(self, ch, index, last_stage, block, num_conv_block_per_list=4): + super(HRBlock, self).__init__() + self.index = index + self.last_stage = last_stage + self.num_conv_block_per_list = num_conv_block_per_list + self.relu = nn.ReLU() + + self.parallel_conv_lists = nn.ModuleList() + for i in range(index): + ch_i = ch * 2**i + conv_list = [] + for j in range(num_conv_block_per_list): + conv_list.append(block(ch_i, ch_i)) + self.parallel_conv_lists.append(nn.Sequential(*conv_list)) + + self.up_conv_lists = nn.ModuleList() + for i in range(index - 1): + conv_list = nn.ModuleList() + for j in range(i + 1, index): + up_factor = 2 ** (j-i) + ch_j = ch * 2**j + conv_list.append(UpSampling(ch_j, up_factor)) + self.up_conv_lists.append(conv_list) + + self.down_conv_lists = nn.ModuleList() + for i in range(1, index if last_stage else index + 1): + conv_list = nn.ModuleList() + for j in range(i): + ch_j = ch * 2**j + conv_list.append(DownSampling(ch_j, i - j)) + self.down_conv_lists.append(conv_list) + + def forward(self, x_list): + parallel_res_list = [] + for i in range(self.index): + x = x_list[i] + x = self.parallel_conv_lists[i](x) + parallel_res_list.append(x) + + final_res_list = [] + for i in range(self.index if self.last_stage else self.index + 1): + # Downsampling all streams to a dimension just lower than the lowest stream, for next stage (Don't do for last stage i.e. index = 4 obviously) + if i == self.index: + x = 0 + for t, m in zip(parallel_res_list, self.down_conv_lists[-1]): + x = x + m(t) + else: + x = parallel_res_list[i] + # Upsampling all streams (except the uppermost), to all possible dimensions above it till the highest stream + if i != self.index - 1: + res_list = parallel_res_list[i+1:] + up_x = 0 + for t, m in zip(res_list, self.up_conv_lists[i]): + up_x = up_x + m(t) + x = x + up_x + # Downsampling all streams (except the lowest) to all possible dimensions below it till the lowest stream dimension + if i != 0: + res_list = parallel_res_list[:i] + down_x = 0 + for t, m in zip(res_list, self.down_conv_lists[i - 1]): + down_x = down_x + m(t) + x = x + down_x + x = self.relu(x) + final_res_list.append(x) + return final_res_list + + +class HRNet(nn.Module): + def __init__(self, in_ch=1, out_ch=32, mid_ch=64, num_stage=4): + super(HRNet, self).__init__() + self.init_conv = nn.Sequential( + Conv(in_ch, 64, 1), + Conv(64, 64, 1)) + self.head = nn.Sequential( + Conv(mid_ch * (1 + 2 + 4 + 8), mid_ch * (1 + 2 + 4 + 8), 1), + nn.Conv2d(mid_ch * (1 + 2 + 4 + 8), out_ch, 1)) + self.first_layer = self._make_layer(64, 64, Bottleneck, 4) + self.first_transition = self._make_transition_layer(256, mid_ch, 1) + self.num_stage = num_stage + self.hr_blocks = nn.ModuleList() + for i in range(1, num_stage): + self.hr_blocks.append(HRBlock(mid_ch, i + 1, True if i == num_stage - 1 else False, BasicBlock)) + + self.up_samplings = nn.ModuleList() + for i in range(num_stage - 1): + up_factor = 2 ** (i + 1) + up = nn.Upsample(scale_factor=up_factor, mode='bilinear') + self.up_samplings.append(up) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def _make_layer(self, in_ch, ch, block, num): + downsampling = None + if in_ch != ch * block.expansion: + downsampling = Conv(in_ch, ch * block.expansion, 1, relued=False) + layers = [] + layers.append(block(in_ch, ch, downsampling)) + for i in range(1, num): + layers.append(block(ch * block.expansion, ch)) + return nn.Sequential(*layers) + + def _make_transition_layer(self, in_ch, out_ch, stage): + layers = nn.ModuleList() + layers.append(Conv(in_ch, out_ch, 1)) + layers.append(Conv(in_ch, out_ch * 2, 3, 2)) + return layers + + def forward(self, x): + x = self.init_conv(x) + # Save visual_features from any 10 random channels for visualization # For image at index 0 in batch + # draw_feature_map(x,"vis_feature_maps/initial_layer", num_channel=25) + # if os.path.exists('vis_feature_maps/initial_layer'): + + x = self.first_layer(x) + x_list = [m(x) for m in self.first_transition] + for i in range(self.num_stage - 1): + x_list = self.hr_blocks[i](x_list) + # Visualization from any 10 random channels for visualization # For image at index 0 in batch + # if i==2: # Last stage + # draw_feature_map( x_list[-1],"vis_feature_maps/lower_layers",25) + + res_list = [x_list[0]] + for t, m in zip(x_list[1:], self.up_samplings): + res_list.append(m(t)) + x = torch.cat(res_list, dim=1) + x = self.head(x) + # draw_feature_map(x,"vis_feature_maps/output_layer", num_channel=25) + return x + +# x = [torch.randn(1, 64, 32, 400),torch.randn(1, 128, 16, 200), torch.rand(1, 256, 8, 100)] +# model = HRBlock(ch=64,index=3,last_stage=False,block=BasicBlock) # index = 2,3,4 +# x = torch.randn(1, 1, 32, 400) +# model = HRNet() +# out = model(x) +# print(out.shape) diff --git a/code/recognization/inception_unet.py b/code/recognization/inception_unet.py new file mode 100644 index 0000000000000000000000000000000000000000..b489178e7e6dc07119d08f9322357171e10b58be --- /dev/null +++ b/code/recognization/inception_unet.py @@ -0,0 +1,190 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import torch.nn.functional as F +import torch.nn as nn +import torch + +''' +Source - https://github.com/mribrahim/Pytorch-UNet-and-Inception/blob/e627658ee84e26ef3befd1ded4904048997e84f8/unet/inception.py +An implementation of this paper - https://dl.acm.org/doi/abs/10.1145/3376922 +''' + +class InceptionConv(nn.Module): + """(convolution => [BN] => ReLU) * 2""" + + def __init__(self, in_channels, out_channels, mid_channels=None): + super().__init__() + if not mid_channels: + mid_channels = out_channels + + self.double_conv1 = nn.Sequential( + nn.MaxPool2d(2), + nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1), + nn.BatchNorm2d(mid_channels), + nn.ReLU(inplace=True), + nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1), + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True) + ) + + self.double_conv2 = nn.Sequential( + nn.MaxPool2d(2), + nn.Conv2d(in_channels, mid_channels, kernel_size=5, padding=2), + nn.BatchNorm2d(mid_channels), + nn.ReLU(inplace=True), + nn.Conv2d(mid_channels, out_channels, kernel_size=5, padding=2), + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True) + ) + + self.double_conv3 = nn.Sequential( + nn.MaxPool2d(2), + nn.Conv2d(in_channels, mid_channels, kernel_size=1, padding=0), + nn.BatchNorm2d(mid_channels), + nn.ReLU(inplace=True), + ) + + self.double_conv4 = nn.Sequential( + nn.MaxPool2d(2), + nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1), + nn.BatchNorm2d(mid_channels), + nn.ReLU(inplace=True), + nn.Conv2d(mid_channels, out_channels, kernel_size=1, padding=0), + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True) + ) + + def forward(self, x): + outputs = [self.double_conv1(x), self.double_conv2(x), self.double_conv3(x), self.double_conv4(x)] + return torch.cat(outputs, 1) + +class DoubleConv(nn.Module): + """(convolution => [BN] => ReLU) * 2""" + + def __init__(self, in_channels, out_channels, mid_channels=None): + super().__init__() + if not mid_channels: + mid_channels = out_channels + self.double_conv = nn.Sequential( + nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1), + nn.BatchNorm2d(mid_channels), + nn.ReLU(inplace=True), + nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1), + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True) + ) + + def forward(self, x): + return self.double_conv(x) + + +class Down(nn.Module): + """Downscaling with maxpool then double conv""" + + def __init__(self, in_channels, out_channels): + super().__init__() + self.maxpool_conv = nn.Sequential( + nn.MaxPool2d(2), + DoubleConv(in_channels, out_channels) + ) + + def forward(self, x): + return self.maxpool_conv(x) + + +class UpInception(nn.Module): + """Upscaling then double conv""" + + def __init__(self, in_channels, out_channels, bilinear=True): + super().__init__() + + # if bilinear, use the normal convolutions to reduce the number of channels + if bilinear: + self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) + self.conv = DoubleConv(in_channels, out_channels, in_channels // 2) + else: + self.up = nn.ConvTranspose2d(in_channels , in_channels // 2, kernel_size=2, stride=2) + self.conv = DoubleConv(in_channels, out_channels) + + def forward(self, x1, x2, x3): + x1 = self.up(x1) + x3 = self.up(x3) + # input is CHW + diffY = x2.size()[2] - x1.size()[2] + diffX = x2.size()[3] - x1.size()[3] + + x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2, + diffY // 2, diffY - diffY // 2]) + x = torch.cat([x3, x2, x1], dim=1) + return self.conv(x) + + +class OutConv(nn.Module): + def __init__(self, in_channels, out_channels): + super(OutConv, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1) + + def forward(self, x): + return self.conv(x) + + +class InceptionUNet(nn.Module): + def __init__(self, n_channels=1, out_channels=512, bilinear=True): + super(InceptionUNet, self).__init__() + self.n_channels = n_channels + self.out_channels = out_channels + self.bilinear = bilinear + + self.block1 = InceptionConv(64, 32) + self.block2 = InceptionConv(128, 64) + self.block3 = InceptionConv(256, 128) + self.block4 = InceptionConv(512, 128) + + self.inc = DoubleConv(n_channels, 64) + self.down1 = Down(64, 128) + self.down2 = Down(128, 256) + self.down3 = Down(256, 512) + factor = 2 if bilinear else 1 + self.down4 = Down(512, 1024 // factor) + + self.up1 = UpInception(1024+512, 256 // factor, bilinear) + self.up2 = UpInception(896, 128 // factor, bilinear) + self.up3 = UpInception(448, 32 // factor, bilinear) + self.up4 = UpInception(208, 16, bilinear) + self.outc = OutConv(16, out_channels) + + + def forward(self, x): + x1 = self.inc(x) + x2 = self.down1(x1) + x3 = self.down2(x2) + x4 = self.down3(x3) + x5 = self.down4(x4) + + block1 = self.block1(x1) + block2 = self.block2(block1) + block3 = self.block3(block2) + block4 = self.block4(block3) + + x = self.up1(x5, x4, block4) + # x = torch.cat(x, block4) + x = self.up2(x, x3, block3) + # x = torch.cat(x, block3) + x = self.up3(x, x2, block2) + # x = torch.cat(x, block2) + x = self.up4(x, x1, block1) + # x = torch.cat(x, block1) + logits = self.outc(x) + return logits + +# x = torch.randn(1, 1, 32, 400) +# net = InceptionUNet() +# out = net(x) +# print(out.shape) \ No newline at end of file diff --git a/code/recognization/model.py b/code/recognization/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c845a7bbb547f514d3329f28032c7b66996da426 --- /dev/null +++ b/code/recognization/model.py @@ -0,0 +1,135 @@ +import torch + +from .feature_extraction import HRNet_FeatureExtractor +from .sequence_modeling import BidirectionalLSTM +from .dropout_layer import dropout_layer +from .prediction import Attention +import torch.nn as nn + +# Other CNN Architectures +from .feature_extraction import DenseNet_FeatureExtractor, InceptionUNet_FeatureExtractor +from .feature_extraction import RCNN_FeatureExtractor, ResNet_FeatureExtractor +from .feature_extraction import ResUnet_FeatureExtractor, AttnUNet_FeatureExtractor +from .feature_extraction import UNet_FeatureExtractor, UNetPlusPlus_FeatureExtractor +from .feature_extraction import VGG_FeatureExtractor + +# Other sequential models +from .sequence_modeling import LSTM, GRU, MDLSTM + +class Text_recognization_model(nn.Module): + + """ The constractor init the struture of the model """ + def __init__(self, opt): + super(Text_recognization_model, self).__init__() + # opt is the configration of the model + self.opt = opt + # The model consist of three stages + # FeatureExtraction, SequenceModeling and Prediction + self.stages = {'Feat': opt.FeatureExtraction, + 'Seq': opt.SequenceModeling, + 'Pred': opt.Prediction} + + """ FeatureExtraction """ + # High-Resolution Network, it maintains high-resolution feature maps + if opt.FeatureExtraction == 'HRNet': + self.FeatureExtraction = HRNet_FeatureExtractor(opt.input_channel, opt.output_channel ) + elif opt.FeatureExtraction == 'Densenet': + self.FeatureExtraction = DenseNet_FeatureExtractor(opt.input_channel, opt.output_channel) + elif opt.FeatureExtraction == 'InceptionUnet': + self.FeatureExtraction = InceptionUNet_FeatureExtractor(opt.input_channel, opt.output_channel) + elif opt.FeatureExtraction == 'RCNN': + self.FeatureExtraction = RCNN_FeatureExtractor(opt.input_channel, opt.output_channel) + elif opt.FeatureExtraction == 'ResNet': + self.FeatureExtraction = ResNet_FeatureExtractor(opt.input_channel, opt.output_channel) + elif opt.FeatureExtraction == 'ResUnet': + self.FeatureExtraction = ResUnet_FeatureExtractor(opt.input_channel, opt.output_channel) + elif opt.FeatureExtraction == 'AttnUNet': + self.FeatureExtraction = AttnUNet_FeatureExtractor(opt.input_channel, opt.output_channel) + elif opt.FeatureExtraction == 'UNet': + self.FeatureExtraction = UNet_FeatureExtractor(opt.input_channel, opt.output_channel) + elif opt.FeatureExtraction == 'UnetPlusPlus': + self.FeatureExtraction = UNetPlusPlus_FeatureExtractor(opt.input_channel, opt.output_channel) + elif opt.FeatureExtraction == 'VGG': + self.FeatureExtraction = VGG_FeatureExtractor(opt.input_channel, opt.output_channel) + else: + raise Exception('No FeatureExtraction module specified') + self.FeatureExtraction_output = opt.output_channel + self.AdaptiveAvgPool = nn.AdaptiveAvgPool2d((None, 1)) # Transform final (imgH/16-1) -> 1 + + """ + Temporal Dropout + """ + self.dropout1 = dropout_layer(opt.device) + self.dropout2 = dropout_layer(opt.device) + self.dropout3 = dropout_layer(opt.device) + self.dropout4 = dropout_layer(opt.device) + self.dropout5 = dropout_layer(opt.device) + + """ Sequence modeling""" + if opt.SequenceModeling == 'LSTM': + self.SequenceModeling = LSTM(self.FeatureExtraction_output, opt.hidden_size, opt.hidden_size) + elif opt.SequenceModeling == 'GRU': + self.SequenceModeling = GRU(self.FeatureExtraction_output, opt.hidden_size, opt.hidden_size) + elif opt.SequenceModeling == 'MDLSTM': + self.SequenceModeling = MDLSTM(self.FeatureExtraction_output, opt.hidden_size, opt.hidden_size) + elif opt.SequenceModeling == 'BiLSTM': + self.SequenceModeling = BidirectionalLSTM(self.FeatureExtraction_output, opt.hidden_size, opt.hidden_size) + # Double BiLSTM + elif opt.SequenceModeling == 'DBiLSTM': + self.SequenceModeling = nn.Sequential( + BidirectionalLSTM(self.FeatureExtraction_output, opt.hidden_size, opt.hidden_size), + BidirectionalLSTM(opt.hidden_size, opt.hidden_size, opt.hidden_size)) + else: + raise Exception('No Sequence Modeling module specified') + self.SequenceModeling_output = opt.hidden_size + + """ Prediction """ + if opt.Prediction == 'CTC': + self.Prediction = nn.Linear(self.SequenceModeling_output, opt.num_class) + elif opt.Prediction == 'Attn': + self.Prediction = Attention(self.SequenceModeling_output, opt.hidden_size, opt.num_class, opt.device) + else: + raise Exception('Prediction is neither CTC or Attn') + + def forward(self, input, text=None, is_train=True): + """ Feature extraction stage """ + ### Pass input to the feature extraction network ### + visual_feature = self.FeatureExtraction(input) + # print(visual_feature.shape) # [32, 32, 32, 400] #HRNet, [32, 512, 32, 400] #UNet + ### Then make pooling ### + visual_feature = self.AdaptiveAvgPool(visual_feature.permute(0, 3, 1, 2)) # [b, c, h, w] -> [b, w, c, h] + # print(visual_feature.shape) # [32, 400, 32, 1] #HRNet, [32, 400, 512, 1] #UNet + ### Remove the columb 3 Ex=> [32,400,32,1] will be [32,400,32] ### + visual_feature = visual_feature.squeeze(3) + # print(visual_feature.shape) # [32, 400, 32] #HRNet, [32, 400, 512] #UNet + + + """ Temporal Dropout + Sequence modeling stage """ + # contextual_feature = self.SequenceModeling(visual_feature) ##### Without temporal dropout + if (self.training): + visual_feature_after_dropout1 = self.dropout1(visual_feature) + contextual_feature = self.SequenceModeling(visual_feature_after_dropout1) + else : + # Inference Phase, make multiple dropout, and take the average of them, this is called Monte Carlo Dropout + visual_feature_after_dropout1 = self.dropout1(visual_feature) + visual_feature_after_dropout2 = self.dropout2(visual_feature) + visual_feature_after_dropout3 = self.dropout3(visual_feature) + visual_feature_after_dropout4 = self.dropout4(visual_feature) + visual_feature_after_dropout5 = self.dropout5(visual_feature) + contextual_feature1 = self.SequenceModeling(visual_feature_after_dropout1) + contextual_feature2 = self.SequenceModeling(visual_feature_after_dropout2) + contextual_feature3 = self.SequenceModeling(visual_feature_after_dropout3) + contextual_feature4 = self.SequenceModeling(visual_feature_after_dropout4) + contextual_feature5 = self.SequenceModeling(visual_feature_after_dropout5) + contextual_feature = ( (contextual_feature1).add ((contextual_feature2).add(((contextual_feature3).add(((contextual_feature4).add(contextual_feature5)))))) ) * (1/5) + + """ Prediction stage """ + if self.stages['Pred'] == 'CTC': + prediction = self.Prediction(contextual_feature.contiguous()) + else: + if text is None: + raise Exception('Input text (for prediction) to model is None') + text = text.to(self.opt.device) + prediction = self.Prediction(contextual_feature, text, is_train, batch_max_length=self.opt.batch_max_length) + + return prediction diff --git a/code/recognization/my_test.py b/code/recognization/my_test.py new file mode 100644 index 0000000000000000000000000000000000000000..0064744eb7b0ca5465bee5c960c139dd38ee30d2 --- /dev/null +++ b/code/recognization/my_test.py @@ -0,0 +1,235 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import os,shutil +import time +import argparse +import random +import numpy as np +import matplotlib.pyplot as plt +from datetime import datetime +import pytz + +import torch +import torch.utils.data +import torch.nn.functional as F +from tqdm import tqdm +from nltk.metrics.distance import edit_distance + +from utils import CTCLabelConverter, AttnLabelConverter, Averager, Logger +from dataset import hierarchical_dataset, AlignCollate +#from model import Model + +def validation(model, criterion, evaluation_loader, converter, opt, device): + """ validation or evaluation """ + eval_arr = [] + sum_len_gt = 0 + + n_correct = 0 + + norm_ED = 0 + length_of_data = 0 + infer_time = 0 + valid_loss_avg = Averager() + + for i, (image_tensors, labels) in enumerate(tqdm(evaluation_loader)): + batch_size = image_tensors.size(0) + length_of_data = length_of_data + batch_size + image = image_tensors.to(device) + # For max length prediction + length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) + text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) + + text_for_loss, length_for_loss = converter.encode(labels, batch_max_length=opt.batch_max_length) + + start_time = time.time() + if 'CTC' in opt.Prediction: + preds = model(image) + forward_time = time.time() - start_time + preds_size = torch.IntTensor([preds.size(1)] * batch_size) + cost = criterion(preds.log_softmax(2).permute(1, 0, 2), text_for_loss, preds_size, length_for_loss) + _, preds_index = preds.max(2) + preds_str = converter.decode(preds_index.data, preds_size.data) + else: + preds = model(image, text=text_for_pred, is_train=False) + forward_time = time.time() - start_time + + preds = preds[:, :text_for_loss.shape[1] - 1, :].to(device) + target = text_for_loss[:, 1:].to(device) # without [GO] Symbol + cost = criterion(preds.contiguous().view(-1, preds.shape[-1]), target.contiguous().view(-1)) + _, preds_index = preds.max(2) + preds_str = converter.decode(preds_index, length_for_pred) + labels = converter.decode(text_for_loss[:, 1:], length_for_loss) + + infer_time += forward_time + valid_loss_avg.add(cost) + + # calculate accuracy & confidence score + preds_prob = F.softmax(preds, dim=2) + preds_max_prob, _ = preds_prob.max(dim=2) + confidence_score_list = [] + for gt, pred, pred_max_prob in zip(labels, preds_str, preds_max_prob): + if 'Attn' in opt.Prediction: + gt = gt[:gt.find('[s]')] + pred_EOS = pred.find('[s]') + pred = pred[:pred_EOS] # prune after "end of sentence" token ([s]) + pred_max_prob = pred_max_prob[:pred_EOS] + + if pred == gt: + n_correct += 1 + + # ICDAR2019 Normalized Edit Distance + if len(gt) == 0 or len(pred) == 0: + ED = 0 + elif len(gt) > len(pred): + ED = 1 - edit_distance(pred, gt) / len(gt) + else: + ED = 1 - edit_distance(pred, gt) / len(pred) + + eval_arr.append([gt,pred,ED]) + + sum_len_gt += len(gt) + norm_ED += (ED*len(gt)) + + # calculate confidence score (= multiply of pred_max_prob) + try: + confidence_score = pred_max_prob.cumprod(dim=0)[-1] + except: + confidence_score = 0 # for empty pred case, when prune after "end of sentence" token ([s]) + confidence_score_list.append(confidence_score) + # print(pred, gt, pred==gt, confidence_score) + + accuracy = n_correct / float(length_of_data) * 100 + norm_ED = norm_ED / float(sum_len_gt) + + return valid_loss_avg.val(), accuracy, norm_ED, eval_arr + + +def test(opt, device): + opt.device = device + os.makedirs("test_outputs", exist_ok=True) + datetime_now = str(datetime.now(pytz.timezone('Asia/Kolkata')).strftime("%Y-%m-%d_%H-%M-%S")) + logger = Logger(f'test_outputs/{datetime_now}.txt') + """ model configuration """ + if 'CTC' in opt.Prediction: + converter = CTCLabelConverter(opt.character) + else: + converter = AttnLabelConverter(opt.character) + opt.num_class = len(converter.character) + + if opt.rgb: + opt.input_channel = 3 + model = Model(opt) + logger.log('model input parameters', opt.imgH, opt.imgW, opt.input_channel, opt.output_channel, + opt.hidden_size, opt.num_class, opt.batch_max_length, opt.FeatureExtraction, + opt.SequenceModeling, opt.Prediction) + model = model.to(device) + + # load model + model.load_state_dict(torch.load(opt.saved_model, map_location=device)) + logger.log('Loaded pretrained model from %s' % opt.saved_model) + # logger.log(model) + + """ setup loss """ + if 'CTC' in opt.Prediction: + criterion = torch.nn.CTCLoss(zero_infinity=True).to(device) + else: + criterion = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) # ignore [GO] token = ignore index 0 + + """ evaluation """ + model.eval() + with torch.no_grad(): + AlignCollate_evaluation = AlignCollate(imgH=opt.imgH, imgW=opt.imgW)#, keep_ratio_with_pad=opt.PAD) + eval_data, eval_data_log = hierarchical_dataset(root=opt.eval_data, opt=opt, rand_aug=False) + logger.log(eval_data_log) + evaluation_loader = torch.utils.data.DataLoader( + eval_data, batch_size=opt.batch_size, + shuffle=False, + num_workers=int(opt.workers), + collate_fn=AlignCollate_evaluation, pin_memory=True) + _, accuracy, norm_ED, eval_arr = validation( model, criterion, evaluation_loader, converter, opt,device) + logger.log("="*20) + logger.log(f'Accuracy : {accuracy:0.4f}\n') + logger.log(f'Norm_ED : {norm_ED:0.4f}\n') + logger.log("="*20) + + if opt.visualize: + logger.log("Threshold - ", opt.threshold) + logger.log("ED","\t","gt","\t","pred") + arr = [] + for gt,pred,ED in eval_arr: + ED = ED*100.0 + arr.append(ED) + if ED<=(opt.threshold): + logger.log(ED,"\t",gt,"\t",pred) + plt.hist(arr, edgecolor="red") + plt.savefig('test_outputs/'+str(datetime_now)+".png") + plt.close() + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--visualize', action='store_true', help='for visualization of bad samples') + parser.add_argument('--threshold', type=float, help='Save samples below this threshold in txt file', default=50.0) + parser.add_argument('--eval_data', required=True, help='path to evaluation dataset') + parser.add_argument('--workers', type=int, help='number of data loading workers', default=4) + parser.add_argument('--batch_size', type=int, default=32, help='input batch size') + parser.add_argument('--saved_model', required=True, help="path to saved_model to evaluation") + """ Data processing """ + parser.add_argument('--batch_max_length', type=int, default=100, help='maximum-label-length') + parser.add_argument('--imgH', type=int, default=32, help='the height of the input image') + parser.add_argument('--imgW', type=int, default=400, help='the width of the input image') + parser.add_argument('--rgb', action='store_true', help='use rgb input') + """ Model Architecture """ + parser.add_argument('--FeatureExtraction', type=str, default="HRNet", #required=True, + help='FeatureExtraction stage VGG|RCNN|ResNet|UNet|HRNet|Densenet|InceptionUnet|ResUnet|AttnUNet|UNet|VGG') + parser.add_argument('--SequenceModeling', type=str, default="DBiLSTM", #required=True, + help='SequenceModeling stage LSTM|GRU|MDLSTM|BiLSTM|DBiLSTM') + parser.add_argument('--Prediction', type=str, default="CTC", #required=True, + help='Prediction stage CTC|Attn') + parser.add_argument('--input_channel', type=int, default=1, help='the number of input channel of Feature extractor') + parser.add_argument('--output_channel', type=int, default=512, help='the number of output channel of Feature extractor') + parser.add_argument('--hidden_size', type=int, default=256, help='the size of the LSTM hidden state') + """ GPU Selection """ + parser.add_argument('--device_id', type=str, default=None, help='cuda device ID') + + opt = parser.parse_args() + if opt.FeatureExtraction == "HRNet": + opt.output_channel = 32 + + # Fix random seeds for both numpy and pytorch + seed = 1111 + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + """ vocab / character number configuration """ + file = open("UrduGlyphs.txt","r",encoding="utf-8") + content = file.readlines() + content = ''.join([str(elem).strip('\n') for elem in content]) + opt.character = content+" " + + cuda_str = 'cuda' + if opt.device_id is not None: + cuda_str = f'cuda:{opt.device_id}' + device = torch.device(cuda_str if torch.cuda.is_available() else 'cpu') + print("Device : ", device) + + # opt.eval_data = "/DATA/parseq/val/" + # test(opt, device) + + # opt.eval_data = "/DATA/parseq/IIITH/lmdb_new/" + # test(opt, device) + + # opt.eval_data = "/DATA/public_datasets/UPTI/valid/" + # test(opt, device) + + test(opt, device) diff --git a/code/recognization/prediction.py b/code/recognization/prediction.py new file mode 100755 index 0000000000000000000000000000000000000000..aef573bb2b1f9dfcac964209ac49f336c18285fc --- /dev/null +++ b/code/recognization/prediction.py @@ -0,0 +1,89 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Attention(nn.Module): + + def __init__(self, input_size, hidden_size, num_classes, device): + super(Attention, self).__init__() + self.attention_cell = AttentionCell(input_size, hidden_size, num_classes) + self.hidden_size = hidden_size + self.num_classes = num_classes + self.generator = nn.Linear(hidden_size, num_classes) + self.device = device + + def _char_to_onehot(self, input_char, onehot_dim=38): + input_char = input_char.unsqueeze(1) + batch_size = input_char.size(0) + one_hot = torch.FloatTensor(batch_size, onehot_dim).zero_().to(self.device) + one_hot = one_hot.scatter_(1, input_char, 1) + return one_hot + + def forward(self, batch_H, text, is_train=True, batch_max_length=25): + """ + input: + batch_H : contextual_feature H = hidden state of encoder. [batch_size x num_steps x contextual_feature_channels] + text : the text-index of each image. [batch_size x (max_length+1)]. +1 for [GO] token. text[:, 0] = [GO]. + output: probability distribution at each step [batch_size x num_steps x num_classes] + """ + batch_size = batch_H.size(0) + num_steps = batch_max_length + 1 # +1 for [s] at end of sentence. + + output_hiddens = torch.FloatTensor(batch_size, num_steps, self.hidden_size).fill_(0).to(self.device) + hidden = (torch.FloatTensor(batch_size, self.hidden_size).fill_(0).to(self.device), + torch.FloatTensor(batch_size, self.hidden_size).fill_(0).to(self.device)) + + if is_train: + for i in range(num_steps): + # one-hot vectors for a i-th char. in a batch + char_onehots = self._char_to_onehot(text[:, i], onehot_dim=self.num_classes) + # hidden : decoder's hidden s_{t-1}, batch_H : encoder's hidden H, char_onehots : one-hot(y_{t-1}) + hidden, _ = self.attention_cell(hidden, batch_H, char_onehots) + output_hiddens[:, i, :] = hidden[0] # LSTM hidden index (0: hidden, 1: Cell) + probs = self.generator(output_hiddens) + + else: + targets = torch.LongTensor(batch_size).fill_(0).to(self.device) # [GO] token + probs = torch.FloatTensor(batch_size, num_steps, self.num_classes).fill_(0).to(self.device) + + for i in range(num_steps): + char_onehots = self._char_to_onehot(targets, onehot_dim=self.num_classes) + hidden, _ = self.attention_cell(hidden, batch_H, char_onehots) + probs_step = self.generator(hidden[0]) + probs[:, i, :] = probs_step + _, next_input = probs_step.max(1) + targets = next_input + + return probs # batch_size x num_steps x num_classes + + +class AttentionCell(nn.Module): + + def __init__(self, input_size, hidden_size, num_embeddings): + super(AttentionCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias=False) + self.h2h = nn.Linear(hidden_size, hidden_size) # either i2i or h2h should have bias + self.score = nn.Linear(hidden_size, 1, bias=False) + self.rnn = nn.LSTMCell(input_size + num_embeddings, hidden_size) + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + # [batch_size x num_encoder_step x num_channel] -> [batch_size x num_encoder_step x hidden_size] + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = self.h2h(prev_hidden[0]).unsqueeze(1) + e = self.score(torch.tanh(batch_H_proj + prev_hidden_proj)) # batch_size x num_encoder_step * 1 + + alpha = F.softmax(e, dim=1) + context = torch.bmm(alpha.permute(0, 2, 1), batch_H).squeeze(1) # batch_size x num_channel + concat_context = torch.cat([context, char_onehots], 1) # batch_size x (num_channel + num_embedding) + cur_hidden = self.rnn(concat_context, prev_hidden) + return cur_hidden, alpha diff --git a/code/recognization/rcnn.py b/code/recognization/rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..1432f26d91cfe864b5ccd1e8f4f3a9c1515f7eff --- /dev/null +++ b/code/recognization/rcnn.py @@ -0,0 +1,89 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +from torch import nn +import torch.nn.functional as F + +# For Gated RCNN +class GRCL(nn.Module): + + def __init__(self, input_channel, output_channel, num_iteration, kernel_size, pad): + super(GRCL, self).__init__() + self.wgf_u = nn.Conv2d(input_channel, output_channel, 1, 1, 0, bias=False) + self.wgr_x = nn.Conv2d(output_channel, output_channel, 1, 1, 0, bias=False) + self.wf_u = nn.Conv2d(input_channel, output_channel, kernel_size, 1, pad, bias=False) + self.wr_x = nn.Conv2d(output_channel, output_channel, kernel_size, 1, pad, bias=False) + + self.BN_x_init = nn.BatchNorm2d(output_channel) + + self.num_iteration = num_iteration + self.GRCL = [GRCL_unit(output_channel) for _ in range(num_iteration)] + self.GRCL = nn.Sequential(*self.GRCL) + + def forward(self, input): + """ The input of GRCL is consistant over time t, which is denoted by u(0) + thus wgf_u / wf_u is also consistant over time t. + """ + wgf_u = self.wgf_u(input) + wf_u = self.wf_u(input) + x = F.relu(self.BN_x_init(wf_u)) + + for i in range(self.num_iteration): + x = self.GRCL[i](wgf_u, self.wgr_x(x), wf_u, self.wr_x(x)) + + return x + +class GRCL_unit(nn.Module): + + def __init__(self, output_channel): + super(GRCL_unit, self).__init__() + self.BN_gfu = nn.BatchNorm2d(output_channel) + self.BN_grx = nn.BatchNorm2d(output_channel) + self.BN_fu = nn.BatchNorm2d(output_channel) + self.BN_rx = nn.BatchNorm2d(output_channel) + self.BN_Gx = nn.BatchNorm2d(output_channel) + + def forward(self, wgf_u, wgr_x, wf_u, wr_x): + G_first_term = self.BN_gfu(wgf_u) + G_second_term = self.BN_grx(wgr_x) + G = F.sigmoid(G_first_term + G_second_term) + + x_first_term = self.BN_fu(wf_u) + x_second_term = self.BN_Gx(self.BN_rx(wr_x) * G) + x = F.relu(x_first_term + x_second_term) + + return x + +class RCNN(nn.Module): + """ FeatureExtractor of GRCNN (https://papers.nips.cc/paper/6637-gated-recurrent-convolution-neural-network-for-ocr.pdf) """ + + def __init__(self, input_channel=1, output_channel=512): + super(RCNN, self).__init__() + self.output_channel = [int(output_channel / 8), int(output_channel / 4), + int(output_channel / 2), output_channel] # [64, 128, 256, 512] + self.ConvNet = nn.Sequential( + nn.Conv2d(input_channel, self.output_channel[0], 3, 1, 1), nn.ReLU(True), + nn.MaxPool2d(2, 2), # 64 x 16 x 50 + GRCL(self.output_channel[0], self.output_channel[0], num_iteration=5, kernel_size=3, pad=1), + nn.MaxPool2d(2, 2), # 64 x 8 x 25 + GRCL(self.output_channel[0], self.output_channel[1], num_iteration=5, kernel_size=3, pad=1), + nn.MaxPool2d(2, (2, 1), (0, 1)), # 128 x 4 x 26 + GRCL(self.output_channel[1], self.output_channel[2], num_iteration=5, kernel_size=3, pad=1), + nn.MaxPool2d(2, (2, 1), (0, 1)), # 256 x 2 x 27 + nn.Conv2d(self.output_channel[2], self.output_channel[3], 2, 1, 0, bias=False), + nn.BatchNorm2d(self.output_channel[3]), nn.ReLU(True)) # 512 x 1 x 26 + + def forward(self, input): + return self.ConvNet(input) + +# import torch +# x = torch.randn(1, 1, 32, 400) +# net = RCNN() +# out = net(x) +# print(out.shape) \ No newline at end of file diff --git a/code/recognization/recognization.py b/code/recognization/recognization.py new file mode 100644 index 0000000000000000000000000000000000000000..2869cdbf22c86c9c0cddb402ce77d425baf2bc08 --- /dev/null +++ b/code/recognization/recognization.py @@ -0,0 +1,92 @@ +from .config import Config +from .model import Text_recognization_model +import os +import torch + +from .utils import CTCLabelConverter,Averager + +from PIL import Image +import math +import numpy as np +from .dataset import NormalizePAD +import tempfile + + +import os +import math +import numpy as np +from PIL import Image +import torch + +class TextRecognition: + def __init__(self,model_path='model/recognization_model.pth' , device='cpu' ): + # Initialize configuration + self.opt = Config() + self.opt.device = device + self.model_path = model_path + + # Load characters from UrduGlyphs.txt + current_dir = os.path.dirname(os.path.abspath(__file__)) + file_path = os.path.join(current_dir, "words.txt") + with open(file_path, "r", encoding="utf-8") as file: + content = file.readlines() + self.opt.character = ''.join([str(elem).strip('\n') for elem in content]) + " " + + # Initialize converter based on prediction type + if 'CTC' in self.opt.Prediction: + self.converter = CTCLabelConverter(self.opt.character) + else: + self.converter = AttnLabelConverter(self.opt.character) + + # Set the number of classes + self.opt.num_class = len(self.converter.character) + + # Load the model + model_path = os.path.join(current_dir, self.model_path) + self.model = Text_recognization_model(self.opt) + + self.model.load_state_dict(torch.load(model_path, map_location=self.opt.device, weights_only=True)) # Load weights + self.model = self.model.to(self.opt.device) + self.model.eval() # Set model to evaluation mode + + def recognize_image(self, image): + # Preprocess the input image + if isinstance(image, str): # If the input is a file path + pil_image = Image.open(image).convert('L') # Convert to grayscale + elif isinstance(image, np.ndarray): # If the input is a NumPy array + if len(image.shape) == 3 and image.shape[2] == 3: # RGB image + # Convert RGB to grayscale using weights + gray_array = np.dot(image[..., :3], [0.2989, 0.5870, 0.1140]) + pil_image = Image.fromarray(gray_array.astype('uint8')) # Convert to PIL Image + elif len(image.shape) == 2: # Already grayscale + pil_image = Image.fromarray(image.astype('uint8')) # Convert to PIL Image + else: + raise ValueError("Unsupported image format!") + else: + raise TypeError("Input must be a file path (str) or a NumPy array.") + + # Preprocess and resize image + pil_image = pil_image.transpose(Image.Transpose.FLIP_LEFT_RIGHT) + w, h = pil_image.size + ratio = w / float(h) + + if math.ceil(self.opt.imgH * ratio) > self.opt.imgW: + resized_w = self.opt.imgW + else: + resized_w = math.ceil(self.opt.imgH * ratio) + pil_image = pil_image.resize((resized_w, self.opt.imgH), Image.Resampling.BICUBIC) + + # Normalize and prepare image for the model + transform = NormalizePAD((1, self.opt.imgH, self.opt.imgW)) + img = transform(pil_image) + img = img.unsqueeze(0) # Add batch dimension + img = img.to(self.opt.device) + + # Perform prediction + preds = self.model(img) + preds_size = torch.IntTensor([preds.size(1)]) + _, preds_index = preds.max(2) + preds_str = self.converter.decode(preds_index.data, preds_size.data)[0] + + return preds_str + diff --git a/code/recognization/resnet.py b/code/recognization/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..267a7b5ea84b34c4f94bf98cca36ac0da63c6015 --- /dev/null +++ b/code/recognization/resnet.py @@ -0,0 +1,177 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import torch.nn as nn + +# Code For ResNet Feature Extractor +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = self._conv3x3(inplanes, planes) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = self._conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def _conv3x3(self, in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + out += residual + out = self.relu(out) + + return out + +class ResNet_model(nn.Module): + + def __init__(self, input_channel, output_channel, block, layers): + super(ResNet_model, self).__init__() + + self.output_channel_block = [int(output_channel / 4), int(output_channel / 2), output_channel, output_channel] + + self.inplanes = int(output_channel / 8) + self.conv0_1 = nn.Conv2d(input_channel, int(output_channel / 16), + kernel_size=3, stride=1, padding=1, bias=False) + self.bn0_1 = nn.BatchNorm2d(int(output_channel / 16)) + self.conv0_2 = nn.Conv2d(int(output_channel / 16), self.inplanes, + kernel_size=3, stride=1, padding=1, bias=False) + self.bn0_2 = nn.BatchNorm2d(self.inplanes) + self.relu = nn.ReLU(inplace=True) + + self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) + self.layer1 = self._make_layer(block, self.output_channel_block[0], layers[0]) + self.conv1 = nn.Conv2d(self.output_channel_block[0], self.output_channel_block[ + 0], kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(self.output_channel_block[0]) + + self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) + self.layer2 = self._make_layer(block, self.output_channel_block[1], layers[1], stride=1) + self.conv2 = nn.Conv2d(self.output_channel_block[1], self.output_channel_block[ + 1], kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(self.output_channel_block[1]) + + self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=(2, 1), padding=(0, 1)) + self.layer3 = self._make_layer(block, self.output_channel_block[2], layers[2], stride=1) + self.conv3 = nn.Conv2d(self.output_channel_block[2], self.output_channel_block[ + 2], kernel_size=3, stride=1, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(self.output_channel_block[2]) + + self.layer4 = self._make_layer(block, self.output_channel_block[3], layers[3], stride=1) + self.conv4_1 = nn.Conv2d(self.output_channel_block[3], self.output_channel_block[ + 3], kernel_size=2, stride=(2, 1), padding=(0, 1), bias=False) + self.bn4_1 = nn.BatchNorm2d(self.output_channel_block[3]) + self.conv4_2 = nn.Conv2d(self.output_channel_block[3], self.output_channel_block[ + 3], kernel_size=2, stride=1, padding=0, bias=False) + self.bn4_2 = nn.BatchNorm2d(self.output_channel_block[3]) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + + # print ("INPUT SHAPE", x.shape) + # INPUT SHAPE torch.Size([16, 1, 32, 400]) + + x = self.conv0_1(x) + x = self.bn0_1(x) + x = self.relu(x) + x = self.conv0_2(x) + x = self.bn0_2(x) + x = self.relu(x) + + # ([16, 64, 32, 400]) + # print ("XXXX", x.shape) + + x = self.maxpool1(x) + + # print ("After 1st Block", x.shape) + # After 1st Block torch.Size([16, 64, 16, 200]) + x = self.layer1(x) + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + x = self.maxpool2(x) + + # print ("After 2nd Block", x.shape) + # After 2nd Block torch.Size([16, 128, 8, 100]) + x = self.layer2(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + + x = self.maxpool3(x) + + # print ("After 3rd Block", x.shape) + # After 3rd Block torch.Size([16, 256, 4, 101]) + x = self.layer3(x) + x = self.conv3(x) + x = self.bn3(x) + x = self.relu(x) + + # print ("After 4th Block", x.shape) + # After 4th Block torch.Size([16, 512, 4, 101]) + x = self.layer4(x) + x = self.conv4_1(x) + x = self.bn4_1(x) + x = self.relu(x) + x = self.conv4_2(x) + x = self.bn4_2(x) + x = self.relu(x) + + # print ("Output Shape", x.shape) + # Output Shape torch.Size([16, 512, 1, 101]) + + return x + +class ResNet(nn.Module): + """ FeatureExtractor of FAN (http://openaccess.thecvf.com/content_ICCV_2017/papers/Cheng_Focusing_Attention_Towards_ICCV_2017_paper.pdf) """ + def __init__(self, input_channel=1, output_channel=512): + super(ResNet, self).__init__() + self.ConvNet = ResNet_model(input_channel, output_channel, BasicBlock, [1, 2, 5, 3]) + + def forward(self, input): + return self.ConvNet(input) + +# import torch +# x = torch.randn(1, 1, 32, 400) +# net = ResNet() +# out = net(x) +# print(out.shape) diff --git a/code/recognization/resunet.py b/code/recognization/resunet.py new file mode 100644 index 0000000000000000000000000000000000000000..47f08f3f526ce641ede97c20f719df469e3dccfd --- /dev/null +++ b/code/recognization/resunet.py @@ -0,0 +1,116 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import torch.nn as nn +import torch + +''' +Source - https://github.com/rishikksh20/ResUnet/blob/master/core/res_unet.py +An implementation of this paper - https://arxiv.org/pdf/1711.10684.pdf +''' + +class ResidualConv(nn.Module): + def __init__(self, input_dim, output_dim, stride, padding): + super(ResidualConv, self).__init__() + + self.conv_block = nn.Sequential( + nn.BatchNorm2d(input_dim), + nn.ReLU(), + nn.Conv2d( + input_dim, output_dim, kernel_size=3, stride=stride, padding=padding + ), + nn.BatchNorm2d(output_dim), + nn.ReLU(), + nn.Conv2d(output_dim, output_dim, kernel_size=3, padding=1), + ) + self.conv_skip = nn.Sequential( + nn.Conv2d(input_dim, output_dim, kernel_size=3, stride=stride, padding=1), + nn.BatchNorm2d(output_dim), + ) + + def forward(self, x): + + return self.conv_block(x) + self.conv_skip(x) + + +class Upsample(nn.Module): + def __init__(self, input_dim, output_dim, kernel, stride): + super(Upsample, self).__init__() + + self.upsample = nn.ConvTranspose2d( + input_dim, output_dim, kernel_size=kernel, stride=stride + ) + + def forward(self, x): + return self.upsample(x) + +class ResUnet(nn.Module): + def __init__(self, in_channel=1,out_channel=512, filters=[64, 128, 256, 512]): + super(ResUnet, self).__init__() + + self.input_layer = nn.Sequential( + nn.Conv2d(in_channel, filters[0], kernel_size=3, padding=1), + nn.BatchNorm2d(filters[0]), + nn.ReLU(), + nn.Conv2d(filters[0], filters[0], kernel_size=3, padding=1), + ) + self.input_skip = nn.Sequential( + nn.Conv2d(in_channel, filters[0], kernel_size=3, padding=1) + ) + + self.residual_conv_1 = ResidualConv(filters[0], filters[1], 2, 1) + self.residual_conv_2 = ResidualConv(filters[1], filters[2], 2, 1) + + self.bridge = ResidualConv(filters[2], filters[3], 2, 1) + + self.upsample_1 = Upsample(filters[3], filters[3], 2, 2) + self.up_residual_conv1 = ResidualConv(filters[3] + filters[2], filters[2], 1, 1) + + self.upsample_2 = Upsample(filters[2], filters[2], 2, 2) + self.up_residual_conv2 = ResidualConv(filters[2] + filters[1], filters[1], 1, 1) + + self.upsample_3 = Upsample(filters[1], filters[1], 2, 2) + self.up_residual_conv3 = ResidualConv(filters[1] + filters[0], filters[0], 1, 1) + + self.output_layer = nn.Sequential( + nn.Conv2d(filters[0], out_channel, 1, 1), + nn.Sigmoid(), + ) + + def forward(self, x): + # Encode + x1 = self.input_layer(x) + self.input_skip(x) + x2 = self.residual_conv_1(x1) + x3 = self.residual_conv_2(x2) + # Bridge + x4 = self.bridge(x3) + # Decode + x4 = self.upsample_1(x4) + x5 = torch.cat([x4, x3], dim=1) + + x6 = self.up_residual_conv1(x5) + + x6 = self.upsample_2(x6) + x7 = torch.cat([x6, x2], dim=1) + + x8 = self.up_residual_conv2(x7) + + x8 = self.upsample_3(x8) + x9 = torch.cat([x8, x1], dim=1) + + x10 = self.up_residual_conv3(x9) + + output = self.output_layer(x10) + + return output + +# x = torch.randn(1, 1, 32, 400) +# net = ResUnet(1,512) +# out = net(x) +# print(out.shape) diff --git a/code/recognization/sequence_modeling.py b/code/recognization/sequence_modeling.py new file mode 100644 index 0000000000000000000000000000000000000000..10eec346f08e8ca37a69c5173fb44d6aa8785396 --- /dev/null +++ b/code/recognization/sequence_modeling.py @@ -0,0 +1,105 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import torch.nn as nn + +class BidirectionalLSTM(nn.Module): + + def __init__(self, input_size, hidden_size, output_size): + super(BidirectionalLSTM, self).__init__() + self.rnn = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True) + self.linear = nn.Linear(hidden_size * 2, output_size) + + def forward(self, input): + """ + input : visual feature [batch_size x T x input_size] + output : contextual feature [batch_size x T x output_size] + """ + self.rnn.flatten_parameters() + recurrent, _ = self.rnn(input) # batch_size x T x input_size -> batch_size x T x (2*hidden_size) + output = self.linear(recurrent) # batch_size x T x output_size + return output + +class LSTM(nn.Module): + + def __init__(self, input_size, hidden_size, output_size): + super(LSTM, self).__init__() + self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True) + self.linear = nn.Linear(hidden_size, output_size) + + def forward(self, input): + """ + input : visual feature [batch_size x T x input_size] + output : contextual feature [batch_size x T x output_size] + """ + self.rnn.flatten_parameters() + recurrent, _ = self.rnn(input) # batch_size x T x input_size -> batch_size x T x hidden_size + output = self.linear(recurrent) # batch_size x T x output_size + return output + +class GRU(nn.Module): + + def __init__(self, input_size, hidden_size, output_size): + super(GRU, self).__init__() + self.rnn = nn.GRU(input_size, hidden_size, batch_first=True) + self.linear = nn.Linear(hidden_size, output_size) + + def forward(self, input): + """ + input : visual feature [batch_size x T x input_size] + output : contextual feature [batch_size x T x output_size] + """ + self.rnn.flatten_parameters() + recurrent, _ = self.rnn(input) # batch_size x T x input_size -> batch_size x T x hidden_size + output = self.linear(recurrent) # batch_size x T x output_size + return output + +class MDLSTM(nn.Module): + # The visual features of textline are given as input to a MDLSTM + # Each of the LSTMs then recursively maps these features into a lower dimensional space + # The standard one dimensional LSTM network can be extended to multiple dimensions by using n self connections with n forget gates + # Inspired by HM-LSTM originally proposed in - https://arxiv.org/pdf/1609.01704.pdf + def __init__(self, input_size, hidden_size, output_size): + super(MDLSTM, self).__init__() + self.rnn = nn.Sequential( + LSTM(input_size, hidden_size, 2*hidden_size), + LSTM(2*hidden_size, hidden_size, 4*hidden_size), + LSTM(4*hidden_size, hidden_size, 2*hidden_size), + LSTM(2*hidden_size, hidden_size, hidden_size)) + self.linear = nn.Linear(hidden_size, output_size) + def forward(self, input): + """ + input : visual feature [batch_size x T x input_size] + output : contextual feature [batch_size x T x output_size] + """ + for rnn in self.rnn: + rnn.rnn.flatten_parameters() + recurrent = self.rnn(input) # batch_size x T x input_size -> batch_size x T x hidden_size + output = self.linear(recurrent) # batch_size x T x output_size + return output + +# import torch +# x = torch.randn(1,100, 512) +# net1 = BidirectionalLSTM(512, 256, 512) +# net2 = LSTM(512, 256, 512) +# net3 = GRU(512, 256, 512) +# net4 = MDLSTM(512, 256, 512) + +# print("=========================================") +# out1 = net1(x) +# print(out1.shape) +# print("=========================================") +# out2 = net2(x) +# print(out2.shape) +# print("=========================================") +# out3 = net3(x) +# print(out3.shape) +# print("=========================================") +# out4 = net4(x) +# print(out4.shape) diff --git a/code/recognization/test_recognization.py b/code/recognization/test_recognization.py new file mode 100644 index 0000000000000000000000000000000000000000..eebb3f48d8a3839109c857dfc53b3d66565a26f5 --- /dev/null +++ b/code/recognization/test_recognization.py @@ -0,0 +1,28 @@ +from recognization import TextRecognition +import os +import argparse + +# Define the argument parser +parser = argparse.ArgumentParser(description='Process an image.') +parser.add_argument('--image_name', type=str, required=False, default='2.png', help='Path to the image file') +parser.add_argument('--device', type=str, required=False, default='cpu', help='cpu or cuda') + +# Parse the arguments +args = parser.parse_args() + +# Get the image path from the command line +image_path = args.image_name +device = args.device + +# Get current dir +current_dir = os.path.dirname(os.path.abspath(__file__)) + +# Get model path +model_path = os.path.join(current_dir , 'model/recognization_model.pth' ) +recognizer = TextRecognition('model/recognization_model.pth' , device=device) + +# Get image path +image_path = os.path.join(current_dir , f'data/{image_path}' ) +result = recognizer.recognize_image(image_path) + +print(result) diff --git a/code/recognization/unet.py b/code/recognization/unet.py new file mode 100644 index 0000000000000000000000000000000000000000..6bb0e8a329948ff0106b0db87c6db7f39066fef7 --- /dev/null +++ b/code/recognization/unet.py @@ -0,0 +1,123 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Code For UNet Feature Extractor - Source - https://github.com/milesial/Pytorch-UNet +class DoubleConv(nn.Module): + """(convolution => [BN] => ReLU) * 2""" + + def __init__(self, in_channels, out_channels, mid_channels=None): + super().__init__() + if not mid_channels: + mid_channels = out_channels + self.double_conv = nn.Sequential( + nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(mid_channels), + nn.ReLU(inplace=True), + nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True) + ) + + def forward(self, x): + return self.double_conv(x) + + +class Down(nn.Module): + """Downscaling with maxpool then double conv""" + + def __init__(self, in_channels, out_channels): + super().__init__() + self.maxpool_conv = nn.Sequential( + nn.MaxPool2d(2), + DoubleConv(in_channels, out_channels) + ) + + def forward(self, x): + return self.maxpool_conv(x) + + +class Up(nn.Module): + """Upscaling then double conv""" + + def __init__(self, in_channels, out_channels): + super().__init__() + self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2) + self.conv = DoubleConv(in_channels, out_channels) + + def forward(self, x1, x2): + x1 = self.up(x1) + # input is CHW + diffY = x2.size()[2] - x1.size()[2] + diffX = x2.size()[3] - x1.size()[3] + + x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2, + diffY // 2, diffY - diffY // 2]) + x = torch.cat([x2, x1], dim=1) + return self.conv(x) + + +class OutConv(nn.Module): + def __init__(self, in_channels, out_channels): + super(OutConv, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1) + + def forward(self, x): + return self.conv(x) + +class UNet(nn.Module): + def __init__(self, n_channels=1, n_classes=512): + super(UNet, self).__init__() + self.n_channels = n_channels + self.n_classes = n_classes + + self.inc = DoubleConv(n_channels, 32) + self.down1 = Down(32, 64) + self.down2 = Down(64, 128) + self.down3 = Down(128, 256) + self.down4 = Down(256, 512) + self.up1 = Up(512, 256) + self.up2 = Up(256, 128) + self.up3 = Up(128, 64) + self.up4 = Up(64, 32) + self.outc = OutConv(32, n_classes) + + def forward(self, x): + # print(x.shape) # torch.Size([1, 1, 32, 400]) + x1 = self.inc(x) + # print(x1.shape) # torch.Size([1, 32, 32, 400]) + x2 = self.down1(x1) + # print(x2.shape) # torch.Size([1, 64, 16, 200]) + x3 = self.down2(x2) + # print(x3.shape) # torch.Size([1, 128, 8, 100]) + x4 = self.down3(x3) + # print(x4.shape) # torch.Size([1, 256, 4, 50]) + x5 = self.down4(x4) + # print(x5.shape) # torch.Size([1, 512, 2, 25]) + + # print("Upscaling...") + x = self.up1(x5, x4) + # print(x.shape) # torch.Size([1, 256, 4, 50]) + x = self.up2(x, x3) + # print(x.shape) # torch.Size([1, 128, 8, 100]) + x = self.up3(x, x2) + # print(x.shape) # torch.Size([1, 64, 16, 200]) + x = self.up4(x, x1) + # print(x.shape) # torch.Size([1, 32, 32, 400]) + logits = self.outc(x) + # print(logits.shape) # torch.Size([1, 512, 32, 400]) + return logits + +# x = torch.randn(1, 1, 32, 400) +# net = UNet() +# out = net(x) +# print(out.shape) \ No newline at end of file diff --git a/code/recognization/unet_attn.py b/code/recognization/unet_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..5c0f9938e1cdc1b7430ff25937799c4ae99dd7b0 --- /dev/null +++ b/code/recognization/unet_attn.py @@ -0,0 +1,191 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import torch.nn as nn +import torch.nn.functional as F +import torch + +""" +Source - https://github.com/sfczekalski/attention_unet +Article - https://towardsdatascience.com/biomedical-image-segmentation-attention-u-net-29b6f0827405 +""" + +class ConvBlock(nn.Module): + + def __init__(self, in_channels, out_channels): + super(ConvBlock, self).__init__() + + # number of input channels is a number of filters in the previous layer + # number of output channels is a number of filters in the current layer + # "same" convolutions + self.conv = nn.Sequential( + nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True), + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True), + nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True), + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True) + ) + + def forward(self, x): + x = self.conv(x) + return x + +class UpConv(nn.Module): + + def __init__(self, in_channels, out_channels): + super(UpConv, self).__init__() + + self.up = nn.Sequential( + nn.Upsample(scale_factor=2), + nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True), + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True) + ) + + def forward(self, x): + x = self.up(x) + return x + + +class AttentionBlock(nn.Module): + """Attention block with learnable parameters""" + + def __init__(self, F_g, F_l, n_coefficients): + """ + :param F_g: number of feature maps (channels) in previous layer + :param F_l: number of feature maps in corresponding encoder layer, transferred via skip connection + :param n_coefficients: number of learnable multi-dimensional attention coefficients + """ + super(AttentionBlock, self).__init__() + + self.W_gate = nn.Sequential( + nn.Conv2d(F_g, n_coefficients, kernel_size=1, stride=1, padding=0, bias=True), + nn.BatchNorm2d(n_coefficients) + ) + + self.W_x = nn.Sequential( + nn.Conv2d(F_l, n_coefficients, kernel_size=1, stride=1, padding=0, bias=True), + nn.BatchNorm2d(n_coefficients) + ) + + self.psi = nn.Sequential( + nn.Conv2d(n_coefficients, 1, kernel_size=1, stride=1, padding=0, bias=True), + nn.BatchNorm2d(1), + nn.Sigmoid() + ) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, gate, skip_connection): + """ + :param gate: gating signal from previous layer + :param skip_connection: activation from corresponding encoder layer + :return: output activations + """ + g1 = self.W_gate(gate) + x1 = self.W_x(skip_connection) + psi = self.relu(g1 + x1) + psi = self.psi(psi) + out = skip_connection * psi + return out + +class AttnUNet(nn.Module): + + def __init__(self, img_ch=1, output_ch=512): + super(AttnUNet, self).__init__() + + self.MaxPool = nn.MaxPool2d(kernel_size=2, stride=2) + + self.Conv1 = ConvBlock(img_ch, 32) + self.Conv2 = ConvBlock(32, 64) + self.Conv3 = ConvBlock(64, 128) + self.Conv4 = ConvBlock(128, 256) + self.Conv5 = ConvBlock(256, 512) + + self.Up5 = UpConv(512, 256) + self.Att5 = AttentionBlock(F_g=256, F_l=256, n_coefficients=128) + self.UpConv5 = ConvBlock(512, 256) + + self.Up4 = UpConv(256, 128) + self.Att4 = AttentionBlock(F_g=128, F_l=128, n_coefficients=64) + self.UpConv4 = ConvBlock(256, 128) + + self.Up3 = UpConv(128, 64) + self.Att3 = AttentionBlock(F_g=64, F_l=64, n_coefficients=32) + self.UpConv3 = ConvBlock(128, 64) + + self.Up2 = UpConv(64, 32) + self.Att2 = AttentionBlock(F_g=32, F_l=32, n_coefficients=16) + self.UpConv2 = ConvBlock(64, 32) + + self.Conv = nn.Conv2d(32, output_ch, kernel_size=1, stride=1, padding=0) + + def forward(self, x): + """ + e : encoder layers + d : decoder layers + s : skip-connections from encoder layers to decoder layers + """ + # print("="*20,"Feeding to Encoder","="*20) + # print ("Size 0", x.shape) + e1 = self.Conv1(x) + # print ("Size 1", e1.shape) + + e2 = self.MaxPool(e1) + e2 = self.Conv2(e2) + # print ("Size 2", e2.shape) + + e3 = self.MaxPool(e2) + e3 = self.Conv3(e3) + # print ("Size 3", e3.shape) + + e4 = self.MaxPool(e3) + e4 = self.Conv4(e4) + # print ("Size 4", e4.shape) + + e5 = self.MaxPool(e4) + e5 = self.Conv5(e5) + # print ("Size 5 (Final Encoder Output) : ", e5.shape) + + # print("\n","="*20,"Feeding to Decoder now","="*20) + + d5 = self.Up5(e5) + s4 = self.Att5(gate=d5, skip_connection=e4) + d5 = torch.cat((s4, d5), dim=1) # concatenate attention-weighted skip connection with previous layer output + d5 = self.UpConv5(d5) + # print ("d5 ", d5.shape) + + d4 = self.Up4(d5) + s3 = self.Att4(gate=d4, skip_connection=e3) + d4 = torch.cat((s3, d4), dim=1) + d4 = self.UpConv4(d4) + # print ("d4 ", d4.shape) + + d3 = self.Up3(d4) + s2 = self.Att3(gate=d3, skip_connection=e2) + d3 = torch.cat((s2, d3), dim=1) + d3 = self.UpConv3(d3) + # print ("d3 ", d3.shape) + + d2 = self.Up2(d3) + s1 = self.Att2(gate=d2, skip_connection=e1) + d2 = torch.cat((s1, d2), dim=1) + d2 = self.UpConv2(d2) + # print ("d2 ", d2.shape) + + out = self.Conv(d2) + # print("out (Final Decoder Output) : ", out.shape) + + return out + +# x = torch.randn(1, 1, 32, 400) +# net = AttnUNet(1,512) +# out = net(x) +# print(out.shape) \ No newline at end of file diff --git a/code/recognization/unet_plus_plus.py b/code/recognization/unet_plus_plus.py new file mode 100644 index 0000000000000000000000000000000000000000..69a89efc8b1c5266ea342116f3b83dc323e9de3d --- /dev/null +++ b/code/recognization/unet_plus_plus.py @@ -0,0 +1,97 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import torch +from torch import nn + +''' +Source - https://github.com/4uiiurz1/pytorch-nested-unet +An implementation of this paper - https://arxiv.org/abs/1807.10165 +''' + + +class VGGBlock(nn.Module): + def __init__(self, in_channels, middle_channels, out_channels): + super().__init__() + self.relu = nn.ReLU(inplace=True) + self.conv1 = nn.Conv2d(in_channels, middle_channels, 3, padding=1) + self.bn1 = nn.BatchNorm2d(middle_channels) + self.conv2 = nn.Conv2d(middle_channels, out_channels, 3, padding=1) + self.bn2 = nn.BatchNorm2d(out_channels) + + def forward(self, x): + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + return out + +class NestedUNet(nn.Module): + def __init__(self, input_channels=1, out_channels=512): + super().__init__() + + nb_filter = [32, 64, 128, 256, 512] + + self.pool = nn.MaxPool2d(2, 2) + self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) + + self.conv0_0 = VGGBlock(input_channels, nb_filter[0], nb_filter[0]) + self.conv1_0 = VGGBlock(nb_filter[0], nb_filter[1], nb_filter[1]) + self.conv2_0 = VGGBlock(nb_filter[1], nb_filter[2], nb_filter[2]) + self.conv3_0 = VGGBlock(nb_filter[2], nb_filter[3], nb_filter[3]) + self.conv4_0 = VGGBlock(nb_filter[3], nb_filter[4], nb_filter[4]) + + self.conv0_1 = VGGBlock(nb_filter[0]+nb_filter[1], nb_filter[0], nb_filter[0]) + self.conv1_1 = VGGBlock(nb_filter[1]+nb_filter[2], nb_filter[1], nb_filter[1]) + self.conv2_1 = VGGBlock(nb_filter[2]+nb_filter[3], nb_filter[2], nb_filter[2]) + self.conv3_1 = VGGBlock(nb_filter[3]+nb_filter[4], nb_filter[3], nb_filter[3]) + + self.conv0_2 = VGGBlock(nb_filter[0]*2+nb_filter[1], nb_filter[0], nb_filter[0]) + self.conv1_2 = VGGBlock(nb_filter[1]*2+nb_filter[2], nb_filter[1], nb_filter[1]) + self.conv2_2 = VGGBlock(nb_filter[2]*2+nb_filter[3], nb_filter[2], nb_filter[2]) + + self.conv0_3 = VGGBlock(nb_filter[0]*3+nb_filter[1], nb_filter[0], nb_filter[0]) + self.conv1_3 = VGGBlock(nb_filter[1]*3+nb_filter[2], nb_filter[1], nb_filter[1]) + + self.conv0_4 = VGGBlock(nb_filter[0]*4+nb_filter[1], nb_filter[0], nb_filter[0]) + + self.final = nn.Conv2d(nb_filter[0], out_channels, kernel_size=1) + + + def forward(self, input): + x0_0 = self.conv0_0(input) + x1_0 = self.conv1_0(self.pool(x0_0)) + x0_1 = self.conv0_1(torch.cat([x0_0, self.up(x1_0)], 1)) + + x2_0 = self.conv2_0(self.pool(x1_0)) + x1_1 = self.conv1_1(torch.cat([x1_0, self.up(x2_0)], 1)) + x0_2 = self.conv0_2(torch.cat([x0_0, x0_1, self.up(x1_1)], 1)) + + x3_0 = self.conv3_0(self.pool(x2_0)) + x2_1 = self.conv2_1(torch.cat([x2_0, self.up(x3_0)], 1)) + x1_2 = self.conv1_2(torch.cat([x1_0, x1_1, self.up(x2_1)], 1)) + x0_3 = self.conv0_3(torch.cat([x0_0, x0_1, x0_2, self.up(x1_2)], 1)) + + x4_0 = self.conv4_0(self.pool(x3_0)) + x3_1 = self.conv3_1(torch.cat([x3_0, self.up(x4_0)], 1)) + x2_2 = self.conv2_2(torch.cat([x2_0, x2_1, self.up(x3_1)], 1)) + x1_3 = self.conv1_3(torch.cat([x1_0, x1_1, x1_2, self.up(x2_2)], 1)) + x0_4 = self.conv0_4(torch.cat([x0_0, x0_1, x0_2, x0_3, self.up(x1_3)], 1)) + + output = self.final(x0_4) + return output + +# x = torch.randn(1, 1, 32, 400) +# net = NestedUNet() +# out = net(x) +# print(out.shape) \ No newline at end of file diff --git a/code/recognization/utils.py b/code/recognization/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d6b01d8fef5ebe8a2c63084d335a1654ea264c7c --- /dev/null +++ b/code/recognization/utils.py @@ -0,0 +1,478 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +import pytz +import torch +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +from torch.autograd import Variable + +import os,random,shutil +import matplotlib.pyplot as plt + +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +class CTCLabelConverter(object): + """ Convert between text-label and text-index """ + + def __init__(self, character): + # character (str): set of the possible characters. + dict_character = list(character) + + self.dict = {} + for i, char in enumerate(dict_character): + # NOTE: 0 is reserved for 'CTCblank' token required by CTCLoss + self.dict[char] = i + 1 + + self.character = ['[CTCblank]'] + dict_character # dummy '[CTCblank]' token for CTCLoss (index 0) + + def encode(self, text, batch_max_length=25): + """convert text-label into text-index. + input: + text: text labels of each image. [batch_size] + batch_max_length: max length of text label in the batch. 25 by default + + output: + text: text index for CTCLoss. [batch_size, batch_max_length] + length: length of each text. [batch_size] + """ + length = [len(s) for s in text] + + # The index used for padding (=0) would not affect the CTC loss calculation. + batch_text = torch.LongTensor(len(text), batch_max_length).fill_(0) + for i, t in enumerate(text): + text = list(t) + text = [self.dict[char] for char in text] + batch_text[i][:len(text)] = torch.LongTensor(text) + return (batch_text, torch.IntTensor(length)) + + def decode(self, text_index, length): + """ convert text-index into text-label. """ + texts = [] + for index, l in enumerate(length): + t = text_index[index, :] + + char_list = [] + for i in range(l): + if t[i] != 0 and (not (i > 0 and t[i - 1] == t[i])): # removing repeated characters and blank. + char_list.append(self.character[t[i]]) + text = ''.join(char_list) + + texts.append(text) + return texts + + +class CTCLabelConverterForBaiduWarpctc(object): + """ Convert between text-label and text-index for baidu warpctc """ + + def __init__(self, character): + # character (str): set of the possible characters. + dict_character = list(character) + + self.dict = {} + for i, char in enumerate(dict_character): + # NOTE: 0 is reserved for 'CTCblank' token required by CTCLoss + self.dict[char] = i + 1 + + self.character = ['[CTCblank]'] + dict_character # dummy '[CTCblank]' token for CTCLoss (index 0) + + def encode(self, text, batch_max_length=25): + """convert text-label into text-index. + input: + text: text labels of each image. [batch_size] + output: + text: concatenated text index for CTCLoss. + [sum(text_lengths)] = [text_index_0 + text_index_1 + ... + text_index_(n - 1)] + length: length of each text. [batch_size] + """ + length = [len(s) for s in text] + text = ''.join(text) + text = [self.dict[char] for char in text] + + return (torch.IntTensor(text), torch.IntTensor(length)) + + def decode(self, text_index, length): + """ convert text-index into text-label. """ + texts = [] + index = 0 + for l in length: + t = text_index[index:index + l] + + char_list = [] + for i in range(l): + if t[i] != 0 and (not (i > 0 and t[i - 1] == t[i])): # removing repeated characters and blank. + char_list.append(self.character[t[i]]) + text = ''.join(char_list) + + texts.append(text) + index += l + return texts + + +class AttnLabelConverter(object): + """ Convert between text-label and text-index """ + + def __init__(self, character): + # character (str): set of the possible characters. + # [GO] for the start token of the attention decoder. [s] for end-of-sentence token. + list_token = ['[GO]', '[s]'] # ['[s]','[UNK]','[PAD]','[GO]'] + list_character = list(character) + self.character = list_token + list_character + + self.dict = {} + for i, char in enumerate(self.character): + # print(i, char) + self.dict[char] = i + + def encode(self, text, batch_max_length=25): + """ convert text-label into text-index. + input: + text: text labels of each image. [batch_size] + batch_max_length: max length of text label in the batch. 25 by default + + output: + text : the input of attention decoder. [batch_size x (max_length+2)] +1 for [GO] token and +1 for [s] token. + text[:, 0] is [GO] token and text is padded with [GO] token after [s] token. + length : the length of output of attention decoder, which count [s] token also. [3, 7, ....] [batch_size] + """ + length = [len(s) + 1 for s in text] # +1 for [s] at end of sentence. + # batch_max_length = max(length) # this is not allowed for multi-gpu setting + batch_max_length += 1 + # additional +1 for [GO] at first step. batch_text is padded with [GO] token after [s] token. + batch_text = torch.LongTensor(len(text), batch_max_length + 1).fill_(0) + for i, t in enumerate(text): + text = list(t) + text.append('[s]') + + try: + text = [self.dict[char] for char in text] + except KeyError as e: + continue + batch_text[i][1:1 + len(text)] = torch.LongTensor(text) # batch_text[:, 0] = [GO] token + return (batch_text, torch.IntTensor(length)) + + def decode(self, text_index, length): + """ convert text-index into text-label. """ + texts = [] + for index, l in enumerate(length): + text = ''.join([self.character[i] for i in text_index[index, :]]) + texts.append(text) + return texts + + +def imshow(img, title,batch_size=1): + std_correction = np.asarray([0.229, 0.224, 0.225]).reshape(3, 1, 1) + mean_correction = np.asarray([0.485, 0.456, 0.406]).reshape(3, 1, 1) + npimg = np.multiply(img.numpy(), std_correction) + mean_correction + plt.figure(figsize = (batch_size * 4, 4)) + plt.axis("off") + plt.imshow(np.transpose(npimg, (1, 2, 0))) + plt.title(title) + plt.show() + + +class Averager(object): + """Compute average for torch.Tensor, used for loss average.""" + + def __init__(self): + self.reset() + + def add(self, v): + count = v.data.numel() + v = v.data.sum() + self.n_count += count + self.sum += v + + def reset(self): + self.n_count = 0 + self.sum = 0 + + def val(self): + res = 0 + if self.n_count != 0: + res = self.sum / float(self.n_count) + return res + +class Logger(object): + """For logging while training""" + def __init__(self, path): + self.logFile = path + datetime_now = str(datetime.now(pytz.timezone('Asia/Kolkata')).strftime("%Y-%m-%d_%H-%M-%S")) + with open(self.logFile,"w",encoding="utf-8") as f: + f.write("Logging at @ " + str(datetime_now) + "\n") + + def log(self,*input): + message = "" + for x in input: + message+=str(x) + " " + message = message.strip() + print(message) + with open(self.logFile,"a",encoding="utf-8") as f: + f.write(str(message)+"\n") + + +def allign_two_strings(x:str, y:str, pxy:int=1, pgap:int=1): + """ + Source: https://www.geeksforgeeks.org/sequence-alignment-problem/ + """ + i = 0 + j = 0 + m = len(x) + n = len(y) + dp = np.zeros([m+1,n+1], dtype=int) + dp[0:(m+1),0] = [ i * pgap for i in range(m+1)] + dp[0,0:(n+1)] = [ i * pgap for i in range(n+1)] + + i = 1 + while i <= m: + j = 1 + while j <= n: + if x[i - 1] == y[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + else: + dp[i][j] = min(dp[i - 1][j - 1] + pxy, + dp[i - 1][j] + pgap, + dp[i][j - 1] + pgap) + j += 1 + i += 1 + + l = n + m + i = m + j = n + + xpos = l + ypos = l + + xans = np.zeros(l+1, dtype=int) + yans = np.zeros(l+1, dtype=int) + + while not (i == 0 or j == 0): + #print(f"i: {i}, j: {j}") + if x[i - 1] == y[j - 1]: + xans[xpos] = ord(x[i - 1]) + yans[ypos] = ord(y[j - 1]) + xpos -= 1 + ypos -= 1 + i -= 1 + j -= 1 + elif (dp[i - 1][j - 1] + pxy) == dp[i][j]: + + xans[xpos] = ord(x[i - 1]) + yans[ypos] = ord(y[j - 1]) + xpos -= 1 + ypos -= 1 + i -= 1 + j -= 1 + + elif (dp[i - 1][j] + pgap) == dp[i][j]: + xans[xpos] = ord(x[i - 1]) + yans[ypos] = ord('_') + xpos -= 1 + ypos -= 1 + i -= 1 + + elif (dp[i][j - 1] + pgap) == dp[i][j]: + xans[xpos] = ord('_') + yans[ypos] = ord(y[j - 1]) + xpos -= 1 + ypos -= 1 + j -= 1 + + + while xpos > 0: + if i > 0: + i -= 1 + xans[xpos] = ord(x[i]) + xpos -= 1 + else: + xans[xpos] = ord('_') + xpos -= 1 + + while ypos > 0: + if j > 0: + j -= 1 + yans[ypos] = ord(y[j]) + ypos -= 1 + else: + yans[ypos] = ord('_') + ypos -= 1 + + id = 1 + i = l + while i >= 1: + if (chr(yans[i]) == '_') and chr(xans[i]) == '_': + id = i + 1 + break + + i -= 1 + + i = id + x_seq = "" + while i <= l: + x_seq += chr(xans[i]) + i += 1 + + # Y + i = id + y_seq = "" + while i <= l: + y_seq += chr(yans[i]) + i += 1 + + return x_seq, y_seq + +# Function to count the number of trainable parameters in a model in "Millions" +def count_parameters(model,precision=2): + return (round(sum(p.numel() for p in model.parameters() if p.requires_grad) / 10.**6, precision)) + +''' +# Code for counting the number of FLOPs in the CNN backbone during inference +Source - https://github.com/fdbtrs/ElasticFace/blob/main/utils/countFLOPS.py +''' + +def count_model_flops(model,in_channels=1, input_res=[32, 400], multiply_adds=True): + list_conv = [] + + def conv_hook(self, input, output): + batch_size, input_channels, input_height, input_width = input[0].size() + output_channels, output_height, output_width = output[0].size() + + kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups) + bias_ops = 1 if self.bias is not None else 0 + + params = output_channels * (kernel_ops + bias_ops) + flops = (kernel_ops * ( + 2 if multiply_adds else 1) + bias_ops) * output_channels * output_height * output_width * batch_size + list_conv.append(flops) + list_linear = [] + + def linear_hook(self, input, output): + batch_size = input[0].size(0) if input[0].dim() == 2 else 1 + + weight_ops = self.weight.nelement() * (2 if multiply_adds else 1) + if self.bias is not None: + bias_ops = self.bias.nelement() if self.bias.nelement() else 0 + flops = batch_size * (weight_ops + bias_ops) + else: + flops = batch_size * weight_ops + list_linear.append(flops) + + list_bn = [] + + def bn_hook(self, input, output): + list_bn.append(input[0].nelement() * 2) + + list_relu = [] + + def relu_hook(self, input, output): + list_relu.append(input[0].nelement()) + + list_pooling = [] + + def pooling_hook(self, input, output): + batch_size, input_channels, input_height, input_width = input[0].size() + output_channels, output_height, output_width = output[0].size() + # If kernel_size is a tuple type, computer ops as product of elements or else if it is int type, compute ops as square of kernel_size + kernel_ops = self.kernel_size[0] * self.kernel_size[1] if isinstance(self.kernel_size, tuple) else self.kernel_size * self.kernel_size + bias_ops = 0 + params = 0 + flops = (kernel_ops + bias_ops) * output_channels * output_height * output_width * batch_size + list_pooling.append(flops) + + def dropout_hook(self, input, output): + # calculate the number of operations for a dropout function by assuming that each operation involves one comparison and one multiplication + batch_size, input_channels, input_height, input_width = input[0].size() + list_conv.append(2*batch_size*input_channels*input_height*input_width) + + def sigmoid_hook(self,input,output): + # calculate the number of operations for a sigmoid function by assuming that each operation involves two multiplications and one addition + batch_size, input_channels, input_height, input_width = input[0].size() + list_conv.append(3*batch_size*input_channels*input_height*input_width) + + def upsample_hook(self, input, output): + batch_size, input_channels, input_height, input_width = input[0].size() + output_channels, output_height, output_width = output[0].size() + + kernel_ops = self.scale_factor * self.scale_factor # * (self.in_channels / self.groups) + flops = (kernel_ops * ( + 2 if multiply_adds else 1)) * output_channels * output_height * output_width * batch_size + list_conv.append(flops) + + handles = [] + + def foo(net): + childrens = list(net.children()) + if not childrens: + if isinstance(net, torch.nn.Conv2d) or isinstance(net, torch.nn.ConvTranspose2d): + handles.append(net.register_forward_hook(conv_hook)) + elif isinstance(net, torch.nn.Linear): + handles.append(net.register_forward_hook(linear_hook)) + elif isinstance(net, torch.nn.BatchNorm2d) or isinstance(net, torch.nn.BatchNorm1d): + handles.append(net.register_forward_hook(bn_hook)) + elif isinstance(net, torch.nn.ReLU) or isinstance(net, torch.nn.PReLU): + handles.append(net.register_forward_hook(relu_hook)) + elif isinstance(net, torch.nn.MaxPool2d) or isinstance(net, torch.nn.AvgPool2d): + handles.append(net.register_forward_hook(pooling_hook)) + elif isinstance(net, torch.nn.Dropout): + handles.append(net.register_forward_hook(dropout_hook)) + elif isinstance(net,torch.nn.Upsample): + handles.append(net.register_forward_hook(upsample_hook)) + elif isinstance(net,torch.nn.Sigmoid): + handles.append(net.register_forward_hook(sigmoid_hook)) + else: + print("warning" + str(net)) + return + for c in childrens: + foo(c) + + model.eval() + foo(model) + input = Variable(torch.rand(in_channels, input_res[1], input_res[0]).unsqueeze(0), requires_grad=True) + out = model(input) + total_flops = (sum(list_conv) + sum(list_linear) + sum(list_bn) + sum(list_relu) + sum(list_pooling)) + for h in handles: + h.remove() + model.train() + + def flops_to_string(flops, units='MFLOPS', precision=4): + if units == 'GFLOPS': + return str(round(flops / 10.**9, precision)) + ' ' + units + elif units == 'MFLOPS': + return str(round(flops / 10.**6, precision)) + ' ' + units + elif units == 'KFLOPS': + return str(round(flops / 10.**3, precision)) + ' ' + units + else: + return str(flops) + ' FLOPS' + + return flops_to_string(total_flops) + + +def draw_feature_map(visual_feature,vis_dir,num_channel=10): + """draws feature maps for the given visual features + Args: + visual_feature (Tensor): Shape (C, H, W) + vis_dir (String): Directory to save the feature maps + """ + if os.path.exists(vis_dir): + shutil.rmtree(vis_dir) + os.makedirs(vis_dir) + # Save visual_feature from num_channel random channels for visualization + for i in range(num_channel): + random_channel = random.randint(0, visual_feature.shape[1]-1) + visual_feature_for_visualization = visual_feature[0, random_channel, :, :].detach().cpu().numpy() + # Horizontal flip + visual_feature_for_visualization = visual_feature_for_visualization[:,::-1] + # Normalize + visual_feature_for_visualization = (visual_feature_for_visualization - visual_feature_for_visualization.min()) / (visual_feature_for_visualization.max() - visual_feature_for_visualization.min()) + # Draw heatmap + plt.imshow(visual_feature_for_visualization, cmap='gray', interpolation='nearest') + plt.axis("off") + plt.savefig(os.path.join(vis_dir, "channel_{}.png".format(random_channel)), bbox_inches='tight', pad_inches=0) \ No newline at end of file diff --git a/code/recognization/vgg.py b/code/recognization/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..bf2f9853f4011bb54471e6ed0c920ec185a5de33 --- /dev/null +++ b/code/recognization/vgg.py @@ -0,0 +1,40 @@ +""" +Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 +Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora +GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition +Project Website: https://abdur75648.github.io/UTRNet/ +Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial +4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) +""" + +from torch import nn + +class VGG(nn.Module): + """ FeatureExtractor of CRNN (https://arxiv.org/pdf/1507.05717.pdf) """ + def __init__(self, input_channel=1, output_channel=512): + super(VGG, self).__init__() + self.output_channel = [int(output_channel / 8), int(output_channel / 4), + int(output_channel / 2), output_channel] + self.ConvNet = nn.Sequential( + nn.Conv2d(input_channel, self.output_channel[0], 3, 1, 1), nn.ReLU(True), + nn.MaxPool2d(2, 2), + nn.Conv2d(self.output_channel[0], self.output_channel[1], 3, 1, 1), nn.ReLU(True), + nn.MaxPool2d(2, 2), + nn.Conv2d(self.output_channel[1], self.output_channel[2], 3, 1, 1), nn.ReLU(True), + nn.Conv2d(self.output_channel[2], self.output_channel[2], 3, 1, 1), nn.ReLU(True), + nn.MaxPool2d((2, 1), (2, 1)), + nn.Conv2d(self.output_channel[2], self.output_channel[3], 3, 1, 1, bias=False), + nn.BatchNorm2d(self.output_channel[3]), nn.ReLU(True), + nn.Conv2d(self.output_channel[3], self.output_channel[3], 3, 1, 1, bias=False), + nn.BatchNorm2d(self.output_channel[3]), nn.ReLU(True), + nn.MaxPool2d((2, 1), (2, 1)), + nn.Conv2d(self.output_channel[3], self.output_channel[3], 2, 1, 0), nn.ReLU(True)) + + def forward(self, input): + return self.ConvNet(input) + +# import torch +# x = torch.randn(1, 1, 32, 400) +# net = VGG() +# out = net(x) +# print(out.shape) \ No newline at end of file diff --git a/code/recognization/words.txt b/code/recognization/words.txt new file mode 100755 index 0000000000000000000000000000000000000000..98f22177c4191052a1bd888519d65c5450212227 --- /dev/null +++ b/code/recognization/words.txt @@ -0,0 +1,180 @@ +ا +آ +ب +ب +ت +ث +ث +ج +ج +ح +خ +د +ڈ +ذ +ر +ڑ +ز +ز +س +ش +ص +ض +ط +ظ +ع +غ +ف +ق +ک +ك +گ +ل +م +ن +ں +و +ه +ھ +ء +ى +ے +ئ +ۓ +ي +ې +ٿ +ڐ +ڙ +أ +ؤ +ۀ +ۃ +ة +ه +ۂ +ﮥ +ٴ +َ +ً +ُ +ِ +ٍ +ْ +ٗ +ٓ +ٰ +ٖ +٘ +ٔ +ّ +ؔ +۰ +١ +٢ +٣ +٣ +٥ +٦ +٧ +٨ +٩ +٪ +% ++ += +٤ +٫ +, +- +_ +٥ +٬ +" +' +/ +\ +> +< +؍ +، +؛ +: +؟ +{ +} +[ +] +( +) +. +‘ +’ +٠ +۔ +“ +” +! +* +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/full_information_extraction.py b/full_information_extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..75378e93600dd8f2d1f4241b5094a5f75b198a43 --- /dev/null +++ b/full_information_extraction.py @@ -0,0 +1,45 @@ +from code.detection.recognize_id.detect_and_recognize_id import Recognize_ID +from code.detection.detection import detection +from code.recognization.recognization import TextRecognition +import os +import argparse + +# Define the argument parser +parser = argparse.ArgumentParser(description='Process an image.') +parser.add_argument('--image_name', type=str, required=False, default='id_1.png', help='Path to the image file') +parser.add_argument('--device', type=str, required=False, default='cpu', help='cpu or cuda') + +# Parse the arguments +args = parser.parse_args() + +# Get the image path from the command line +image_name = args.image_name +device = args.device + + +current_dir = os.path.dirname(os.path.abspath(__file__)) + +# Recognize ID +rec_id = Recognize_ID() +image_path = os.path.join(current_dir , 'data' , image_name ) +id = rec_id.give_me_id_number(image_path) + +# Detection +det = detection() +detection_list = det.full_pipeline(image_path,True) + +result = '' +# Loop on all detected images and recognize them +recognizer = TextRecognition(device=device) +for line in detection_list[2:6]: + for word in line: + recognized_word = recognizer.recognize_image(word) + result = result + recognized_word + ' ' + result += '\n' + +# Add Id number +result = result + id + + +print(result) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ced148bd0d235ba96a3b213023f7b6b6a3bf26c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +torch +numpy +tqdm +opencv-python +ultralytics diff --git a/samples/id_1.png b/samples/id_1.png new file mode 100644 index 0000000000000000000000000000000000000000..b4dc92b97fca873a8fd6934610c8e99cc5fcbf8e Binary files /dev/null and b/samples/id_1.png differ diff --git a/samples/id_2.jpg b/samples/id_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3f6ea35ef16162f42e20075243cf012b173c1a82 Binary files /dev/null and b/samples/id_2.jpg differ