Spaces:

TungDuong
/

Scene_Text_Recognization

Runtime error

App Files Files Community

TungDuong commited on Feb 4, 2025

Commit

f4dccd7

verified ·

1 Parent(s): 5215b56

Delete src

Browse files

Files changed (15) hide show

src/Text_Localization/__pycache__/prepare_dataset.cpython-312.pyc +0 -0
src/Text_Localization/prepare_dataset.py +0 -177
src/Text_Localization/text_localization.py +0 -33
src/Text_Recognization/__pycache__/dataloader.cpython-312.pyc +0 -0
src/Text_Recognization/__pycache__/prepare_dataset.cpython-312.pyc +0 -0
src/Text_Recognization/__pycache__/text_recognization.cpython-312.pyc +0 -0
src/Text_Recognization/dataloader.py +0 -96
src/Text_Recognization/prepare_dataset.py +0 -221
src/Text_Recognization/text_recognization.py +0 -64
src/Text_Recognization/trainer.py +0 -162
src/__pycache__/pipeline_end2end.cpython-312.pyc +0 -0
src/__pycache__/predict.cpython-312.pyc +0 -0
src/app.py +0 -46
src/config.json +0 -24
src/predict.py +0 -137

src/Text_Localization/__pycache__/prepare_dataset.cpython-312.pyc DELETED Viewed

Binary file (7.86 kB)

src/Text_Localization/prepare_dataset.py DELETED Viewed

@@ -1,177 +0,0 @@
-import numpy as np
-import os
-import matplotlib.pyplot as plt
-import cv2
-import xml.etree.ElementTree as ET
-import shutil
-import yaml
-from sklearn.model_selection import train_test_split
-location_path = r'Dataset/locations.xml'
-tree = ET.parse(location_path)
-root = tree.getroot()
-def get_gt_bboxes(location_path):
-    """get all the gt bbox of text in dataset
-    Args:
-        location_path: (path)
-    Return:
-        gt_imagepaths[1] (list): image's name
-        gt_locations (list): bboxes of each image
-    """
-    gt_imagepaths = []
-    gt_imagesizes = []
-    gt_locations = []
-    for image in root:
-        # get path to image
-        image_name = image[0].text
-        image_path = os.path.join('Dataset', image_name)
-        gt_imagepaths.append(image_path)
-        # get the image size
-        w = image[1].get('x')
-        h = image[1].get('y')
-        gt_imagesizes.append([h, w])
-        # bboxes in the image
-        bbs = []
-        for bbox in image[2]:
-            x = np.int64(float(bbox.get('x')))
-            y = np.int64(float(bbox.get('y')))
-            width = np.int64(float(bbox.get('width')))
-            height = np.int64(float(bbox.get('height')))
-            bbs.append([x, y, width, height])
-        gt_locations.append(bbs)
-    return gt_imagepaths, gt_imagesizes, gt_locations
-gt_imagepaths, gt_imagesizes, gt_locations = get_gt_bboxes(location_path)
-def visualize_gt_bboxes(image_path, gt_locations):
-    image = cv2.imread(image_path)
-    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    for gt_location in gt_locations:
-        x, y, width, height = gt_location
-        x, y, width, height = int(x), int(y), int(width), int(height)
-        image = cv2.rectangle(image, (x, y), (x+width, y+height), color=(255, 0, 0), thickness=2)
-    plt.imshow(image)
-    plt.axis('off')
-    plt.show()
-def visualize_gt_bboxes_yolo(image_path, gt_location_yolo):
-    image = cv2.imread(image_path)
-    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    image_height, image_width = image.shape[:2]
-    # Convert to original format
-    for data in gt_location_yolo:
-        xc, yc, w, h = data[1:]
-        xmin = int((xc - w/2) * image_width)
-        ymin = int((yc - h/2) * image_height)
-        xmax = int((xc + w/2) * image_width)
-        ymax = int((yc + h/2) * image_height)
-        image = cv2.rectangle(image, (xmin, ymin), (xmax, ymax), color=(255, 0, 0), thickness=2)
-    plt.imshow(image)
-    plt.axis('off')
-    plt.show()
-def convert_yolo_format(gt_locations, gt_imagesizes):
-    gt_locations_yolo = []
-    for image, image_size in zip(gt_locations, gt_imagesizes):
-        gt_location_yolo = []
-        for gt_location in image:
-            x, y, w, h = gt_location
-            image_height, image_width = image_size
-            xc = (x + w/2) / float(image_width)
-            yc = (y + h/2) / float(image_height)
-            width = w / float(image_width)
-            height = h / float(image_height)
-            # class = 0 -> meaning contains text
-            class_id = 0
-            gt_location_yolo.append([class_id, xc, yc, width, height])
-        gt_locations_yolo.append(gt_location_yolo)
-    return gt_locations_yolo
-gt_locations_yolo = convert_yolo_format(gt_locations, gt_imagesizes)
-def save_data_into_yolo_folder(data, src_img_dir, save_dir):
-    # Create folder if not exist
-    os.makedirs(save_dir, exist_ok=True)
-    # Make images and labels folder
-    os.makedirs(os.path.join(save_dir, 'images'), exist_ok=True)
-    os.makedirs(os.path.join(save_dir, 'labels'), exist_ok=True)
-    # write data into yolo folder
-    for dt in data:
-        # copy data
-        image_path = dt[0]
-        shutil.copy(image_path, os.path.join(save_dir, 'images'))
-        #copy labels
-        image_name = os.path.basename(image_path)
-        image_name = os.path.splitext(image_name)[0]
-        with open(os.path.join(save_dir, 'labels', f'{image_name}.txt'), "w") as f:
-            for label in dt[1]:
-                label_str = " ".join(map(str, label))
-                f.write(f'{label_str}\n')
-seed = 0
-val_size = 0.2
-test_size = 0.125
-dataset = [[gt_imagepath, gt_location_yolo] for gt_imagepath, gt_location_yolo in zip(gt_imagepaths, gt_locations_yolo)]
-train_data, val_data = train_test_split(dataset, test_size=val_size, random_state=42, shuffle=True)
-train_data, test_data = train_test_split(train_data, test_size=test_size, random_state=42, shuffle=True)
-save_yolo_data_dir = 'yolo_data'
-os.makedirs(save_yolo_data_dir, exist_ok=True)
-save_data_into_yolo_folder(
-    data=train_data,
-    src_img_dir=save_yolo_data_dir,
-    save_dir=os.path.join(save_yolo_data_dir, 'train')
-)
-save_data_into_yolo_folder(
-    data=val_data,
-    src_img_dir=save_yolo_data_dir,
-    save_dir=os.path.join(save_yolo_data_dir, 'val')
-)
-save_data_into_yolo_folder(
-    data=test_data,
-    src_img_dir=save_yolo_data_dir,
-    save_dir=os.path.join(save_yolo_data_dir, 'test')
-)
-class_label = ['text']
-# Create data.yaml file
-data_yaml = {
-    "path": '../yolo_data',
-    'train': 'train/images',
-    'test': 'test/images',
-    'val': 'val/images',
-    'nc': 1,
-    'names': class_label
-}
-yolo_yaml_path = os.path.join(save_yolo_data_dir, 'data.yaml')
-with open(yolo_yaml_path, "w") as f:
-    yaml.dump(data_yaml, f, default_flow_style=False)

src/Text_Localization/text_localization.py DELETED Viewed

@@ -1,33 +0,0 @@
-import ultralytics
-import json
-from ultralytics import YOLO
-yolo_yaml_path = 'yolo_data/data.yaml'
-config_path = 'src/config.json'
-def load_json_config(config_path):
-    with open(config_path, "r") as f:
-        config = json.load(f)
-    return config
-config = load_json_config(config_path)
-# Load model
-model = YOLO('yolo11m.pt')
-# Train model
-results = model.train(
-    data=yolo_yaml_path,
-    epochs=config['yolov11']['epochs'],
-    imgsz=config['yolov11']['image_size'],
-    cache=config['yolov11']['cache'],
-    patience=config['yolov11']['patience'],
-    plots=config['yolov11']['plots']
-)
-# Evaluate model
-model_path = 'checkpoints/yolov11m.pt'
-model = YOLO(model_path)
-metrics = model.val()

src/Text_Recognization/__pycache__/dataloader.cpython-312.pyc DELETED Viewed

Binary file (4.38 kB)

src/Text_Recognization/__pycache__/prepare_dataset.cpython-312.pyc DELETED Viewed

Binary file (11.1 kB)

src/Text_Recognization/__pycache__/text_recognization.cpython-312.pyc DELETED Viewed

Binary file (3.54 kB)

src/Text_Recognization/dataloader.py DELETED Viewed

@@ -1,96 +0,0 @@
-import torch
-import torchvision
-import json
-import sys
-from torch.utils.data import Dataset, DataLoader
-from torchvision import transforms
-from sklearn.model_selection import train_test_split
-from src.Text_Recognization.prepare_dataset import *
-# data augmentation
-data_transforms = {
-    "train": transforms.Compose(
-        [
-            transforms.ToTensor(),
-            transforms.Resize((100, 400)),
-            transforms.ColorJitter(
-                brightness=0.5,
-                contrast=0.5,
-                saturation=0.5
-            ),
-            transforms.GaussianBlur(3),
-            transforms.RandomAffine(
-                degrees=1,
-                shear=1
-            ),
-            transforms.RandomPerspective(
-                distortion_scale=0.3,
-                p=0.5
-            ),
-            transforms.RandomRotation(degrees=15),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-        ]
-    ),
-    "val": transforms.Compose(
-        [
-            transforms.ToTensor(),
-            transforms.Resize((100, 400)),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-        ]
-    )
-}
-def load_json_config(config_path):
-    with open(config_path, "r") as f:
-        config = json.load(f)
-    return config
-# Dataloader
-class STRDataset(Dataset):
-    def __init__(self, image_paths, labels, char_to_idx, transforms=None):
-        self.image_paths = image_paths
-        self.labels = labels
-        self.char_to_idx = char_to_idx
-        self.transforms= transforms
-    def __len__(self):
-        return len(self.image_paths)
-    def __getitem__(self, idx):
-        image = cv2.imread(self.image_paths[idx])
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        if self.transforms:
-            image = self.transforms(image)
-        label_encoded, length = encode(self.labels[idx], self.char_to_idx, self.labels)
-        return image, label_encoded, length
-def get_dataloader():
-    val_size = 0.1
-    test_size = 0.1
-    root_path = 'Dataset'
-    config_path = 'src/config.json'
-    # get image paths and labels
-    image_paths, labels = get_imagepaths_and_labels(root_path)
-    char_to_idx, idx_to_char = build_vocab(root_path)
-    config = load_json_config(config_path)
-    X_train, X_val, y_train, y_val = train_test_split(image_paths, labels, test_size=val_size, random_state=42, shuffle=True)
-    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=test_size, random_state=42, shuffle=True)
-    train_dataset = STRDataset(X_train, y_train, char_to_idx, transforms=data_transforms['train'])
-    train_loader = DataLoader(train_dataset, batch_size=config['CRNN']['batch_size'], shuffle=True)
-    val_dataset = STRDataset(X_val, y_val, char_to_idx, transforms=data_transforms['val'])
-    val_loader = DataLoader(val_dataset, batch_size=config['CRNN']['batch_size'], shuffle=True)
-    test_dataset = STRDataset(X_test, y_test, char_to_idx, transforms=data_transforms['val'])
-    test_loader = DataLoader(test_dataset, batch_size=config['CRNN']['batch_size'], shuffle=True)
-    return train_loader, val_loader, test_loader

src/Text_Recognization/prepare_dataset.py DELETED Viewed

@@ -1,221 +0,0 @@
-import os
-import random
-import numpy as np
-import xml.etree.ElementTree as ET
-import cv2
-import json
-import matplotlib.pyplot as plt
-import argparse
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-def extract_data_from_xml(root_path):
-    words_path = os.path.join(root_path, 'words.xml')
-    tree = ET.parse(words_path)
-    root = tree.getroot()
-    image_paths = []
-    image_sizes = []
-    image_labels = []
-    bboxes = []
-    for image in root:
-        imagename = image[0].text
-        image_path = os.path.join(root_path, imagename)
-        image_paths.append(image_path)
-        image_height = image[1].get('x')
-        image_width = image[1].get('y')
-        image_sizes.append([image_height, image_width])
-        bboxes_in_image = []
-        labels_in_bboxes = []
-        for bbox in image[2]:
-            x = float(bbox.get('x'))
-            y = float(bbox.get('y'))
-            width = float(bbox.get('width'))
-            height = float(bbox.get('height'))
-            bboxes_in_image.append([x, y, width, height])
-            # get text in this bbox
-            labels = bbox.find('tag').text
-            labels_in_bboxes.append(labels)
-        bboxes.append(bboxes_in_image)
-        image_labels.append(labels_in_bboxes)
-    return image_paths, image_sizes, bboxes, image_labels
-def visualize_gt_bboxes(image_path, gt_locations, gt_labels):
-    image = cv2.imread(image_path)
-    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    for gt_location, gt_label in zip(gt_locations, gt_labels):
-        x, y, width, height = gt_location
-        x, y, width, height = int(x), int(y), int(width), int(height)
-        image = cv2.rectangle(image, (x, y), (x+width, y+height), color=(255, 0, 0), thickness=2)
-        image = cv2.putText(image, gt_label, (x, y-10), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale = 3, color=(255, 0, 0), thickness=2)
-    plt.imshow(image)
-    plt.axis('off')
-    plt.show()
-def split_bboxes_from_image(image_paths, image_labels, bboxes, save_dir):
-    """create a new dataset contains bboxes and corresponding labels
-    Args:
-        image_paths
-        image_labels
-        bboxes
-        save_dir
-    Return:
-        non-return
-    """
-    os.makedirs(save_dir, exist_ok=True)
-    os.makedirs('unvalid_images', exist_ok=True)
-    bboxes_idx = 0
-    unvalid_bboxes = 0
-    new_labels = []         # List to store labels
-    for image_path, bbox, label in zip(image_paths, bboxes, image_labels):
-        image = cv2.imread(image_path)
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        if image is None:
-            print(image_path)
-            continue
-        for bb, lb in zip(bbox, label):
-            x, y, width, height = bb
-            x, y, width, height = int(x), int(y), int(width), int(height)
-            cropped_text = image[y:y+height, x:x+width]
-            # Filter if x, y, width, height is invalid cordinates
-            if x < 0 or y < 0 or width < 0 or height < 0:
-                continue
-            # Filter text contain special characters
-            if 'é' in [lb[i].lower() for i in range(len(lb))] or 'ñ' in [lb[i].lower() for i in range(len(lb))] or '£' in [lb[i].lower() for i in range(len(lb))]:
-                continue
-            # Filter out if text is too light or too dark
-            if np.mean(cropped_text) < 30 or np.mean(cropped_text) > 230:
-                cv2.imwrite(f'unvalid_images\\unvalid_image{unvalid_bboxes}_{lb}.jpg', cropped_text)
-                unvalid_bboxes += 1
-                continue
-            # Filter out if image is too small
-            if width < 10 or height < 10:
-                cv2.imwrite(f'unvalid_images\\unvalid_image{unvalid_bboxes}_{lb}.jpg', cropped_text)
-                unvalid_bboxes += 1
-                continue
-            new_image_path = os.path.join(save_dir, f'cropped_image{bboxes_idx}.jpg')
-            cv2.imwrite(new_image_path, cropped_text)
-            new_label = new_image_path + '\t' + lb
-            new_labels.append(new_label)
-            bboxes_idx += 1
-        # Write labels into a text file
-        with open(os.path.join(save_dir, 'labels.txt'), "w") as f:
-            for new_label in new_labels:
-                f.write(f'{new_label}\n')
-def build_vocab(root_dir):
-    img_paths = []
-    labels = []
-    # Read labels from text file
-    with open(os.path.join(root_dir, 'ocr_dataset', 'labels.txt'), "r") as f:
-        for label in f:
-            labels.append(label.strip().split("\t")[1])
-            img_paths.append(label.strip().split("\t")[0])
-    # build the vocab
-    vocab = set()
-    for label in labels:
-        for i in range(len(label)):
-            vocab.add(label[i])
-    # "blank" character
-    vocab = list(sorted(vocab))
-    vocab = "".join(vocab)
-    blank_char = '@'
-    vocab = vocab + 'z'
-    vocab = vocab + blank_char
-    # build a dictionary convert from vocab to idx and idx to vocab
-    char_to_idx = {
-        char: idx + 1 for idx, char in enumerate(vocab)
-    }
-    idx_to_char = {
-        idx: char for char, idx in char_to_idx.items()
-    }
-    return char_to_idx, idx_to_char
-def get_imagepaths_and_labels(root_path):
-    img_paths = []
-    labels = []
-    # Read labels from text file
-    with open(os.path.join(root_path, 'ocr_dataset', 'labels.txt'), "r") as f:
-        for label in f:
-            labels.append(label.strip().split("\t")[1])
-            img_paths.append(label.strip().split("\t")[0])
-    return img_paths, labels
-def encode(label, char_to_idx, labels):
-    max_length_label = np.max([len(lb) for lb in labels])
-    # encode label
-    encoded_label = torch.tensor(
-                        [char_to_idx[char.lower()] for char in label],
-                        dtype=torch.int32
-                    )
-    label_len = len(encoded_label)
-    length = torch.tensor(
-                label_len,
-                dtype=torch.int32
-            )
-    padded_label = F.pad(
-                        encoded_label,
-                        (0, max_length_label-label_len),
-                        value=0
-                    )
-    return padded_label, length
-def decode(encoded_label, idx_to_char, char_to_idx, blank_char='@'):
-    label = []
-    encoded_label = encoded_label.detach().numpy()
-    for i in range(len(encoded_label)):
-        if encoded_label[i] == 0:
-            break
-        elif (i == 0 or encoded_label[i] != encoded_label[i-1]) and encoded_label[i] != char_to_idx[blank_char]:
-            label.append(idx_to_char[encoded_label[i]])
-    label = "".join(label)
-    return label
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--path", type=str, default=os.getcwd(), help="Path to the root directory")
-    args = parser.parse_args()
-    root_path = os.path.join(args.path, 'Dataset')
-    image_paths, image_sizes, bboxes, image_labels = extract_data_from_xml(root_path)
-    save_dir = 'Dataset/ocr_dataset'
-    split_bboxes_from_image(image_paths, image_labels, bboxes, save_dir)
-    char_to_idx, idx_to_char = build_vocab(root_path)
-if __name__ == '__main__':
-    main()

src/Text_Recognization/text_recognization.py DELETED Viewed

@@ -1,64 +0,0 @@
-import torch
-import torch.nn as nn
-import torchvision
-from torchvision.models import resnet101
-class BackBone(nn.Module):
-    def __init__(self, num_unfreeze_layers=3):
-        super(BackBone, self).__init__()
-        model = resnet101(weights='IMAGENET1K_V2', progress=True)
-        feature_maps = list(model.children())[:8]
-        # Adding an AdaptiveAvgPooling (batch_size, 2048, 8, 8) -> (batch_size, 2048, 1, 8)
-        feature_maps.append(nn.AdaptiveAvgPool2d((1, None)))
-        self.backbone = nn.Sequential(*feature_maps)
-        for layer in list(self.backbone.parameters())[-(num_unfreeze_layers+1):]:
-            layer.requires_grad = True
-    def forward(self, image):
-        return self.backbone(image)
-class CRNN(nn.Module):
-    def __init__(self, vocab_size, hidden_size, n_layers, dropout=0.2, num_unfreeze_layers=3):
-        super(CRNN, self).__init__()
-        self.backbone = BackBone(num_unfreeze_layers=num_unfreeze_layers)
-        self.mapSeq = nn.Sequential(
-            nn.Linear(2048, 512),
-            nn.ReLU(),
-            nn.Dropout(p=dropout)
-        )
-        self.gru = nn.GRU(
-            input_size=512,
-            hidden_size=hidden_size,
-            num_layers=n_layers,
-            bidirectional=True,
-            batch_first=True,
-            dropout=dropout if n_layers > 1 else 0
-        )
-        self.layer_norm = nn.LayerNorm(hidden_size * 2)
-        # Dense layers
-        self.out = nn.Sequential(
-            nn.Linear(hidden_size * 2, vocab_size),
-            nn.LogSoftmax(dim=2)
-        )
-    def forward(self, x):
-        x = self.backbone(x)
-        # (batch_size, 2048, 1, 8) -> (batch_size, 8, 2048, 1)
-        x = x.permute(0, 3, 1, 2)
-        # flatten -> (batch_size, 8, 2048)
-        x = x.view(x.size(0), x.size(1), -1)
-        x = self.mapSeq(x)
-        x, _ = self.gru(x)
-        x = self.layer_norm(x)
-        x = self.out(x)
-        # (batch_size, 8, vocab_size) -> (8, batch_size, vocab_size)
-        x = x.permute(1, 0, 2)
-        return x

src/Text_Recognization/trainer.py DELETED Viewed

@@ -1,162 +0,0 @@
-import json
-import sys
-import os
-import argparse
-import torch
-import torch.nn as nn
-import torchvision
-from tqdm import tqdm
-sys.path.append(os.getcwd())
-from src.Text_Recognization.text_recognization import *
-from src.Text_Recognization.prepare_dataset import *
-from src.Text_Recognization.dataloader import *
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-def load_json_config(config_path):
-    with open(config_path, "r") as f:
-        config = json.load(f)
-    return config
-def evaluate(model, dataloader, criterion, device):
-    model.eval()
-    losses = []
-    with torch.no_grad():
-        for images, labels, labels_len in dataloader:
-            images = images.to(device)
-            labels = labels.to(device)
-            outputs = model(images)
-            logits_lens = torch.full(
-                size=(outputs.size(1), ),
-                fill_value=outputs.size(0),
-                dtype=torch.long
-            ).to(device)
-            loss = criterion(outputs, labels, logits_lens, labels_len)
-            losses.append(loss.item())
-        eval_loss = sum(losses) / len(losses)
-        return eval_loss
-def training_loop(model, train_loader, val_loader, learning_rate, epochs, optimizer, criterion, scheduler, device):
-    model.to(device)
-    train_losses = []
-    val_losses = []
-    for epoch in range(epochs):
-        model.train()
-        batch_losses = []
-        for images, labels, labels_len in tqdm(train_loader):
-            images = images.to(device)
-            labels = labels.to(device)
-            optimizer.zero_grad()
-            outputs = model(images)
-            logits_lens = torch.full(
-                size=(outputs.size(1), ),
-                fill_value=outputs.size(0),
-                dtype=torch.long
-            ).to(device)
-            loss = criterion(outputs, labels, logits_lens, labels_len)
-            loss.backward()
-            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
-            optimizer.step()
-            batch_losses.append(loss.item())
-        train_loss = sum(batch_losses) / len(batch_losses)
-        train_losses.append(train_loss)
-        val_loss = evaluate(model, val_loader, criterion, device)
-        val_losses.append(val_loss)
-        print(f"epoch: {epoch+1}/{epochs}\ttrain_loss:{train_loss}\tval_loss:{val_loss}")
-        scheduler.step()
-    return train_losses, val_losses
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--root_path', type=str, default=os.getcwd(), help='Path to the root directory')
-    parser.add_argument('--checkpoints_path', type=str, default=os.path.join(os.getcwd(), 'checkpoints'), help='Path to the checkpoint directory')
-    args = parser.parse_args()
-    config_path = 'src/config.json'
-    dataset_path = os.path.join(args.root_path, 'Dataset')
-    config = load_json_config(config_path)
-    # dictionary char and idx
-    char_to_idx, idx_to_char = build_vocab(dataset_path)
-    # model
-    model = CRNN(vocab_size=config['vocab_size'], hidden_size=config['CRNN']['hidden_size'], n_layers=config['CRNN']['n_layers'])
-    # dataloader
-    train_loader, val_loader, test_loader = get_dataloader()
-    # define hyper parammeters
-    criterion = nn.CTCLoss(
-        blank=char_to_idx[config['blank_char']],
-        zero_infinity=True,
-        reduction='mean'
-    )
-    optimizer = torch.optim.Adam(
-        model.parameters(),
-        lr=config['CRNN']['learning_rate'],
-        weight_decay=config['CRNN']['weight_decay']
-    )
-    scheduler = torch.optim.lr_scheduler.StepLR(
-        optimizer=optimizer,
-        step_size=config['CRNN']['scheduler_step_size'],
-        gamma=0.1
-    )
-    # training loop
-    train_losses, val_losses = training_loop(
-        model=model,
-        train_loader=train_loader,
-        val_loader=val_loader,
-        learning_rate=config['CRNN']['learning_rate'],
-        epochs=config['CRNN']['epochs'],
-        optimizer=optimizer,
-        criterion=criterion,
-        scheduler=scheduler,
-        device=device
-    )
-    # save model
-    if not os.path.exists(args.checkpoints_path):
-        os.makedirs(args.checkpoints_path)
-        os.makedirs(os.path.join(args.checkpoints_path, 'losses'))
-    torch.save(model.state_dict(), os.path.join(args.checkpoints_path, 'crnn.pt'))
-    # draw losses
-    fig, axis = plt.subplots(1, 2, figsize=(8, 8))
-    axis[0].plot(train_losses, label='train_loss')
-    axis[0].set_xlabel('Epochs')
-    axis[0].set_ylabel('Loss')
-    axis[0].axis('off')
-    axis[0].legend()
-    axis[1].plot(val_losses, label='val_loss')
-    axis[1].set_xlabel('Epochs')
-    axis[1].set_ylabel('Loss')
-    axis[1].axis('off')
-    axis[1].legend()
-    plt.savefig(os.path.join(args.checkpoints_path, 'losses', 'losses.png'))
-if __name__ == '__main__':
-    main()

src/__pycache__/pipeline_end2end.cpython-312.pyc DELETED Viewed

Binary file (6.49 kB)

src/__pycache__/predict.cpython-312.pyc DELETED Viewed

Binary file (7.9 kB)

src/app.py DELETED Viewed

@@ -1,46 +0,0 @@
-import gradio as gr
-import numpy as np
-import os
-import json
-import cv2
-import sys
-import torch
-import torch.nn as nn
-import torchvision
-sys.path.append(os.getcwd())
-from predict import *
-def visualize_image(image, detections):
-    for bbox, detected_class, conf, text, _ in detections:
-        x1, y1, x2, y2 = bbox
-        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
-        image = cv2.rectangle(image, (x1, y1), (x2, y2), color=(255, 0, 0), thickness=2)
-        image = cv2.putText(image, f"{conf:.2f} {text}", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
-    return image
-def pipeline(image):
-    image = np.array(image)
-    predictions = prediction(image)
-    # Filter low conf boxes
-    filter_predictions = []
-    for bbox, cls, conf, text, encoded_text in predictions:
-        if conf > 0.7:
-            filter_predictions.append([bbox, cls, conf, text, encoded_text])
-    image = visualize_image(image, filter_predictions)
-    return image
-demo = gr.Interface(
-    fn=pipeline,
-    inputs=gr.Image(type="pil", label="Input Image"),
-    outputs="image",
-    title="Scene Text Recognization",
-    description="Recognize text in scene images"
-)
-demo.launch()

src/config.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-    "yolov11":
-    {
-        "epochs": 100,
-        "image_size": 640,
-        "cache": true,
-        "patience": 20,
-        "plots": true
-    },
-    "CRNN":
-    {
-        "batch_size": 64,
-        "epochs": 100,
-        "hidden_size": 256,
-        "n_layers": 3,
-        "dropout": 0.2,
-        "unfreeze_layers": 3,
-        "learning_rate": 5e-4,
-        "weight_decay": 1e-5,
-        "scheduler_step_size": 30
-    },
-    "blank_char": "@",
-    "vocab_size": 73
-}

src/predict.py DELETED Viewed

@@ -1,137 +0,0 @@
-import os
-import sys
-import json
-import cv2
-import argparse
-import matplotlib.pyplot as plt
-import ultralytics
-import torch
-import torch.nn as nn
-import torchvision
-from torchvision.models import resnet101
-from torchvision import transforms
-sys.path.append(os.getcwd())
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-from ultralytics import YOLO
-from src.Text_Recognization.text_recognization import *
-from src.Text_Recognization.prepare_dataset import *
-# config
-def load_json_config(config_path):
-    with open(config_path, "r") as f:
-        config = json.load(f)
-    return config
-config = load_json_config('src/config.json')
-# char to idx
-char_to_idx, idx_to_char = build_vocab('Dataset')
-# text detection model
-text_det_model_path = 'checkpoints/yolov11m.pt'
-yolo = YOLO(text_det_model_path)
-# text recognition model
-text_rec_model_path = 'checkpoints/crnn_extend_vocab.pt'
-# rcnn model
-rcnn_model = CRNN(vocab_size=74, hidden_size=config['CRNN']['hidden_size'], n_layers=config['CRNN']['n_layers'])
-rcnn_model.load_state_dict(torch.load(text_rec_model_path, weights_only=True, map_location=torch.device('cpu')))
-def text_detection(img_path, text_det_model):
-    text_det_results = text_det_model(img_path, verbose=False)[0]
-    bboxes = text_det_results.boxes.xyxy.tolist()
-    classes = text_det_results.boxes.cls.tolist()
-    names = text_det_results.names
-    confs = text_det_results.boxes.conf.tolist()
-    return bboxes, classes, names, confs
-def visualize_gt_bboxes_yolo(image_path, gt_location_yolo):
-    image = cv2.imread(image_path)
-    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    # Convert to original format
-    for data in gt_location_yolo:
-        xmin, ymin, xmax, ymax = data
-        xmin, ymin, xmax, ymax = int(xmin), int(ymin), int(xmax), int(ymax)
-        image = cv2.rectangle(image, (xmin, ymin), (xmax, ymax), color=(255, 0, 0), thickness=2)
-    plt.imshow(image)
-    plt.axis('off')
-    plt.show()
-def text_recognization(image, data_transforms, text_reg_model, idx_to_char=idx_to_char, device=device):
-    transformsed_image = data_transforms(image)
-    transformsed_image = transformsed_image.unsqueeze(0).to(device)
-    text_reg_model.to(device)
-    text_reg_model.eval()
-    with torch.no_grad():
-        preds = text_reg_model(transformsed_image)
-        _, idx = torch.max(preds, dim=2)
-        idx = idx.view(-1)
-        text = decode(idx, idx_to_char, char_to_idx)
-    return text, idx
-def visualize_detection(image, detections):
-    plt.figure(figsize=(10, 8))
-    for bbox, detected_classes, conf, text, _ in detections:
-        x1, y1, x2, y2 = bbox
-        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
-        image = cv2.rectangle(image, (x1, y1), (x2, y2), color=(255, 0, 0), thickness=2)
-        image = cv2.putText(image, f"{conf:.2f} {text}", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
-    plt.imshow(image)
-    plt.axis('off')
-    plt.show()
-    return image
-data_transforms = transforms.Compose([
-    transforms.ToTensor(),
-    transforms.Resize((100, 400)),
-    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-])
-def prediction(image, text_det_model=yolo, text_reg_model=rcnn_model, idx_to_char=idx_to_char, char_to_idx=char_to_idx, data_transforms=data_transforms, device=device):
-    # detection
-    bboxes, classes, names, confs = text_detection(image, text_det_model)
-    predictions = []
-    for bbox, cls, conf in zip(bboxes, classes, confs):
-        x1, y1, x2, y2 = bbox
-        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
-        detected_text = image[y1:y2, x1:x2]
-        text, encoded_text = text_recognization(detected_text, data_transforms, text_reg_model, idx_to_char, device)
-        predictions.append((bbox, cls, conf, text, encoded_text))
-        print(bbox, cls, conf, text)
-    return predictions
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--image_path', type=str, help='Path to the image')
-    parser.add_argument('--save_path', type=str, default=None, help='Path to save the image')
-    args = parser.parse_args()
-    image_path = args.image_path
-    image = cv2.imread(image_path)
-    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    detections = prediction(image)
-    image = visualize_detection(image, detections)
-    if args.save_path:
-        print(f"Saving the image to {os.path.join(args.save_path, 'predicted_image.jpg')}")
-        cv2.imwrite(os.path.join(args.save_path, 'predicted_image.jpg'), cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
-if __name__ == '__main__':
-    main()