Spaces:

Ehtesham123
/

OCR_AEB_Serial_Number

Sleeping

App Files Files Community

Ehtesham123 commited on Aug 8, 2025

Commit

a045aa1

verified ·

1 Parent(s): 2663dbe

Upload 54 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
.gradio/flagged/dataset1.csv +2 -0
__pycache__/parseq_recognize.cpython-311.pyc +0 -0
__pycache__/yolo_detect.cpython-311.pyc +0 -0
app.py +34 -0
parseq_recognize.py +18 -0
pretrained_model/parseq.ckpt +3 -0
pretrained_model/yolo_obb.pt +3 -0
requirements.txt +8 -0
samples/T1.png +3 -0
samples/T2.png +3 -0
samples/image_0004.png +3 -0
samples/image_0082.png +3 -0
strhub/__init__.py +0 -0
strhub/__pycache__/__init__.cpython-311.pyc +0 -0
strhub/__pycache__/__init__.cpython-312.pyc +0 -0
strhub/data/__init__.py +0 -0
strhub/data/__pycache__/__init__.cpython-311.pyc +0 -0
strhub/data/__pycache__/__init__.cpython-312.pyc +0 -0
strhub/data/__pycache__/aa_overrides.cpython-312.pyc +0 -0
strhub/data/__pycache__/augment.cpython-312.pyc +0 -0
strhub/data/__pycache__/dataset.cpython-311.pyc +0 -0
strhub/data/__pycache__/dataset.cpython-312.pyc +0 -0
strhub/data/__pycache__/module.cpython-311.pyc +0 -0
strhub/data/__pycache__/module.cpython-312.pyc +0 -0
strhub/data/__pycache__/utils.cpython-311.pyc +0 -0
strhub/data/__pycache__/utils.cpython-312.pyc +0 -0
strhub/data/aa_overrides.py +46 -0
strhub/data/augment.py +112 -0
strhub/data/dataset.py +148 -0
strhub/data/module.py +158 -0
strhub/data/utils.py +150 -0
strhub/models/__init__.py +0 -0
strhub/models/__pycache__/__init__.cpython-311.pyc +0 -0
strhub/models/__pycache__/__init__.cpython-312.pyc +0 -0
strhub/models/__pycache__/base.cpython-311.pyc +0 -0
strhub/models/__pycache__/base.cpython-312.pyc +0 -0
strhub/models/__pycache__/utils.cpython-311.pyc +0 -0
strhub/models/__pycache__/utils.cpython-312.pyc +0 -0
strhub/models/base.py +221 -0
strhub/models/modules.py +20 -0
strhub/models/parseq/__init__.py +0 -0
strhub/models/parseq/__pycache__/__init__.cpython-311.pyc +0 -0
strhub/models/parseq/__pycache__/__init__.cpython-312.pyc +0 -0
strhub/models/parseq/__pycache__/model.cpython-311.pyc +0 -0
strhub/models/parseq/__pycache__/model.cpython-312.pyc +0 -0
strhub/models/parseq/__pycache__/modules.cpython-311.pyc +0 -0
strhub/models/parseq/__pycache__/modules.cpython-312.pyc +0 -0
strhub/models/parseq/__pycache__/system.cpython-311.pyc +0 -0
strhub/models/parseq/__pycache__/system.cpython-312.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -37,3 +37,7 @@ ocr_demo/samples/image_0004.png filter=lfs diff=lfs merge=lfs -text
 ocr_demo/samples/image_0082.png filter=lfs diff=lfs merge=lfs -text
 ocr_demo/samples/T1.png filter=lfs diff=lfs merge=lfs -text
 ocr_demo/samples/T2.png filter=lfs diff=lfs merge=lfs -text

 ocr_demo/samples/image_0082.png filter=lfs diff=lfs merge=lfs -text
 ocr_demo/samples/T1.png filter=lfs diff=lfs merge=lfs -text
 ocr_demo/samples/T2.png filter=lfs diff=lfs merge=lfs -text
+samples/image_0004.png filter=lfs diff=lfs merge=lfs -text
+samples/image_0082.png filter=lfs diff=lfs merge=lfs -text
+samples/T1.png filter=lfs diff=lfs merge=lfs -text
+samples/T2.png filter=lfs diff=lfs merge=lfs -text

.gradio/flagged/dataset1.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Choose a sample image,Recognized Text,timestamp
2	+ ,,2025-08-07 15:20:31.167102

__pycache__/parseq_recognize.cpython-311.pyc ADDED Viewed

Binary file (2.07 kB). View file

__pycache__/yolo_detect.cpython-311.pyc ADDED Viewed

Binary file (4 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import gradio as gr
+from yolo_detect import OBBPredictor
+from parseq_recognize import TextRecognizer
+import os
+# Initialize models
+yolo_model_path = "pretrained_model\\yolo_obb.pt"
+parseq_ckpt_path = "pretrained_model\\parseq.ckpt"
+detector = OBBPredictor(yolo_model_path)
+recognizer = TextRecognizer(parseq_ckpt_path, device='cpu')  # or 'cuda' if on GPU
+# ==== OCR pipeline function ====
+def run_pipeline(image):
+    crops = detector.predict(image)
+    recognized_texts = [recognizer.recognize(crop) for crop in crops]
+    final_output = "\n".join([f"{i+1}. {txt}" for i, txt in enumerate(recognized_texts)])
+    return final_output if recognized_texts else "No text detected."
+# ==== Get sample image paths ====
+example_images = [f"samples/{f}" for f in os.listdir("samples") if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
+# ==== Gradio app with ONLY sample images ====
+demo = gr.Interface(
+    fn=run_pipeline,
+    inputs=gr.Image(type="pil", label="Choose a sample image"),
+    outputs=gr.Textbox(label="Recognized Text"),
+    examples=[[img] for img in example_images],  # list of lists required
+    title="Two-Stage OCR Network for Aero Engine Blades Serial Number",
+    description="Choose only one of the predefined image. The model will detect text regions and recognize their contents."
+)
+if __name__ == "__main__":
+    demo.launch()

parseq_recognize.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch
+from PIL import Image
+from strhub.data.module import SceneTextDataModule
+from strhub.models.utils import load_from_checkpoint
+class TextRecognizer:
+    def __init__(self, ckpt_path, device='cpu'):
+        self.device = device
+        self.parseq = load_from_checkpoint(ckpt_path).eval().to(device)
+        self.img_transform = SceneTextDataModule.get_transform(self.parseq.hparams.img_size)
+    def recognize(self, image_pil):
+        image_tensor = self.img_transform(image_pil).unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            logits = self.parseq(image_tensor)
+            pred = logits.softmax(-1)
+            label, _ = self.parseq.tokenizer.decode(pred)
+        return label[0]

pretrained_model/parseq.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c95fbe3efe9c59f71e7f75761b7b70b5ed5097e7f502cf138d6eded042f7c073
+size 96584214

pretrained_model/yolo_obb.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:739cd8cd3f49a3f466cbdee965dcc3720331404d0de5787881bb8a95992dd6e1
+size 5715964

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio
+ultralytics
+torch
+torchvision
+strhub
+Pillow
+opencv-python
+numpy

samples/T1.png ADDED Viewed

Git LFS Details

SHA256: e342674a1d65afa411806908fc186db15058e5cb76d89b64bf3b56117ce1622f
Pointer size: 132 Bytes
Size of remote file: 9.28 MB

samples/T2.png ADDED Viewed

Git LFS Details

SHA256: 8256e3156e5a16973574bb81879269128c0c7728000df15f7136eb65e0b1544a
Pointer size: 132 Bytes
Size of remote file: 1.4 MB

samples/image_0004.png ADDED Viewed

Git LFS Details

SHA256: 2853354cd82fa5ae60c224a44c3d8fc80436c1d806587f0abd47df763460ad3e
Pointer size: 132 Bytes
Size of remote file: 3.02 MB

samples/image_0082.png ADDED Viewed

Git LFS Details

SHA256: e476ee9ab4472b41e990a4c2f0d194e2f4152c4234d0897b80bb113d1ea5d50b
Pointer size: 132 Bytes
Size of remote file: 1.87 MB

strhub/__init__.py ADDED Viewed

File without changes

strhub/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (166 Bytes). View file

strhub/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (154 Bytes). View file

strhub/data/__init__.py ADDED Viewed

File without changes

strhub/data/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (171 Bytes). View file

strhub/data/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (159 Bytes). View file

strhub/data/__pycache__/aa_overrides.cpython-312.pyc ADDED Viewed

Binary file (1.67 kB). View file

strhub/data/__pycache__/augment.cpython-312.pyc ADDED Viewed

Binary file (5.2 kB). View file

strhub/data/__pycache__/dataset.cpython-311.pyc ADDED Viewed

Binary file (8.05 kB). View file

strhub/data/__pycache__/dataset.cpython-312.pyc ADDED Viewed

Binary file (7.07 kB). View file

strhub/data/__pycache__/module.cpython-311.pyc ADDED Viewed

Binary file (7.19 kB). View file

strhub/data/__pycache__/module.cpython-312.pyc ADDED Viewed

Binary file (6.65 kB). View file

strhub/data/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (10.9 kB). View file

strhub/data/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (8.84 kB). View file

strhub/data/aa_overrides.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Scene Text Recognition Model Hub
+# Copyright 2022 Darwin Bautista
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Extends default ops to accept optional parameters."""
+from functools import partial
+from timm.data.auto_augment import _LEVEL_DENOM, LEVEL_TO_ARG, NAME_TO_OP, _randomly_negate, rotate
+def rotate_expand(img, degrees, **kwargs):
+    """Rotate operation with expand=True to avoid cutting off the characters"""
+    kwargs['expand'] = True
+    return rotate(img, degrees, **kwargs)
+def _level_to_arg(level, hparams, key, default):
+    magnitude = hparams.get(key, default)
+    level = (level / _LEVEL_DENOM) * magnitude
+    level = _randomly_negate(level)
+    return (level,)
+def apply():
+    # Overrides
+    NAME_TO_OP.update({
+        'Rotate': rotate_expand,
+    })
+    LEVEL_TO_ARG.update({
+        'Rotate': partial(_level_to_arg, key='rotate_deg', default=30.0),
+        'ShearX': partial(_level_to_arg, key='shear_x_pct', default=0.3),
+        'ShearY': partial(_level_to_arg, key='shear_y_pct', default=0.3),
+        'TranslateXRel': partial(_level_to_arg, key='translate_x_pct', default=0.45),
+        'TranslateYRel': partial(_level_to_arg, key='translate_y_pct', default=0.45),
+    })

strhub/data/augment.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Scene Text Recognition Model Hub
+# Copyright 2022 Darwin Bautista
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import imgaug.augmenters as iaa
+import numpy as np
+from PIL import Image, ImageFilter
+from timm.data import auto_augment
+from strhub.data import aa_overrides
+aa_overrides.apply()
+_OP_CACHE = {}
+def _get_op(key, factory):
+    try:
+        op = _OP_CACHE[key]
+    except KeyError:
+        op = factory()
+        _OP_CACHE[key] = op
+    return op
+def _get_param(level, img, max_dim_factor, min_level=1):
+    max_level = max(min_level, max_dim_factor * max(img.size))
+    return round(min(level, max_level))
+def gaussian_blur(img, radius, **__):
+    radius = _get_param(radius, img, 0.02)
+    key = 'gaussian_blur_' + str(radius)
+    op = _get_op(key, lambda: ImageFilter.GaussianBlur(radius))
+    return img.filter(op)
+def motion_blur(img, k, **__):
+    k = _get_param(k, img, 0.08, 3) | 1  # bin to odd values
+    key = 'motion_blur_' + str(k)
+    op = _get_op(key, lambda: iaa.MotionBlur(k))
+    return Image.fromarray(op(image=np.asarray(img)))
+def gaussian_noise(img, scale, **_):
+    scale = _get_param(scale, img, 0.25) | 1  # bin to odd values
+    key = 'gaussian_noise_' + str(scale)
+    op = _get_op(key, lambda: iaa.AdditiveGaussianNoise(scale=scale))
+    return Image.fromarray(op(image=np.asarray(img)))
+def poisson_noise(img, lam, **_):
+    lam = _get_param(lam, img, 0.2) | 1  # bin to odd values
+    key = 'poisson_noise_' + str(lam)
+    op = _get_op(key, lambda: iaa.AdditivePoissonNoise(lam))
+    return Image.fromarray(op(image=np.asarray(img)))
+def _level_to_arg(level, _hparams, max):
+    level = max * level / auto_augment._LEVEL_DENOM
+    return (level,)
+_RAND_TRANSFORMS = auto_augment._RAND_INCREASING_TRANSFORMS.copy()
+_RAND_TRANSFORMS.remove('SharpnessIncreasing')  # remove, interferes with *blur ops
+_RAND_TRANSFORMS.extend([
+    'GaussianBlur',
+    # 'MotionBlur',
+    # 'GaussianNoise',
+    'PoissonNoise',
+])
+auto_augment.LEVEL_TO_ARG.update({
+    'GaussianBlur': partial(_level_to_arg, max=4),
+    'MotionBlur': partial(_level_to_arg, max=20),
+    'GaussianNoise': partial(_level_to_arg, max=0.1 * 255),
+    'PoissonNoise': partial(_level_to_arg, max=40),
+})
+auto_augment.NAME_TO_OP.update({
+    'GaussianBlur': gaussian_blur,
+    'MotionBlur': motion_blur,
+    'GaussianNoise': gaussian_noise,
+    'PoissonNoise': poisson_noise,
+})
+def rand_augment_transform(magnitude=5, num_layers=3):
+    # These are tuned for magnitude=5, which means that effective magnitudes are half of these values.
+    hparams = {
+        'rotate_deg': 30,
+        'shear_x_pct': 0.9,
+        'shear_y_pct': 0.2,
+        'translate_x_pct': 0.10,
+        'translate_y_pct': 0.30,
+    }
+    ra_ops = auto_augment.rand_augment_ops(magnitude, hparams=hparams, transforms=_RAND_TRANSFORMS)
+    # Supply weights to disable replacement in random selection (i.e. avoid applying the same op twice)
+    choice_weights = [1.0 / len(ra_ops) for _ in range(len(ra_ops))]
+    return auto_augment.RandAugment(ra_ops, num_layers, choice_weights)

strhub/data/dataset.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# Scene Text Recognition Model Hub
+# Copyright 2022 Darwin Bautista
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import io
+import logging
+import unicodedata
+from pathlib import Path, PurePath
+from typing import Callable, Optional, Union
+import lmdb
+from PIL import Image
+from torch.utils.data import ConcatDataset, Dataset
+from strhub.data.utils import CharsetAdapter
+log = logging.getLogger(__name__)
+def build_tree_dataset(root: Union[PurePath, str], *args, **kwargs):
+    try:
+        kwargs.pop('root')  # prevent 'root' from being passed via kwargs
+    except KeyError:
+        pass
+    root = Path(root).absolute()
+    log.info(f'dataset root:\t{root}')
+    datasets = []
+    for mdb in glob.glob(str(root / '**/data.mdb'), recursive=True):
+        mdb = Path(mdb)
+        ds_name = str(mdb.parent.relative_to(root))
+        ds_root = str(mdb.parent.absolute())
+        dataset = LmdbDataset(ds_root, *args, **kwargs)
+        log.info(f'\tlmdb:\t{ds_name}\tnum samples: {len(dataset)}')
+        datasets.append(dataset)
+    return ConcatDataset(datasets)
+class LmdbDataset(Dataset):
+    """Dataset interface to an LMDB database.
+    It supports both labelled and unlabelled datasets. For unlabelled datasets, the image index itself is returned
+    as the label. Unicode characters are normalized by default. Case-sensitivity is inferred from the charset.
+    Labels are transformed according to the charset.
+    """
+    def __init__(
+        self,
+        root: str,
+        charset: str,
+        max_label_len: int,
+        min_image_dim: int = 0,
+        remove_whitespace: bool = True,
+        normalize_unicode: bool = True,
+        unlabelled: bool = False,
+        transform: Optional[Callable] = None,
+    ):
+        self._env = None
+        self.root = root
+        self.unlabelled = unlabelled
+        self.transform = transform
+        self.labels = []
+        self.filtered_index_list = []
+        self.num_samples = self._preprocess_labels(
+            charset, remove_whitespace, normalize_unicode, max_label_len, min_image_dim
+        )
+    def __del__(self):
+        if self._env is not None:
+            self._env.close()
+            self._env = None
+    def _create_env(self):
+        return lmdb.open(
+            self.root, max_readers=1, readonly=True, create=False, readahead=False, meminit=False, lock=False
+        )
+    @property
+    def env(self):
+        if self._env is None:
+            self._env = self._create_env()
+        return self._env
+    def _preprocess_labels(self, charset, remove_whitespace, normalize_unicode, max_label_len, min_image_dim):
+        charset_adapter = CharsetAdapter(charset)
+        with self._create_env() as env, env.begin() as txn:
+            num_samples = int(txn.get('num-samples'.encode()))
+            if self.unlabelled:
+                return num_samples
+            for index in range(num_samples):
+                index += 1  # lmdb starts with 1
+                label_key = f'label-{index:09d}'.encode()
+                label = txn.get(label_key).decode()
+                # Normally, whitespace is removed from the labels.
+                if remove_whitespace:
+                    label = ''.join(label.split())
+                # Normalize unicode composites (if any) and convert to compatible ASCII characters
+                if normalize_unicode:
+                    label = unicodedata.normalize('NFKD', label).encode('ascii', 'ignore').decode()
+                # Filter by length before removing unsupported characters. The original label might be too long.
+                if len(label) > max_label_len:
+                    continue
+                label = charset_adapter(label)
+                # We filter out samples which don't contain any supported characters
+                if not label:
+                    continue
+                # Filter images that are too small.
+                if min_image_dim > 0:
+                    img_key = f'image-{index:09d}'.encode()
+                    buf = io.BytesIO(txn.get(img_key))
+                    w, h = Image.open(buf).size
+                    if w < self.min_image_dim or h < self.min_image_dim:
+                        continue
+                self.labels.append(label)
+                self.filtered_index_list.append(index)
+        return len(self.labels)
+    def __len__(self):
+        return self.num_samples
+    def __getitem__(self, index):
+        if self.unlabelled:
+            label = index
+        else:
+            label = self.labels[index]
+            index = self.filtered_index_list[index]
+        img_key = f'image-{index:09d}'.encode()
+        with self.env.begin() as txn:
+            imgbuf = txn.get(img_key)
+        buf = io.BytesIO(imgbuf)
+        img = Image.open(buf).convert('RGB')
+        if self.transform is not None:
+            img = self.transform(img)
+        return img, label

strhub/data/module.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# Scene Text Recognition Model Hub
+# Copyright 2022 Darwin Bautista
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pathlib import PurePath
+from typing import Callable, Optional, Sequence
+from torch.utils.data import DataLoader
+from torchvision import transforms as T
+import pytorch_lightning as pl
+from .dataset import LmdbDataset, build_tree_dataset
+class SceneTextDataModule(pl.LightningDataModule):
+    TEST_BENCHMARK_SUB = ('IIIT5k', 'SVT', 'IC13_857', 'IC15_1811', 'SVTP', 'CUTE80')
+    TEST_BENCHMARK = ('IIIT5k', 'SVT', 'IC13_1015', 'IC15_2077', 'SVTP', 'CUTE80')
+    TEST_NEW = ('ArT', 'COCOv1.4', 'Uber')
+    TEST_CUSTOM = ("blade",)
+    TEST_ALL = tuple(set(TEST_BENCHMARK_SUB + TEST_BENCHMARK + TEST_NEW))
+    def __init__(
+        self,
+        root_dir: str,
+        train_dir: str,
+        img_size: Sequence[int],
+        max_label_length: int,
+        charset_train: str,
+        charset_test: str,
+        batch_size: int,
+        num_workers: int,
+        augment: bool,
+        remove_whitespace: bool = True,
+        normalize_unicode: bool = True,
+        min_image_dim: int = 0,
+        rotation: int = 0,
+        collate_fn: Optional[Callable] = None,
+    ):
+        super().__init__()
+        self.root_dir = root_dir
+        self.train_dir = train_dir
+        self.img_size = tuple(img_size)
+        self.max_label_length = max_label_length
+        self.charset_train = charset_train
+        self.charset_test = charset_test
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.augment = augment
+        self.remove_whitespace = remove_whitespace
+        self.normalize_unicode = normalize_unicode
+        self.min_image_dim = min_image_dim
+        self.rotation = rotation
+        self.collate_fn = collate_fn
+        self._train_dataset = None
+        self._val_dataset = None
+    @staticmethod
+    def get_transform(img_size: tuple[int], augment: bool = False, rotation: int = 0):
+        transforms = []
+        if augment:
+            from .augment import rand_augment_transform
+            transforms.append(rand_augment_transform())
+        if rotation:
+            transforms.append(lambda img: img.rotate(rotation, expand=True))
+        transforms.extend([
+            T.Resize(img_size, T.InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(0.5, 0.5),
+        ])
+        return T.Compose(transforms)
+    @property
+    def train_dataset(self):
+        if self._train_dataset is None:
+            transform = self.get_transform(self.img_size, self.augment)
+            root = PurePath(self.root_dir, 'train', self.train_dir)
+            self._train_dataset = build_tree_dataset(
+                root,
+                self.charset_train,
+                self.max_label_length,
+                self.min_image_dim,
+                self.remove_whitespace,
+                self.normalize_unicode,
+                transform=transform,
+            )
+        return self._train_dataset
+    @property
+    def val_dataset(self):
+        if self._val_dataset is None:
+            transform = self.get_transform(self.img_size)
+            root = PurePath(self.root_dir, 'val')
+            self._val_dataset = build_tree_dataset(
+                root,
+                self.charset_test,
+                self.max_label_length,
+                self.min_image_dim,
+                self.remove_whitespace,
+                self.normalize_unicode,
+                transform=transform,
+            )
+        return self._val_dataset
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            persistent_workers=self.num_workers > 0,
+            pin_memory=True,
+            collate_fn=self.collate_fn,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            persistent_workers=self.num_workers > 0,
+            pin_memory=True,
+            collate_fn=self.collate_fn,
+        )
+    def test_dataloaders(self, subset):
+        transform = self.get_transform(self.img_size, rotation=self.rotation)
+        root = PurePath(self.root_dir, 'test')
+        datasets = {
+            s: LmdbDataset(
+                str(root / s),
+                self.charset_test,
+                self.max_label_length,
+                self.min_image_dim,
+                self.remove_whitespace,
+                self.normalize_unicode,
+                transform=transform,
+            )
+            for s in subset
+        }
+        return {
+            k: DataLoader(
+                v, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True, collate_fn=self.collate_fn
+            )
+            for k, v in datasets.items()
+        }

strhub/data/utils.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Scene Text Recognition Model Hub
+# Copyright 2022 Darwin Bautista
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from abc import ABC, abstractmethod
+from itertools import groupby
+from typing import Optional
+import torch
+from torch import Tensor
+from torch.nn.utils.rnn import pad_sequence
+class CharsetAdapter:
+    """Transforms labels according to the target charset."""
+    def __init__(self, target_charset) -> None:
+        super().__init__()
+        self.lowercase_only = target_charset == target_charset.lower()
+        self.uppercase_only = target_charset == target_charset.upper()
+        self.unsupported = re.compile(f'[^{re.escape(target_charset)}]')
+    def __call__(self, label):
+        if self.lowercase_only:
+            label = label.lower()
+        elif self.uppercase_only:
+            label = label.upper()
+        # Remove unsupported characters
+        label = self.unsupported.sub('', label)
+        return label
+class BaseTokenizer(ABC):
+    def __init__(self, charset: str, specials_first: tuple = (), specials_last: tuple = ()) -> None:
+        self._itos = specials_first + tuple(charset) + specials_last
+        self._stoi = {s: i for i, s in enumerate(self._itos)}
+    def __len__(self):
+        return len(self._itos)
+    def _tok2ids(self, tokens: str) -> list[int]:
+        return [self._stoi[s] for s in tokens]
+    def _ids2tok(self, token_ids: list[int], join: bool = True) -> str:
+        tokens = [self._itos[i] for i in token_ids]
+        return ''.join(tokens) if join else tokens
+    @abstractmethod
+    def encode(self, labels: list[str], device: Optional[torch.device] = None) -> Tensor:
+        """Encode a batch of labels to a representation suitable for the model.
+        Args:
+            labels: List of labels. Each can be of arbitrary length.
+            device: Create tensor on this device.
+        Returns:
+            Batched tensor representation padded to the max label length. Shape: N, L
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def _filter(self, probs: Tensor, ids: Tensor) -> tuple[Tensor, list[int]]:
+        """Internal method which performs the necessary filtering prior to decoding."""
+        raise NotImplementedError
+    def decode(self, token_dists: Tensor, raw: bool = False) -> tuple[list[str], list[Tensor]]:
+        """Decode a batch of token distributions.
+        Args:
+            token_dists: softmax probabilities over the token distribution. Shape: N, L, C
+            raw: return unprocessed labels (will return list of list of strings)
+        Returns:
+            list of string labels (arbitrary length) and
+            their corresponding sequence probabilities as a list of Tensors
+        """
+        batch_tokens = []
+        batch_probs = []
+        for dist in token_dists:
+            probs, ids = dist.max(-1)  # greedy selection
+            if not raw:
+                probs, ids = self._filter(probs, ids)
+            tokens = self._ids2tok(ids, not raw)
+            batch_tokens.append(tokens)
+            batch_probs.append(probs)
+        return batch_tokens, batch_probs
+class Tokenizer(BaseTokenizer):
+    BOS = '[B]'
+    EOS = '[E]'
+    PAD = '[P]'
+    def __init__(self, charset: str) -> None:
+        specials_first = (self.EOS,)
+        specials_last = (self.BOS, self.PAD)
+        super().__init__(charset, specials_first, specials_last)
+        self.eos_id, self.bos_id, self.pad_id = [self._stoi[s] for s in specials_first + specials_last]
+    def encode(self, labels: list[str], device: Optional[torch.device] = None) -> Tensor:
+        batch = [
+            torch.as_tensor([self.bos_id] + self._tok2ids(y) + [self.eos_id], dtype=torch.long, device=device)
+            for y in labels
+        ]
+        return pad_sequence(batch, batch_first=True, padding_value=self.pad_id)
+    def _filter(self, probs: Tensor, ids: Tensor) -> tuple[Tensor, list[int]]:
+        ids = ids.tolist()
+        try:
+            eos_idx = ids.index(self.eos_id)
+        except ValueError:
+            eos_idx = len(ids)  # Nothing to truncate.
+        # Truncate after EOS
+        ids = ids[:eos_idx]
+        probs = probs[: eos_idx + 1]  # but include prob. for EOS (if it exists)
+        return probs, ids
+class CTCTokenizer(BaseTokenizer):
+    BLANK = '[B]'
+    def __init__(self, charset: str) -> None:
+        # BLANK uses index == 0 by default
+        super().__init__(charset, specials_first=(self.BLANK,))
+        self.blank_id = self._stoi[self.BLANK]
+    def encode(self, labels: list[str], device: Optional[torch.device] = None) -> Tensor:
+        # We use a padded representation since we don't want to use CUDNN's CTC implementation
+        batch = [torch.as_tensor(self._tok2ids(y), dtype=torch.long, device=device) for y in labels]
+        return pad_sequence(batch, batch_first=True, padding_value=self.blank_id)
+    def _filter(self, probs: Tensor, ids: Tensor) -> tuple[Tensor, list[int]]:
+        # Best path decoding:
+        ids = list(zip(*groupby(ids.tolist())))[0]  # Remove duplicate tokens
+        ids = [x for x in ids if x != self.blank_id]  # Remove BLANKs
+        # `probs` is just pass-through since all positions are considered part of the path
+        return probs, ids

strhub/models/__init__.py ADDED Viewed

File without changes

strhub/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (173 Bytes). View file

strhub/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (161 Bytes). View file

strhub/models/__pycache__/base.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

strhub/models/__pycache__/base.cpython-312.pyc ADDED Viewed

Binary file (11.6 kB). View file

strhub/models/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (9.09 kB). View file

strhub/models/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (8.09 kB). View file

strhub/models/base.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# Scene Text Recognition Model Hub
+# Copyright 2022 Darwin Bautista
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional
+from nltk import edit_distance
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import OneCycleLR
+import pytorch_lightning as pl
+from pytorch_lightning.utilities.types import STEP_OUTPUT
+from timm.optim import create_optimizer_v2
+from strhub.data.utils import BaseTokenizer, CharsetAdapter, CTCTokenizer, Tokenizer
+@dataclass
+class BatchResult:
+    num_samples: int
+    correct: int
+    ned: float
+    confidence: float
+    label_length: int
+    loss: Tensor
+    loss_numel: int
+EPOCH_OUTPUT = list[dict[str, BatchResult]]
+class BaseSystem(pl.LightningModule, ABC):
+    def __init__(
+        self,
+        tokenizer: BaseTokenizer,
+        charset_test: str,
+        batch_size: int,
+        lr: float,
+        warmup_pct: float,
+        weight_decay: float,
+    ) -> None:
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.charset_adapter = CharsetAdapter(charset_test)
+        self.batch_size = batch_size
+        self.lr = lr
+        self.warmup_pct = warmup_pct
+        self.weight_decay = weight_decay
+        self.outputs: EPOCH_OUTPUT = []
+    @abstractmethod
+    def forward(self, images: Tensor, max_length: Optional[int] = None) -> Tensor:
+        """Inference
+        Args:
+            images: Batch of images. Shape: N, Ch, H, W
+            max_length: Max sequence length of the output. If None, will use default.
+        Returns:
+            logits: N, L, C (L = sequence length, C = number of classes, typically len(charset_train) + num specials)
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def forward_logits_loss(self, images: Tensor, labels: list[str]) -> tuple[Tensor, Tensor, int]:
+        """Like forward(), but also computes the loss (calls forward() internally).
+        Args:
+            images: Batch of images. Shape: N, Ch, H, W
+            labels: Text labels of the images
+        Returns:
+            logits: N, L, C (L = sequence length, C = number of classes, typically len(charset_train) + num specials)
+            loss: mean loss for the batch
+            loss_numel: number of elements the loss was calculated from
+        """
+        raise NotImplementedError
+    def configure_optimizers(self):
+        agb = self.trainer.accumulate_grad_batches
+        # Linear scaling so that the effective learning rate is constant regardless of the number of GPUs used with DDP.
+        lr_scale = agb * math.sqrt(self.trainer.num_devices) * self.batch_size / 256.0
+        lr = lr_scale * self.lr
+        optim = create_optimizer_v2(self, 'adamw', lr, self.weight_decay)
+        sched = OneCycleLR(
+            optim, lr, self.trainer.estimated_stepping_batches, pct_start=self.warmup_pct, cycle_momentum=False
+        )
+        return {'optimizer': optim, 'lr_scheduler': {'scheduler': sched, 'interval': 'step'}}
+    def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer) -> None:
+        optimizer.zero_grad(set_to_none=True)
+    def _eval_step(self, batch, validation: bool) -> Optional[STEP_OUTPUT]:
+        images, labels = batch
+        correct = 0
+        total = 0
+        ned = 0
+        confidence = 0
+        label_length = 0
+        if validation:
+            logits, loss, loss_numel = self.forward_logits_loss(images, labels)
+        else:
+            # At test-time, we shouldn't specify a max_label_length because the test-time charset used
+            # might be different from the train-time charset. max_label_length in eval_logits_loss() is computed
+            # based on the transformed label, which could be wrong if the actual gt label contains characters existing
+            # in the train-time charset but not in the test-time charset. For example, "aishahaleyes.blogspot.com"
+            # is exactly 25 characters, but if processed by CharsetAdapter for the 36-char set, it becomes 23 characters
+            # long only, which sets max_label_length = 23. This will cause the model prediction to be truncated.
+            logits = self.forward(images)
+            loss = loss_numel = None  # Only used for validation; not needed at test-time.
+        probs = logits.softmax(-1)
+        preds, probs = self.tokenizer.decode(probs)
+        for pred, prob, gt in zip(preds, probs, labels):
+            confidence += prob.prod().item()
+            pred = self.charset_adapter(pred)
+            # Follow ICDAR 2019 definition of N.E.D.
+            ned += edit_distance(pred, gt) / max(len(pred), len(gt))
+            if pred == gt:
+                correct += 1
+            total += 1
+            label_length += len(pred)
+        return dict(output=BatchResult(total, correct, ned, confidence, label_length, loss, loss_numel))
+    @staticmethod
+    def _aggregate_results(outputs: EPOCH_OUTPUT) -> tuple[float, float, float]:
+        if not outputs:
+            return 0.0, 0.0, 0.0
+        total_loss = 0
+        total_loss_numel = 0
+        total_n_correct = 0
+        total_norm_ED = 0
+        total_size = 0
+        for result in outputs:
+            result = result['output']
+            total_loss += result.loss_numel * result.loss
+            total_loss_numel += result.loss_numel
+            total_n_correct += result.correct
+            total_norm_ED += result.ned
+            total_size += result.num_samples
+        acc = total_n_correct / total_size
+        ned = 1 - total_norm_ED / total_size
+        loss = total_loss / total_loss_numel
+        return acc, ned, loss
+    def validation_step(self, batch, batch_idx) -> Optional[STEP_OUTPUT]:
+        result = self._eval_step(batch, True)
+        self.outputs.append(result)
+        return result
+    def on_validation_epoch_end(self) -> None:
+        acc, ned, loss = self._aggregate_results(self.outputs)
+        self.outputs.clear()
+        self.log('val_accuracy', 100 * acc, sync_dist=True)
+        self.log('val_NED', 100 * ned, sync_dist=True)
+        self.log('val_loss', loss, sync_dist=True)
+        self.log('hp_metric', acc, sync_dist=True)
+    def test_step(self, batch, batch_idx) -> Optional[STEP_OUTPUT]:
+        return self._eval_step(batch, False)
+class CrossEntropySystem(BaseSystem):
+    def __init__(
+        self, charset_train: str, charset_test: str, batch_size: int, lr: float, warmup_pct: float, weight_decay: float
+    ) -> None:
+        tokenizer = Tokenizer(charset_train)
+        super().__init__(tokenizer, charset_test, batch_size, lr, warmup_pct, weight_decay)
+        self.bos_id = tokenizer.bos_id
+        self.eos_id = tokenizer.eos_id
+        self.pad_id = tokenizer.pad_id
+    def forward_logits_loss(self, images: Tensor, labels: list[str]) -> tuple[Tensor, Tensor, int]:
+        targets = self.tokenizer.encode(labels, self.device)
+        targets = targets[:, 1:]  # Discard <bos>
+        max_len = targets.shape[1] - 1  # exclude <eos> from count
+        logits = self.forward(images, max_len)
+        loss = F.cross_entropy(logits.flatten(end_dim=1), targets.flatten(), ignore_index=self.pad_id)
+        loss_numel = (targets != self.pad_id).sum()
+        return logits, loss, loss_numel
+class CTCSystem(BaseSystem):
+    def __init__(
+        self, charset_train: str, charset_test: str, batch_size: int, lr: float, warmup_pct: float, weight_decay: float
+    ) -> None:
+        tokenizer = CTCTokenizer(charset_train)
+        super().__init__(tokenizer, charset_test, batch_size, lr, warmup_pct, weight_decay)
+        self.blank_id = tokenizer.blank_id
+    def forward_logits_loss(self, images: Tensor, labels: list[str]) -> tuple[Tensor, Tensor, int]:
+        targets = self.tokenizer.encode(labels, self.device)
+        logits = self.forward(images)
+        log_probs = logits.log_softmax(-1).transpose(0, 1)  # swap batch and seq. dims
+        T, N, _ = log_probs.shape
+        input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long, device=self.device)
+        target_lengths = torch.as_tensor(list(map(len, labels)), dtype=torch.long, device=self.device)
+        loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths, blank=self.blank_id, zero_infinity=True)
+        return logits, loss, N

strhub/models/modules.py ADDED Viewed

	@@ -0,0 +1,20 @@

+r"""Shared modules used by CRNN and TRBA"""
+from torch import nn
+class BidirectionalLSTM(nn.Module):
+    """Ref: https://github.com/clovaai/deep-text-recognition-benchmark/blob/master/modules/sequence_modeling.py"""
+    def __init__(self, input_size, hidden_size, output_size):
+        super().__init__()
+        self.rnn = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True)
+        self.linear = nn.Linear(hidden_size * 2, output_size)
+    def forward(self, input):
+        """
+        input : visual feature [batch_size x T x input_size], T = num_steps.
+        output : contextual feature [batch_size x T x output_size]
+        """
+        recurrent, _ = self.rnn(input)  # batch_size x T x input_size -> batch_size x T x (2*hidden_size)
+        output = self.linear(recurrent)  # batch_size x T x output_size
+        return output

strhub/models/parseq/__init__.py ADDED Viewed

File without changes

strhub/models/parseq/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (180 Bytes). View file

strhub/models/parseq/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (168 Bytes). View file

strhub/models/parseq/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (8.97 kB). View file

strhub/models/parseq/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (8.27 kB). View file

strhub/models/parseq/__pycache__/modules.cpython-311.pyc ADDED Viewed

Binary file (8.83 kB). View file

strhub/models/parseq/__pycache__/modules.cpython-312.pyc ADDED Viewed

Binary file (7.73 kB). View file

strhub/models/parseq/__pycache__/system.cpython-311.pyc ADDED Viewed

Binary file (9.4 kB). View file

strhub/models/parseq/__pycache__/system.cpython-312.pyc ADDED Viewed

Binary file (8.59 kB). View file