k0ry commited on Dec 16, 2025

Commit

646f45c

verified ·

1 Parent(s): cc8a699

Upload 20 files

Browse files

Files changed (20) hide show

.gitattributes +36 -35
README.md +217 -0
data/dataset.py +202 -0
data/transform.py +334 -0
image/architecture.png +3 -0
model/htr_convtext.py +446 -0
model/layer.py +75 -0
model/resnet18.py +411 -0
model/tcm_head.py +133 -0
requirements.txt +8 -0
run/iam.sh +1 -0
run/lam.sh +1 -0
run/read2016.sh +1 -0
run/vnondb.sh +1 -0
test.py +140 -0
train.py +441 -0
utils/option.py +235 -0
utils/sam.py +63 -0
utils/utils.py +276 -0
valid.py +77 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,36 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+image/architecture.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,220 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
+language:
+- en
+- vi
+pipeline_tag: image-to-text
+model-index:
+- name: HTR-ConvText
+  results:
+  - task:
+      type: image-to-text
+      name: Handwritten Text Recognition
+    dataset:
+      name: IAM
+      type: iam
+      split: test
+    metrics:
+    - type: cer
+      value: 4.0
+      name: Test CER
+    - type: wer
+      value: 12.9
+      name: Test WER
+  - task:
+      type: image-to-text
+      name: Handwritten Text Recognition
+    dataset:
+      name: LAM
+      type: lam
+      split: test
+    metrics:
+    - type: cer
+      value: 2.7
+      name: Test CER
+    - type: wer
+      value: 7.0
+      name: Test WER
+  - task:
+      type: image-to-text
+      name: Handwritten Text Recognition
+    dataset:
+      name: READ2016
+      type: read2016
+      split: test
+    metrics:
+    - type: cer
+      value: 3.6
+      name: Test CER
+    - type: wer
+      value: 15.7
+      name: Test WER
+  - task:
+      type: image-to-text
+      name: Handwritten Text Recognition
+    dataset:
+      name: HANDS-VNOnDB
+      type: hands-vnondb
+      split: test
+    metrics:
+    - type: cer
+      value: 3.45
+      name: Test CER
+    - type: wer
+      value: 8.9
+      name: Test WER
 ---
+---
+# HTR-ConvText: Leveraging Convolution and Textual Information for Handwritten Text Recognition
+<div align="center"> <img src="image/architecture.png" alt="HTR-ConvText Architecture" width="800"/> </div>
+<p align="center">
+  <a href="https://huggingface.co/DAIR-Group/HTR-ConvText">
+    <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-blue">
+  </a>
+  <a href="https://github.com/DAIR-Group/HTR-ConvText">
+    <img alt="GitHub" src="https://img.shields.io/badge/GitHub-Repo-181717.svg?logo=github&logoColor=white">
+  </a>
+  <a href="https://github.com/DAIR-Group/HTR-ConvText/blob/main/LICENSE">
+    <img alt="License" src="https://img.shields.io/badge/License-Apache%202.0-green">
+  </a>
+  <a href="https://arxiv.org/abs/2512.05021">
+    <img alt="arXiv" src="https://img.shields.io/badge/arXiv-2512.05021-b31b1b.svg">
+  </a>
+</p>
+## Highlights
+HTR-ConvText is a novel hybrid architecture for Handwritten Text Recognition (HTR) that effectively balances local feature extraction with global contextual modeling. Designed to overcome the limitations of standard CTC-based decoding and data-hungry Transformers, HTR-ConvText delivers state-of-the-art performance with the following key features:
+- **Hybrid CNN-ViT Architecture**: Seamlessly integrates a ResNet backbone with MobileViT blocks (MVP) and Conditional Positional Encoding, enabling the model to capture fine-grained stroke details while maintaining global spatial awareness.
+- **Hierarchical ConvText Encoder**: A U-Net-like encoder structure that interleaves Multi-Head Self-Attention with Depthwise Convolutions. This design efficiently models both long-range dependencies and local structural patterns.
+- **Textual Context Module (TCM)**: An innovative training-only auxiliary module that injects bidirectional linguistic priors into the visual encoder. This mitigates the conditional independence weakness of CTC decoding without adding any latency during inference.
+- **State-of-the-Art Performance**: Outperforms existing methods on major benchmarks including IAM (English), READ2016 (German), LAM (Italian), and HANDS-VNOnDB (Vietnamese), specifically excelling in low-resource scenarios and complex diacritics.
+## Model Overview
+HTR-ConvText configurations and specifications:
+| Feature             | Specification                                       |
+| ------------------- | --------------------------------------------------- |
+| Architecture Type   | Hybrid CNN + Vision Transformer (Encoder-Only)      |
+| Parameters          | ~65.9M                                              |
+| Backbone            | ResNet-18 + MobileViT w/ Positional Encoding (MVP)  |
+| Encoder Layers      | 8 ConvText Blocks (Hierarchical)                    |
+| Attention Heads     | 8                                                   |
+| Embedding Dimension | 512                                                 |
+| Image Input Size    | 512×64                                              |
+| Inference Strategy  | Standard CTC Decoding (TCM is removed at inference) |
+For more details, including ablation studies and theoretical proofs, please refer to our [Technical Report](https://arxiv.org/pdf/2512.05021).
+## Performance
+We evaluated HTR-ConvText across four diverse datasets. The model achieves new SOTA results with the lowest Character Error Rate (CER) and Word Error Rate (WER) without requiring massive synthetic pre-training.
+| Dataset   | Language    | Ours CER (%) | HTR-VT | OrigamiNet | TrOCR | CRNN  |
+|-----------|-------------|--------------|--------|------------|-------|-------|
+| IAM       | English     | 4.0          | 4.7    | 4.8        | 7.3   | 7.8   |
+| LAM       | Italian     | 2.7          | 2.8    | 3.0        | 3.6   | 3.8   |
+| READ2016  | German      | 3.6          | 3.9    | -          | -     | 4.7   |
+| VNOnDB    | Vietnamese  | 3.45         | 4.26   | 7.6        | -     | 10.53 |
+## Quickstart
+### Instalation
+1. **Clone the repository**
+   ```cmd
+   git clone https://github.com/0xk0ry/HTR-ConvText.git
+   cd HTR-ConvText
+   ```
+2. **Create and activate a Python 3.9+ Conda environment**
+   ```cmd
+   conda create -n htr-convtext python=3.9 -y
+   conda activate htr-convtext
+   ```
+3. **Install PyTorch** using the wheel that matches your CUDA driver (swap the index for CPU-only builds):
+   ```cmd
+   pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126
+   ```
+4. **Install the remaining project requirements** (everything except PyTorch, which you already picked in step 3).
+   ```cmd
+   pip install -r requirements.txt
+   ```
+The code was tested on Python 3.9 and PyTorch 2.9.1.
+### Data Preparation
+We provide split files (train.ln, val.ln, test.ln) for IAM, READ2016, LAM, and VNOnDB under data/. Organize your data as follows:
+```
+./data/iam/
+├── train.ln
+├── val.ln
+├── test.ln
+└── lines
+      ├── a01-000u-00.png
+      ├── a01-000u-00.txt
+      └── ...
+```
+### Training
+We provide comprehensive scripts in the ./run/ directory. To train on the IAM dataset with the Textual Context Module (TCM) enabled:
+```
+# Using the provided script
+bash run/iam.sh
+# OR running directly via Python
+python train.py \
+    --use-wandb \
+    --dataset iam \
+    --tcm-enable \
+    --exp-name "htr-convtext-iam" \
+    --img-size 512 64 \
+    --train-bs 32 \
+    --val-bs 8 \
+    --data-path /path/to/iam/lines/ \
+    --train-data-list data/iam/train.ln \
+    --val-data-list data/iam/val.ln \
+    --test-data-list data/iam/test.ln \
+    --nb-cls 80
+```
+### Inference / Evaluation
+To evaluate a pre-trained checkpoint on the test set:
+```
+python test.py \
+    --resume ./checkpoints/best_CER.pth \
+    --dataset iam \
+    --img-size 512 64 \
+    --data-path /path/to/iam/lines/ \
+    --test-data-list data/iam/test.ln \
+    --nb-cls 80
+```
+## Citation
+If you find our work helpful, please cite our paper:
+```
+@misc{truc2025htrconvtex,
+      title={HTR-ConvText: Leveraging Convolution and Textual Information for Handwritten Text Recognition},
+      author={Pham Thach Thanh Truc and Dang Hoai Nam and Huynh Tong Dang Khoa and Vo Nguyen Le Duy},
+      year={2025},
+      eprint={2512.05021},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2512.05021},
+}
+```
+## Acknowledgement
+This project is inspired by and adapted from [HTR-VT](https://github.com/Intellindust-AI-Lab/HTR-VT). We gratefully acknowledge the authors for their open-source contributions.

data/dataset.py ADDED Viewed

	@@ -0,0 +1,202 @@

+from torchvision.transforms import ColorJitter
+from data import transform as transform
+from utils import utils
+from torch.utils.data import Dataset
+from PIL import Image
+import itertools
+import os
+import skimage
+import torch
+import numpy as np
+def SameTrCollate(batch, args):
+    images, labels = zip(*batch)
+    images = [Image.fromarray(np.uint8(images[i][0] * 255))
+              for i in range(len(images))]
+    # Apply data augmentations with 90% probability
+    if np.random.rand() < 0.5:
+        images = [transform.RandomTransform(
+            args.proj)(image) for image in images]
+    if np.random.rand() < 0.5:
+        kernel_h = utils.randint(1, args.dila_ero_max_kernel + 1)
+        kernel_w = utils.randint(1, args.dila_ero_max_kernel + 1)
+        if utils.randint(0, 2) == 0:
+            images = [transform.Erosion((kernel_w, kernel_h), args.dila_ero_iter)(
+                image) for image in images]
+        else:
+            images = [transform.Dilation((kernel_w, kernel_h), args.dila_ero_iter)(
+                image) for image in images]
+    if np.random.rand() < 0.5:
+        images = [ColorJitter(args.jitter_brightness, args.jitter_contrast, args.jitter_saturation,
+                              args.jitter_hue)(image) for image in images]
+    # Convert images to tensors
+    image_tensors = [torch.from_numpy(
+        np.array(image, copy=True)) for image in images]
+    image_tensors = torch.cat([t.unsqueeze(0) for t in image_tensors], 0)
+    image_tensors = image_tensors.unsqueeze(1).float()
+    image_tensors = image_tensors / 255.
+    return image_tensors, labels
+class myLoadDS(Dataset):
+    def __init__(self, flist, dpath, img_size=[512, 32], ralph=None, fmin=True, mln=None, dataset=None):
+        self.fns = get_files(flist, dpath)
+        self.tlbls = get_labels(self.fns)
+        self.img_size = img_size
+        if ralph is not None:
+            self.ralph = ralph
+        elif dataset is not None:
+            if dataset == 'iam':
+                self.ralph = {
+                    idx: char for idx, char in enumerate(
+                        ' !"#&\'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+                    )
+                }
+            elif dataset == 'lam':
+                self.ralph = {
+                    idx: char for idx, char in enumerate(
+                        ' !"#%&\'()+,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXZabcdefghijlmnopqrstuvwxyz|°·ÈÉàèéìòù–'
+                    )
+                }
+            elif dataset == 'read2016':
+                self.ralph = {
+                    idx: char for idx, char in enumerate(
+                        ' ()+,-./0123456789:<>ABCDEFGHIJKLMNOPQRSTUVWYZ[]abcdefghijklmnopqrstuvwxyz¾Ößäöüÿāēōūȳ̄̈—'
+                    )
+                }
+            elif dataset == 'vnondb':
+                self.ralph = {
+                    idx: char for idx, char in enumerate(
+                        ' !"%&()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvxyzÀÁÂÔÚÝàáâãèéêìíòóôõùúýĂăĐđĩũƠơƯưạẢảẤấẦầẩẫậắằẳẵặẹẻẽếỀềỂểễỆệỉịọỏỐốỒồổỗộớờỞởỡợụỦủứừửữựỳỷỹ'
+                    )
+                }
+            else:
+                alph = get_alphabet(self.tlbls)
+                self.ralph = dict(zip(alph.values(), alph.keys()))
+                self.alph = alph
+        else:
+            alph = get_alphabet(self.tlbls)
+            self.ralph = dict(zip(alph.values(), alph.keys()))
+            self.alph = alph
+        if mln != None:
+            filt = [len(x) <= mln if fmin else len(x)
+                    >= mln for x in self.tlbls]
+            self.tlbls = np.asarray(self.tlbls)[filt].tolist()
+            self.fns = np.asarray(self.fns)[filt].tolist()
+    def __len__(self):
+        return len(self.fns)
+    def __getitem__(self, index):
+        timgs = get_images(self.fns[index], self.img_size[0], self.img_size[1])
+        timgs = timgs.transpose((2, 0, 1))
+        return (timgs, self.tlbls[index])
+def _read_text(path):
+    """Read a text file with robust encoding handling.
+    Try UTF-8 first, then fall back to common Windows encodings.
+    """
+    encodings = ['utf-8', 'utf-8-sig', 'cp1258', 'cp1252', 'latin-1']
+    last_err = None
+    for enc in encodings:
+        try:
+            with open(path, 'r', encoding=enc) as f:
+                return f.read()
+        except UnicodeDecodeError as e:
+            last_err = e
+            continue
+        except FileNotFoundError:
+            raise
+    # As a last resort, ignore errors to avoid crashing the training loop
+    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
+        return f.read()
+def _read_lines(path):
+    txt = _read_text(path)
+    return txt.splitlines()
+def get_files(nfile, dpath):
+    fnames = _read_lines(nfile)
+    fnames = [dpath + x.strip() for x in fnames]
+    return fnames
+def npThum(img, max_w, max_h):
+    x, y = np.shape(img)[:2]
+    y = min(int(y * max_h / x), max_w)
+    x = max_h
+    img = np.array(Image.fromarray(img).resize((y, x)))
+    return img
+def get_images(fname, max_w=500, max_h=500, nch=1):  # args.max_w args.max_h args.nch
+    try:
+        image_data = np.array(Image.open(fname).convert('L'))
+        image_data = npThum(image_data, max_w, max_h)
+        image_data = skimage.img_as_float32(image_data)
+        h, w = np.shape(image_data)[:2]
+        if image_data.ndim < 3:
+            image_data = np.expand_dims(image_data, axis=-1)
+        if nch == 3 and image_data.shape[2] != 3:
+            image_data = np.tile(image_data, 3)
+        image_data = np.pad(image_data, ((0, 0), (0, max_w - np.shape(image_data)[1]), (0, 0)), mode='constant',
+                            constant_values=(1.0))
+    except IOError as e:
+        print('Could not read:', fname, ':', e)
+    return image_data
+def get_labels(fnames):
+    labels = []
+    for id, image_file in enumerate(fnames):
+        fn = os.path.splitext(image_file)[0] + '.txt'
+        lbl = _read_text(fn)
+        lbl = ' '.join(lbl.split())  # remove linebreaks if present
+        labels.append(lbl)
+    return labels
+def get_alphabet(labels):
+    coll = ''.join(labels)
+    unq = sorted(list(set(coll)))
+    unq = [''.join(i) for i in itertools.product(unq, repeat=1)]
+    alph = dict(zip(unq, range(len(unq))))
+    return alph
+def cycle_dpp(iterable):
+    epoch = 0
+    iterable.sampler.set_epoch(epoch)
+    while True:
+        for x in iterable:
+            yield x
+        epoch += 1
+        iterable.sampler.set_epoch(epoch)
+def cycle_data(iterable):
+    while True:
+        for x in iterable:
+            yield x

data/transform.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import itertools
+import cv2
+import numpy as np
+from skimage import transform as stf
+from numpy import random, floor
+from PIL import Image, ImageOps
+from cv2 import erode, dilate, normalize
+from torchvision.transforms import RandomCrop
+import math
+class Dilation:
+    """
+    OCR: stroke width increasing
+    """
+    def __init__(self, kernel, iterations):
+        self.kernel = np.ones(kernel, np.uint8)
+        self.iterations = iterations
+    def __call__(self, x):
+        return Image.fromarray(dilate(np.array(x), self.kernel, iterations=self.iterations))
+class Erosion:
+    """
+    OCR: stroke width decreasing
+    """
+    def __init__(self, kernel, iterations):
+        self.kernel = np.ones(kernel, np.uint8)
+        self.iterations = iterations
+    def __call__(self, x):
+        return Image.fromarray(erode(np.array(x), self.kernel, iterations=self.iterations))
+class ElasticDistortion:
+    """
+    Elastic Distortion adapted from https://github.com/IntuitionMachines/OrigamiNet
+    Used in "OrigamiNet: Weakly-Supervised, Segmentation-Free, One-Step, Full Page TextRecognition by learning to unfold",
+        Yousef, Mohamed and Bishop, Tom E., The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2020
+    """
+    def __init__(self, grid, magnitude, min_sep):
+        self.grid_width, self.grid_height = grid
+        self.xmagnitude, self.ymagnitude = magnitude
+        self.min_h_sep, self.min_v_sep = min_sep
+    def __call__(self, x):
+        w, h = x.size
+        horizontal_tiles = self.grid_width
+        vertical_tiles = self.grid_height
+        width_of_square = int(floor(w / float(horizontal_tiles)))
+        height_of_square = int(floor(h / float(vertical_tiles)))
+        width_of_last_square = w - (width_of_square * (horizontal_tiles - 1))
+        height_of_last_square = h - (height_of_square * (vertical_tiles - 1))
+        dimensions = []
+        shift = [[(0, 0) for x in range(horizontal_tiles)] for y in range(vertical_tiles)]
+        for vertical_tile in range(vertical_tiles):
+            for horizontal_tile in range(horizontal_tiles):
+                if vertical_tile == (vertical_tiles - 1) and horizontal_tile == (horizontal_tiles - 1):
+                    dimensions.append([horizontal_tile * width_of_square,
+                                       vertical_tile * height_of_square,
+                                       width_of_last_square + (horizontal_tile * width_of_square),
+                                       height_of_last_square + (height_of_square * vertical_tile)])
+                elif vertical_tile == (vertical_tiles - 1):
+                    dimensions.append([horizontal_tile * width_of_square,
+                                       vertical_tile * height_of_square,
+                                       width_of_square + (horizontal_tile * width_of_square),
+                                       height_of_last_square + (height_of_square * vertical_tile)])
+                elif horizontal_tile == (horizontal_tiles - 1):
+                    dimensions.append([horizontal_tile * width_of_square,
+                                       vertical_tile * height_of_square,
+                                       width_of_last_square + (horizontal_tile * width_of_square),
+                                       height_of_square + (height_of_square * vertical_tile)])
+                else:
+                    dimensions.append([horizontal_tile * width_of_square,
+                                       vertical_tile * height_of_square,
+                                       width_of_square + (horizontal_tile * width_of_square),
+                                       height_of_square + (height_of_square * vertical_tile)])
+                sm_h = min(self.xmagnitude,
+                           width_of_square - (self.min_h_sep + shift[vertical_tile][horizontal_tile - 1][
+                               0])) if horizontal_tile > 0 else self.xmagnitude
+                sm_v = min(self.ymagnitude,
+                           height_of_square - (self.min_v_sep + shift[vertical_tile - 1][horizontal_tile][
+                               1])) if vertical_tile > 0 else self.ymagnitude
+                dx = random.randint(-sm_h, self.xmagnitude)
+                dy = random.randint(-sm_v, self.ymagnitude)
+                shift[vertical_tile][horizontal_tile] = (dx, dy)
+        shift = list(itertools.chain.from_iterable(shift))
+        last_column = []
+        for i in range(vertical_tiles):
+            last_column.append((horizontal_tiles - 1) + horizontal_tiles * i)
+        last_row = range((horizontal_tiles * vertical_tiles) - horizontal_tiles, horizontal_tiles * vertical_tiles)
+        polygons = []
+        for x1, y1, x2, y2 in dimensions:
+            polygons.append([x1, y1, x1, y2, x2, y2, x2, y1])
+        polygon_indices = []
+        for i in range((vertical_tiles * horizontal_tiles) - 1):
+            if i not in last_row and i not in last_column:
+                polygon_indices.append([i, i + 1, i + horizontal_tiles, i + 1 + horizontal_tiles])
+        for id, (a, b, c, d) in enumerate(polygon_indices):
+            dx = shift[id][0]
+            dy = shift[id][1]
+            x1, y1, x2, y2, x3, y3, x4, y4 = polygons[a]
+            polygons[a] = [x1, y1,
+                           x2, y2,
+                           x3 + dx, y3 + dy,
+                           x4, y4]
+            x1, y1, x2, y2, x3, y3, x4, y4 = polygons[b]
+            polygons[b] = [x1, y1,
+                           x2 + dx, y2 + dy,
+                           x3, y3,
+                           x4, y4]
+            x1, y1, x2, y2, x3, y3, x4, y4 = polygons[c]
+            polygons[c] = [x1, y1,
+                           x2, y2,
+                           x3, y3,
+                           x4 + dx, y4 + dy]
+            x1, y1, x2, y2, x3, y3, x4, y4 = polygons[d]
+            polygons[d] = [x1 + dx, y1 + dy,
+                           x2, y2,
+                           x3, y3,
+                           x4, y4]
+        generated_mesh = []
+        for i in range(len(dimensions)):
+            generated_mesh.append([dimensions[i], polygons[i]])
+        self.generated_mesh = generated_mesh
+        return x.transform(x.size, Image.MESH, self.generated_mesh, resample=Image.BICUBIC)
+class RandomTransform:
+    """
+    Random Transform adapted from https://github.com/IntuitionMachines/OrigamiNet
+    Used in "OrigamiNet: Weakly-Supervised, Segmentation-Free, One-Step, Full Page TextRecognition by learning to unfold",
+        Yousef, Mohamed and Bishop, Tom E., The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2020
+    """
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, x):
+        w, h = x.size
+        dw, dh = (self.val, 0) if random.randint(0, 2) == 0 else (0, self.val)
+        def rd(d):
+            return random.uniform(-d, d)
+        def fd(d):
+            return random.uniform(-dw, d)
+        # generate a random projective transform
+        # adapted from https://navoshta.com/traffic-signs-classification/
+        tl_top = rd(dh)
+        tl_left = fd(dw)
+        bl_bottom = rd(dh)
+        bl_left = fd(dw)
+        tr_top = rd(dh)
+        tr_right = fd(min(w * 3 / 4 - tl_left, dw))
+        br_bottom = rd(dh)
+        br_right = fd(min(w * 3 / 4 - bl_left, dw))
+        tform = stf.ProjectiveTransform()
+        tform.estimate(np.array((        #从对应点估计变换矩阵
+            (tl_left, tl_top),
+            (bl_left, h - bl_bottom),
+            (w - br_right, h - br_bottom),
+            (w - tr_right, tr_top)
+        )), np.array((
+            [0, 0],
+            [0, h - 1],
+            [w - 1, h - 1],
+            [w - 1, 0]
+        )))
+        # determine shape of output image, to preserve size
+        # trick take from the implementation of skimage.transform.rotate
+        corners = np.array([
+            [0, 0],
+            [0, h - 1],
+            [w - 1, h - 1],
+            [w - 1, 0]
+        ])
+        corners = tform.inverse(corners)
+        minc = corners[:, 0].min()
+        minr = corners[:, 1].min()
+        maxc = corners[:, 0].max()
+        maxr = corners[:, 1].max()
+        out_rows = maxr - minr + 1
+        out_cols = maxc - minc + 1
+        output_shape = np.around((out_rows, out_cols))
+        # fit output image in new shape
+        translation = (minc, minr)
+        tform4 = stf.SimilarityTransform(translation=translation)
+        tform = tform4 + tform
+        # normalize
+        tform.params /= tform.params[2, 2]
+        x = stf.warp(np.array(x), tform, output_shape=output_shape, cval=255, preserve_range=True)
+        x = stf.resize(x, (h, w), preserve_range=True).astype(np.uint8)
+        return Image.fromarray(x)
+class SignFlipping:
+    """
+    Color inversion
+    """
+    def __init__(self):
+        pass
+    def __call__(self, x):
+        return ImageOps.invert(x)
+class DPIAdjusting:
+    """
+    Resolution modification
+    """
+    def __init__(self, factor, preserve_ratio):
+        self.factor = factor
+    def __call__(self, x):
+        w, h = x.size
+        return x.resize((int(np.ceil(w * self.factor)), int(np.ceil(h * self.factor))), Image.BILINEAR)
+class GaussianNoise:
+    """
+    Add Gaussian Noise
+    """
+    def __init__(self, std):
+        self.std = std
+    def __call__(self, x):
+        x_np = np.array(x)
+        mean, std = np.mean(x_np), np.std(x_np)
+        std = math.copysign(max(abs(std), 0.000001), std)
+        min_, max_ = np.min(x_np,), np.max(x_np)
+        normal_noise = np.random.randn(*x_np.shape)
+        if len(x_np.shape) == 3 and x_np.shape[2] == 3 and np.all(x_np[:, :, 0] == x_np[:, :, 1]) and np.all(x_np[:, :, 0] == x_np[:, :, 2]):
+            normal_noise[:, :, 1] = normal_noise[:, :, 2] = normal_noise[:, :, 0]
+        x_np = ((x_np-mean)/std + normal_noise*self.std) * std + mean
+        x_np = normalize(x_np, x_np, max_, min_, cv2.NORM_MINMAX)
+        return Image.fromarray(x_np.astype(np.uint8))
+class Sharpen:
+    """
+    Add Gaussian Noise
+    """
+    def __init__(self, alpha, strength):
+        self.alpha = alpha
+        self.strength = strength
+    def __call__(self, x):
+        x_np = np.array(x)
+        id_matrix = np.array([[0, 0, 0],
+                              [0, 1, 0],
+                              [0, 0, 0]]
+                             )
+        effect_matrix = np.array([[1, 1, 1],
+                                  [1, -(8+self.strength), 1],
+                                  [1, 1, 1]]
+                                 )
+        kernel = (1 - self.alpha) * id_matrix - self.alpha * effect_matrix
+        kernel = np.expand_dims(kernel, axis=2)
+        kernel = np.concatenate([kernel, kernel, kernel], axis=2)
+        sharpened = cv2.filter2D(x_np, -1, kernel=kernel[:, :, 0])
+        return Image.fromarray(sharpened.astype(np.uint8))
+class ZoomRatio:
+    """
+        Crop by ratio
+        Preserve dimensions if keep_dim = True (= zoom)
+    """
+    def __init__(self, ratio_h, ratio_w, keep_dim=True):
+        self.ratio_w = ratio_w
+        self.ratio_h = ratio_h
+        self.keep_dim = keep_dim
+    def __call__(self, x):
+        w, h = x.size
+        x = RandomCrop((int(h * self.ratio_h), int(w * self.ratio_w)))(x)
+        if self.keep_dim:
+            x = x.resize((w, h), Image.BILINEAR)
+        return x
+class Tightening:
+    """
+    Reduce interline spacing
+    """
+    def __init__(self, color=255, remove_proba=0.75):
+        self.color = color
+        self.remove_proba = remove_proba
+    def __call__(self, x):
+        x_np = np.array(x)
+        interline_indices = [np.all(line == 255) for line in x_np]
+        indices_to_removed = np.logical_and(np.random.choice([True, False], size=len(x_np), replace=True, p=[self.remove_proba, 1-self.remove_proba]), interline_indices)
+        new_x = x_np[np.logical_not(indices_to_removed)]
+        return Image.fromarray(new_x.astype(np.uint8))

image/architecture.png ADDED Viewed

Git LFS Details

SHA256: f4e7e266e92b47867035820e9aa2529470278d11d99838574c23e6d901b77bc2
Pointer size: 131 Bytes
Size of remote file: 797 kB

model/htr_convtext.py ADDED Viewed

	@@ -0,0 +1,446 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.vision_transformer import Mlp, DropPath
+from timm.layers import LayerScale
+import numpy as np
+from model import resnet18
+from functools import partial
+import random
+import re
+import warnings
+class RelativePositionBias1D(nn.Module):
+    def __init__(self, num_heads: int, max_rel_positions: int = 1024):
+        super().__init__()
+        self.num_heads = num_heads
+        self.max_rel_positions = max(1, int(max_rel_positions))
+        self.bias = nn.Embedding(2 * self.max_rel_positions - 1, num_heads)
+        nn.init.zeros_(self.bias.weight)
+    def forward(self, N: int) -> torch.Tensor:
+        device = self.bias.weight.device
+        coords = torch.arange(N, device=device)
+        rel = coords[:, None] - coords[None, :]
+        rel = rel.clamp(-self.max_rel_positions + 1,
+                        self.max_rel_positions - 1)
+        rel = rel + (self.max_rel_positions - 1)
+        bias = self.bias(rel)
+        return bias.permute(2, 0, 1).unsqueeze(0)
+class Attention(nn.Module):
+    def __init__(self, dim, num_patches, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        max_rel_positions = max(
+            1, int(num_patches)) if num_patches is not None else 1024
+        self.rel_pos_bias = RelativePositionBias1D(
+            num_heads=num_heads, max_rel_positions=max_rel_positions)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C //
+                                  self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn + self.rel_pos_bias(N)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim, dropout=0.1, activation=nn.SiLU):
+        super().__init__()
+        self.lin1 = nn.Linear(dim, hidden_dim)
+        self.act = activation()
+        self.lin2 = nn.Linear(hidden_dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.dropout(self.lin2(self.act(self.lin1(x))))
+class ConvModule(nn.Module):
+    def __init__(self, dim, kernel_size=3, dropout=0.1, drop_path=0.0,
+                 expansion=1.0, pre_norm=False, activation=nn.SiLU):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(dim) if pre_norm else None
+        hidden = int(round(dim * expansion))
+        self.pw1 = nn.Conv1d(dim, hidden, kernel_size=1, bias=True)
+        self.act1 = activation()
+        self.dw = nn.Conv1d(hidden, hidden, kernel_size=kernel_size,
+                            padding=kernel_size // 2, groups=hidden, bias=True)
+        self.gn = nn.GroupNorm(1, hidden, eps=1e-5)
+        self.act2 = activation()
+        self.pw2 = nn.Conv1d(hidden, dim, kernel_size=1, bias=True)
+        self.dropout = nn.Dropout(dropout)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x):
+        if self.pre_norm is not None:
+            x = self.pre_norm(x)
+        z = x.transpose(1, 2)
+        z = self.pw1(z)
+        z = self.act1(z)
+        z = self.dw(z)
+        z = self.gn(z)
+        z = self.act2(z)
+        z = self.pw2(z)
+        z = self.dropout(z).transpose(1, 2)
+        return self.drop_path(z)
+class Downsample1D(nn.Module):
+    def __init__(self, dim, kernel_size=3, stride=2, lowpass_init=True):
+        super().__init__()
+        self.dw = nn.Conv1d(dim, dim, kernel_size=kernel_size,
+                            stride=stride, padding=kernel_size//2,
+                            groups=dim, bias=False)
+        self.pw = nn.Conv1d(dim, dim, kernel_size=1, bias=True)
+        if lowpass_init:
+            with torch.no_grad():
+                w = torch.zeros_like(self.dw.weight)
+                w[:, 0, :] = 1.0 / kernel_size
+                self.dw.weight.copy_(w)
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        x = self.pw(self.dw(x))
+        return x.transpose(1, 2)
+class Upsample1D(nn.Module):
+    def __init__(self, dim, mode: str = 'nearest'):
+        super().__init__()
+        assert mode in (
+            'nearest', 'linear'), "Upsample1D mode must be 'nearest' or 'linear'"
+        self.mode = mode
+        self.proj = nn.Conv1d(dim, dim, kernel_size=1, bias=True)
+    def forward(self, x, target_len: int):
+        x = x.transpose(1, 2)
+        if self.mode == 'nearest':
+            x = F.interpolate(x, size=target_len, mode='nearest')
+        else:
+            x = F.interpolate(x, size=target_len,
+                              mode='linear', align_corners=False)
+        x = self.proj(x)
+        return x.transpose(1, 2)
+class ConvTextBlock(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 num_patches,
+                 mlp_ratio=4.0,
+                 ff_dropout=0.1,
+                 attn_dropout=0.0,
+                 conv_dropout=0.0,
+                 conv_kernel_size=3,
+                 conv_expansion=1.0,
+                 norm_layer=nn.LayerNorm,
+                 drop_path=0.0,
+                 layerscale_init=1e-5):
+        super().__init__()
+        ff_hidden = int(dim * mlp_ratio)
+        self.attn = Attention(dim, num_patches, num_heads=num_heads,
+                              qkv_bias=True, attn_drop=attn_dropout, proj_drop=ff_dropout)
+        self.ffn1 = FeedForward(
+            dim, ff_hidden, dropout=ff_dropout, activation=nn.SiLU)
+        self.conv = ConvModule(dim, kernel_size=conv_kernel_size,
+                               dropout=conv_dropout, drop_path=0.0,
+                               expansion=conv_expansion, pre_norm=False, activation=nn.SiLU)
+        self.ffn2 = FeedForward(
+            dim, ff_hidden, dropout=ff_dropout, activation=nn.SiLU)
+        self.postln_attn = norm_layer(dim, elementwise_affine=True)
+        self.postln_ffn1 = norm_layer(dim, elementwise_affine=True)
+        self.postln_conv = norm_layer(dim, elementwise_affine=True)
+        self.postln_ffn2 = norm_layer(dim, elementwise_affine=True)
+        self.dp_attn = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+        self.dp_ffn1 = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+        self.dp_conv = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+        self.dp_ffn2 = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+        self.ls_attn = LayerScale(dim, init_values=layerscale_init)
+        self.ls_ffn1 = LayerScale(dim, init_values=layerscale_init)
+        self.ls_conv = LayerScale(dim, init_values=layerscale_init)
+        self.ls_ffn2 = LayerScale(dim, init_values=layerscale_init)
+    def forward(self, x):
+        x = self.postln_attn(x + self.ls_attn(self.dp_attn(self.attn(x))))
+        x = self.postln_ffn1(
+            x + self.ls_ffn1(0.5 * self.dp_ffn1(self.ffn1(x))))
+        x = self.postln_conv(x + self.ls_conv(self.dp_conv(self.conv(x))))
+        x = self.postln_ffn2(
+            x + self.ls_ffn2(0.5 * self.dp_ffn2(self.ffn2(x))))
+        return x
+def get_2d_sincos_pos_embed(embed_dim, grid_size):
+    grid_h = np.arange(grid_size[0], dtype=np.float32)
+    grid_w = np.arange(grid_size[1], dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    emb_h = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[0])
+    emb_w = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[1])
+    emb = np.concatenate([emb_h, emb_w], axis=1)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000 ** omega
+    pos = pos.reshape(-1)
+    out = np.einsum('m,d->md', pos, omega)
+    emb_sin = np.sin(out)
+    emb_cos = np.cos(out)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)
+    return emb
+class HTR_ConvText(nn.Module):
+    def __init__(
+        self,
+        nb_cls=80,
+        img_size=[512, 64],
+        patch_size=[4, 32],
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4.0,
+        norm_layer=nn.LayerNorm,
+        conv_kernel_size: int = 3,
+        dropout: float = 0.1,
+        drop_path: float = 0.1,
+        down_after: int = 2,
+        up_after: int = 4,
+        ds_kernel: int = 3,
+        max_seq_len: int = 1024,
+        upsample_mode: str = 'nearest',
+    ):
+        super().__init__()
+        self.patch_embed = resnet18.ResNet18(embed_dim)
+        self.embed_dim = embed_dim
+        self.max_rel_pos = int(max_seq_len)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
+        self.blocks = nn.ModuleList([
+            ConvTextBlock(embed_dim, num_heads, self.max_rel_pos,
+                               mlp_ratio=mlp_ratio,
+                               ff_dropout=dropout, attn_dropout=dropout,
+                               conv_dropout=dropout, conv_kernel_size=conv_kernel_size,
+                               conv_expansion=1.0,
+                               norm_layer=norm_layer, drop_path=dpr[i],
+                               layerscale_init=1e-5)
+            for i in range(depth)
+        ])
+        self.norm = norm_layer(embed_dim, elementwise_affine=True)
+        self.head = torch.nn.Linear(embed_dim, nb_cls)
+        self.down_after = down_after
+        self.up_after = up_after
+        self.down1 = Downsample1D(embed_dim, kernel_size=ds_kernel)
+        self.up1 = Upsample1D(embed_dim, mode=upsample_mode)
+        self.initialize_weights()
+    def initialize_weights(self):
+        torch.nn.init.normal_(self.mask_token, std=.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            torch.nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def mask_random_1d(self, x, ratio):
+        B, L, _ = x.shape
+        mask = torch.ones(B, L, dtype=torch.bool).to(x.device)
+        if ratio <= 0.0 or ratio > 1.0:
+            return mask
+        num = int(round(ratio * L))
+        if num <= 0:
+            return mask
+        noise = torch.rand(B, L).to(x.device)
+        idx = noise.argsort(dim=1)[:, :num]
+        mask.scatter_(1, idx, False)
+        return mask
+    def mask_block_1d(self, x, ratio: float, max_block_length: int):
+        B, L, _ = x.shape
+        device = x.device
+        if ratio <= 0.0:
+            return torch.ones(B, L, 1, dtype=torch.bool, device=device)
+        if ratio >= 1.0:
+            return torch.zeros(B, L, 1, dtype=torch.bool, device=device)
+        target_mask_tokens = int(round(ratio * L))
+        K = target_mask_tokens // max_block_length
+        K = max(K, 1)
+        starts = torch.randint(0, max(1, L - max_block_length + 1), (B, K), device=device)
+        lengths = torch.randint(1, max_block_length + 1, (B, K), device=device)
+        positions = torch.arange(L, device=device).view(1, 1, L)
+        starts_exp = starts.unsqueeze(-1)
+        ends_exp = (starts + lengths).unsqueeze(-1).clamp(max=L)
+        blocks_mask = (positions >= starts_exp) & (positions < ends_exp)
+        masked_any = blocks_mask.any(dim=1)
+        keep_mask = ~masked_any
+        return keep_mask.unsqueeze(-1)
+    def mask_span_1d(self, x, ratio: float, max_span_length: int):
+        B, L, _ = x.shape
+        device = x.device
+        if ratio <= 0.0:
+            return torch.ones(B, L, 1, dtype=torch.bool, device=device)
+        if ratio >= 1.0:
+            return torch.zeros(B, L, 1, dtype=torch.bool, device=device)
+        target_mask_tokens = int(round(ratio * L))
+        K = target_mask_tokens // max_span_length
+        K = max(K, 1)
+        starts = torch.randint(0, max(1, L - max_span_length + 1), (B, K), device=device)
+        lengths = torch.full((B, K), max_span_length, device=device)
+        positions = torch.arange(L, device=device).view(1, 1, L)
+        starts_exp = starts.unsqueeze(-1)
+        ends_exp = (starts + lengths).unsqueeze(-1).clamp(max=L)
+        spans_mask = (positions >= starts_exp) & (positions < ends_exp)
+        masked_any = spans_mask.any(dim=1)
+        keep_mask = ~masked_any
+        return keep_mask.unsqueeze(-1)
+    def forward_features(self, x, use_masking=False,
+                         mask_mode="span",
+                         mask_ratio=0.5, block_span=4, max_span_length=8):
+        x = self.patch_embed(x)
+        B, C, W, H = x.shape
+        assert C == self.embed_dim, f"Expected embed_dim {self.embed_dim}, got {C}"
+        x = x.view(B, C, -1).permute(0, 2, 1)
+        masked_positions_1d = None
+        if use_masking:
+            if mask_mode == "random":
+                keep_mask_1d = self.mask_random_1d(x, mask_ratio).float()
+                mask = keep_mask_1d.unsqueeze(-1)
+            elif mask_mode in ("block"):
+                keep_mask = self.mask_block_1d(x, mask_ratio, block_span).float()
+                keep_mask_1d = keep_mask.squeeze(-1)
+                mask = keep_mask
+            elif mask_mode in ("span"):
+                keep_mask = self.mask_span_1d(
+                    x, mask_ratio, max_span_length).float()
+                keep_mask_1d = keep_mask.squeeze(-1)
+                mask = keep_mask
+            else:
+                warnings.warn(
+                    f"Unknown mask_mode '{mask_mode}', defaulting to span.")
+                keep_mask = self.mask_span_1d(
+                    x, mask_ratio, max_span_length).float()
+                keep_mask_1d = keep_mask.squeeze(-1)
+                mask = keep_mask
+            masked_positions_1d = (1.0 - keep_mask_1d).clamp(min=0.0, max=1.0)
+            x = mask * x + (1.0 - mask) * \
+                self.mask_token.expand(x.size(0), x.size(1), x.size(2))
+        skip_hi = None
+        for i, blk in enumerate(self.blocks, 1):
+            x = blk(x)
+            if i == self.down_after:
+                skip_hi = x
+                if (x.size(1) % 2) == 1:
+                    x = torch.cat([x, x[:, -1:, :]], dim=1)
+                x = self.down1(x)
+            if i == self.up_after:
+                assert skip_hi is not None, "Upsample requires a stored skip."
+                x = self.up1(x, target_len=skip_hi.size(1))
+                x = x + skip_hi
+        x = self.norm(x)
+        return x, masked_positions_1d
+    def forward(self, x, use_masking=False, return_features=False, return_mask=False,
+                mask_mode="span", mask_ratio=None, block_span=None, max_span_length=None):
+        feats, masked_positions_1d = self.forward_features(
+            x, use_masking=use_masking, mask_mode=mask_mode, mask_ratio=mask_ratio, block_span=block_span, max_span_length=max_span_length)
+        logits = self.head(feats)
+        if return_features and return_mask:
+            return logits, feats, (masked_positions_1d if masked_positions_1d is not None else None)
+        if return_features:
+            return logits, feats
+        if return_mask:
+            return logits, (masked_positions_1d if masked_positions_1d is not None else None)
+        return logits
+def create_model(nb_cls, img_size, mlp_ratio=4, **kwargs):
+    model = HTR_ConvText(
+        nb_cls,
+        img_size=img_size,
+        patch_size=(4, 64),
+        embed_dim=512,
+        depth=8,
+        num_heads=8,
+        mlp_ratio=mlp_ratio,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        conv_kernel_size=7,
+        down_after=3,
+        up_after=7,
+        ds_kernel=3,
+        max_seq_len=128,
+        upsample_mode='nearest',
+        **kwargs,
+    )
+    return model

model/layer.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from torch import nn
+from typing import Optional, Union, Tuple
+class ConvLayer2d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+        use_norm: bool = True,
+        use_act: bool = True,
+        norm_layer: Optional[nn.Module] = None,
+        act_layer: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+        layers = []
+        layers.append(
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=bias
+            )
+        )
+        if use_norm:
+            if norm_layer is None:
+                norm_layer = nn.BatchNorm2d(out_channels)
+            layers.append(norm_layer)
+        if use_act:
+            if act_layer is None:
+                act_layer = nn.ReLU(inplace=True)
+            layers.append(act_layer)
+        self.block = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.block(x)
+# PEG  from https://arxiv.org/abs/2102.10882
+class PosCNN(nn.Module):
+    def __init__(self, in_chans, embed_dim=None, s=1):
+        super(PosCNN, self).__init__()
+        self.proj = nn.Sequential(
+            nn.Conv2d(in_chans, embed_dim, 3, s, 1,
+                      bias=True, groups=embed_dim),
+        )
+        self.s = s
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        feat_token = x
+        cnn_feat = feat_token.transpose(1, 2).view(B, C, H, W)
+        if self.s == 1:
+            x = self.proj(cnn_feat) + cnn_feat
+        else:
+            x = self.proj(cnn_feat)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+    def no_weight_decay(self):
+        return ["proj.%d.weight" % i for i in range(4)]

model/resnet18.py ADDED Viewed

	@@ -0,0 +1,411 @@

+import torch
+import torch.nn as nn
+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2023 Apple Inc. All Rights Reserved.
+#
+import math
+from typing import Dict, Optional, Sequence, Tuple, Union
+import numpy as np
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+from .layer import ConvLayer2d, PosCNN
+from timm.models.vision_transformer import Mlp, DropPath
+from typing import Any
+class BaseModule(nn.Module):
+    """Base class for all modules"""
+    def __init__(self, *args, **kwargs):
+        super(BaseModule, self).__init__()
+    def forward(self, x: Any, *args, **kwargs) -> Any:
+        raise NotImplementedError
+    def __repr__(self):
+        return "{}".format(self.__class__.__name__)
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class Block(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_heads,
+            mlp_ratio=4.,
+            qkv_bias=False,
+            drop=0.0,
+            attn_drop=0.,
+            init_values=None,
+            drop_path=0.,
+            act_layer=nn.GELU,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim, elementwise_affine=True)
+        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim, elementwise_affine=True)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+class MobileViTBlock(BaseModule):
+    """
+    This class defines the `MobileViT block <https://arxiv.org/abs/2110.02178?context=cs.LG>`_
+    Args:
+        opts: command line arguments
+        in_channels (int): :math:`C_{in}` from an expected input of size :math:`(N, C_{in}, H, W)`
+        transformer_dim (int): Input dimension to the transformer unit
+        ffn_dim (int): Dimension of the FFN block
+        n_transformer_blocks (Optional[int]): Number of transformer blocks. Default: 2
+        head_dim (Optional[int]): Head dimension in the multi-head attention. Default: 32
+        attn_dropout (Optional[float]): Dropout in multi-head attention. Default: 0.0
+        dropout (Optional[float]): Dropout rate. Default: 0.0
+        ffn_dropout (Optional[float]): Dropout between FFN layers in transformer. Default: 0.0
+        patch_h (Optional[int]): Patch height for unfolding operation. Default: 8
+        patch_w (Optional[int]): Patch width for unfolding operation. Default: 8
+        transformer_norm_layer (Optional[str]): Normalization layer in the transformer block. Default: layer_norm
+        conv_ksize (Optional[int]): Kernel size to learn local representations in MobileViT block. Default: 3
+        dilation (Optional[int]): Dilation rate in convolutions. Default: 1
+        no_fusion (Optional[bool]): Do not combine the input and output feature maps. Default: False
+    """
+    def __init__(
+        self,
+        in_channels = 128,
+        transformer_dim = 128,
+        n_transformer_blocks = 2,
+        head_dim = 64,
+        attn_dropout = 0.0,
+        dropout = 0.0,
+        patch_h = 2,
+        patch_w = 2,
+        conv_ksize = 3,
+        dilation = 1,
+        no_fusion = True,
+    ) -> None:
+        conv_3x3_in = ConvLayer2d(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=conv_ksize,
+            stride=1,
+            use_norm=True,
+            use_act=True,
+            dilation=dilation,
+            padding = 1,
+        )
+        conv_1x1_in = ConvLayer2d(
+            in_channels=in_channels,
+            out_channels=transformer_dim,
+            kernel_size=1,
+            stride=1,
+            use_norm=False,
+            use_act=False,
+        )
+        conv_1x1_out = ConvLayer2d(
+            in_channels=transformer_dim,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            use_norm=True,
+            use_act=True,
+        )
+        conv_3x3_out = None
+        if not no_fusion:
+            conv_3x3_out = ConvLayer2d(
+                in_channels=2 * in_channels,
+                out_channels=in_channels,
+                kernel_size=conv_ksize,
+                stride=1,
+                padding = 1,
+                use_norm=True,
+                use_act=True,
+            )
+        super().__init__()
+        self.local_rep = nn.Sequential()
+        self.local_rep.add_module(name="conv_3x3", module=conv_3x3_in)
+        self.local_rep.add_module(name="conv_1x1", module=conv_1x1_in)
+        self.pos_pe = PosCNN(in_chans=transformer_dim, embed_dim=transformer_dim)
+        assert transformer_dim % head_dim == 0
+        num_heads = transformer_dim // head_dim
+        global_rep = [
+            Block(
+                dim=transformer_dim,
+                num_heads=num_heads,
+                mlp_ratio = 4.0,
+                qkv_bias = True,
+                attn_drop = attn_dropout,
+                drop=dropout,
+                norm_layer=nn.LayerNorm,
+            )
+            for _ in range(n_transformer_blocks)
+        ]
+        global_rep.append(nn.LayerNorm(transformer_dim))
+        self.global_rep = nn.Sequential(*global_rep)
+        self.conv_proj = conv_1x1_out
+        self.fusion = conv_3x3_out
+        self.patch_h = patch_h
+        self.patch_w = patch_w
+        self.patch_area = self.patch_w * self.patch_h
+        self.cnn_in_dim = in_channels
+        self.cnn_out_dim = transformer_dim
+        self.n_heads = num_heads
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.dilation = dilation
+        self.n_blocks = n_transformer_blocks
+        self.conv_ksize = conv_ksize
+    def unfolding(self, feature_map: Tensor) -> Tuple[Tensor, Dict]:
+        patch_w, patch_h = self.patch_w, self.patch_h
+        patch_area = int(patch_w * patch_h)
+        batch_size, in_channels, orig_h, orig_w = feature_map.shape
+        new_h = int(math.ceil(orig_h / self.patch_h) * self.patch_h)
+        new_w = int(math.ceil(orig_w / self.patch_w) * self.patch_w)
+        interpolate = False
+        if new_w != orig_w or new_h != orig_h:
+            # Note: Padding can be done, but then it needs to be handled in attention function.
+            feature_map = F.interpolate(
+                feature_map, size=(new_h, new_w), mode="bilinear", align_corners=False
+            )
+            interpolate = True
+        # number of patches along width and height
+        num_patch_w = new_w // patch_w  # n_w
+        num_patch_h = new_h // patch_h  # n_h
+        num_patches = num_patch_h * num_patch_w  # N
+        # [B, C, H, W] --> [B * C * n_h, p_h, n_w, p_w]
+        reshaped_fm = feature_map.reshape(
+            batch_size * in_channels * num_patch_h, patch_h, num_patch_w, patch_w
+        )
+        # [B * C * n_h, p_h, n_w, p_w] --> [B * C * n_h, n_w, p_h, p_w]
+        transposed_fm = reshaped_fm.transpose(1, 2)
+        # [B * C * n_h, n_w, p_h, p_w] --> [B, C, N, P] where P = p_h * p_w and N = n_h * n_w
+        reshaped_fm = transposed_fm.reshape(
+            batch_size, in_channels, num_patches, patch_area
+        )
+        # [B, C, N, P] --> [B, P, N, C]
+        transposed_fm = reshaped_fm.transpose(1, 3)
+        # [B, P, N, C] --> [BP, N, C]
+        patches = transposed_fm.reshape(batch_size * patch_area, num_patches, -1)
+        info_dict = {
+            "orig_size": (orig_h, orig_w),
+            "batch_size": batch_size,
+            "interpolate": interpolate,
+            "total_patches": num_patches,
+            "num_patches_w": num_patch_w,
+            "num_patches_h": num_patch_h,
+        }
+        return patches, info_dict
+    def folding(self, patches: Tensor, info_dict: Dict) -> Tensor:
+        n_dim = patches.dim()
+        assert n_dim == 3, "Tensor should be of shape BPxNxC. Got: {}".format(
+            patches.shape
+        )
+        # [BP, N, C] --> [B, P, N, C]
+        patches = patches.contiguous().view(
+            info_dict["batch_size"], self.patch_area, info_dict["total_patches"], -1
+        )
+        batch_size, pixels, num_patches, channels = patches.size()
+        num_patch_h = info_dict["num_patches_h"]
+        num_patch_w = info_dict["num_patches_w"]
+        # [B, P, N, C] --> [B, C, N, P]
+        patches = patches.transpose(1, 3)
+        # [B, C, N, P] --> [B*C*n_h, n_w, p_h, p_w]
+        feature_map = patches.reshape(
+            batch_size * channels * num_patch_h, num_patch_w, self.patch_h, self.patch_w
+        )
+        # [B*C*n_h, n_w, p_h, p_w] --> [B*C*n_h, p_h, n_w, p_w]
+        feature_map = feature_map.transpose(1, 2)
+        # [B*C*n_h, p_h, n_w, p_w] --> [B, C, H, W]
+        feature_map = feature_map.reshape(
+            batch_size, channels, num_patch_h * self.patch_h, num_patch_w * self.patch_w
+        )
+        if info_dict["interpolate"]:
+            feature_map = F.interpolate(
+                feature_map,
+                size=info_dict["orig_size"],
+                mode="bilinear",
+                align_corners=False,
+            )
+        return feature_map
+    def forward(self, x: Tensor) -> Tensor:
+        res = x
+        fm = self.local_rep(x)
+        # convert feature map to patches
+        patches, info_dict = self.unfolding(fm)
+        num_patch_h = info_dict["num_patches_h"]
+        num_patch_w = info_dict["num_patches_w"]
+        # learn global representations
+        for j, transformer_layer in enumerate(self.global_rep):
+            patches = transformer_layer(patches)
+            if j == 0:
+                patches  = self.pos_pe(patches, num_patch_h, num_patch_w)  # PEG here
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        fm = self.folding(patches=patches, info_dict=info_dict)
+        fm = self.conv_proj(fm)
+        if self.fusion is not None:
+            fm = self.fusion(torch.cat((res, fm), dim=1))
+        return fm
+def conv3x3(in_planes, out_planes, stride=1):
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3,
+                     stride=stride, padding=1, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, eps=1e-05)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-05)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ResNet18(nn.Module):
+    def __init__(self, nb_feat=384):
+        self.inplanes = nb_feat // 4
+        super(ResNet18, self).__init__()
+        self.conv1 = nn.Conv2d(
+            1, nb_feat // 4, kernel_size=3, stride=(2, 1), padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(nb_feat // 4, eps=1e-05)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=(2, 1), padding=1)
+        self.layer1 = self._make_layer(
+            BasicBlock, nb_feat // 4, 2, stride=(2, 1))
+        self.mobilevit_block1 = MobileViTBlock(in_channels=nb_feat // 4, transformer_dim=nb_feat // 4, n_transformer_blocks=1, head_dim=64, attn_dropout=0.0, dropout=0.0, patch_h=2, patch_w=2, conv_ksize=3, dilation=1, no_fusion=True)
+        self.layer2 = self._make_layer(BasicBlock, nb_feat // 2, 2, stride=2)
+        self.mobilevit_block2 = MobileViTBlock(in_channels=nb_feat // 2, transformer_dim=nb_feat//2, n_transformer_blocks=1, head_dim=64, attn_dropout=0.0, dropout=0.0, patch_h=2, patch_w=2, conv_ksize=3, dilation=1, no_fusion=True)
+        self.layer3 = self._make_layer(BasicBlock, nb_feat, 2, stride=2)
+        self.mobilevit_block3 = MobileViTBlock(in_channels=nb_feat, transformer_dim=nb_feat, n_transformer_blocks=1, head_dim=64, attn_dropout=0.0, dropout=0.0, patch_h=2, patch_w=2, conv_ksize=3, dilation=1, no_fusion=True)
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, eps=1e-05),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, 1, None))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.mobilevit_block1(x)
+        x = self.layer2(x)
+        x = self.mobilevit_block2(x)
+        x = self.layer3(x)
+        x = self.mobilevit_block3(x)
+        x = self.maxpool(x)
+        return x

model/tcm_head.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def build_tcm_vocab(converter, add_tokens=("<pad>",)):
+    base = list(converter.character)
+    stoi = {ch: i for i, ch in enumerate(base)}
+    for t in add_tokens:
+        if t not in stoi:
+            stoi[t] = len(stoi)
+    itos = [''] * len(stoi)
+    for k, v in stoi.items():
+        itos[v] = k
+    pad_id = stoi["<pad>"]
+    return stoi, itos, pad_id
+def texts_to_ids(texts, stoi):
+    return [torch.tensor([stoi[ch] for ch in t], dtype=torch.long) for t in texts]
+def make_context_batch(texts, stoi, sub_str_len=5, device='cuda'):
+    ids = [torch.tensor([stoi[ch] for ch in t], dtype=torch.long, device=device) for t in texts]
+    B = len(ids); Lmax = max(t.size(0) for t in ids); S = sub_str_len
+    PAD = stoi["<pad>"]
+    left  = torch.full((B, Lmax, S), PAD, dtype=torch.long, device=device)
+    right = torch.full((B, Lmax, S), PAD, dtype=torch.long, device=device)
+    tgt   = torch.full((B, Lmax),    PAD, dtype=torch.long, device=device)
+    mask  = torch.zeros((B, Lmax),   dtype=torch.float32,   device=device)
+    for b, seq in enumerate(ids):
+        L = seq.size(0)
+        tgt[b, :L]  = seq
+        mask[b, :L] = 1.0
+        for i in range(L):
+            l_ctx = seq[max(0, i-S):i]
+            # left pad with PAD
+            if l_ctx.numel() < S:
+                l_ctx = torch.cat([torch.full((S - l_ctx.numel(),), PAD, device=device), l_ctx], dim=0)
+            left[b, i] = l_ctx[-S:]
+            r_ctx = seq[i+1:min(L, i+1+S)]
+            # right pad with PAD
+            if r_ctx.numel() < S:
+                r_ctx = torch.cat([r_ctx, torch.full((S - r_ctx.numel(),), PAD, device=device)], dim=0)
+            right[b, i] = r_ctx[:S]
+    return left, right, tgt, mask
+class TCMHead(nn.Module):
+    def __init__(self, d_vis, vocab_size_tcm, pad_id, d_txt=256, sub_str_len=5, p_drop=0.1):
+        super().__init__()
+        self.vocab_size = vocab_size_tcm
+        self.sub_str_len = sub_str_len
+        # critical: padding_idx zeroes the PAD row and keeps it frozen
+        self.emb = nn.Embedding(vocab_size_tcm, d_txt, padding_idx=pad_id)
+        # keep direction as learned vectors (not tokens)
+        self.dir_left  = nn.Parameter(torch.randn(1, 1, d_txt))
+        self.dir_right = nn.Parameter(torch.randn(1, 1, d_txt))
+        self.ctx_conv = nn.Conv1d(d_txt, d_txt, kernel_size=3, padding=1)
+        self.txt_proj = nn.Linear(d_txt, d_vis)
+        self.q_norm   = nn.LayerNorm(d_vis)
+        self.kv_norm  = nn.LayerNorm(d_vis)
+        self.dropout  = nn.Dropout(p_drop)
+        self.classifier = nn.Linear(d_vis, vocab_size_tcm)
+    def _context_to_query(self, ctx_ids, dir_token):
+        E = self.emb(ctx_ids)
+        B, L, S, D = E.shape
+        x = E.view(B*L, S, D).transpose(1, 2)
+        x = self.ctx_conv(x)
+        x = x.mean(dim=-1)
+        x = x.view(B, L, D)
+        x = x + dir_token
+        x = self.txt_proj(x)
+        return self.q_norm(x)
+    def _cross_attend(self, Q, F):
+        K = self.kv_norm(F)
+        V = K
+        attn = torch.einsum('bld,bnd->bln', Q, K) / \
+            (K.size(-1) ** 0.5)
+        A = attn.softmax(dim=-1)
+        out = torch.einsum('bln,bnd->bld', A, V)
+        return self.dropout(out)
+    def forward(self,
+                vis_tokens,
+                left_ctx_ids,
+                right_ctx_ids,
+                tgt_ids,
+                tgt_mask,
+                focus_mask=None):
+        Ql = self._context_to_query(left_ctx_ids,  self.dir_left)
+        Qr = self._context_to_query(right_ctx_ids, self.dir_right)
+        Fl = self._cross_attend(Ql, vis_tokens)
+        Fr = self._cross_attend(Qr, vis_tokens)
+        logits_l = self.classifier(Fl)
+        logits_r = self.classifier(Fr)
+        loss_l = F.cross_entropy(
+            logits_l.view(-1, self.vocab_size),
+            tgt_ids.view(-1),
+            reduction='none'
+        ).view_as(tgt_ids)
+        loss_r = F.cross_entropy(
+            logits_r.view(-1, self.vocab_size),
+            tgt_ids.view(-1),
+            reduction='none'
+        ).view_as(tgt_ids)
+        if focus_mask is not None:
+            weights = tgt_mask * (1.0 + focus_mask)
+        else:
+            weights = tgt_mask
+        loss_masked = (loss_l + loss_r) * weights
+        denom = torch.clamp(weights.sum(), min=1.0)
+        loss_tcm = loss_masked.sum() / (2.0 * denom)
+        return {'loss_tcm': loss_tcm,
+                'logits_l': logits_l,
+                'logits_r': logits_r}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+numpy>=1.24
+pillow>=9.0
+opencv-python>=4.8
+scikit-image>=0.21
+tensorboard>=2.13
+wandb>=0.16
+editdistance>=0.6
+timm>=0.9

run/iam.sh ADDED Viewed

	@@ -0,0 +1 @@

+ python3 train.py --use-wandb --dataset iam --tcm-enable --exp-name "htr-convtext" --wandb-project iam --num-workers 4 --max-lr 1e-3 --warm-up-iter 1000 --weight-decay 0.05 --train-bs 32 --val-bs 8 --max-span-length 8 --mask-ratio 0.4 --attn-mask-ratio 0.1 --img-size 512 64 --proj 8 --dila-ero-max-kernel 2 --dila-ero-iter 1 --proba 0.5 --alpha 1 --total-iter 100001 --data-path /kaggle/input/iam-vt-lines/lines/ --train-data-list /kaggle/input/iam-vt-lines/train.ln --val-data-list /kaggle/input/iam-vt-lines/val.ln --test-data-list /kaggle/input/iam-vt-lines/test.ln --nb-cls 80

run/lam.sh ADDED Viewed

	@@ -0,0 +1 @@

+ python3 train.py --use-wandb --dataset lam --tcm-enable --exp-name "htr-convtext" --wandb-project lam --num-workers 4 --max-lr 1e-3 --warm-up-iter 1000 --weight-decay 0.05 --train-bs 32 --val-bs 8 --max-span-length 8 --mask-ratio 0.4 --attn-mask-ratio 0.1 --img-size 512 64 --proj 8 --dila-ero-max-kernel 2 --dila-ero-iter 1 --proba 0.5 --alpha 1 --total-iter 100001 --data-path /kaggle/input/lam-vt-lines/lines/ --train-data-list /kaggle/input/lam-vt-lines/train.ln --val-data-list /kaggle/input/lam-vt-lines/val.ln --test-data-list /kaggle/input/lam-vt-lines/test.ln --nb-cls 91

run/read2016.sh ADDED Viewed

	@@ -0,0 +1 @@

+ python3 train.py --use-wandb --dataset read2016 --tcm-enable --exp-name "htr-convtext" --wandb-project read2016 --num-workers 4 --max-lr 1e-3 --warm-up-iter 1000 --weight-decay 0.05 --train-bs 32 --val-bs 8 --max-span-length 8 --mask-ratio 0.4 --attn-mask-ratio 0.1 --img-size 512 64 --proj 8 --dila-ero-max-kernel 2 --dila-ero-iter 1 --proba 0.5 --alpha 1 --total-iter 100001 --data-path /kaggle/input/read2016-vt-lines/lines/ --train-data-list /kaggle/input/read2016-vt-lines/train.ln --val-data-list /kaggle/input/read2016-vt-lines/val.ln --test-data-list /kaggle/input/read2016-vt-lines/test.ln --nb-cls 90

run/vnondb.sh ADDED Viewed

	@@ -0,0 +1 @@

+ python3 train.py --use-wandb --dataset vnondb --tcm-enable --exp-name "htr-convtext" --wandb-project vnondb --num-workers 4 --max-lr 1e-3 --warm-up-iter 1000 --weight-decay 0.05 --train-bs 32 --val-bs 8 --max-span-length 8 --mask-ratio 0.4 --attn-mask-ratio 0.1 --img-size 512 64 --proj 8 --dila-ero-max-kernel 2 --dila-ero-iter 1 --proba 0.5 --alpha 1 --total-iter 100001 --data-path /kaggle/input/vnondb/lines/ --train-data-list /kaggle/input/vnondb/train.ln --val-data-list /kaggle/input/vnondb/val.ln --test-data-list /kaggle/input/vnondb/test.ln --nb-cls 162

test.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+import os
+import re
+import json
+import valid
+from utils import utils
+from utils import option
+from data import dataset
+from model import htr_convtext
+from collections import OrderedDict
+def main():
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    torch.manual_seed(args.seed)
+    args.save_dir = os.path.join(args.out_dir, args.exp_name)
+    os.makedirs(args.save_dir, exist_ok=True)
+    logger = utils.get_logger(args.save_dir)
+    model = htr_convtext.create_model(
+        nb_cls=args.nb_cls, img_size=args.img_size[::-1])
+    pth_path = args.resume
+    logger.info('loading HWR checkpoint from {}'.format(pth_path))
+    ckpt = torch.load(pth_path, map_location='cpu', weights_only=False)
+    model_dict = OrderedDict()
+    pattern = re.compile('module.')
+    for k, v in ckpt['state_dict_ema'].items():
+        if re.search("module", k):
+            model_dict[re.sub(pattern, '', k)] = v
+        else:
+            model_dict[k] = v
+    model.load_state_dict(model_dict, strict=True)
+    model = model.cuda()
+    logger.info('Loading test loader...')
+    train_dataset = dataset.myLoadDS(
+        args.train_data_list, args.data_path, args.img_size, dataset=args.dataset)
+    test_dataset = dataset.myLoadDS(
+        args.test_data_list, args.data_path, args.img_size, ralph=train_dataset.ralph, dataset=args.dataset)
+    test_loader = torch.utils.data.DataLoader(test_dataset,
+                                              batch_size=args.val_bs,
+                                              shuffle=False,
+                                              pin_memory=True,
+                                              num_workers=args.num_workers)
+    converter = utils.CTCLabelConverter(train_dataset.ralph.values())
+    criterion = torch.nn.CTCLoss(
+        reduction='none', zero_infinity=True).to(device)
+    model.eval()
+    with torch.no_grad():
+        val_loss, val_cer, val_wer, preds, labels = valid.validation(
+            model,
+            criterion,
+            test_loader,
+            converter,
+        )
+    logger.info(
+        f'Test. loss : {val_loss:0.3f} \t CER : {val_cer:0.4f} \t WER : {val_wer:0.4f} ')
+    # Save predictions as JSON
+    results = {
+        "test_metrics": {
+            "loss": float(val_loss),
+            "cer": float(val_cer),
+            "wer": float(val_wer)
+        },
+        "predictions": []
+    }
+    def _levenshtein(pred_tokens, gt_tokens):
+        if pred_tokens == gt_tokens:
+            return 0
+        lp, lg = len(pred_tokens), len(gt_tokens)
+        if lp == 0:
+            return lg
+        if lg == 0:
+            return lp
+        prev = list(range(lg + 1))
+        for i in range(1, lp + 1):
+            cur = [i]
+            pi = pred_tokens[i - 1]
+            for j in range(1, lg + 1):
+                gj = gt_tokens[j - 1]
+                cost = 0 if pi == gj else 1
+                cur.append(
+                    min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + cost))
+            prev = cur
+        return prev[-1]
+    def _levenshtein_str(a: str, b: str):
+        return _levenshtein(list(a), list(b))
+    def _cer(pred: str, gt: str):
+        if len(gt) == 0:
+            return 0.0 if len(pred) == 0 else 1.0
+        return _levenshtein_str(pred, gt) / len(gt)
+    def _wer(pred: str, gt: str):
+        gt_words = gt.split()
+        pred_words = pred.split()
+        if len(gt_words) == 0:
+            return 0.0 if len(pred_words) == 0 else 1.0
+        return _levenshtein(pred_words, gt_words) / len(gt_words)
+    for i, (pred, label) in enumerate(zip(preds, labels)):
+        if i < len(test_dataset.fns):
+            img_path = test_dataset.fns[i]
+            img_name = os.path.basename(img_path)
+        else:
+            img_path = None
+            img_name = None
+        results["predictions"].append({
+            "sample_id": i + 1,
+            "image_filename": img_name,
+            "image_path": img_path,
+            "prediction": pred,
+            "ground_truth": label,
+            "match": pred == label,
+            "cer": round(float(_cer(pred, label)), 6),
+            "wer": round(float(_wer(pred, label)), 6)
+        })
+    pred_file = os.path.join(args.save_dir, 'predictions.json')
+    with open(pred_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+if __name__ == '__main__':
+    args = option.get_args_parser()
+    main()

train.py ADDED Viewed

	@@ -0,0 +1,441 @@

+import torch
+import torch.utils.data
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+import os
+import json
+import valid
+from utils import utils
+from utils import sam
+from utils import option
+from data import dataset
+from model import htr_convtext
+from functools import partial
+import random
+import numpy as np
+import re
+import importlib
+from model.tcm_head import TCMHead, build_tcm_vocab, make_context_batch
+import wandb
+def compute_losses(
+    args,
+    model,
+    tcm_head,
+    image,
+    texts,
+    batch_size,
+    criterion_ctc,
+    converter,
+    nb_iter,
+    ctc_lambda,
+    tcm_lambda,
+    stoi,
+    mask_mode='span',
+    mask_ratio=0.30,
+    block_span=4,
+    max_span_length=8,
+    pre_tcm_ctx=None,
+    use_masking=True,
+):
+    if tcm_head is None or nb_iter < args.tcm_warmup_iters:
+        preds = model(image, use_masking=use_masking, mask_mode=mask_mode,
+                      mask_ratio=mask_ratio, max_span_length=max_span_length)
+        feats = None
+    else:
+        preds, feats, vis_mask = model(
+            image,
+            use_masking=use_masking,
+            return_features=True,
+            return_mask=True,
+            mask_mode=mask_mode,
+            mask_ratio=mask_ratio,
+            block_span=block_span,
+            max_span_length=max_span_length
+        )
+    text_ctc, length_ctc = converter.encode(texts)
+    text_ctc = text_ctc.to(preds.device)
+    length_ctc = length_ctc.to(preds.device)
+    preds_sz = torch.full((batch_size,), preds.size(
+        1), dtype=torch.int32, device=preds.device)
+    loss_ctc = criterion_ctc(preds.permute(1, 0, 2).log_softmax(2),
+                             text_ctc, preds_sz, length_ctc).mean()
+    loss_tcm = torch.zeros((), device=preds.device)
+    if tcm_head is not None and feats is not None:
+        left_ctx, right_ctx, tgt_ids, tgt_mask = pre_tcm_ctx if pre_tcm_ctx is not None else make_context_batch(
+            texts, stoi, sub_str_len=args.tcm_sub_len, device=image.device)
+        if vis_mask is not None:
+            B_v, N_v = vis_mask.shape
+            B_t, L_t = tgt_mask.shape
+            if N_v != L_t:
+                idx = torch.linspace(0, N_v - 1, steps=L_t,
+                                     device=vis_mask.device).long()
+                focus_mask = vis_mask[:, idx]
+            else:
+                focus_mask = vis_mask
+        else:
+            focus_mask = None
+        out = tcm_head(
+            feats,
+            left_ctx, right_ctx,
+            tgt_ids, tgt_mask,
+            focus_mask=focus_mask
+        )
+        loss_tcm = out['loss_tcm']
+    total = ctc_lambda * loss_ctc + tcm_lambda * loss_tcm
+    return total, loss_ctc.detach(), loss_tcm.detach()
+def tri_masked_loss(args, model, tcm_head, image, labels, batch_size,
+                    criterion, converter, nb_iter, ctc_lambda, tcm_lambda, stoi,
+                    r_rand=0.6, r_block=0.6, block_span=4, r_span=0.4, max_span=8):
+    total = 0.0
+    total_ctc = 0.0
+    total_tcm = 0.0
+    plans = [("random", r_rand), ("block", r_block), ("span", r_span)]
+    if tcm_head is not None and nb_iter >= args.tcm_warmup_iters:
+        pre_tcm_ctx = make_context_batch(
+            labels, stoi, sub_str_len=args.tcm_sub_len, device=image.device)
+    for mode, ratio in plans:
+        loss, loss_ctc, loss_tcm = compute_losses(
+            args, model, tcm_head, image, labels, batch_size, criterion, converter,
+            nb_iter, ctc_lambda, tcm_lambda, stoi,
+            mask_mode=mode, mask_ratio=ratio, block_span=block_span, max_span_length=max_span,
+            pre_tcm_ctx=pre_tcm_ctx
+        )
+        total += loss
+        total_ctc += loss_ctc
+        total_tcm += loss_tcm
+    denom = 3.0
+    return total/denom, total_ctc/denom, total_tcm/denom
+def main():
+    args = option.get_args_parser()
+    torch.manual_seed(args.seed)
+    args.save_dir = os.path.join(args.out_dir, args.exp_name)
+    os.makedirs(args.save_dir, exist_ok=True)
+    logger = utils.get_logger(args.save_dir)
+    logger.info(json.dumps(vars(args), indent=4, sort_keys=True))
+    writer = SummaryWriter(args.save_dir)
+    if getattr(args, 'use_wandb', False):
+        try:
+            wandb = importlib.import_module('wandb')
+            wandb.init(project=getattr(args, 'wandb_project', 'None'), name=args.exp_name,
+                       config=vars(args), dir=args.save_dir)
+            logger.info("Weights & Biases logging enabled")
+        except Exception as e:
+            logger.warning(
+                f"Failed to initialize wandb: {e}. Continuing without wandb.")
+            wandb = None
+    else:
+        wandb = None
+    torch.backends.cudnn.benchmark = True
+    model = htr_convtext.create_model(
+        nb_cls=args.nb_cls, img_size=args.img_size[::-1])
+    total_param = sum(p.numel() for p in model.parameters())
+    logger.info('total_param is {}'.format(total_param))
+    model.train()
+    model = model.cuda()
+    ema_decay = args.ema_decay
+    logger.info(f"Using EMA decay: {ema_decay}")
+    model_ema = utils.ModelEma(model, ema_decay)
+    model.zero_grad()
+    resume_path = args.resume
+    best_cer, best_wer, start_iter, optimizer_state, train_loss, train_loss_count = utils.load_checkpoint(
+        model, model_ema, None, resume_path, logger)
+    logger.info('Loading train loader...')
+    train_dataset = dataset.myLoadDS(
+        args.train_data_list, args.data_path, args.img_size, dataset=args.dataset)
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                               batch_size=args.train_bs,
+                                               shuffle=True,
+                                               pin_memory=True,
+                                               num_workers=args.num_workers,
+                                               collate_fn=partial(dataset.SameTrCollate, args=args))
+    train_iter = dataset.cycle_data(train_loader)
+    logger.info('Loading val loader...')
+    val_dataset = dataset.myLoadDS(
+        args.val_data_list, args.data_path, args.img_size, ralph=train_dataset.ralph, dataset=args.dataset)
+    val_loader = torch.utils.data.DataLoader(val_dataset,
+                                             batch_size=args.val_bs,
+                                             shuffle=False,
+                                             pin_memory=True,
+                                             num_workers=args.num_workers)
+    criterion = torch.nn.CTCLoss(reduction='none', zero_infinity=True)
+    converter = utils.CTCLabelConverter(train_dataset.ralph.values())
+    stoi, itos, pad_id = build_tcm_vocab(converter)
+    vocab_size_tcm = len(itos)
+    d_vis = model.embed_dim
+    if args.tcm_enable:
+        tcm_head = TCMHead(d_vis=d_vis, vocab_size_tcm=vocab_size_tcm, pad_id=pad_id,
+                           sub_str_len=args.tcm_sub_len).cuda()
+        tcm_head.train()
+    else:
+        tcm_head = None
+    param_groups = list(model.parameters())
+    if args.tcm_enable and tcm_head is not None:
+        param_groups += list(tcm_head.parameters())
+        logger.info(
+            f"Optimizing {sum(p.numel() for p in tcm_head.parameters())} tcm params in addition to model params")
+    optimizer = sam.SAM(param_groups, torch.optim.AdamW,
+                        lr=1e-7, betas=(0.9, 0.99), weight_decay=args.weight_decay)
+    if optimizer_state is not None:
+        try:
+            optimizer.load_state_dict(optimizer_state)
+            logger.info("Successfully loaded optimizer state")
+        except Exception as e:
+            logger.warning(f"Failed to load optimizer state: {e}")
+            logger.info(
+                "Continuing training without optimizer state (will restart from initial lr/momentum)")
+    elif resume_path and os.path.isfile(resume_path):
+        try:
+            ckpt = torch.load(resume_path, map_location='cpu',
+                              weights_only=False)
+            if 'optimizer' in ckpt:
+                optimizer.load_state_dict(ckpt['optimizer'])
+                logger.info("Loaded optimizer state from checkpoint directly")
+        except Exception as e:
+            logger.warning(
+                f"Could not load optimizer state from checkpoint: {e}")
+    if resume_path and os.path.isfile(resume_path) and tcm_head is not None:
+        try:
+            ckpt = torch.load(resume_path, map_location='cpu',
+                              weights_only=False)
+            if 'tcm_head' in ckpt:
+                tcm_head.load_state_dict(ckpt['tcm_head'], strict=False)
+                logger.info("Restored tcm head state from checkpoint")
+            else:
+                logger.info(
+                    "No tcm head state found in checkpoint; training tcm from scratch")
+        except Exception as e:
+            logger.warning(f"Failed to restore tcm head from checkpoint: {e}")
+    best_cer, best_wer = best_cer, best_wer
+    train_loss = train_loss
+    train_loss_count = train_loss_count
+    #### ---- train & eval ---- ####
+    logger.info('Start training...')
+    accum_steps = max(1, int(getattr(args, 'accum_steps', 1)))
+    micro_step = 0
+    avg_loss_ctc = 0.0
+    avg_loss_tcm = 0.0
+    for nb_iter in range(start_iter, args.total_iter):
+        optimizer, current_lr = utils.update_lr_cos(
+            nb_iter, args.warm_up_iter, args.total_iter, args.max_lr, optimizer)
+        optimizer.zero_grad()
+        total_loss_this_macro = 0.0
+        avg_loss_ctc = 0.0
+        avg_loss_tcm = 0.0
+        cached_batches = []
+        for micro_step in range(accum_steps):
+            batch = next(train_iter)
+            cached_batches.append(batch)
+            image = batch[0].cuda(non_blocking=True)
+            text, length = converter.encode(batch[1])
+            batch_size = image.size(0)
+            if args.use_masking:
+                # loss, loss_ctc, loss_tcm = tri_masked_loss(
+                #     args, model, tcm_head, image, batch[1], batch_size, criterion, converter,
+                #     nb_iter, args.ctc_lambda, args.tcm_lambda, stoi,
+                #     r_rand=args.r_rand,
+                #     r_block=args.r_block,
+                #     block_span=args.block_span,
+                #     r_span=args.r_span,
+                #     max_span=args.max_span
+                # )
+                loss, loss_ctc, loss_tcm = compute_losses(
+                    args, model, tcm_head, image, batch[1], batch_size, criterion, converter,
+                    nb_iter, args.ctc_lambda, args.tcm_lambda, stoi,
+                    mask_mode='span', mask_ratio=0.4, max_span_length=8, use_masking=True
+                )
+            else:
+                loss, loss_ctc, loss_tcm = compute_losses(
+                    args, model, tcm_head, image, batch[1], batch_size, criterion, converter,
+                    nb_iter, args.ctc_lambda, args.tcm_lambda, stoi, use_masking=False
+                )
+            (loss / accum_steps).backward()
+            total_loss_this_macro += loss.item()
+            avg_loss_ctc += loss_ctc.mean().item()
+            avg_loss_tcm += loss_tcm.mean().item()
+        optimizer.first_step(zero_grad=True)
+        # Recompute with perturbed weights and accumulate again for the second step
+        for micro_step in range(accum_steps):
+            batch = cached_batches[micro_step]
+            image = batch[0].cuda(non_blocking=True)
+            text, length = converter.encode(batch[1])
+            batch_size = image.size(0)
+            if args.use_masking:
+                # loss2, loss_ctc, loss_tcm = tri_masked_loss(
+                #     args, model, tcm_head, image, batch[1], batch_size, criterion, converter,
+                #     nb_iter, args.ctc_lambda, args.tcm_lambda, stoi,
+                #     r_rand=args.r_rand,
+                #     r_block=args.r_block,
+                #     block_span=args.block_span,
+                #     r_span=args.r_span,
+                #     max_span=args.max_span
+                # )
+                loss2, loss_ctc, loss_tcm = compute_losses(
+                    args, model, tcm_head, image, batch[1], batch_size, criterion, converter,
+                    nb_iter, args.ctc_lambda, args.tcm_lambda, stoi,
+                    mask_mode='span', mask_ratio=0.4, max_span_length=8, use_masking=True
+                )
+            else:
+                loss2, loss_ctc, loss_tcm = compute_losses(
+                    args, model, tcm_head, image, batch[1], batch_size, criterion, converter,
+                    nb_iter, args.ctc_lambda, args.tcm_lambda, stoi, use_masking=False
+                )
+            (loss2 / accum_steps).backward()
+        optimizer.second_step(zero_grad=True)
+        model.zero_grad()
+        model_ema.update(model, num_updates=nb_iter / 2)
+        train_loss += total_loss_this_macro / accum_steps
+        train_loss_count += 1
+        if nb_iter % args.print_iter == 0:
+            train_loss_avg = train_loss / train_loss_count if train_loss_count > 0 else 0.0
+            logger.info(
+                f'Iter : {nb_iter} \t LR : {current_lr:0.5f} \t total : {train_loss_avg:0.5f} \t CTC : {(avg_loss_ctc/accum_steps):0.5f} \t tcm : {(avg_loss_tcm/accum_steps):0.5f} \t ')
+            writer.add_scalar('./Train/lr', current_lr, nb_iter)
+            writer.add_scalar('./Train/train_loss', train_loss_avg, nb_iter)
+            if wandb is not None:
+                wandb.log({
+                    'train/lr': current_lr,
+                    'train/loss': train_loss_avg,
+                    'train/CTC': (avg_loss_ctc/accum_steps),
+                    'train/tcm': (avg_loss_tcm/accum_steps),
+                    'iter': nb_iter,
+                }, step=nb_iter)
+            train_loss = 0.0
+            train_loss_count = 0
+        if nb_iter % args.eval_iter == 0:
+            model.eval()
+            with torch.no_grad():
+                val_loss, val_cer, val_wer, preds, labels = valid.validation(model_ema.ema,
+                                                                             criterion,
+                                                                             val_loader,
+                                                                             converter)
+                if nb_iter % args.eval_iter*5 == 0:
+                    ckpt_name = f"checkpoint_{best_cer:.4f}_{best_wer:.4f}_{nb_iter}.pth"
+                    checkpoint = {
+                        'model': model.state_dict(),
+                        'state_dict_ema': model_ema.ema.state_dict(),
+                        'optimizer': optimizer.state_dict(),
+                        'nb_iter': nb_iter,
+                        'best_cer': best_cer,
+                        'best_wer': best_wer,
+                        'args': vars(args),
+                        'random_state': random.getstate(),
+                        'numpy_state': np.random.get_state(),
+                        'torch_state': torch.get_rng_state(),
+                        'torch_cuda_state': torch.cuda.get_rng_state() if torch.cuda.is_available() else None,
+                        'train_loss': train_loss,
+                        'train_loss_count': train_loss_count,
+                    }
+                    if tcm_head is not None:
+                        checkpoint['tcm_head'] = tcm_head.state_dict()
+                    torch.save(checkpoint, os.path.join(
+                        args.save_dir, ckpt_name))
+                if val_cer < best_cer:
+                    logger.info(
+                        f'CER improved from {best_cer:.4f} to {val_cer:.4f}!!!')
+                    best_cer = val_cer
+                    checkpoint = {
+                        'model': model.state_dict(),
+                        'state_dict_ema': model_ema.ema.state_dict(),
+                        'optimizer': optimizer.state_dict(),
+                        'nb_iter': nb_iter,
+                        'best_cer': best_cer,
+                        'best_wer': best_wer,
+                        'args': vars(args),
+                        'random_state': random.getstate(),
+                        'numpy_state': np.random.get_state(),
+                        'torch_state': torch.get_rng_state(),
+                        'torch_cuda_state': torch.cuda.get_rng_state() if torch.cuda.is_available() else None,
+                        'train_loss': train_loss,
+                        'train_loss_count': train_loss_count,
+                    }
+                    if tcm_head is not None:
+                        checkpoint['tcm_head'] = tcm_head.state_dict()
+                    torch.save(checkpoint, os.path.join(
+                        args.save_dir, 'best_CER.pth'))
+                if val_wer < best_wer:
+                    logger.info(
+                        f'WER improved from {best_wer:.4f} to {val_wer:.4f}!!!')
+                    best_wer = val_wer
+                    checkpoint = {
+                        'model': model.state_dict(),
+                        'state_dict_ema': model_ema.ema.state_dict(),
+                        'optimizer': optimizer.state_dict(),
+                        'nb_iter': nb_iter,
+                        'best_cer': best_cer,
+                        'best_wer': best_wer,
+                        'args': vars(args),
+                        'random_state': random.getstate(),
+                        'numpy_state': np.random.get_state(),
+                        'torch_state': torch.get_rng_state(),
+                        'torch_cuda_state': torch.cuda.get_rng_state() if torch.cuda.is_available() else None,
+                        'train_loss': train_loss,
+                        'train_loss_count': train_loss_count,
+                    }
+                    if tcm_head is not None:
+                        checkpoint['tcm_head'] = tcm_head.state_dict()
+                    torch.save(checkpoint, os.path.join(
+                        args.save_dir, 'best_WER.pth'))
+                logger.info(
+                    f'Val. loss : {val_loss:0.3f} \t CER : {val_cer:0.4f} \t WER : {val_wer:0.4f} \t ')
+                writer.add_scalar('./VAL/CER', val_cer, nb_iter)
+                writer.add_scalar('./VAL/WER', val_wer, nb_iter)
+                writer.add_scalar('./VAL/bestCER', best_cer, nb_iter)
+                writer.add_scalar('./VAL/bestWER', best_wer, nb_iter)
+                writer.add_scalar('./VAL/val_loss', val_loss, nb_iter)
+                if wandb is not None:
+                    wandb.log({
+                        'val/loss': val_loss,
+                        'val/CER': val_cer,
+                        'val/WER': val_wer,
+                        'val/best_CER': best_cer,
+                        'val/best_WER': best_wer,
+                        'iter': nb_iter,
+                    }, step=nb_iter)
+                model.train()
+if __name__ == '__main__':
+    main()

utils/option.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import argparse
+def get_args_parser() -> argparse.Namespace:
+    """Create and parse command-line options for HTR-ConvText.
+    This keeps all option names and defaults intact, but organizes them into
+    logical groups with clearer help messages.
+    """
+    parser = argparse.ArgumentParser(
+        description='HTR-ConvText: Leveraging Convolution and Textual Context with Mixed Masking for Handwritten Text Recognition',
+        add_help=True,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    # ---------------------------------------------------------------------
+    # Experiment & Logging
+    # ---------------------------------------------------------------------
+    exp = parser.add_argument_group('Experiment & Logging')
+    exp.add_argument('--out-dir', type=str, default='./output',
+                     help='Root directory to save logs, checkpoints, and outputs')
+    exp.add_argument('--exp-name', type=str, default='IAM_HTR_ORIGAMI_NET',
+                     help='Experiment name; results go to <out-dir>/<exp-name>')
+    exp.add_argument('--seed', default=123, type=int,
+                     help='Random seed for reproducibility')
+    exp.add_argument('--use-wandb', action='store_true', default=False,
+                     help='Log to Weights & Biases; otherwise use TensorBoard')
+    exp.add_argument('--wandb-project', type=str, default='None',
+                     help='W&B project name (used only if --use-wandb)')
+    exp.add_argument('--print-iter', default=100, type=int,
+                     help='Iterations between training status prints')
+    exp.add_argument('--eval-iter', default=1000, type=int,
+                     help='Iterations between validation runs')
+    # ---------------------------------------------------------------------
+    # Data & Dataloading
+    # ---------------------------------------------------------------------
+    data = parser.add_argument_group('Data & Dataloading')
+    data.add_argument('--dataset', type=str, choices=['iam', 'read2016', 'lam', 'vnondb'],
+                      help='Dataset choice')
+    data.add_argument('--data-path', type=str, default='./data/iam/lines/',
+                      help='Root directory containing image/line data')
+    data.add_argument('--train-data-list', type=str, default='./data/iam/train.ln',
+                      help='Path to training list file (e.g., .ln)')
+    data.add_argument('--val-data-list', type=str, default='./data/iam/val.ln',
+                      help='Path to validation list file (e.g., .ln)')
+    data.add_argument('--test-data-list', type=str, default='./data/iam/test.ln',
+                      help='Path to test list file (e.g., .ln)')
+    data.add_argument('--nb-cls', default=80, type=int,
+                      help='Number of classes. IAM=79+1, READ2016=89+1, LAM=90+1, VNOnDB=161+1')
+    data.add_argument('--num-workers', default=0, type=int,
+                      help='Dataloader worker processes')
+    data.add_argument('--img-size', default=[512, 64], type=int, nargs='+',
+                      help='Input image size [W, H]')
+    data.add_argument('--patch-size', default=[4, 32], type=int, nargs='+',
+                      help='Patch size [W, H] for patch embedding')
+    # ---------------------------------------------------------------------
+    # Training Schedule & Optimization
+    # ---------------------------------------------------------------------
+    train = parser.add_argument_group('Training Schedule & Optimization')
+    train.add_argument('--train-bs', default=8, type=int,
+                       help='Training batch size per iteration')
+    train.add_argument('--accum-steps', default=1, type=int,
+                       help='Gradient accumulation steps; effective batch = train-bs * accum-steps')
+    train.add_argument('--val-bs', default=1, type=int,
+                       help='Validation/test batch size')
+    train.add_argument('--total-iter', default=100000, type=int,
+                       help='Total training iterations')
+    train.add_argument('--warm-up-iter', default=1000, type=int,
+                       help='Warm-up iterations for the optimizer/scheduler')
+    train.add_argument('--max-lr', default=1e-3, type=float,
+                       help='Peak learning rate')
+    train.add_argument('--weight-decay', default=5e-1, type=float,
+                       help='Weight decay (L2) regularization')
+    train.add_argument('--ema-decay', default=0.9999, type=float,
+                       help='Exponential Moving Average (EMA) decay factor for model weights')
+    train.add_argument('--alpha', default=0, type=float,
+                       help='KL-divergence loss ratio (if applicable)')
+    # ---------------------------------------------------------------------
+    # Model & Encoder
+    # ---------------------------------------------------------------------
+    model = parser.add_argument_group('Model & Encoder')
+    model.add_argument('--model-type', default='ctc', type=str, choices=['ctc', 'encoder_decoder'],
+                      help='Model family to train/use')
+    model.add_argument('--cos-temp', default=8, type=int,
+                      help='Cosine-similarity classifier temperature')
+    model.add_argument('--proj', default=8, type=float,
+                      help='Projection dimension or scaling for classifier head')
+    model.add_argument('--attn-mask-ratio', default=0., type=float,
+                      help='Attention drop-key mask ratio')
+    # ---------------------------------------------------------------------
+    # Masking Strategy
+    # ---------------------------------------------------------------------
+    mask = parser.add_argument_group('Masking Strategy')
+    mask.add_argument('--use-masking', action='store_true', default=False,
+                      help='Enable masking strategy during training')
+    mask.add_argument('--mask-ratio', default=0.3, type=float,
+                      help='Overall proportion of tokens/patches to mask')
+    mask.add_argument('--max-span-length', default=4, type=int,
+                      help='Max length for individual span masks')
+    mask.add_argument('--spacing', default=0, type=int,
+                      help='Minimum spacing between two span masks')
+    # Tri-masking schedule ratios
+    mask.add_argument('--r-rand', dest='r_rand', default=0.6, type=float,
+                      help='Ratio for random masking in tri-masking schedule')
+    mask.add_argument('--r-block', dest='r_block', default=0.6, type=float,
+                      help='Ratio for block masking in tri-masking schedule')
+    mask.add_argument('--block-span', dest='block_span', default=4, type=int,
+                      help='Block span length for block masking')
+    mask.add_argument('--r-span', dest='r_span', default=0.4, type=float,
+                      help='Ratio for span masking in tri-masking schedule')
+    mask.add_argument('--max-span', dest='max_span', default=8, type=int,
+                      help='Max span length for span masking')
+    # ---------------------------------------------------------------------
+    # Data Augmentations
+    # ---------------------------------------------------------------------
+    aug = parser.add_argument_group('Data Augmentations')
+    aug.add_argument('--dpi-min-factor', default=0.5, type=float,
+                     help='Minimum scaling factor for DPI-based resize')
+    aug.add_argument('--dpi-max-factor', default=1.5, type=float,
+                     help='Maximum scaling factor for DPI-based resize')
+    aug.add_argument('--perspective-low', default=0., type=float,
+                     help='Lower bound for perspective transform magnitude')
+    aug.add_argument('--perspective-high', default=0.4, type=float,
+                     help='Upper bound for perspective transform magnitude')
+    aug.add_argument('--elastic-distortion-min-kernel-size', default=3, type=int,
+                     help='Minimum kernel size for elastic distortion grid')
+    aug.add_argument('--elastic-distortion-max-kernel-size', default=3, type=int,
+                     help='Maximum kernel size for elastic distortion grid')
+    aug.add_argument('--elastic_distortion-max-magnitude', default=20, type=int,
+                     help='Maximum distortion magnitude for elastic transforms')
+    aug.add_argument('--elastic-distortion-min-alpha', default=0.5, type=float,
+                     help='Minimum alpha for elastic distortion')
+    aug.add_argument('--elastic-distortion-max-alpha', default=1, type=float,
+                     help='Maximum alpha for elastic distortion')
+    aug.add_argument('--elastic-distortion-min-sigma', default=1, type=int,
+                     help='Minimum sigma for Gaussian in elastic distortion')
+    aug.add_argument('--elastic-distortion-max-sigma', default=10, type=int,
+                     help='Maximum sigma for Gaussian in elastic distortion')
+    aug.add_argument('--dila-ero-max-kernel', default=3, type=int,
+                     help='Max kernel size for dilation/erosion ops')
+    aug.add_argument('--dila-ero-iter', default=1, type=int,
+                     help='Iterations for dilation/erosion')
+    aug.add_argument('--jitter-contrast', default=0.4, type=float,
+                     help='ColorJitter: contrast range')
+    aug.add_argument('--jitter-brightness', default=0.4, type=float,
+                     help='ColorJitter: brightness range')
+    aug.add_argument('--jitter-saturation', default=0.4, type=float,
+                     help='ColorJitter: saturation range')
+    aug.add_argument('--jitter-hue', default=0.2, type=float,
+                     help='ColorJitter: hue range')
+    aug.add_argument('--blur-min-kernel', default=3, type=int,
+                     help='Minimum kernel size for Gaussian blur')
+    aug.add_argument('--blur-max-kernel', default=5, type=int,
+                     help='Maximum kernel size for Gaussian blur')
+    aug.add_argument('--blur-min-sigma', default=3, type=int,
+                     help='Minimum sigma for Gaussian blur')
+    aug.add_argument('--blur-max-sigma', default=5, type=int,
+                     help='Maximum sigma for Gaussian blur')
+    aug.add_argument('--sharpen-min-alpha', default=0, type=int,
+                     help='Minimum alpha/mix for sharpening')
+    aug.add_argument('--sharpen-max-alpha', default=1, type=int,
+                     help='Maximum alpha/mix for sharpening')
+    aug.add_argument('--sharpen-min-strength', default=0, type=int,
+                     help='Minimum sharpening strength')
+    aug.add_argument('--sharpen-max-strength', default=1, type=int,
+                     help='Maximum sharpening strength')
+    aug.add_argument('--zoom-min-h', default=0.8, type=float,
+                     help='Minimum vertical zoom factor')
+    aug.add_argument('--zoom-max-h', default=1, type=float,
+                     help='Maximum vertical zoom factor')
+    aug.add_argument('--zoom-min-w', default=0.99, type=float,
+                     help='Minimum horizontal zoom factor')
+    aug.add_argument('--zoom-max-w', default=1, type=float,
+                     help='Maximum horizontal zoom factor')
+    aug.add_argument('--proba', default=0.5, type=float,
+                     help='Default probability for applying stochastic augmentations')
+    # ---------------------------------------------------------------------
+    # Decoder & Inference (for encoder-decoder mode)
+    # ---------------------------------------------------------------------
+    dec = parser.add_argument_group('Decoder & Inference')
+    dec.add_argument('--decoder-layers', default=6, type=int,
+                     help='Number of Transformer decoder layers')
+    dec.add_argument('--decoder-heads', default=8, type=int,
+                     help='Number of attention heads in decoder')
+    dec.add_argument('--max-seq-len', default=256, type=int,
+                     help='Maximum output sequence length')
+    dec.add_argument('--label-smoothing', default=0.1, type=float,
+                     help='Label-smoothing factor for cross-entropy loss')
+    dec.add_argument('--beam-size', default=5, type=int,
+                     help='Beam size for beam-search decoding')
+    dec.add_argument('--generation-method', default='nucleus', type=str,
+                     choices=['greedy', 'nucleus', 'beam_search'],
+                     help='Token generation method for inference')
+    dec.add_argument('--generation-temperature', default=0.7, type=float,
+                     help='Sampling temperature (used by nucleus/greedy sampling)')
+    dec.add_argument('--repetition-penalty', default=1.3, type=float,
+                     help='Penalty to discourage token repetition during generation')
+    dec.add_argument('--top-p', default=0.9, type=float,
+                     help='Top-p threshold for nucleus sampling')
+    # ---------------------------------------------------------------------
+    # TCM (Textual Context Module)
+    # ---------------------------------------------------------------------
+    tcm = parser.add_argument_group('TCM (Textual Context Module)')
+    tcm.add_argument('--tcm-enable', action='store_true', default=False,
+                    help='Enable Textual Context Module (TCM)')
+    tcm.add_argument('--tcm-lambda', default=1.0, type=float,
+                    help='TCM loss weight (λ2 in the paper)')
+    tcm.add_argument('--ctc-lambda', default=0.1, type=float,
+                    help='CTC loss weight (λ1 in the paper)')
+    tcm.add_argument('--tcm-sub-len', default=5, type=int,
+                    help='TCM context sub-string length')
+    tcm.add_argument('--tcm-warmup-iters', default=0, type=int,
+                    help='Warm-up iterations before activating TCM (0 = start immediately)')
+    # ---------------------------------------------------------------------
+    # Checkpointing & Pretrained Weights
+    # ---------------------------------------------------------------------
+    ckpt = parser.add_argument_group('Checkpointing & Pretrained Weights')
+    ckpt.add_argument('--resume', type=str, default=None,
+                      help='Resume training from a checkpoint (alias)')
+    ckpt.add_argument('--load-model', type=str, default=None,
+                      help='Load a full pretrained model for fine-tuning')
+    ckpt.add_argument('--load-encoder-only', action='store_true', default=False,
+                      help='Load only encoder weights (transfer learning)')
+    ckpt.add_argument('--strict-loading', action='store_true', default=True,
+                      help='Use strict key matching when loading weights')
+    return parser.parse_args()

utils/sam.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+class SAM(torch.optim.Optimizer):
+    def __init__(self, params, base_optimizer, rho=0.05, adaptive=False, **kwargs):
+        assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}"
+        defaults = dict(rho=rho, adaptive=adaptive, **kwargs)
+        super(SAM, self).__init__(params, defaults)
+        self.base_optimizer = base_optimizer(self.param_groups, **kwargs)
+        self.param_groups = self.base_optimizer.param_groups
+        self.defaults.update(self.base_optimizer.defaults)
+    @torch.no_grad()
+    def first_step(self, zero_grad=False):
+        grad_norm = self._grad_norm()
+        for group in self.param_groups:
+            scale = group["rho"] / (grad_norm + 1e-12)
+            for p in group["params"]:
+                if p.grad is None: continue
+                self.state[p]["old_p"] = p.data.clone()
+                e_w = (torch.pow(p, 2) if group["adaptive"] else 1.0) * p.grad * scale.to(p)
+                p.add_(e_w)  # climb to the local maximum "w + e(w)"
+        if zero_grad: self.zero_grad()
+    @torch.no_grad()
+    def second_step(self, zero_grad=False):
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None: continue
+                p.data = self.state[p]["old_p"]  # get back to "w" from "w + e(w)"
+        self.base_optimizer.step()  # do the actual "sharpness-aware" update
+        if zero_grad: self.zero_grad()
+    @torch.no_grad()
+    def step(self, closure=None):
+        assert closure is not None, "Sharpness Aware Minimization requires closure, but it was not provided"
+        closure = torch.enable_grad()(closure)  # the closure should do a full forward-backward pass
+        self.first_step(zero_grad=True)
+        closure()
+        self.second_step()
+    def _grad_norm(self):
+        shared_device = self.param_groups[0]["params"][0].device  # put everything on the same device, in case of model parallelism
+        norm = torch.norm(
+                    torch.stack([
+                        ((torch.abs(p) if group["adaptive"] else 1.0) * p.grad).norm(p=2).to(shared_device)
+                        for group in self.param_groups for p in group["params"]
+                        if p.grad is not None
+                    ]),
+                    p=2
+               )
+        return norm
+    def load_state_dict(self, state_dict):
+        super().load_state_dict(state_dict)
+        self.base_optimizer.param_groups = self.param_groups

utils/utils.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import torch
+import torch.distributed as dist
+from torch.distributions.uniform import Uniform
+import os
+import re
+import sys
+import math
+import logging
+from copy import deepcopy
+from collections import OrderedDict
+import random
+import numpy as np
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def randint(low, high):
+    return int(torch.randint(low, high, (1, )))
+def rand_uniform(low, high):
+    return float(Uniform(low, high).sample())
+def get_logger(out_dir):
+    logger = logging.getLogger('Exp')
+    logger.setLevel(logging.INFO)
+    formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
+    file_path = os.path.join(out_dir, "run.log")
+    file_hdlr = logging.FileHandler(file_path)
+    file_hdlr.setFormatter(formatter)
+    strm_hdlr = logging.StreamHandler(sys.stdout)
+    strm_hdlr.setFormatter(formatter)
+    logger.addHandler(file_hdlr)
+    logger.addHandler(strm_hdlr)
+    return logger
+def update_lr_cos(nb_iter, warm_up_iter, total_iter, max_lr, optimizer, min_lr=1e-7):
+    if nb_iter < warm_up_iter:
+        current_lr = max_lr * (nb_iter + 1) / (warm_up_iter + 1)
+    else:
+        current_lr = min_lr + (max_lr - min_lr) * 0.5 * (1. + math.cos(math.pi * nb_iter / (total_iter - warm_up_iter)))
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = current_lr
+    return optimizer, current_lr
+class CTCLabelConverter(object):
+    def __init__(self, character):
+        dict_character = list(character)
+        self.dict = {}
+        for i, char in enumerate(dict_character):
+            self.dict[char] = i + 1
+        if len(self.dict) == 87:     # '[' and ']' are not in the test set but in the training and validation sets.
+            self.dict['['], self.dict[']'] = 88, 89
+        self.character = ['[blank]'] + dict_character
+    def encode(self, text):
+        length = [len(s) for s in text]
+        text = ''.join(text)
+        text = [self.dict[char] for char in text]
+        return (torch.IntTensor(text).to(device), torch.IntTensor(length).to(device))
+    def decode(self, text_index, length):
+        texts = []
+        index = 0
+        for l in length:
+            t = text_index[index:index + l]
+            char_list = []
+            for i in range(l):
+                if t[i] != 0 and (not (i > 0 and t[i - 1] == t[i])) and t[i]<len(self.character):
+                    char_list.append(self.character[t[i]])
+            text = ''.join(char_list)
+            texts.append(text)
+            index += l
+        return texts
+class Averager(object):
+    def __init__(self):
+        self.reset()
+    def add(self, v):
+        count = v.data.numel()
+        v = v.data.sum()
+        self.n_count += count
+        self.sum += v
+    def reset(self):
+        self.n_count = 0
+        self.sum = 0
+    def val(self):
+        res = 0
+        if self.n_count != 0:
+            res = self.sum / float(self.n_count)
+        return res
+class Metric(object):
+    def __init__(self, name=''):
+        self.name = name
+        self.sum = torch.tensor(0.).double()
+        self.n = torch.tensor(0.)
+    def update(self, val):
+        rt = val.clone()
+        dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+        rt /= dist.get_world_size()
+        self.sum += rt.detach().cpu().double()
+        self.n += 1
+    @property
+    def avg(self):
+        return self.sum / self.n.double()
+class ModelEma:
+    def __init__(self, model, decay=0.9999, device='', resume=''):
+        self.ema = deepcopy(model)
+        self.ema.eval()
+        self.decay = decay
+        self.device = device
+        if device:
+            self.ema.to(device=device)
+        self.ema_has_module = hasattr(self.ema, 'module')
+        if resume:
+            self._load_checkpoint(resume)
+        for p in self.ema.parameters():
+            p.requires_grad_(False)
+    def _load_checkpoint(self, checkpoint_path, mapl=None):
+        checkpoint = torch.load(checkpoint_path,map_location=mapl)
+        assert isinstance(checkpoint, dict)
+        if 'state_dict_ema' in checkpoint:
+            new_state_dict = OrderedDict()
+            for k, v in checkpoint['state_dict_ema'].items():
+                if self.ema_has_module:
+                    name = 'module.' + k if not k.startswith('module') else k
+                else:
+                    name = k
+                new_state_dict[name] = v
+            self.ema.load_state_dict(new_state_dict)
+            print("=> Loaded state_dict_ema")
+        else:
+            print("=> Failed to find state_dict_ema, starting from loaded model weights")
+    def update(self, model, num_updates=-1):
+        needs_module = hasattr(model, 'module') and not self.ema_has_module
+        if num_updates >= 0:
+            _cdecay = min(self.decay, (1 + num_updates) / (10 + num_updates))
+        else:
+            _cdecay = self.decay
+        with torch.no_grad():
+            msd = model.state_dict()
+            for k, ema_v in self.ema.state_dict().items():
+                if needs_module:
+                    k = 'module.' + k
+                model_v = msd[k].detach()
+                if self.device:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(ema_v * _cdecay + (1. - _cdecay) * model_v)
+def format_string_for_wer(str):
+    str = re.sub('([\[\]{}/\\()\"\'&+*=<>?.;:,!\-—_€#%°])', r' \1 ', str)
+    str = re.sub('([ \n])+', " ", str).strip()
+    return str
+def load_checkpoint(model, model_ema, optimizer, checkpoint_path, logger):
+    best_cer, best_wer, start_iter = 1e+6, 1e+6, 1
+    train_loss, train_loss_count = 0.0, 0
+    optimizer_state = None
+    if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+        logger.info(f"Resuming from checkpoint: {checkpoint_path}")
+        checkpoint = torch.load(
+            checkpoint_path, map_location='cpu', weights_only=False)
+        # Load model state dict (handle module prefix like in test.py)
+        model_dict = OrderedDict()
+        pattern = re.compile('module.')
+        # For main model, load from the 'model' state dict
+        # (the training checkpoint contains both 'model' and 'state_dict_ema')
+        if 'model' in checkpoint:
+            source_dict = checkpoint['model']
+            logger.info("Loading main model from 'model' state dict")
+        elif 'state_dict_ema' in checkpoint:
+            source_dict = checkpoint['state_dict_ema']
+            logger.info(
+                "Loading main model from 'state_dict_ema' (fallback)")
+        else:
+            raise KeyError(
+                "Neither 'model' nor 'state_dict_ema' found in checkpoint")
+        for k, v in source_dict.items():
+            if re.search("module", k):
+                model_dict[re.sub(pattern, '', k)] = v
+            else:
+                model_dict[k] = v
+        model.load_state_dict(model_dict, strict=True)
+        logger.info("Successfully loaded main model state dict")
+        # Load EMA state dict if available
+        if 'state_dict_ema' in checkpoint and model_ema is not None:
+            ema_dict = OrderedDict()
+            for k, v in checkpoint['state_dict_ema'].items():
+                if re.search("module", k):
+                    ema_dict[re.sub(pattern, '', k)] = v
+                else:
+                    ema_dict[k] = v
+            model_ema.ema.load_state_dict(ema_dict, strict=True)
+            logger.info("Successfully loaded EMA model state dict")
+        # Load optimizer state - handle SAM optimizer structure
+        if 'optimizer' in checkpoint and optimizer is not None:
+            try:
+                optimizer_state = checkpoint['optimizer']
+                logger.info(
+                    "Optimizer state will be loaded after optimizer initialization")
+            except Exception as e:
+                logger.warning(f"Failed to prepare optimizer state: {e}")
+                optimizer_state = None
+        # Load metrics from checkpoint if available
+        if 'best_cer' in checkpoint:
+            best_cer = checkpoint['best_cer']
+        if 'best_wer' in checkpoint:
+            best_wer = checkpoint['best_wer']
+        if 'nb_iter' in checkpoint:
+            start_iter = checkpoint['nb_iter'] + 1
+        # Parse CER, WER, iter from filename as fallback
+        m = re.search(
+            r'checkpoint_(?P<cer>[\d\.]+)_(?P<wer>[\d\.]+)_(?P<iter>\d+)\.pth', checkpoint_path)
+        if m and 'best_cer' not in checkpoint:
+            best_cer = float(m.group('cer'))
+            best_wer = float(m.group('wer'))
+            start_iter = int(m.group('iter')) + 1
+        if 'train_loss' in checkpoint:
+            train_loss = checkpoint['train_loss']
+        if 'train_loss_count' in checkpoint:
+            train_loss_count = checkpoint['train_loss_count']
+        if 'random_state' in checkpoint:
+            random.setstate(checkpoint['random_state'])
+            logger.info("Restored random state")
+        if 'numpy_state' in checkpoint:
+            np.random.set_state(checkpoint['numpy_state'])
+            logger.info("Restored numpy random state")
+        if 'torch_state' in checkpoint:
+            torch.set_rng_state(checkpoint['torch_state'])
+            logger.info("Restored torch random state")
+        if 'torch_cuda_state' in checkpoint and torch.cuda.is_available():
+            torch.cuda.set_rng_state(checkpoint['torch_cuda_state'])
+            logger.info("Restored torch cuda random state")
+        # Validate that the model was loaded correctly by checking a few parameters
+        total_params = sum(p.numel() for p in model.parameters())
+        logger.info(f"Model loaded with {total_params} total parameters")
+        logger.info(
+            f"Resumed best_cer={best_cer}, best_wer={best_wer}, start_iter={start_iter}")
+    return best_cer, best_wer, start_iter, optimizer_state, train_loss, train_loss_count

valid.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+import torch.utils.data
+import torch.backends.cudnn as cudnn
+from utils import utils
+import editdistance
+def validation(model, criterion, evaluation_loader, converter):
+    """ validation or evaluation """
+    norm_ED = 0
+    norm_ED_wer = 0
+    tot_ED = 0
+    tot_ED_wer = 0
+    valid_loss = 0.0
+    length_of_gt = 0
+    length_of_gt_wer = 0
+    count = 0
+    all_preds_str = []
+    all_labels = []
+    for i, (image_tensors, labels) in enumerate(evaluation_loader):
+        batch_size = image_tensors.size(0)
+        image = image_tensors.cuda()
+        text_for_loss, length_for_loss = converter.encode(labels)
+        preds = model(image)
+        preds = preds.float()
+        preds_size = torch.IntTensor([preds.size(1)] * batch_size)
+        preds = preds.permute(1, 0, 2).log_softmax(2)
+        torch.backends.cudnn.enabled = False
+        cost = criterion(preds, text_for_loss, preds_size, length_for_loss).mean()
+        torch.backends.cudnn.enabled = True
+        _, preds_index = preds.max(2)
+        preds_index = preds_index.transpose(1, 0).contiguous().view(-1)
+        preds_str = converter.decode(preds_index.data, preds_size.data)
+        valid_loss += cost.item()
+        count += 1
+        all_preds_str.extend(preds_str)
+        all_labels.extend(labels)
+        for pred_cer, gt_cer in zip(preds_str, labels):
+            tmp_ED = editdistance.eval(pred_cer, gt_cer)
+            if len(gt_cer) == 0:
+                norm_ED += 1
+            else:
+                norm_ED += tmp_ED / float(len(gt_cer))
+            tot_ED += tmp_ED
+            length_of_gt += len(gt_cer)
+        for pred_wer, gt_wer in zip(preds_str, labels):
+            pred_wer = utils.format_string_for_wer(pred_wer)
+            gt_wer = utils.format_string_for_wer(gt_wer)
+            pred_wer = pred_wer.split(" ")
+            gt_wer = gt_wer.split(" ")
+            tmp_ED_wer = editdistance.eval(pred_wer, gt_wer)
+            if len(gt_wer) == 0:
+                norm_ED_wer += 1
+            else:
+                norm_ED_wer += tmp_ED_wer / float(len(gt_wer))
+            tot_ED_wer += tmp_ED_wer
+            length_of_gt_wer += len(gt_wer)
+    val_loss = valid_loss / count
+    CER = tot_ED / float(length_of_gt)
+    WER = tot_ED_wer / float(length_of_gt_wer)
+    return val_loss, CER, WER, all_preds_str, all_labels