init
Browse files- README.md +910 -0
- upload_to_huggingface.py +6 -1
README.md
ADDED
|
@@ -0,0 +1,910 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
license: mit
|
| 5 |
+
tags:
|
| 6 |
+
- handwriting-recognition
|
| 7 |
+
- ocr
|
| 8 |
+
- computer-vision
|
| 9 |
+
- pytorch
|
| 10 |
+
- crnn
|
| 11 |
+
- ctc
|
| 12 |
+
- iam-dataset
|
| 13 |
+
library_name: pytorch
|
| 14 |
+
datasets:
|
| 15 |
+
- Teklia/IAM-line
|
| 16 |
+
metrics:
|
| 17 |
+
- cer
|
| 18 |
+
- wer
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
# ๐๏ธ Handwriting Recognition with Deep Learning
|
| 22 |
+
|
| 23 |
+
<div align="center">
|
| 24 |
+
|
| 25 |
+
[](https://huggingface.co/IsmatS/handwriting-recognition-iam)
|
| 26 |
+
[](https://huggingface.co/datasets/Teklia/IAM-line)
|
| 27 |
+
[](LICENSE)
|
| 28 |
+
[](https://pytorch.org/)
|
| 29 |
+
|
| 30 |
+
**A complete end-to-end handwriting recognition system using CNN-BiLSTM-CTC architecture**
|
| 31 |
+
|
| 32 |
+
[๐ฏ Model](#-trained-model) โข [๐ Dataset Analysis](#-dataset-insights) โข [๐๏ธ Architecture](#๏ธ-model-architecture) โข [๐ Performance](#-training-results) โข [๐ Quick Start](#-quick-start)
|
| 33 |
+
|
| 34 |
+
</div>
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## ๐ฏ Overview
|
| 39 |
+
|
| 40 |
+
This project implements a state-of-the-art **Handwriting Recognition** system that converts handwritten text images into digital text. The model achieves **87% character-level accuracy** on the IAM Handwriting Database.
|
| 41 |
+
|
| 42 |
+
### Key Highlights
|
| 43 |
+
|
| 44 |
+
- โ
**CNN-BiLSTM-CTC Architecture** - Industry-standard OCR architecture
|
| 45 |
+
- โ
**9.1M Parameters** - Efficient yet powerful model
|
| 46 |
+
- โ
**CER: 12.95%** - High character recognition accuracy
|
| 47 |
+
- โ
**IAM Dataset** - 10,000+ handwritten text samples
|
| 48 |
+
- โ
**Google Colab Compatible** - Train on free GPU
|
| 49 |
+
- โ
**Production Ready** - Complete inference pipeline
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
## ๐ Resources
|
| 54 |
+
|
| 55 |
+
| Resource | Link | Description |
|
| 56 |
+
|----------|------|-------------|
|
| 57 |
+
| **๐ค Trained Model** | [IsmatS/handwriting-recognition-iam](https://huggingface.co/IsmatS/handwriting-recognition-iam) | Pre-trained weights (105MB) |
|
| 58 |
+
| **๐ฆ Dataset** | [Teklia/IAM-line](https://huggingface.co/datasets/Teklia/IAM-line) | IAM Handwriting Database |
|
| 59 |
+
| **๐ Training Notebook** | `train_colab.ipynb` | Full training pipeline |
|
| 60 |
+
| **๐ Analysis Notebook** | `analysis.ipynb` | Dataset exploration |
|
| 61 |
+
|
| 62 |
+
---
|
| 63 |
+
|
| 64 |
+
## ๐ Dataset Insights
|
| 65 |
+
|
| 66 |
+
The **IAM Handwriting Database** is one of the most widely-used datasets for handwriting recognition research. Here's what we discovered:
|
| 67 |
+
|
| 68 |
+
### Dataset Statistics
|
| 69 |
+
|
| 70 |
+
| Split | Samples | Usage |
|
| 71 |
+
|-------|---------|-------|
|
| 72 |
+
| **Train** | 6,482 | Model training |
|
| 73 |
+
| **Validation** | 976 | Hyperparameter tuning |
|
| 74 |
+
| **Test** | 2,915 | Final evaluation |
|
| 75 |
+
| **Total** | 10,373 | Complete dataset |
|
| 76 |
+
|
| 77 |
+
### ๐ธ Sample Images
|
| 78 |
+
|
| 79 |
+
Real handwritten text samples from the dataset:
|
| 80 |
+
|
| 81 |
+

|
| 82 |
+
|
| 83 |
+
**Observations:**
|
| 84 |
+
- โ๏ธ Diverse writing styles (cursive, print, mixed)
|
| 85 |
+
- ๐ Variable text lengths (10-100+ characters)
|
| 86 |
+
- ๐จ Different pen types and ink intensity
|
| 87 |
+
- ๐ Natural variations in slant and spacing
|
| 88 |
+
|
| 89 |
+
---
|
| 90 |
+
|
| 91 |
+
### ๐ Text Length Distribution
|
| 92 |
+
|
| 93 |
+

|
| 94 |
+
|
| 95 |
+
**Key Insights:**
|
| 96 |
+
- ๐ **Mean length**: ~48-60 characters per line
|
| 97 |
+
- ๐ **Peak**: 40-70 character range (most common)
|
| 98 |
+
- ๐ข **Range**: 5-150 characters
|
| 99 |
+
- ๐ฏ **Implication**: Model must handle variable-length sequences efficiently
|
| 100 |
+
|
| 101 |
+
**Why this matters:** The CTC (Connectionist Temporal Classification) loss function in our model is specifically designed to handle this variability without requiring character-level alignment annotations.
|
| 102 |
+
|
| 103 |
+
---
|
| 104 |
+
|
| 105 |
+
### ๐ Image Dimensions Analysis
|
| 106 |
+
|
| 107 |
+

|
| 108 |
+
|
| 109 |
+
**Dimensional Characteristics:**
|
| 110 |
+
|
| 111 |
+
| Metric | Width | Height | Aspect Ratio |
|
| 112 |
+
|--------|-------|--------|--------------|
|
| 113 |
+
| **Mean** | ~400-500px | ~50-100px | ~6-8:1 |
|
| 114 |
+
| **Min** | ~100px | ~30px | ~3:1 |
|
| 115 |
+
| **Max** | ~1200px | ~150px | ~15:1 |
|
| 116 |
+
|
| 117 |
+
**Engineering Decision:**
|
| 118 |
+
- ๐ **Fixed height**: Resize to 128px (preserves vertical features)
|
| 119 |
+
- ๐ **Variable width**: Maintain aspect ratio (prevents distortion)
|
| 120 |
+
- ๐ฏ **Result**: Preserves legibility while standardizing input
|
| 121 |
+
|
| 122 |
+
---
|
| 123 |
+
|
| 124 |
+
### ๐ค Character Frequency Analysis
|
| 125 |
+
|
| 126 |
+

|
| 127 |
+
|
| 128 |
+
**Character Distribution:**
|
| 129 |
+
- ๐ก **Lowercase dominates**: 'e', 't', 'a', 'o', 'n' (English frequency)
|
| 130 |
+
- ๐ **Capitals less common**: Sentence beginnings, proper nouns
|
| 131 |
+
- ๐ข **Numbers rare**: Limited numeric content
|
| 132 |
+
- โ๏ธ **Punctuation**: Periods, commas most frequent
|
| 133 |
+
|
| 134 |
+
**Implications:**
|
| 135 |
+
- ๐ **74 unique characters**: a-z, A-Z, 0-9, space, punctuation
|
| 136 |
+
- โ๏ธ **Class imbalance**: Model sees more common characters
|
| 137 |
+
- ๐ **Training strategy**: No special balancing needed (mirrors real-world text)
|
| 138 |
+
|
| 139 |
+
---
|
| 140 |
+
|
| 141 |
+
### ๐ Summary Statistics
|
| 142 |
+
|
| 143 |
+

|
| 144 |
+
|
| 145 |
+
**Complete Statistical Overview:**
|
| 146 |
+
- ๐ Min/Max/Mean for all features
|
| 147 |
+
- ๐ Standard deviations
|
| 148 |
+
- ๐ฏ Quartile distributions
|
| 149 |
+
- ๐ Outlier detection
|
| 150 |
+
|
| 151 |
+
---
|
| 152 |
+
|
| 153 |
+
## ๐๏ธ Model Architecture
|
| 154 |
+
|
| 155 |
+
Our **CNN-BiLSTM-CTC** architecture combines three powerful components:
|
| 156 |
+
|
| 157 |
+
```
|
| 158 |
+
Input Image (128 x Variable Width)
|
| 159 |
+
โ
|
| 160 |
+
โโโโโโโโโโโโโโโโ
|
| 161 |
+
โ CNN Layers โ โ Extract visual features
|
| 162 |
+
โ (7 blocks) โ (edges, strokes, characters)
|
| 163 |
+
โโโโโโโโโโโโโโโโ
|
| 164 |
+
โ
|
| 165 |
+
Feature Maps (512 channels)
|
| 166 |
+
โ
|
| 167 |
+
โโโโโโโโโโโโโโโโ
|
| 168 |
+
โ BiLSTM โ โ Model sequential dependencies
|
| 169 |
+
โ (2 layers) โ (left-to-right + right-to-left)
|
| 170 |
+
โโโโโโโโโโโโโโโโ
|
| 171 |
+
โ
|
| 172 |
+
โโโโโโโโโโโโโโโโ
|
| 173 |
+
โ CTC Decoder โ โ Alignment-free decoding
|
| 174 |
+
โ (75 chars) โ (handles variable lengths)
|
| 175 |
+
โโโโโโโโโโโโโโโโ
|
| 176 |
+
โ
|
| 177 |
+
Predicted Text
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
### Component Breakdown
|
| 181 |
+
|
| 182 |
+
#### 1๏ธโฃ **CNN Feature Extractor** (7 Convolutional Blocks)
|
| 183 |
+
|
| 184 |
+
| Block | Layers | Output Channels | Purpose |
|
| 185 |
+
|-------|--------|-----------------|---------|
|
| 186 |
+
| 1 | Conv + BN + ReLU + MaxPool | 64 | Basic edge detection |
|
| 187 |
+
| 2 | Conv + BN + ReLU + MaxPool | 128 | Stroke patterns |
|
| 188 |
+
| 3 | Conv + BN + ReLU | 256 | Character components |
|
| 189 |
+
| 4 | Conv + BN + ReLU + MaxPool(2,1) | 256 | Horizontal compression |
|
| 190 |
+
| 5 | Conv + BN + ReLU | 512 | Complex features |
|
| 191 |
+
| 6 | Conv + BN + ReLU + MaxPool(2,1) | 512 | Further compression |
|
| 192 |
+
| 7 | Conv + BN + ReLU | 512 | Final features |
|
| 193 |
+
|
| 194 |
+
**Key Design Choices:**
|
| 195 |
+
|
| 196 |
+
| Design Decision | Rationale |
|
| 197 |
+
|----------------|-----------|
|
| 198 |
+
| **Batch Normalization** | Normalizes activations โ faster training, prevents internal covariate shift |
|
| 199 |
+
| **Asymmetric pooling (2,1)** | Compress height but preserve width โ maintains character boundaries |
|
| 200 |
+
| **Progressive channels (64โ512)** | More filters = richer features at deeper layers |
|
| 201 |
+
| **No pooling in Conv 3,5** | Maintains spatial resolution for detail preservation |
|
| 202 |
+
|
| 203 |
+
**Why Asymmetric MaxPool (2,1)?**
|
| 204 |
+
|
| 205 |
+
```
|
| 206 |
+
Regular MaxPool (2,2):
|
| 207 |
+
Image: [128, 400] โ [64, 200] โ [32, 100] โ [16, 50]
|
| 208 |
+
Problem: Loses too much horizontal resolution โ
|
| 209 |
+
Result: Character boundaries blur together
|
| 210 |
+
|
| 211 |
+
Asymmetric MaxPool (2,1):
|
| 212 |
+
Image: [128, 400] โ [64, 400] โ [32, 400] โ [16, 400]
|
| 213 |
+
Benefit: Preserves horizontal details โ
|
| 214 |
+
Result: Each character remains distinct
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
#### 2๏ธโฃ **Bidirectional LSTM** (Sequence Modeling)
|
| 218 |
+
|
| 219 |
+
```
|
| 220 |
+
Configuration:
|
| 221 |
+
- Input Size: 256
|
| 222 |
+
- Hidden Size: 256
|
| 223 |
+
- Num Layers: 2
|
| 224 |
+
- Bidirectional: Yes (512 output)
|
| 225 |
+
- Dropout: 0.3
|
| 226 |
+
```
|
| 227 |
+
|
| 228 |
+
**Why BiLSTM?**
|
| 229 |
+
- โฌ
๏ธ **Forward pass**: Reads left-to-right (like humans)
|
| 230 |
+
- โก๏ธ **Backward pass**: Reads right-to-left (context from future)
|
| 231 |
+
- ๐ **Combined**: Each character sees full sentence context
|
| 232 |
+
|
| 233 |
+
#### 3๏ธโฃ **CTC Loss** (Alignment-Free Training)
|
| 234 |
+
|
| 235 |
+
**Advantages:**
|
| 236 |
+
- ๐ฏ No character-level position labels needed
|
| 237 |
+
- ๐ Handles variable-length input/output
|
| 238 |
+
- ๐ Learns temporal alignment automatically
|
| 239 |
+
- โ
Industry standard for OCR/speech recognition
|
| 240 |
+
|
| 241 |
+
**Total Parameters:** 9,139,147 (~9.1M)
|
| 242 |
+
|
| 243 |
+
---
|
| 244 |
+
|
| 245 |
+
### ๐ Deep Dive: How the Model Works
|
| 246 |
+
|
| 247 |
+
#### Step-by-Step Processing Pipeline
|
| 248 |
+
|
| 249 |
+
**1. Image Input Processing**
|
| 250 |
+
```
|
| 251 |
+
Original Image: "Hello" (handwritten)
|
| 252 |
+
โ
|
| 253 |
+
Resize: Height=128px, Width proportional
|
| 254 |
+
โ
|
| 255 |
+
Normalize: Pixel values from [0,255] โ [-1,1]
|
| 256 |
+
โ
|
| 257 |
+
Tensor Shape: [Batch=1, Channels=1, Height=128, Width=W]
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
**2. CNN Feature Extraction**
|
| 261 |
+
|
| 262 |
+
The CNN progressively extracts hierarchical visual features:
|
| 263 |
+
|
| 264 |
+
| Layer Type | What It Detects | Example |
|
| 265 |
+
|------------|-----------------|---------|
|
| 266 |
+
| **Conv1-2 (64-128 ch)** | Edges, lines, curves | Vertical strokes, horizontal bars |
|
| 267 |
+
| **Conv3-4 (256 ch)** | Stroke combinations | Letter parts: tops of 't', loops in 'e' |
|
| 268 |
+
| **Conv5-7 (512 ch)** | Character-level features | Distinguish 'o' from 'a', 'n' from 'h' |
|
| 269 |
+
|
| 270 |
+
**Output:** Feature map of shape `[Batch, 512, 7, W_reduced]`
|
| 271 |
+
- Height reduced: 128 โ 7 (18x compression)
|
| 272 |
+
- Width reduced: ~W โ W/4 (4x compression)
|
| 273 |
+
- Channels increased: 1 โ 512 (rich features)
|
| 274 |
+
|
| 275 |
+
**3. Sequence-to-Sequence Mapping**
|
| 276 |
+
|
| 277 |
+
```python
|
| 278 |
+
# Convert 2D feature map to 1D sequence
|
| 279 |
+
Feature Map: [B, 512, 7, W/4]
|
| 280 |
+
โ
|
| 281 |
+
Reshape: [B, W/4, 512*7] = [B, W/4, 3584]
|
| 282 |
+
โ
|
| 283 |
+
Linear Layer: [B, W/4, 3584] โ [B, W/4, 256]
|
| 284 |
+
```
|
| 285 |
+
|
| 286 |
+
Now we have a **temporal sequence** where each time step represents a horizontal segment of the image.
|
| 287 |
+
|
| 288 |
+
**4. BiLSTM Sequential Modeling**
|
| 289 |
+
|
| 290 |
+
```
|
| 291 |
+
Time step t:
|
| 292 |
+
Forward LSTM โ Reads: "H" "e" "l" "l" "o"
|
| 293 |
+
Backward LSTM โ Reads: "o" "l" "l" "e" "H"
|
| 294 |
+
โ
|
| 295 |
+
Concatenate: [forward_256, backward_256] = 512
|
| 296 |
+
โ
|
| 297 |
+
Context-aware representation for each character
|
| 298 |
+
```
|
| 299 |
+
|
| 300 |
+
**Why bidirectional matters:**
|
| 301 |
+
- Forward: "H" knows it's at the start of a word
|
| 302 |
+
- Backward: "H" knows "ello" comes after it
|
| 303 |
+
- Combined: Better prediction accuracy
|
| 304 |
+
|
| 305 |
+
**5. CTC Decoding**
|
| 306 |
+
|
| 307 |
+
```
|
| 308 |
+
LSTM Output: [B, W/4, 512]
|
| 309 |
+
โ
|
| 310 |
+
Linear: [B, W/4, 512] โ [B, W/4, 75] (75 = 74 chars + blank)
|
| 311 |
+
โ
|
| 312 |
+
Softmax: Probability distribution over characters
|
| 313 |
+
โ
|
| 314 |
+
CTC Decode: Remove blanks and duplicates
|
| 315 |
+
```
|
| 316 |
+
|
| 317 |
+
**Example CTC Alignment:**
|
| 318 |
+
```
|
| 319 |
+
Model output (frame by frame):
|
| 320 |
+
[-, -, H, H, H, -, e, e, -, l, l, l, -, l, -, o, o, -, -]
|
| 321 |
+
|
| 322 |
+
CTC decoding:
|
| 323 |
+
- Remove blanks (-)
|
| 324 |
+
- Collapse repeats
|
| 325 |
+
Result: "Hello" โ
|
| 326 |
+
```
|
| 327 |
+
|
| 328 |
+
---
|
| 329 |
+
|
| 330 |
+
### ๐ Understanding the Metrics
|
| 331 |
+
|
| 332 |
+
#### **CER (Character Error Rate)**
|
| 333 |
+
|
| 334 |
+
CER measures the **edit distance** at character level using Levenshtein distance.
|
| 335 |
+
|
| 336 |
+
**Formula:**
|
| 337 |
+
```
|
| 338 |
+
CER = (Insertions + Deletions + Substitutions) / Total_Characters_in_Ground_Truth
|
| 339 |
+
```
|
| 340 |
+
|
| 341 |
+
**Example Calculation:**
|
| 342 |
+
|
| 343 |
+
| Ground Truth | Prediction | Operations | CER |
|
| 344 |
+
|--------------|-----------|------------|-----|
|
| 345 |
+
| `hello` (5 chars) | `helo` | 1 deletion ('l') | 1/5 = **20%** |
|
| 346 |
+
| `hello` (5 chars) | `hallo` | 1 substitution ('e'โ'a') | 1/5 = **20%** |
|
| 347 |
+
| `hello` (5 chars) | `helloo` | 1 insertion ('o') | 1/6 = **16.7%** |
|
| 348 |
+
| `hello` (5 chars) | `hello` | 0 errors | 0/5 = **0%** โ
|
|
| 349 |
+
|
| 350 |
+
**Our Model Performance:**
|
| 351 |
+
```
|
| 352 |
+
CER = 12.95%
|
| 353 |
+
|
| 354 |
+
Example with 100 characters:
|
| 355 |
+
- Ground truth: 100 characters
|
| 356 |
+
- Errors: ~13 character mistakes
|
| 357 |
+
- Correct: ~87 characters โ
|
| 358 |
+
|
| 359 |
+
Character-level accuracy: 87.05%
|
| 360 |
+
```
|
| 361 |
+
|
| 362 |
+
**What CER tells us:**
|
| 363 |
+
- โ
Lower is better (0% = perfect)
|
| 364 |
+
- โ
Character-by-character accuracy
|
| 365 |
+
- โ
Sensitive to small mistakes
|
| 366 |
+
- โ
Good for measuring overall quality
|
| 367 |
+
|
| 368 |
+
---
|
| 369 |
+
|
| 370 |
+
#### **WER (Word Error Rate)**
|
| 371 |
+
|
| 372 |
+
WER measures the **edit distance** at word level.
|
| 373 |
+
|
| 374 |
+
**Formula:**
|
| 375 |
+
```
|
| 376 |
+
WER = (Word_Insertions + Word_Deletions + Word_Substitutions) / Total_Words_in_Ground_Truth
|
| 377 |
+
```
|
| 378 |
+
|
| 379 |
+
**Example Calculation:**
|
| 380 |
+
|
| 381 |
+
| Ground Truth | Prediction | Word Errors | WER |
|
| 382 |
+
|--------------|-----------|-------------|-----|
|
| 383 |
+
| `hello world` (2 words) | `helo world` | 1 error ('hello'โ'helo') | 1/2 = **50%** |
|
| 384 |
+
| `hello world` (2 words) | `hello world` | 0 errors | 0/2 = **0%** โ
|
|
| 385 |
+
| `the quick brown fox` (4 words) | `the quik brown fox` | 1 error ('quick'โ'quik') | 1/4 = **25%** |
|
| 386 |
+
|
| 387 |
+
**Our Model Performance:**
|
| 388 |
+
```
|
| 389 |
+
WER = 42.47%
|
| 390 |
+
|
| 391 |
+
Example with 100 words:
|
| 392 |
+
- Ground truth: 100 words
|
| 393 |
+
- Word errors: ~42 words have at least 1 character wrong
|
| 394 |
+
- Correct words: ~58 words โ
|
| 395 |
+
|
| 396 |
+
Word-level accuracy: 57.53%
|
| 397 |
+
```
|
| 398 |
+
|
| 399 |
+
**Why WER > CER?**
|
| 400 |
+
|
| 401 |
+
One character error corrupts the entire word:
|
| 402 |
+
|
| 403 |
+
```
|
| 404 |
+
Ground Truth: "The magnificent castle stood tall"
|
| 405 |
+
Prediction: "The magnifcent castle stood tall"
|
| 406 |
+
โ missing 'i'
|
| 407 |
+
|
| 408 |
+
Character errors: 1
|
| 409 |
+
Word errors: 1 (entire word "magnificent" is wrong)
|
| 410 |
+
|
| 411 |
+
CER = 1/34 = 2.9%
|
| 412 |
+
WER = 1/5 = 20% โ Much higher!
|
| 413 |
+
```
|
| 414 |
+
|
| 415 |
+
**What WER tells us:**
|
| 416 |
+
- โ
More strict than CER
|
| 417 |
+
- โ
Real-world usability measure
|
| 418 |
+
- โ
High WER with low CER = mostly correct characters but words incomplete
|
| 419 |
+
- โ ๏ธ Can be harsh on OCR systems
|
| 420 |
+
|
| 421 |
+
---
|
| 422 |
+
|
| 423 |
+
#### **CTC Loss**
|
| 424 |
+
|
| 425 |
+
The loss function used during training.
|
| 426 |
+
|
| 427 |
+
**What is CTC Loss?**
|
| 428 |
+
|
| 429 |
+
Connectionist Temporal Classification (CTC) solves the **alignment problem** in sequence-to-sequence tasks.
|
| 430 |
+
|
| 431 |
+
**The Problem CTC Solves:**
|
| 432 |
+
|
| 433 |
+
Traditional approaches need exact character positions:
|
| 434 |
+
```
|
| 435 |
+
Image: "Hello"
|
| 436 |
+
Required labels:
|
| 437 |
+
- 'H' at pixels 0-20
|
| 438 |
+
- 'e' at pixels 21-35
|
| 439 |
+
- 'l' at pixels 36-50
|
| 440 |
+
- 'l' at pixels 51-65
|
| 441 |
+
- 'o' at pixels 66-80
|
| 442 |
+
```
|
| 443 |
+
|
| 444 |
+
This is **impossible to annotate** for handwriting!
|
| 445 |
+
|
| 446 |
+
**CTC Solution:**
|
| 447 |
+
|
| 448 |
+
Just provide the text: `"Hello"` โ
|
| 449 |
+
|
| 450 |
+
CTC figures out the alignment automatically:
|
| 451 |
+
|
| 452 |
+
```
|
| 453 |
+
Input Frames: |---|---|---|---|---|---|---|---|---|
|
| 454 |
+
Model Output: | - | H | H | e | - | l | l | o | - |
|
| 455 |
+
โ โ โ โ โ โ โ โ โ
|
| 456 |
+
CTC Decoding: Remove blanks (-) and collapse repeats
|
| 457 |
+
Result: "Hello" โ
|
| 458 |
+
```
|
| 459 |
+
|
| 460 |
+
**How CTC Training Works:**
|
| 461 |
+
|
| 462 |
+
1. **Blank token (ฮต)**: Special symbol for "no character"
|
| 463 |
+
2. **Multiple alignments**: Many ways to align same text
|
| 464 |
+
3. **Sum probabilities**: CTC sums all valid alignments
|
| 465 |
+
|
| 466 |
+
**Example:**
|
| 467 |
+
```
|
| 468 |
+
Target: "Hi"
|
| 469 |
+
|
| 470 |
+
Valid alignments:
|
| 471 |
+
- [H, i, -, -]
|
| 472 |
+
- [-, H, i, -]
|
| 473 |
+
- [H, H, i, i]
|
| 474 |
+
- [-, H, -, i]
|
| 475 |
+
... many more!
|
| 476 |
+
|
| 477 |
+
CTC Loss = -log(sum of probabilities of all valid paths)
|
| 478 |
+
```
|
| 479 |
+
|
| 480 |
+
**Why CTC is Powerful:**
|
| 481 |
+
|
| 482 |
+
โ
**No alignment needed**: Just text labels
|
| 483 |
+
โ
**Handles variable lengths**: Input 100 frames โ Output 5 characters
|
| 484 |
+
โ
**Robust**: Learns best alignment automatically
|
| 485 |
+
โ
**Standard**: Used in speech recognition, OCR, handwriting
|
| 486 |
+
|
| 487 |
+
**CTC During Inference:**
|
| 488 |
+
|
| 489 |
+
```python
|
| 490 |
+
# Model outputs probabilities for each frame
|
| 491 |
+
output = model(image) # Shape: [time_steps, batch, num_chars]
|
| 492 |
+
|
| 493 |
+
# Greedy decoding (simple approach)
|
| 494 |
+
best_path = torch.argmax(output, dim=2) # Pick most likely char per frame
|
| 495 |
+
# Example: [-, -, H, H, e, e, -, l, l, l, o, -]
|
| 496 |
+
|
| 497 |
+
# CTC collapse
|
| 498 |
+
result = collapse_repeats_and_remove_blanks(best_path)
|
| 499 |
+
# Result: "Hello"
|
| 500 |
+
```
|
| 501 |
+
|
| 502 |
+
**Advanced: Beam Search Decoding**
|
| 503 |
+
|
| 504 |
+
Instead of greedy (picking top-1), beam search keeps top-K possibilities:
|
| 505 |
+
- More accurate but slower
|
| 506 |
+
- Can incorporate language models
|
| 507 |
+
- Used in production systems
|
| 508 |
+
|
| 509 |
+
---
|
| 510 |
+
|
| 511 |
+
### ๐ฏ Model Performance Analysis
|
| 512 |
+
|
| 513 |
+
#### Accuracy by Character Type
|
| 514 |
+
|
| 515 |
+
Based on validation results, approximate accuracy:
|
| 516 |
+
|
| 517 |
+
| Character Type | Accuracy | Notes |
|
| 518 |
+
|---------------|----------|-------|
|
| 519 |
+
| **Lowercase (a-z)** | ~90% | Most common, well-learned |
|
| 520 |
+
| **Uppercase (A-Z)** | ~85% | Less training data |
|
| 521 |
+
| **Digits (0-9)** | ~80% | Rare in dataset |
|
| 522 |
+
| **Space** | ~95% | Easy to detect |
|
| 523 |
+
| **Punctuation (.,'")** | ~75% | Often confused or missed |
|
| 524 |
+
|
| 525 |
+
#### Common Confusions
|
| 526 |
+
|
| 527 |
+
Based on error analysis:
|
| 528 |
+
|
| 529 |
+
| Ground Truth | Often Predicted As | Reason |
|
| 530 |
+
|--------------|-------------------|--------|
|
| 531 |
+
| `e` | `c`, `o` | Similar circular shapes |
|
| 532 |
+
| `n` | `u`, `r` | Stroke similarity |
|
| 533 |
+
| `a` | `o`, `e` | Loop closure ambiguity |
|
| 534 |
+
| `i` | `l`, `t` | Vertical strokes |
|
| 535 |
+
| `rn` | `m` | Combined strokes look like 'm' |
|
| 536 |
+
| `cl` | `d` | Close proximity โ merged |
|
| 537 |
+
|
| 538 |
+
**Mitigation Strategies:**
|
| 539 |
+
- ๐ Data augmentation focusing on confusable pairs
|
| 540 |
+
- ๐ Language model post-processing (spell check)
|
| 541 |
+
- ๐ฏ Attention mechanisms to focus on character boundaries
|
| 542 |
+
|
| 543 |
+
---
|
| 544 |
+
|
| 545 |
+
## ๐ Training Results
|
| 546 |
+
|
| 547 |
+
### Training Configuration
|
| 548 |
+
|
| 549 |
+
| Hyperparameter | Value | Why This Value? |
|
| 550 |
+
|----------------|-------|-----------------|
|
| 551 |
+
| **Epochs** | 10 | Sweet spot for convergence; more epochs show diminishing returns |
|
| 552 |
+
| **Batch Size** | 8 | Balanced: Large enough for stable gradients, small enough for GPU memory |
|
| 553 |
+
| **Learning Rate** | 0.001 | Standard Adam LR; reduced automatically by scheduler if plateauing |
|
| 554 |
+
| **Optimizer** | Adam | Adaptive learning rates per parameter; industry standard |
|
| 555 |
+
| **Scheduler** | ReduceLROnPlateau | Reduces LR by 50% if validation loss doesn't improve for 3 epochs |
|
| 556 |
+
| **Gradient Clip** | 5.0 | Prevents exploding gradients common in RNNs/LSTMs |
|
| 557 |
+
| **Image Height** | 128px | Balance between detail preservation and computational efficiency |
|
| 558 |
+
| **Dropout** | 0.3 | Regularization to prevent overfitting in LSTM layers |
|
| 559 |
+
|
| 560 |
+
#### Hyperparameter Rationale
|
| 561 |
+
|
| 562 |
+
**Why Batch Size = 8?**
|
| 563 |
+
```
|
| 564 |
+
Larger batch (16+):
|
| 565 |
+
โ
Faster training
|
| 566 |
+
โ Requires more GPU memory
|
| 567 |
+
โ Less gradient noise (can hurt generalization)
|
| 568 |
+
|
| 569 |
+
Smaller batch (4-):
|
| 570 |
+
โ
Fits in memory easily
|
| 571 |
+
โ
More gradient noise (better generalization)
|
| 572 |
+
โ Slower training
|
| 573 |
+
โ Unstable gradients
|
| 574 |
+
|
| 575 |
+
Batch=8: Sweet spot โ
|
| 576 |
+
```
|
| 577 |
+
|
| 578 |
+
**Why Gradient Clipping = 5.0?**
|
| 579 |
+
|
| 580 |
+
LSTMs are prone to exploding gradients:
|
| 581 |
+
```
|
| 582 |
+
Without clipping:
|
| 583 |
+
Gradient = 10,000 โ Model diverges โ
|
| 584 |
+
|
| 585 |
+
With clipping (max norm = 5.0):
|
| 586 |
+
Gradient = 10,000 โ Scaled down to 5.0 โ
|
| 587 |
+
Training remains stable
|
| 588 |
+
```
|
| 589 |
+
|
| 590 |
+
**Why ReduceLROnPlateau Scheduler?**
|
| 591 |
+
|
| 592 |
+
Automatically adjusts learning rate when training stalls:
|
| 593 |
+
```
|
| 594 |
+
Epoch 1-5: LR = 0.001 (loss decreasing rapidly)
|
| 595 |
+
Epoch 6-8: LR = 0.001 (loss plateau detected)
|
| 596 |
+
Epoch 9+: LR = 0.0005 (scheduler reduces by 50%)
|
| 597 |
+
โ Enables fine-tuning โ
|
| 598 |
+
```
|
| 599 |
+
|
| 600 |
+
### Training Progress
|
| 601 |
+
|
| 602 |
+

|
| 603 |
+
|
| 604 |
+
**Convergence Analysis:**
|
| 605 |
+
|
| 606 |
+
| Epoch | Train Loss | Val Loss | CER โ | WER โ | Status |
|
| 607 |
+
|-------|-----------|----------|-------|-------|--------|
|
| 608 |
+
| 1 | 3.2065 | 2.6728 | 100.0% | 100.0% | Random init |
|
| 609 |
+
| 2 | 1.6866 | 1.0331 | 29.3% | 71.8% | โก Rapid learning |
|
| 610 |
+
| 5 | 0.6004 | 0.5655 | 17.7% | 53.1% | ๐ฏ Good progress |
|
| 611 |
+
| 7 | 0.4868 | 0.4595 | 14.4% | 46.5% | ๐ Stable |
|
| 612 |
+
| **10** | **0.3923** | **0.3836** | **12.95%** | **42.5%** | โ
**Best** |
|
| 613 |
+
|
| 614 |
+
### Final Metrics
|
| 615 |
+
|
| 616 |
+
<div align="center">
|
| 617 |
+
|
| 618 |
+
| Metric | Value | Interpretation |
|
| 619 |
+
|--------|-------|----------------|
|
| 620 |
+
| **Character Error Rate (CER)** | **12.95%** | ๐ฏ **87% characters correct** |
|
| 621 |
+
| **Word Error Rate (WER)** | **42.47%** | โ
**57.5% words correct** |
|
| 622 |
+
| **Training Time** | ~20 minutes | โก On T4 GPU (10 epochs) |
|
| 623 |
+
|
| 624 |
+
</div>
|
| 625 |
+
|
| 626 |
+
**Why is WER higher than CER?**
|
| 627 |
+
- A single character error makes the entire word wrong
|
| 628 |
+
- Example: "splendid" โ "splondid" (1 char error = 1 word error)
|
| 629 |
+
- This is normal for OCR systems
|
| 630 |
+
|
| 631 |
+
---
|
| 632 |
+
|
| 633 |
+
## ๐ฌ Prediction Examples
|
| 634 |
+
|
| 635 |
+
### Sample Predictions (Validation Set)
|
| 636 |
+
|
| 637 |
+
| Ground Truth | Model Prediction | Analysis |
|
| 638 |
+
|--------------|------------------|----------|
|
| 639 |
+
| `It was a splendid interpretation of the` | `It was a splendid inteyetation of thatf` | โ
85% correct, minor char confusions |
|
| 640 |
+
| `sympathetic C O . Paul Daneman gave another` | `sympathetie CD. Sul abameman gave anotherf` | โ ๏ธ Struggles with names, punctuation |
|
| 641 |
+
| `part . The rest of the cast were well chosen ,` | `pat . The nit of the cast were well chosen .f .` | โ
Most words correct, extra punctuation |
|
| 642 |
+
|
| 643 |
+
**Common Error Patterns:**
|
| 644 |
+
- ๐ค Character confusions: `e`โ`c`, `r`โ`n`, `a`โ`o`
|
| 645 |
+
- ๐ค Proper nouns: Lower accuracy on names
|
| 646 |
+
- โ๏ธ Punctuation: Extra/missing spaces around symbols
|
| 647 |
+
- ๐ End-of-line artifacts: Extra `f` or `.` characters
|
| 648 |
+
|
| 649 |
+
---
|
| 650 |
+
|
| 651 |
+
## ๐ Quick Start
|
| 652 |
+
|
| 653 |
+
### 1๏ธโฃ Load Pre-trained Model
|
| 654 |
+
|
| 655 |
+
```python
|
| 656 |
+
from huggingface_hub import hf_hub_download
|
| 657 |
+
import torch
|
| 658 |
+
|
| 659 |
+
# Download model
|
| 660 |
+
model_path = hf_hub_download(
|
| 661 |
+
repo_id="IsmatS/handwriting-recognition-iam",
|
| 662 |
+
filename="best_model.pth"
|
| 663 |
+
)
|
| 664 |
+
|
| 665 |
+
# Load checkpoint
|
| 666 |
+
checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
|
| 667 |
+
print(f"Model trained for {checkpoint['epoch']} epochs")
|
| 668 |
+
print(f"Validation CER: {checkpoint['val_cer']:.4f}")
|
| 669 |
+
```
|
| 670 |
+
|
| 671 |
+
### 2๏ธโฃ Inference on Your Own Images
|
| 672 |
+
|
| 673 |
+
```python
|
| 674 |
+
from PIL import Image
|
| 675 |
+
import numpy as np
|
| 676 |
+
|
| 677 |
+
# Load your handwritten text image
|
| 678 |
+
img = Image.open('your_handwriting.png').convert('L')
|
| 679 |
+
|
| 680 |
+
# Preprocess (resize to height=128, maintain aspect ratio)
|
| 681 |
+
w, h = img.size
|
| 682 |
+
new_w = int(128 * (w / h))
|
| 683 |
+
img = img.resize((new_w, 128), Image.LANCZOS)
|
| 684 |
+
|
| 685 |
+
# Normalize
|
| 686 |
+
img_array = np.array(img, dtype=np.float32) / 255.0
|
| 687 |
+
img_array = (img_array - 0.5) / 0.5
|
| 688 |
+
|
| 689 |
+
# Convert to tensor
|
| 690 |
+
img_tensor = torch.FloatTensor(img_array).unsqueeze(0).unsqueeze(0)
|
| 691 |
+
|
| 692 |
+
# Predict (after loading model)
|
| 693 |
+
model.eval()
|
| 694 |
+
with torch.no_grad():
|
| 695 |
+
output = model(img_tensor)
|
| 696 |
+
prediction = decode_predictions(output, char_mapper)[0]
|
| 697 |
+
|
| 698 |
+
print(f"Predicted text: {prediction}")
|
| 699 |
+
```
|
| 700 |
+
|
| 701 |
+
### 3๏ธโฃ Train Your Own Model
|
| 702 |
+
|
| 703 |
+
```bash
|
| 704 |
+
# Upload train_colab.ipynb to Google Colab
|
| 705 |
+
# Set Runtime โ Change runtime type โ GPU (T4)
|
| 706 |
+
# Run all cells
|
| 707 |
+
|
| 708 |
+
# Training takes ~1-2 hours for 10 epochs
|
| 709 |
+
```
|
| 710 |
+
|
| 711 |
+
---
|
| 712 |
+
|
| 713 |
+
## ๐ฆ Installation
|
| 714 |
+
|
| 715 |
+
```bash
|
| 716 |
+
# Clone repository
|
| 717 |
+
git clone https://huggingface.co/IsmatS/handwriting-recognition-iam
|
| 718 |
+
cd handwriting-recognition-iam
|
| 719 |
+
|
| 720 |
+
# Install dependencies
|
| 721 |
+
pip install -r requirements.txt
|
| 722 |
+
|
| 723 |
+
# Download dataset (automatic in notebooks)
|
| 724 |
+
# from datasets import load_dataset
|
| 725 |
+
# dataset = load_dataset("Teklia/IAM-line")
|
| 726 |
+
```
|
| 727 |
+
|
| 728 |
+
### Requirements
|
| 729 |
+
|
| 730 |
+
```
|
| 731 |
+
torch>=2.0.0
|
| 732 |
+
datasets>=2.14.0
|
| 733 |
+
pillow>=9.5.0
|
| 734 |
+
numpy>=1.24.0
|
| 735 |
+
matplotlib>=3.7.0
|
| 736 |
+
jiwer>=3.0.0
|
| 737 |
+
huggingface_hub>=0.16.0
|
| 738 |
+
```
|
| 739 |
+
|
| 740 |
+
---
|
| 741 |
+
|
| 742 |
+
## ๐ Project Structure
|
| 743 |
+
|
| 744 |
+
```
|
| 745 |
+
handwriting-recognition-iam/
|
| 746 |
+
โโโ ๐ train_colab.ipynb # Complete training pipeline
|
| 747 |
+
โโโ ๐ analysis.ipynb # Dataset exploration & EDA
|
| 748 |
+
โโโ ๐พ best_model.pth # Trained model checkpoint (105MB)
|
| 749 |
+
โโโ ๐ training_history.png # Training curves visualization
|
| 750 |
+
โโโ ๐ requirements.txt # Python dependencies
|
| 751 |
+
โโโ ๐ README.md # This file
|
| 752 |
+
โโโ ๐ charts/ # Dataset analysis visualizations
|
| 753 |
+
โโโ 01_sample_images.png
|
| 754 |
+
โโโ 02_text_length_distribution.png
|
| 755 |
+
โโโ 03_image_dimensions.png
|
| 756 |
+
โโโ 04_character_frequency.png
|
| 757 |
+
โโโ 05_summary_statistics.png
|
| 758 |
+
```
|
| 759 |
+
|
| 760 |
+
---
|
| 761 |
+
|
| 762 |
+
## ๐ฏ Use Cases
|
| 763 |
+
|
| 764 |
+
This model can be used for:
|
| 765 |
+
|
| 766 |
+
- ๐ **Document Digitization** - Convert handwritten notes to text
|
| 767 |
+
- ๐ง **Mail Processing** - Read handwritten addresses
|
| 768 |
+
- ๐ฅ **Medical Records** - Digitize doctor's notes
|
| 769 |
+
- ๐ซ **Educational Tools** - Auto-grade handwritten assignments
|
| 770 |
+
- ๐๏ธ **Historical Archives** - Transcribe historical documents
|
| 771 |
+
- ๐ฑ **Mobile Apps** - Real-time handwriting recognition
|
| 772 |
+
|
| 773 |
+
---
|
| 774 |
+
|
| 775 |
+
## ๐ง Advanced Usage
|
| 776 |
+
|
| 777 |
+
### Fine-tuning on Custom Data
|
| 778 |
+
|
| 779 |
+
```python
|
| 780 |
+
# Load pre-trained model
|
| 781 |
+
checkpoint = torch.load('best_model.pth')
|
| 782 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
| 783 |
+
|
| 784 |
+
# Freeze CNN layers (optional)
|
| 785 |
+
for param in model.cnn.parameters():
|
| 786 |
+
param.requires_grad = False
|
| 787 |
+
|
| 788 |
+
# Train on your dataset
|
| 789 |
+
# ... (your training loop)
|
| 790 |
+
```
|
| 791 |
+
|
| 792 |
+
### Batch Inference
|
| 793 |
+
|
| 794 |
+
```python
|
| 795 |
+
# Process multiple images
|
| 796 |
+
predictions = []
|
| 797 |
+
for image_path in image_paths:
|
| 798 |
+
img = preprocess_image(image_path)
|
| 799 |
+
pred = model.predict(img)
|
| 800 |
+
predictions.append(pred)
|
| 801 |
+
```
|
| 802 |
+
|
| 803 |
+
---
|
| 804 |
+
|
| 805 |
+
## ๐ Performance Benchmarks
|
| 806 |
+
|
| 807 |
+
| Device | Batch Size | Inference Speed | Memory Usage |
|
| 808 |
+
|--------|-----------|-----------------|--------------|
|
| 809 |
+
| CPU (Intel i7) | 1 | ~200-500ms/image | ~500MB |
|
| 810 |
+
| GPU (T4) | 8 | ~50-100ms/image | ~2GB |
|
| 811 |
+
| GPU (V100) | 16 | ~20-40ms/image | ~4GB |
|
| 812 |
+
|
| 813 |
+
---
|
| 814 |
+
|
| 815 |
+
## ๐ Technical Details
|
| 816 |
+
|
| 817 |
+
### Why CTC Loss?
|
| 818 |
+
|
| 819 |
+
Traditional OCR requires character-level bounding boxes. CTC eliminates this:
|
| 820 |
+
|
| 821 |
+
```
|
| 822 |
+
Traditional: Need positions: [H:0-10px, e:10-18px, l:18-24px, ...]
|
| 823 |
+
CTC: Just need text: "Hello" โ
|
| 824 |
+
```
|
| 825 |
+
|
| 826 |
+
CTC learns alignment automatically during training.
|
| 827 |
+
|
| 828 |
+
### Data Augmentation (Potential Improvements)
|
| 829 |
+
|
| 830 |
+
Currently not implemented, but could boost accuracy:
|
| 831 |
+
- ๐ Rotation (ยฑ5ยฐ)
|
| 832 |
+
- ๐ Elastic distortion
|
| 833 |
+
- ๐จ Brightness/contrast variation
|
| 834 |
+
- โ๏ธ Random crops
|
| 835 |
+
- ๐ Wave distortion
|
| 836 |
+
|
| 837 |
+
Expected gain: +2-5% accuracy
|
| 838 |
+
|
| 839 |
+
---
|
| 840 |
+
|
| 841 |
+
## ๐ง Limitations
|
| 842 |
+
|
| 843 |
+
Current known limitations:
|
| 844 |
+
|
| 845 |
+
- โ **Single-line only** - Doesn't handle multi-line paragraphs
|
| 846 |
+
- โ **English only** - Trained on English text (74 ASCII characters)
|
| 847 |
+
- โ **Cursive struggles** - Lower accuracy on highly cursive writing
|
| 848 |
+
- โ **Proper nouns** - Names and uncommon words have higher error rates
|
| 849 |
+
- โ **Punctuation** - Sometimes adds/removes punctuation
|
| 850 |
+
|
| 851 |
+
---
|
| 852 |
+
|
| 853 |
+
## ๐ฎ Future Improvements
|
| 854 |
+
|
| 855 |
+
Potential enhancements:
|
| 856 |
+
|
| 857 |
+
1. โ
**Attention Mechanism** - Replace/augment LSTM with Transformer
|
| 858 |
+
2. โ
**Data Augmentation** - Improve robustness
|
| 859 |
+
3. โ
**Larger Model** - Scale to 20-50M parameters
|
| 860 |
+
4. โ
**Multi-line Support** - Detect and process paragraphs
|
| 861 |
+
5. โ
**Language Models** - Post-process with spelling correction
|
| 862 |
+
6. โ
**Multilingual** - Extend to other languages
|
| 863 |
+
|
| 864 |
+
---
|
| 865 |
+
|
| 866 |
+
## ๐ References
|
| 867 |
+
|
| 868 |
+
- **IAM Database**: [Marti & Bunke, 2002](http://www.fki.inf.unibe.ch/databases/iam-handwriting-database)
|
| 869 |
+
- **CTC Loss**: [Graves et al., 2006](https://www.cs.toronto.edu/~graves/icml_2006.pdf)
|
| 870 |
+
- **CRNN**: [Shi et al., 2015](https://arxiv.org/abs/1507.05717)
|
| 871 |
+
- **Dataset on HF**: [Teklia/IAM-line](https://huggingface.co/datasets/Teklia/IAM-line)
|
| 872 |
+
|
| 873 |
+
---
|
| 874 |
+
|
| 875 |
+
## ๐ License
|
| 876 |
+
|
| 877 |
+
- **Code**: MIT License
|
| 878 |
+
- **Model Weights**: MIT License
|
| 879 |
+
- **IAM Dataset**: Free for research use (see [dataset license](https://huggingface.co/datasets/Teklia/IAM-line))
|
| 880 |
+
|
| 881 |
+
---
|
| 882 |
+
|
| 883 |
+
## ๐ Acknowledgments
|
| 884 |
+
|
| 885 |
+
- ๐ University of Bern for the IAM Database
|
| 886 |
+
- ๐ค Hugging Face for hosting dataset and model
|
| 887 |
+
- ๐ฅ PyTorch team for the framework
|
| 888 |
+
- ๐ Teklia for preparing the HF dataset version
|
| 889 |
+
|
| 890 |
+
---
|
| 891 |
+
|
| 892 |
+
## ๐ง Contact
|
| 893 |
+
|
| 894 |
+
For questions, issues, or collaboration:
|
| 895 |
+
|
| 896 |
+
- ๐ค **Hugging Face**: [@IsmatS](https://huggingface.co/IsmatS)
|
| 897 |
+
- ๐ **Issues**: [GitHub Issues](https://github.com/IsmatS/handwriting-recognition-iam/issues)
|
| 898 |
+
|
| 899 |
+
---
|
| 900 |
+
|
| 901 |
+
<div align="center">
|
| 902 |
+
|
| 903 |
+
**โญ If you find this project useful, please consider giving it a star! โญ**
|
| 904 |
+
|
| 905 |
+
[](https://huggingface.co/IsmatS/handwriting-recognition-iam)
|
| 906 |
+
[](https://huggingface.co/datasets/Teklia/IAM-line)
|
| 907 |
+
|
| 908 |
+
Made with โค๏ธ using PyTorch and Hugging Face
|
| 909 |
+
|
| 910 |
+
</div>
|
upload_to_huggingface.py
CHANGED
|
@@ -16,7 +16,12 @@ FILES_TO_UPLOAD = [
|
|
| 16 |
"README.md",
|
| 17 |
"requirements.txt",
|
| 18 |
"train_colab.ipynb",
|
| 19 |
-
"training_history.png"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
]
|
| 21 |
|
| 22 |
def upload_model_to_hf():
|
|
|
|
| 16 |
"README.md",
|
| 17 |
"requirements.txt",
|
| 18 |
"train_colab.ipynb",
|
| 19 |
+
"training_history.png",
|
| 20 |
+
"charts/01_sample_images.png",
|
| 21 |
+
"charts/02_text_length_distribution.png",
|
| 22 |
+
"charts/03_image_dimensions.png",
|
| 23 |
+
"charts/04_character_frequency.png",
|
| 24 |
+
"charts/05_summary_statistics.png"
|
| 25 |
]
|
| 26 |
|
| 27 |
def upload_model_to_hf():
|