Upload 124 files
Browse filesThis view is limited to 50 files because it contains too many changes. Β See raw diff
- .gitattributes +37 -0
- CRNN+CTC/.env +1 -0
- CRNN+CTC/.gitignore +14 -0
- CRNN+CTC/IAM_train.py +332 -0
- CRNN+CTC/README.md +449 -0
- CRNN+CTC/calibrate_fields.py +196 -0
- CRNN+CTC/calibrated_fields.py +7 -0
- CRNN+CTC/check_cer.py +331 -0
- CRNN+CTC/checkpoints/best_model.pth +3 -0
- CRNN+CTC/checkpoints/best_model_final.pth +3 -0
- CRNN+CTC/checkpoints/best_model_iam.pth +3 -0
- CRNN+CTC/checkpoints/best_model_v2.pth +3 -0
- CRNN+CTC/checkpoints/best_model_v3.pth +3 -0
- CRNN+CTC/checkpoints/best_model_v4.pth +3 -0
- CRNN+CTC/checkpoints/best_model_v5.pth +3 -0
- CRNN+CTC/checkpoints/best_model_v6.pth +3 -0
- CRNN+CTC/checkpoints/best_model_v7.pth +3 -0
- CRNN+CTC/checkpoints/best_model_v732.pth +3 -0
- CRNN+CTC/checkpoints/checkpoint_epoch_10.pth +3 -0
- CRNN+CTC/checkpoints/latest_checkpoint.pth +3 -0
- CRNN+CTC/compare_checkpoints.py +34 -0
- CRNN+CTC/compare_live_cer.py +158 -0
- CRNN+CTC/create_test_images.py +50 -0
- CRNN+CTC/crnn_model.py +119 -0
- CRNN+CTC/dataset.py +401 -0
- CRNN+CTC/field_extractor.py +735 -0
- CRNN+CTC/finetune.py +202 -0
- CRNN+CTC/generate_ph_names.py +350 -0
- CRNN+CTC/inference.py +395 -0
- CRNN+CTC/prepare_emnist.py +97 -0
- CRNN+CTC/requirements.txt +61 -0
- CRNN+CTC/train.py +438 -0
- CRNN+CTC/train_emnist.py +15 -0
- CRNN+CTC/train_mnist.py +42 -0
- CRNN+CTC/train_with_emnist.py +169 -0
- CRNN+CTC/utils.py +397 -0
- MNB/__init__.py +4 -0
- MNB/classifier.py +292 -0
- MNB/form_classifier.py +466 -0
- MNB/keywords.py +127 -0
- MNB/mnb_metadata.json +17 -0
- MNB/models/mnb_classifier.pkl +3 -0
- MNB/models/mnb_metadata.json +13 -0
- MNB/models/tfidf_vectorizer.pkl +3 -0
- references/12 +3 -0
- references/321 +3 -0
- references/321321 +3 -0
- references/old.jpg +3 -0
- references/reference-102.png +3 -0
- references/reference-103.png +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CRNN+CTC/checkpoints/best_model_final.pth filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
CRNN+CTC/checkpoints/best_model_iam.pth filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
CRNN+CTC/checkpoints/best_model_v2.pth filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
CRNN+CTC/checkpoints/best_model_v3.pth filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
CRNN+CTC/checkpoints/best_model_v4.pth filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
CRNN+CTC/checkpoints/best_model_v5.pth filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
CRNN+CTC/checkpoints/best_model_v6.pth filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
CRNN+CTC/checkpoints/best_model_v7.pth filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
CRNN+CTC/checkpoints/best_model_v732.pth filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
CRNN+CTC/checkpoints/best_model.pth filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
CRNN+CTC/checkpoints/checkpoint_epoch_10.pth filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
CRNN+CTC/checkpoints/latest_checkpoint.pth filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
MNB/models/mnb_classifier.pkl filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
MNB/models/tfidf_vectorizer.pkl filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
references/12 filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
references/321 filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
references/321321 filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
references/old.jpg filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
references/reference-102.png filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
references/reference-103.png filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
references/reference-97.png filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
spacyNER/models/civil_registry_model/model-best/ner/model filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
spacyNER/models/civil_registry_model/model-best/tok2vec/model filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
spacyNER/models/civil_registry_model/model-best/vocab/key2row filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
spacyNER/models/civil_registry_model/model-best/vocab/vectors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
spacyNER/models/civil_registry_model/model-last/ner/model filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
spacyNER/models/civil_registry_model/model-last/tok2vec/model filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
spacyNER/models/civil_registry_model/model-last/vocab/key2row filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
spacyNER/models/civil_registry_model/model-last/vocab/vectors filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
spacyNER/models/phase1_funsd/model-best/ner/model filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
spacyNER/models/phase1_funsd/model-best/tok2vec/model filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
spacyNER/models/phase1_funsd/model-best/vocab/key2row filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
spacyNER/models/phase1_funsd/model-best/vocab/vectors filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
spacyNER/models/phase1_funsd/model-last/ner/model filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
spacyNER/models/phase1_funsd/model-last/tok2vec/model filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
spacyNER/models/phase1_funsd/model-last/vocab/key2row filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
spacyNER/models/phase1_funsd/model-last/vocab/vectors filter=lfs diff=lfs merge=lfs -text
|
CRNN+CTC/.env
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»ΏPOPPLER_PATH=C:\Program Files\poppler-25.12.0\Library\bin
|
CRNN+CTC/.gitignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
datasets/
|
| 2 |
+
checkpoints/
|
| 3 |
+
logs/
|
| 4 |
+
test_images/
|
| 5 |
+
data/
|
| 6 |
+
__pycache__/
|
| 7 |
+
*.png
|
| 8 |
+
*.jpg
|
| 9 |
+
*.jpeg
|
| 10 |
+
*.npy
|
| 11 |
+
*.h5
|
| 12 |
+
*.pkl
|
| 13 |
+
*.pyc
|
| 14 |
+
iam-handwriting-word-database/
|
CRNN+CTC/IAM_train.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
IAM_train.py
|
| 3 |
+
============
|
| 4 |
+
Fine-tune the CRNN model using the IAM Handwriting Word Database.
|
| 5 |
+
Builds on top of EMNIST-trained model (best_model_emnist.pth).
|
| 6 |
+
|
| 7 |
+
FIXES vs old version:
|
| 8 |
+
- IMG_WIDTH 400 -> 512 (must match pipeline)
|
| 9 |
+
- Added log_softmax before CTCLoss (was missing β caused catastrophic forgetting)
|
| 10 |
+
- Phase 1: CNN FROZEN β only RNN+FC trained
|
| 11 |
+
- Phase 2: Full model at very low LR
|
| 12 |
+
- Loads from best_model_emnist.pth, falls back to best_model.pth
|
| 13 |
+
- Uses get_crnn_model() with correct architecture from checkpoint config
|
| 14 |
+
|
| 15 |
+
DATASET:
|
| 16 |
+
Download from: https://www.kaggle.com/datasets/nibinv23/iam-handwriting-word-database
|
| 17 |
+
Expected structure:
|
| 18 |
+
data/IAM/iam_words/
|
| 19 |
+
words/ <- word image folders (a01, a02, ...)
|
| 20 |
+
words.txt <- annotation file
|
| 21 |
+
|
| 22 |
+
USAGE:
|
| 23 |
+
python IAM_train.py --prepare # convert IAM -> annotation JSON
|
| 24 |
+
python IAM_train.py --train # fine-tune model
|
| 25 |
+
python IAM_train.py --prepare --train # do both
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
import os
|
| 29 |
+
import sys
|
| 30 |
+
import json
|
| 31 |
+
import argparse
|
| 32 |
+
import random
|
| 33 |
+
from pathlib import Path
|
| 34 |
+
|
| 35 |
+
import torch
|
| 36 |
+
import torch.nn.functional as F
|
| 37 |
+
import torch.optim as optim
|
| 38 |
+
from torch.utils.data import DataLoader, ConcatDataset
|
| 39 |
+
|
| 40 |
+
sys.path.append('.')
|
| 41 |
+
from crnn_model import get_crnn_model
|
| 42 |
+
from dataset import CivilRegistryDataset, collate_fn
|
| 43 |
+
|
| 44 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
# CONFIG
|
| 46 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 47 |
+
IAM_ROOT = "data/IAM/iam_words"
|
| 48 |
+
IAM_WORDS_TXT = f"{IAM_ROOT}/words.txt"
|
| 49 |
+
IAM_WORDS_DIR = f"{IAM_ROOT}/words"
|
| 50 |
+
|
| 51 |
+
TRAIN_ANN = "data/iam_train_annotations.json"
|
| 52 |
+
IAM_VAL_ANN = "data/iam_val_annotations.json" # written by --prepare (IAM word images)
|
| 53 |
+
SYNTH_VAL_ANN = "data/val_annotations.json" # real civil registry val set β never overwritten
|
| 54 |
+
TRAIN_IMG_DIR = "data/train/iam"
|
| 55 |
+
VAL_IMG_DIR = "data/val/iam"
|
| 56 |
+
|
| 57 |
+
IMG_HEIGHT = 64
|
| 58 |
+
IMG_WIDTH = 512 # FIXED: was 400 β must match pipeline
|
| 59 |
+
BATCH_SIZE = 32
|
| 60 |
+
VAL_SPLIT = 0.1
|
| 61 |
+
MAX_SAMPLES = 50000
|
| 62 |
+
|
| 63 |
+
# Load from EMNIST checkpoint, fall back to synthetic if not found
|
| 64 |
+
CHECKPOINT_IN = "checkpoints/best_model_emnist.pth"
|
| 65 |
+
CHECKPOINT_IN2 = "checkpoints/best_model.pth" # fallback
|
| 66 |
+
CHECKPOINT_OUT = "checkpoints/best_model_iam.pth"
|
| 67 |
+
|
| 68 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 72 |
+
# STEP 1 β PREPARE
|
| 73 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 74 |
+
def prepare_iam():
|
| 75 |
+
from PIL import Image
|
| 76 |
+
|
| 77 |
+
print("\n" + "=" * 50)
|
| 78 |
+
print("STEP 1 β Preparing IAM dataset")
|
| 79 |
+
print("=" * 50)
|
| 80 |
+
|
| 81 |
+
if not os.path.exists(IAM_WORDS_TXT):
|
| 82 |
+
print(f"ERROR: {IAM_WORDS_TXT} not found!")
|
| 83 |
+
print("Download from: https://www.kaggle.com/datasets/nibinv23/iam-handwriting-word-database")
|
| 84 |
+
print("Expected structure:")
|
| 85 |
+
print(" data/IAM/iam_words/words.txt")
|
| 86 |
+
print(" data/IAM/iam_words/words/")
|
| 87 |
+
sys.exit(1)
|
| 88 |
+
|
| 89 |
+
os.makedirs(TRAIN_IMG_DIR, exist_ok=True)
|
| 90 |
+
os.makedirs(VAL_IMG_DIR, exist_ok=True)
|
| 91 |
+
|
| 92 |
+
entries = []
|
| 93 |
+
print(f" Reading {IAM_WORDS_TXT} ...")
|
| 94 |
+
with open(IAM_WORDS_TXT, "r") as f:
|
| 95 |
+
for line in f:
|
| 96 |
+
line = line.strip()
|
| 97 |
+
if not line or line.startswith("#"):
|
| 98 |
+
continue
|
| 99 |
+
parts = line.split(" ")
|
| 100 |
+
if len(parts) < 9:
|
| 101 |
+
continue
|
| 102 |
+
word_id = parts[0]
|
| 103 |
+
seg_result = parts[1]
|
| 104 |
+
text = parts[-1]
|
| 105 |
+
if seg_result != "ok":
|
| 106 |
+
continue
|
| 107 |
+
if len(text) < 1 or len(text) > 32:
|
| 108 |
+
continue
|
| 109 |
+
parts_id = word_id.split("-")
|
| 110 |
+
img_path = os.path.join(
|
| 111 |
+
IAM_WORDS_DIR,
|
| 112 |
+
parts_id[0],
|
| 113 |
+
f"{parts_id[0]}-{parts_id[1]}",
|
| 114 |
+
f"{word_id}.png"
|
| 115 |
+
)
|
| 116 |
+
if not os.path.exists(img_path):
|
| 117 |
+
continue
|
| 118 |
+
entries.append((img_path, text))
|
| 119 |
+
|
| 120 |
+
print(f" Found {len(entries)} valid word entries")
|
| 121 |
+
|
| 122 |
+
if MAX_SAMPLES and len(entries) > MAX_SAMPLES:
|
| 123 |
+
random.shuffle(entries)
|
| 124 |
+
entries = entries[:MAX_SAMPLES]
|
| 125 |
+
print(f" Limiting to {MAX_SAMPLES} samples")
|
| 126 |
+
|
| 127 |
+
random.shuffle(entries)
|
| 128 |
+
split_idx = int(len(entries) * (1 - VAL_SPLIT))
|
| 129 |
+
train_entries = entries[:split_idx]
|
| 130 |
+
val_entries = entries[split_idx:]
|
| 131 |
+
print(f" Train: {len(train_entries)} | Val: {len(val_entries)}")
|
| 132 |
+
print(" Copying and resizing images...")
|
| 133 |
+
|
| 134 |
+
def process_entries(entry_list, out_dir, prefix):
|
| 135 |
+
annotations = []
|
| 136 |
+
for i, (src_path, text) in enumerate(entry_list):
|
| 137 |
+
try:
|
| 138 |
+
img = Image.open(src_path).convert("RGB")
|
| 139 |
+
img = img.resize((IMG_WIDTH, IMG_HEIGHT)) # FIXED: 512x64
|
| 140 |
+
fname = f"iam_{prefix}_{i:06d}.jpg"
|
| 141 |
+
out_path = os.path.join(out_dir, fname)
|
| 142 |
+
img.save(out_path, quality=90)
|
| 143 |
+
annotations.append({"image_path": f"iam/{fname}", "text": text})
|
| 144 |
+
except Exception:
|
| 145 |
+
continue
|
| 146 |
+
if i % 5000 == 0:
|
| 147 |
+
print(f" {i}/{len(entry_list)} processed...")
|
| 148 |
+
return annotations
|
| 149 |
+
|
| 150 |
+
train_ann = process_entries(train_entries, TRAIN_IMG_DIR, "train")
|
| 151 |
+
val_ann = process_entries(val_entries, VAL_IMG_DIR, "val")
|
| 152 |
+
|
| 153 |
+
with open(TRAIN_ANN, "w") as f:
|
| 154 |
+
json.dump(train_ann, f, indent=2)
|
| 155 |
+
with open(IAM_VAL_ANN, "w") as f:
|
| 156 |
+
json.dump(val_ann, f, indent=2)
|
| 157 |
+
|
| 158 |
+
print(f"\n Train annotations -> {TRAIN_ANN} ({len(train_ann)} entries)")
|
| 159 |
+
print(f" Val annotations -> {IAM_VAL_ANN} ({len(val_ann)} entries)")
|
| 160 |
+
print("\n Done! Now run: python IAM_train.py --train")
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 164 |
+
# STEP 2 β TRAIN
|
| 165 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
+
def train_iam():
|
| 167 |
+
print("\n" + "=" * 55)
|
| 168 |
+
print("STEP 2 β Fine-tuning CRNN with IAM dataset")
|
| 169 |
+
print("=" * 55)
|
| 170 |
+
print(f" Device : {DEVICE}")
|
| 171 |
+
|
| 172 |
+
for ann_file in [TRAIN_ANN, SYNTH_VAL_ANN]:
|
| 173 |
+
if not os.path.exists(ann_file):
|
| 174 |
+
print(f"ERROR: {ann_file} not found! Run --prepare first.")
|
| 175 |
+
sys.exit(1)
|
| 176 |
+
|
| 177 |
+
train_dataset = CivilRegistryDataset(
|
| 178 |
+
data_dir="data/train", annotations_file=TRAIN_ANN,
|
| 179 |
+
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
|
| 180 |
+
)
|
| 181 |
+
# FIXED: mix synthetic data in so the model never forgets Filipino multi-word sequences
|
| 182 |
+
synth_dataset = CivilRegistryDataset(
|
| 183 |
+
data_dir="data/train", annotations_file="data/train_annotations.json",
|
| 184 |
+
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
|
| 185 |
+
)
|
| 186 |
+
mixed_train = ConcatDataset([train_dataset, synth_dataset])
|
| 187 |
+
val_dataset = CivilRegistryDataset(
|
| 188 |
+
data_dir="data/val", annotations_file=SYNTH_VAL_ANN,
|
| 189 |
+
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
|
| 190 |
+
)
|
| 191 |
+
print(f" IAM train : {len(train_dataset)}")
|
| 192 |
+
print(f" Synthetic train: {len(synth_dataset)}")
|
| 193 |
+
print(f" Mixed train : {len(mixed_train)}")
|
| 194 |
+
print(f" Val : {len(val_dataset)}")
|
| 195 |
+
|
| 196 |
+
train_loader = DataLoader(mixed_train, batch_size=BATCH_SIZE,
|
| 197 |
+
shuffle=True, num_workers=0, collate_fn=collate_fn)
|
| 198 |
+
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
|
| 199 |
+
shuffle=False, num_workers=0, collate_fn=collate_fn)
|
| 200 |
+
|
| 201 |
+
# ββ Load checkpoint (EMNIST preferred, synthetic fallback) ββ
|
| 202 |
+
ckpt_path = CHECKPOINT_IN if os.path.exists(CHECKPOINT_IN) else CHECKPOINT_IN2
|
| 203 |
+
if not os.path.exists(ckpt_path):
|
| 204 |
+
print(f"ERROR: No checkpoint found at {CHECKPOINT_IN} or {CHECKPOINT_IN2}")
|
| 205 |
+
print("Run: python train.py then python train_with_emnist.py")
|
| 206 |
+
sys.exit(1)
|
| 207 |
+
|
| 208 |
+
print(f" Loading: {ckpt_path}")
|
| 209 |
+
ckpt = torch.load(ckpt_path, map_location=DEVICE, weights_only=False)
|
| 210 |
+
config = ckpt.get('config', {})
|
| 211 |
+
|
| 212 |
+
model = get_crnn_model(
|
| 213 |
+
model_type = config.get('model_type', 'standard'),
|
| 214 |
+
img_height = config.get('img_height', 64),
|
| 215 |
+
num_chars = train_dataset.num_chars,
|
| 216 |
+
hidden_size = config.get('hidden_size', 128),
|
| 217 |
+
num_lstm_layers = config.get('num_lstm_layers', 1),
|
| 218 |
+
).to(DEVICE)
|
| 219 |
+
|
| 220 |
+
missing, _ = model.load_state_dict(ckpt['model_state_dict'], strict=False)
|
| 221 |
+
if missing:
|
| 222 |
+
print(f" Note: {len(missing)} layers re-initialized")
|
| 223 |
+
print(f" Loaded epoch {ckpt.get('epoch', 'N/A')} "
|
| 224 |
+
f"val_loss={ckpt.get('val_loss', ckpt.get('val_cer', 0)):.4f}")
|
| 225 |
+
|
| 226 |
+
criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
|
| 227 |
+
os.makedirs("checkpoints", exist_ok=True)
|
| 228 |
+
|
| 229 |
+
def run_epoch(loader, training, optimizer=None):
|
| 230 |
+
model.train() if training else model.eval()
|
| 231 |
+
total, n = 0, 0
|
| 232 |
+
ctx = torch.enable_grad() if training else torch.no_grad()
|
| 233 |
+
with ctx:
|
| 234 |
+
for images, targets, target_lengths, _ in loader:
|
| 235 |
+
images = images.to(DEVICE)
|
| 236 |
+
batch_size = images.size(0)
|
| 237 |
+
if training:
|
| 238 |
+
optimizer.zero_grad()
|
| 239 |
+
# CRITICAL: log_softmax before CTCLoss
|
| 240 |
+
outputs = F.log_softmax(model(images), dim=2)
|
| 241 |
+
seq_len = outputs.size(0)
|
| 242 |
+
input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
|
| 243 |
+
loss = criterion(outputs, targets, input_lengths, target_lengths)
|
| 244 |
+
if not torch.isnan(loss) and not torch.isinf(loss):
|
| 245 |
+
if training:
|
| 246 |
+
loss.backward()
|
| 247 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
|
| 248 |
+
optimizer.step()
|
| 249 |
+
total += loss.item()
|
| 250 |
+
n += 1
|
| 251 |
+
return total / max(n, 1)
|
| 252 |
+
|
| 253 |
+
def run_phase(num, epochs, lr, freeze_cnn, patience):
|
| 254 |
+
print(f"\n{'='*55}")
|
| 255 |
+
print(f" PHASE {num} β "
|
| 256 |
+
f"{'CNN FROZEN (RNN+FC only)' if freeze_cnn else 'FULL MODEL (all layers)'}"
|
| 257 |
+
f" LR={lr}")
|
| 258 |
+
print(f"{'='*55}")
|
| 259 |
+
|
| 260 |
+
for name, param in model.named_parameters():
|
| 261 |
+
param.requires_grad = not (freeze_cnn and 'cnn' in name)
|
| 262 |
+
|
| 263 |
+
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 264 |
+
print(f" Trainable params : {trainable:,}")
|
| 265 |
+
|
| 266 |
+
opt = optim.Adam(
|
| 267 |
+
filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
|
| 268 |
+
sched = optim.lr_scheduler.ReduceLROnPlateau(opt, patience=3, factor=0.5)
|
| 269 |
+
best = float('inf')
|
| 270 |
+
counter = 0
|
| 271 |
+
|
| 272 |
+
for epoch in range(1, epochs + 1):
|
| 273 |
+
tr = run_epoch(train_loader, True, opt)
|
| 274 |
+
vl = run_epoch(val_loader, False, None)
|
| 275 |
+
sched.step(vl)
|
| 276 |
+
|
| 277 |
+
if vl < best:
|
| 278 |
+
best = vl
|
| 279 |
+
counter = 0
|
| 280 |
+
torch.save({
|
| 281 |
+
'model_state_dict': model.state_dict(),
|
| 282 |
+
'config': config,
|
| 283 |
+
'char_to_idx': train_dataset.char_to_idx,
|
| 284 |
+
'idx_to_char': train_dataset.idx_to_char,
|
| 285 |
+
'epoch': epoch,
|
| 286 |
+
'val_loss': vl, # FIXED: renamed from val_cer β this is val loss, not CER%
|
| 287 |
+
}, CHECKPOINT_OUT)
|
| 288 |
+
print(f" Epoch {epoch:02d}/{epochs} "
|
| 289 |
+
f"Train={tr:.4f} Val={vl:.4f} <- saved")
|
| 290 |
+
else:
|
| 291 |
+
counter += 1
|
| 292 |
+
print(f" Epoch {epoch:02d}/{epochs} "
|
| 293 |
+
f"Train={tr:.4f} Val={vl:.4f} "
|
| 294 |
+
f"(patience {counter}/{patience})")
|
| 295 |
+
if counter >= patience:
|
| 296 |
+
print(f" Early stopping at epoch {epoch}.")
|
| 297 |
+
break
|
| 298 |
+
return best
|
| 299 |
+
|
| 300 |
+
# Phase 1: Freeze CNN
|
| 301 |
+
p1 = run_phase(1, epochs=30, lr=1e-4, freeze_cnn=True, patience=7)
|
| 302 |
+
# Phase 2: Full model, very low LR
|
| 303 |
+
p2 = run_phase(2, epochs=20, lr=1e-6, freeze_cnn=False, patience=5)
|
| 304 |
+
|
| 305 |
+
print(f"\n{'='*55}")
|
| 306 |
+
print(f"IAM fine-tuning complete!")
|
| 307 |
+
print(f" Phase 1 best val loss : {p1:.4f}")
|
| 308 |
+
print(f" Phase 2 best val loss : {p2:.4f}")
|
| 309 |
+
print(f" Saved : {CHECKPOINT_OUT}")
|
| 310 |
+
print(f"\nNext step: collect physical certificate scans")
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 314 |
+
# MAIN
|
| 315 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 316 |
+
if __name__ == "__main__":
|
| 317 |
+
parser = argparse.ArgumentParser()
|
| 318 |
+
parser.add_argument("--prepare", action="store_true")
|
| 319 |
+
parser.add_argument("--train", action="store_true")
|
| 320 |
+
args = parser.parse_args()
|
| 321 |
+
|
| 322 |
+
if not args.prepare and not args.train:
|
| 323 |
+
print("Usage:")
|
| 324 |
+
print(" python IAM_train.py --prepare # prepare dataset")
|
| 325 |
+
print(" python IAM_train.py --train # train model")
|
| 326 |
+
print(" python IAM_train.py --prepare --train # do both")
|
| 327 |
+
sys.exit(0)
|
| 328 |
+
|
| 329 |
+
if args.prepare:
|
| 330 |
+
prepare_iam()
|
| 331 |
+
if args.train:
|
| 332 |
+
train_iam()
|
CRNN+CTC/README.md
ADDED
|
@@ -0,0 +1,449 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Local Civil Registry Document Digitization and Data Extraction
|
| 2 |
+
|
| 3 |
+
## Using CRNN+CTC, Multinomial Naive Bayes, and Named Entity Recognition
|
| 4 |
+
|
| 5 |
+
**Thesis Project by:**
|
| 6 |
+
- Shane Mark C. Blanco
|
| 7 |
+
- Princess A. Pasamonte
|
| 8 |
+
- Irish Faith G. Ramirez
|
| 9 |
+
|
| 10 |
+
**Institution:** Tarlac State University, College of Computer Studies
|
| 11 |
+
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
## π Project Overview
|
| 15 |
+
|
| 16 |
+
This system automates the digitization and data extraction of Philippine Civil Registry documents using advanced machine learning algorithms:
|
| 17 |
+
|
| 18 |
+
### Target Documents:
|
| 19 |
+
- **Form 1A** - Birth Certificate
|
| 20 |
+
- **Form 2A** - Death Certificate
|
| 21 |
+
- **Form 3A** - Marriage Certificate
|
| 22 |
+
- **Form 90** - Application of Marriage License
|
| 23 |
+
|
| 24 |
+
### Key Features:
|
| 25 |
+
β
OCR for printed and handwritten text
|
| 26 |
+
β
Automatic document classification
|
| 27 |
+
β
Named entity extraction (names, dates, places)
|
| 28 |
+
β
Auto-fill digital forms
|
| 29 |
+
β
MySQL database storage
|
| 30 |
+
β
Searchable digital archive
|
| 31 |
+
β
Data visualization dashboard
|
| 32 |
+
|
| 33 |
+
---
|
| 34 |
+
|
| 35 |
+
## ποΈ System Architecture
|
| 36 |
+
|
| 37 |
+
```
|
| 38 |
+
Input: Scanned Civil Registry Form
|
| 39 |
+
β
|
| 40 |
+
1. Image Preprocessing
|
| 41 |
+
β
|
| 42 |
+
2. CRNN+CTC β Text Recognition
|
| 43 |
+
β
|
| 44 |
+
3. Multinomial Naive Bayes β Document Classification
|
| 45 |
+
β
|
| 46 |
+
4. spaCy NER β Entity Extraction
|
| 47 |
+
β
|
| 48 |
+
5. Data Validation & Storage β MySQL Database
|
| 49 |
+
β
|
| 50 |
+
Output: Digitized & Searchable Record
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
---
|
| 54 |
+
|
| 55 |
+
## π Quick Start
|
| 56 |
+
|
| 57 |
+
### Prerequisites
|
| 58 |
+
|
| 59 |
+
- Python 3.8+
|
| 60 |
+
- CUDA-capable GPU (recommended) or CPU
|
| 61 |
+
- 8GB RAM minimum
|
| 62 |
+
|
| 63 |
+
### Installation
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
# 1. Clone or download the project
|
| 67 |
+
cd civil_registry_ocr
|
| 68 |
+
|
| 69 |
+
# 2. Create virtual environment
|
| 70 |
+
python -m venv venv
|
| 71 |
+
source venv/bin/activate # Linux/Mac
|
| 72 |
+
venv\Scripts\activate # Windows
|
| 73 |
+
|
| 74 |
+
# 3. Install dependencies
|
| 75 |
+
pip install -r requirements.txt
|
| 76 |
+
|
| 77 |
+
# 4. Download spaCy model
|
| 78 |
+
python -m spacy download en_core_web_sm
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
### Quick Test
|
| 82 |
+
|
| 83 |
+
```python
|
| 84 |
+
from inference import CivilRegistryOCR
|
| 85 |
+
|
| 86 |
+
# Load model
|
| 87 |
+
ocr = CivilRegistryOCR('checkpoints/best_model.pth')
|
| 88 |
+
|
| 89 |
+
# Recognize text
|
| 90 |
+
text = ocr.predict('test_images/sample_name.jpg')
|
| 91 |
+
print(f"Recognized: {text}")
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
---
|
| 95 |
+
|
| 96 |
+
## π Project Files
|
| 97 |
+
|
| 98 |
+
### Core Implementation Files:
|
| 99 |
+
|
| 100 |
+
1. **crnn_model.py** - CRNN+CTC neural network architecture
|
| 101 |
+
2. **dataset.py** - Data loading and preprocessing
|
| 102 |
+
3. **train.py** - Model training script
|
| 103 |
+
4. **inference.py** - Prediction and inference
|
| 104 |
+
5. **utils.py** - Helper functions and metrics
|
| 105 |
+
6. **requirements.txt** - Python dependencies
|
| 106 |
+
7. **IMPLEMENTATION_GUIDE.md** - Detailed implementation guide
|
| 107 |
+
|
| 108 |
+
### Additional Components (To be created):
|
| 109 |
+
|
| 110 |
+
8. **document_classifier.py** - Multinomial Naive Bayes classifier
|
| 111 |
+
9. **ner_extractor.py** - Named Entity Recognition
|
| 112 |
+
10. **web_app.py** - Web application (Flask/FastAPI)
|
| 113 |
+
11. **database.py** - MySQL integration
|
| 114 |
+
|
| 115 |
+
---
|
| 116 |
+
|
| 117 |
+
## π Training the Model
|
| 118 |
+
|
| 119 |
+
### 1. Prepare Your Data
|
| 120 |
+
|
| 121 |
+
Organize images and labels:
|
| 122 |
+
```
|
| 123 |
+
data/
|
| 124 |
+
train/
|
| 125 |
+
form1a/
|
| 126 |
+
name_001.jpg
|
| 127 |
+
name_001.txt
|
| 128 |
+
form2a/
|
| 129 |
+
...
|
| 130 |
+
val/
|
| 131 |
+
...
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
### 2. Create Annotations
|
| 135 |
+
|
| 136 |
+
```python
|
| 137 |
+
from dataset import create_annotation_file
|
| 138 |
+
|
| 139 |
+
create_annotation_file('data/train', 'data/train_annotations.json')
|
| 140 |
+
create_annotation_file('data/val', 'data/val_annotations.json')
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
### 3. Train Model
|
| 144 |
+
|
| 145 |
+
```bash
|
| 146 |
+
python train.py
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
Monitor metrics:
|
| 150 |
+
- Character Error Rate (CER)
|
| 151 |
+
- Word Error Rate (WER)
|
| 152 |
+
- Training/Validation Loss
|
| 153 |
+
|
| 154 |
+
### 4. Evaluate
|
| 155 |
+
|
| 156 |
+
```python
|
| 157 |
+
from utils import calculate_cer, calculate_wer
|
| 158 |
+
|
| 159 |
+
predictions = [ocr.predict(img) for img in test_images]
|
| 160 |
+
cer = calculate_cer(predictions, ground_truths)
|
| 161 |
+
print(f"CER: {cer:.2f}%")
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
---
|
| 165 |
+
|
| 166 |
+
## π Web Application
|
| 167 |
+
|
| 168 |
+
### Start the Server
|
| 169 |
+
|
| 170 |
+
```bash
|
| 171 |
+
python web_app.py
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
### API Endpoints
|
| 175 |
+
|
| 176 |
+
**POST /api/ocr** - Process document
|
| 177 |
+
```bash
|
| 178 |
+
curl -X POST -F "file=@birth_cert.jpg" http://localhost:8000/api/ocr
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
**Response:**
|
| 182 |
+
```json
|
| 183 |
+
{
|
| 184 |
+
"text": "Juan Dela Cruz\n01/15/1990\nTarlac City",
|
| 185 |
+
"form_type": "form1a",
|
| 186 |
+
"entities": {
|
| 187 |
+
"persons": ["Juan Dela Cruz"],
|
| 188 |
+
"dates": ["01/15/1990"],
|
| 189 |
+
"locations": ["Tarlac City"]
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
---
|
| 195 |
+
|
| 196 |
+
## π― Expected Performance
|
| 197 |
+
|
| 198 |
+
Based on thesis objectives:
|
| 199 |
+
|
| 200 |
+
### CRNN+CTC Model:
|
| 201 |
+
- **Target CER:** < 5%
|
| 202 |
+
- **Target Accuracy:** > 95%
|
| 203 |
+
- Handles both printed and handwritten text
|
| 204 |
+
|
| 205 |
+
### Document Classifier (MNB):
|
| 206 |
+
- **Target Accuracy:** > 90%
|
| 207 |
+
- Fast classification (< 100ms)
|
| 208 |
+
|
| 209 |
+
### NER (spaCy):
|
| 210 |
+
- **F1 Score:** > 85%
|
| 211 |
+
- Extracts: Names, Dates, Places
|
| 212 |
+
|
| 213 |
+
---
|
| 214 |
+
|
| 215 |
+
## π§ͺ Testing
|
| 216 |
+
|
| 217 |
+
### ISO 25010 Evaluation
|
| 218 |
+
|
| 219 |
+
**Usability Testing:**
|
| 220 |
+
```python
|
| 221 |
+
# Metrics to measure:
|
| 222 |
+
- Task completion rate
|
| 223 |
+
- Average time per task
|
| 224 |
+
- User satisfaction score (SUS)
|
| 225 |
+
```
|
| 226 |
+
|
| 227 |
+
**Reliability Testing:**
|
| 228 |
+
```python
|
| 229 |
+
# Metrics to measure:
|
| 230 |
+
- System uptime %
|
| 231 |
+
- Error rate
|
| 232 |
+
- Recovery time
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
### Confusion Matrix
|
| 236 |
+
|
| 237 |
+
```python
|
| 238 |
+
from sklearn.metrics import confusion_matrix
|
| 239 |
+
import seaborn as sns
|
| 240 |
+
|
| 241 |
+
cm = confusion_matrix(true_labels, predicted_labels)
|
| 242 |
+
sns.heatmap(cm, annot=True)
|
| 243 |
+
```
|
| 244 |
+
|
| 245 |
+
---
|
| 246 |
+
|
| 247 |
+
## πΎ Database Schema
|
| 248 |
+
|
| 249 |
+
### Birth Certificates Table
|
| 250 |
+
```sql
|
| 251 |
+
CREATE TABLE birth_certificates (
|
| 252 |
+
id INT PRIMARY KEY AUTO_INCREMENT,
|
| 253 |
+
child_name VARCHAR(255),
|
| 254 |
+
date_of_birth DATE,
|
| 255 |
+
place_of_birth VARCHAR(255),
|
| 256 |
+
sex CHAR(1),
|
| 257 |
+
father_name VARCHAR(255),
|
| 258 |
+
mother_name VARCHAR(255),
|
| 259 |
+
raw_text TEXT,
|
| 260 |
+
form_image LONGBLOB,
|
| 261 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 262 |
+
);
|
| 263 |
+
```
|
| 264 |
+
|
| 265 |
+
---
|
| 266 |
+
|
| 267 |
+
## π System Requirements
|
| 268 |
+
|
| 269 |
+
### Minimum:
|
| 270 |
+
- CPU: Intel i5 or equivalent
|
| 271 |
+
- RAM: 8GB
|
| 272 |
+
- Storage: 10GB
|
| 273 |
+
- OS: Windows 10, Ubuntu 18.04, macOS 10.14
|
| 274 |
+
|
| 275 |
+
### Recommended:
|
| 276 |
+
- CPU: Intel i7 or equivalent
|
| 277 |
+
- GPU: NVIDIA GTX 1060 or better
|
| 278 |
+
- RAM: 16GB
|
| 279 |
+
- Storage: 50GB SSD
|
| 280 |
+
|
| 281 |
+
---
|
| 282 |
+
|
| 283 |
+
## π Data Privacy & Security
|
| 284 |
+
|
| 285 |
+
Following Philippine Data Privacy Act (RA 10173):
|
| 286 |
+
|
| 287 |
+
- β
Encrypted data transmission
|
| 288 |
+
- β
Access control and authentication
|
| 289 |
+
- β
Audit logging
|
| 290 |
+
- β
Regular security updates
|
| 291 |
+
- β
Data retention policies
|
| 292 |
+
|
| 293 |
+
---
|
| 294 |
+
|
| 295 |
+
## π Key Algorithms
|
| 296 |
+
|
| 297 |
+
### 1. CRNN+CTC
|
| 298 |
+
**Purpose:** Text recognition from images
|
| 299 |
+
**Strengths:** Handles variable-length sequences, no character segmentation needed
|
| 300 |
+
**Reference:** Shi et al. (2016)
|
| 301 |
+
|
| 302 |
+
### 2. Multinomial Naive Bayes
|
| 303 |
+
**Purpose:** Document classification
|
| 304 |
+
**Strengths:** Fast, efficient, works well with text data
|
| 305 |
+
**Reference:** McCallum & Nigam (1998)
|
| 306 |
+
|
| 307 |
+
### 3. Named Entity Recognition
|
| 308 |
+
**Purpose:** Extract entities (names, dates, places)
|
| 309 |
+
**Strengths:** Pre-trained, accurate, easy to use
|
| 310 |
+
**Reference:** spaCy (Honnibal & Montani, 2017)
|
| 311 |
+
|
| 312 |
+
---
|
| 313 |
+
|
| 314 |
+
## π οΈ Troubleshooting
|
| 315 |
+
|
| 316 |
+
### Low Accuracy?
|
| 317 |
+
1. Increase training data (target: 10,000+ samples)
|
| 318 |
+
2. Use data augmentation
|
| 319 |
+
3. Train longer (100+ epochs)
|
| 320 |
+
4. Clean your dataset
|
| 321 |
+
|
| 322 |
+
### Out of Memory?
|
| 323 |
+
1. Reduce batch size
|
| 324 |
+
2. Use smaller image dimensions
|
| 325 |
+
3. Use gradient accumulation
|
| 326 |
+
4. Enable mixed precision
|
| 327 |
+
|
| 328 |
+
### Slow Inference?
|
| 329 |
+
1. Use GPU if available
|
| 330 |
+
2. Batch process images
|
| 331 |
+
3. Optimize model (ONNX)
|
| 332 |
+
4. Cache frequent results
|
| 333 |
+
|
| 334 |
+
---
|
| 335 |
+
|
| 336 |
+
## π Documentation
|
| 337 |
+
|
| 338 |
+
- **IMPLEMENTATION_GUIDE.md** - Complete step-by-step guide
|
| 339 |
+
- **API_DOCUMENTATION.md** - API reference (to be created)
|
| 340 |
+
- **USER_MANUAL.md** - End-user guide (to be created)
|
| 341 |
+
|
| 342 |
+
---
|
| 343 |
+
|
| 344 |
+
## π Academic References
|
| 345 |
+
|
| 346 |
+
### Key Papers:
|
| 347 |
+
|
| 348 |
+
1. **CRNN**
|
| 349 |
+
Shi, B., Bai, X., & Yao, C. (2016). An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. *IEEE TPAMI*.
|
| 350 |
+
|
| 351 |
+
2. **CTC Loss**
|
| 352 |
+
Graves, A., et al. (2006). Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural networks. *ICML*.
|
| 353 |
+
|
| 354 |
+
3. **Naive Bayes**
|
| 355 |
+
McCallum, A., & Nigam, K. (1998). A comparison of event models for naive bayes text classification. *AAAI Workshop*.
|
| 356 |
+
|
| 357 |
+
4. **spaCy**
|
| 358 |
+
Honnibal, M., & Montani, I. (2017). spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing.
|
| 359 |
+
|
| 360 |
+
---
|
| 361 |
+
|
| 362 |
+
## π₯ Contributors
|
| 363 |
+
|
| 364 |
+
**Researchers:**
|
| 365 |
+
- Shane Mark C. Blanco
|
| 366 |
+
- Princess A. Pasamonte
|
| 367 |
+
- Irish Faith G. Ramirez
|
| 368 |
+
|
| 369 |
+
**Advisers:**
|
| 370 |
+
- Mr. Rengel V. Corpuz (Technical Adviser)
|
| 371 |
+
- Mr. Joselito T. Tan (Subject Teacher)
|
| 372 |
+
|
| 373 |
+
**Institution:**
|
| 374 |
+
Tarlac State University
|
| 375 |
+
College of Computer Studies
|
| 376 |
+
Bachelor of Science in Computer Science
|
| 377 |
+
|
| 378 |
+
---
|
| 379 |
+
|
| 380 |
+
## π Support
|
| 381 |
+
|
| 382 |
+
For questions regarding this implementation:
|
| 383 |
+
|
| 384 |
+
1. Review IMPLEMENTATION_GUIDE.md
|
| 385 |
+
2. Check code documentation
|
| 386 |
+
3. Consult with thesis advisers
|
| 387 |
+
|
| 388 |
+
---
|
| 389 |
+
|
| 390 |
+
## π License
|
| 391 |
+
|
| 392 |
+
This project is for academic purposes as part of a thesis requirement.
|
| 393 |
+
|
| 394 |
+
---
|
| 395 |
+
|
| 396 |
+
## β
Implementation Checklist
|
| 397 |
+
|
| 398 |
+
### Phase 1: Setup β
|
| 399 |
+
- [x] Install dependencies
|
| 400 |
+
- [x] Set up project structure
|
| 401 |
+
- [x] Prepare development environment
|
| 402 |
+
|
| 403 |
+
### Phase 2: Data Preparation
|
| 404 |
+
- [ ] Collect civil registry form images
|
| 405 |
+
- [ ] Create annotations
|
| 406 |
+
- [ ] Split into train/val/test sets
|
| 407 |
+
|
| 408 |
+
### Phase 3: Model Development
|
| 409 |
+
- [ ] Train CRNN+CTC model
|
| 410 |
+
- [ ] Train document classifier
|
| 411 |
+
- [ ] Integrate NER system
|
| 412 |
+
|
| 413 |
+
### Phase 4: Web Application
|
| 414 |
+
- [ ] Develop Flask/FastAPI backend
|
| 415 |
+
- [ ] Create frontend interface
|
| 416 |
+
- [ ] Implement database integration
|
| 417 |
+
|
| 418 |
+
### Phase 5: Testing
|
| 419 |
+
- [ ] Accuracy testing
|
| 420 |
+
- [ ] Black-box testing
|
| 421 |
+
- [ ] ISO 25010 evaluation
|
| 422 |
+
- [ ] User acceptance testing
|
| 423 |
+
|
| 424 |
+
### Phase 6: Deployment
|
| 425 |
+
- [ ] Optimize for production
|
| 426 |
+
- [ ] Set up server
|
| 427 |
+
- [ ] Deploy application
|
| 428 |
+
- [ ] Monitor performance
|
| 429 |
+
|
| 430 |
+
---
|
| 431 |
+
|
| 432 |
+
## π― Success Metrics
|
| 433 |
+
|
| 434 |
+
Target metrics for thesis evaluation:
|
| 435 |
+
|
| 436 |
+
| Metric | Target | Status |
|
| 437 |
+
|--------|--------|--------|
|
| 438 |
+
| OCR Accuracy | > 95% | Pending |
|
| 439 |
+
| CER | < 5% | Pending |
|
| 440 |
+
| Classifier Accuracy | > 90% | Pending |
|
| 441 |
+
| NER F1 Score | > 85% | Pending |
|
| 442 |
+
| Response Time | < 2s | Pending |
|
| 443 |
+
| System Uptime | > 99% | Pending |
|
| 444 |
+
|
| 445 |
+
---
|
| 446 |
+
|
| 447 |
+
**Good luck with your thesis defense! πβ¨**
|
| 448 |
+
|
| 449 |
+
For detailed implementation instructions, see **IMPLEMENTATION_GUIDE.md**
|
CRNN+CTC/calibrate_fields.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
calibrate_fields.py
|
| 3 |
+
===================
|
| 4 |
+
Click-to-measure tool for recalibrating field ratios in field_extractor.py.
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
python calibrate_fields.py --image your_scan.png --form birth
|
| 8 |
+
|
| 9 |
+
Controls:
|
| 10 |
+
β’ Click and drag β draw a field box
|
| 11 |
+
β’ After releasing β enter the field name in the terminal
|
| 12 |
+
β’ Press S β save all measured ratios to calibrated_fields.py
|
| 13 |
+
β’ Press Z β undo last box
|
| 14 |
+
β’ Press Q / ESC β quit without saving
|
| 15 |
+
|
| 16 |
+
Output:
|
| 17 |
+
calibrated_fields.py β copy-paste the dict into field_extractor.py
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import argparse
|
| 21 |
+
import json
|
| 22 |
+
import cv2
|
| 23 |
+
import numpy as np
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
|
| 26 |
+
# ββ state βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
+
drawing = False
|
| 28 |
+
ix, iy = -1, -1
|
| 29 |
+
ex, ey = -1, -1
|
| 30 |
+
boxes = [] # list of (name, rx1, ry1, rx2, ry2)
|
| 31 |
+
form_name = "birth"
|
| 32 |
+
|
| 33 |
+
COLOURS = [
|
| 34 |
+
(0,200,0),(0,150,255),(200,0,200),(0,200,200),(200,200,0),(220,20,60),
|
| 35 |
+
(255,140,0),(150,50,200),(0,160,80),(30,144,255),(255,20,147),(100,200,100),
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
def draw_boxes(img, bounds):
|
| 39 |
+
left, top, right, bottom = bounds
|
| 40 |
+
h, w = img.shape[:2]
|
| 41 |
+
|
| 42 |
+
vis = img.copy()
|
| 43 |
+
# form boundary
|
| 44 |
+
cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 2)
|
| 45 |
+
|
| 46 |
+
for idx, (name, rx1, ry1, rx2, ry2) in enumerate(boxes):
|
| 47 |
+
x1 = int(rx1 * w)
|
| 48 |
+
y1 = int(ry1 * h)
|
| 49 |
+
x2 = int(rx2 * w)
|
| 50 |
+
y2 = int(ry2 * h)
|
| 51 |
+
c = COLOURS[idx % len(COLOURS)]
|
| 52 |
+
cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2)
|
| 53 |
+
cv2.putText(vis, name[:25], (x1 + 2, max(0, y1 - 3)),
|
| 54 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.35, c, 1)
|
| 55 |
+
|
| 56 |
+
# live cursor box
|
| 57 |
+
if drawing and ix >= 0 and ex >= 0:
|
| 58 |
+
cv2.rectangle(vis, (ix, iy), (ex, ey), (255, 255, 255), 1)
|
| 59 |
+
|
| 60 |
+
# instructions
|
| 61 |
+
cv2.putText(vis, "Drag=draw box | S=save | Z=undo | Q=quit",
|
| 62 |
+
(10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1)
|
| 63 |
+
cv2.putText(vis, f"Boxes: {len(boxes)}",
|
| 64 |
+
(10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1)
|
| 65 |
+
return vis
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def detect_bounds(image_bgr):
|
| 69 |
+
"""Simple form boundary detection (reuses logic from FormBoundsDetector)."""
|
| 70 |
+
h, w = image_bgr.shape[:2]
|
| 71 |
+
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
|
| 72 |
+
try:
|
| 73 |
+
thresh = cv2.adaptiveThreshold(
|
| 74 |
+
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 75 |
+
cv2.THRESH_BINARY_INV, 11, 2)
|
| 76 |
+
hk = cv2.getStructuringElement(cv2.MORPH_RECT, (max(w // 5, 10), 1))
|
| 77 |
+
h_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, hk)
|
| 78 |
+
h_rows = np.where(np.sum(h_lines, axis=1) > w * 0.15)[0]
|
| 79 |
+
vk = cv2.getStructuringElement(cv2.MORPH_RECT, (1, max(h // 5, 10)))
|
| 80 |
+
v_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vk)
|
| 81 |
+
v_cols = np.where(np.sum(v_lines, axis=0) > h * 0.08)[0]
|
| 82 |
+
if len(h_rows) == 0 or len(v_cols) == 0:
|
| 83 |
+
return (0, 0, w, h)
|
| 84 |
+
top_b, bottom_b = int(h_rows.min()), int(h_rows.max())
|
| 85 |
+
left_b, right_b = int(v_cols.min()), int(v_cols.max())
|
| 86 |
+
if (right_b - left_b) < w * 0.4 or (bottom_b - top_b) < h * 0.4:
|
| 87 |
+
return (0, 0, w, h)
|
| 88 |
+
return (left_b, top_b, right_b, bottom_b)
|
| 89 |
+
except Exception:
|
| 90 |
+
return (0, 0, w, h)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def save_calibration(output_path, form):
|
| 94 |
+
dict_name = {
|
| 95 |
+
"birth": "BIRTH_FIELDS",
|
| 96 |
+
"death": "DEATH_FIELDS",
|
| 97 |
+
"marriage": "MARRIAGE_FIELDS",
|
| 98 |
+
"marriage_license": "MARRIAGE_LICENSE_FIELDS",
|
| 99 |
+
}.get(form, "CALIBRATED_FIELDS")
|
| 100 |
+
|
| 101 |
+
lines = [f"# Auto-calibrated β copy-paste into field_extractor.py\n",
|
| 102 |
+
f"{dict_name} = {{\n"]
|
| 103 |
+
for name, rx1, ry1, rx2, ry2 in boxes:
|
| 104 |
+
lines.append(f' "{name}":{" " * max(1, 34 - len(name))}'
|
| 105 |
+
f'({rx1:.4f}, {ry1:.4f}, {rx2:.4f}, {ry2:.4f}),\n')
|
| 106 |
+
lines.append("}\n")
|
| 107 |
+
|
| 108 |
+
with open(output_path, "w") as f:
|
| 109 |
+
f.writelines(lines)
|
| 110 |
+
print(f"\n Saved {len(boxes)} fields β {output_path}")
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def main():
|
| 114 |
+
global drawing, ix, iy, ex, ey, form_name
|
| 115 |
+
|
| 116 |
+
parser = argparse.ArgumentParser()
|
| 117 |
+
parser.add_argument("--image", required=True)
|
| 118 |
+
parser.add_argument("--form", default="birth",
|
| 119 |
+
choices=["birth","death","marriage","marriage_license"])
|
| 120 |
+
parser.add_argument("--output", default="calibrated_fields.py")
|
| 121 |
+
parser.add_argument("--scale", type=float, default=1.0,
|
| 122 |
+
help="Scale factor to fit image on screen (e.g. 0.5)")
|
| 123 |
+
args = parser.parse_args()
|
| 124 |
+
form_name = args.form
|
| 125 |
+
|
| 126 |
+
img_orig = cv2.imread(args.image)
|
| 127 |
+
if img_orig is None:
|
| 128 |
+
print(f"ERROR: Cannot load {args.image}")
|
| 129 |
+
return
|
| 130 |
+
|
| 131 |
+
scale = args.scale
|
| 132 |
+
if scale != 1.0:
|
| 133 |
+
img_orig = cv2.resize(img_orig, None, fx=scale, fy=scale)
|
| 134 |
+
|
| 135 |
+
bounds = detect_bounds(img_orig)
|
| 136 |
+
left, top, right, bottom = bounds
|
| 137 |
+
fw = right - left
|
| 138 |
+
fh = bottom - top
|
| 139 |
+
print(f" Form boundary detected: {bounds} ({fw}Γ{fh} px)")
|
| 140 |
+
print(f" Scale: {scale}")
|
| 141 |
+
print("\n Instructions:")
|
| 142 |
+
print(" Drag β draw a field box")
|
| 143 |
+
print(" After releasing β type field name in terminal, press Enter")
|
| 144 |
+
print(" S β save all boxes")
|
| 145 |
+
print(" Z β undo last box")
|
| 146 |
+
print(" Q/ESC β quit\n")
|
| 147 |
+
|
| 148 |
+
win = "Calibrate Fields"
|
| 149 |
+
cv2.namedWindow(win, cv2.WINDOW_NORMAL)
|
| 150 |
+
|
| 151 |
+
def mouse(event, x, y, flags, param):
|
| 152 |
+
global drawing, ix, iy, ex, ey
|
| 153 |
+
if event == cv2.EVENT_LBUTTONDOWN:
|
| 154 |
+
drawing = True
|
| 155 |
+
ix, iy = x, y
|
| 156 |
+
ex, ey = x, y
|
| 157 |
+
elif event == cv2.EVENT_MOUSEMOVE and drawing:
|
| 158 |
+
ex, ey = x, y
|
| 159 |
+
elif event == cv2.EVENT_LBUTTONUP:
|
| 160 |
+
drawing = False
|
| 161 |
+
ex, ey = x, y
|
| 162 |
+
ih, iw = img_orig.shape[:2]
|
| 163 |
+
x1r = min(ix, ex) / iw
|
| 164 |
+
y1r = min(iy, ey) / ih
|
| 165 |
+
x2r = max(ix, ex) / iw
|
| 166 |
+
y2r = max(iy, ey) / ih
|
| 167 |
+
x1r, y1r = max(0.0, x1r), max(0.0, y1r)
|
| 168 |
+
x2r, y2r = min(1.0, x2r), min(1.0, y2r)
|
| 169 |
+
if (x2r - x1r) > 0.005 and (y2r - y1r) > 0.003:
|
| 170 |
+
name = input(f" Field name for ({x1r:.3f},{y1r:.3f},{x2r:.3f},{y2r:.3f}): ").strip()
|
| 171 |
+
if name:
|
| 172 |
+
boxes.append((name, x1r, y1r, x2r, y2r))
|
| 173 |
+
print(f" β '{name}' added (total: {len(boxes)})")
|
| 174 |
+
|
| 175 |
+
cv2.setMouseCallback(win, mouse)
|
| 176 |
+
|
| 177 |
+
while True:
|
| 178 |
+
vis = draw_boxes(img_orig, bounds)
|
| 179 |
+
cv2.imshow(win, vis)
|
| 180 |
+
key = cv2.waitKey(20) & 0xFF
|
| 181 |
+
|
| 182 |
+
if key in (ord('q'), 27):
|
| 183 |
+
print(" Quit β no file saved.")
|
| 184 |
+
break
|
| 185 |
+
elif key == ord('s'):
|
| 186 |
+
save_calibration(args.output, form_name)
|
| 187 |
+
break
|
| 188 |
+
elif key == ord('z') and boxes:
|
| 189 |
+
removed = boxes.pop()
|
| 190 |
+
print(f" Undone: '{removed[0]}'")
|
| 191 |
+
|
| 192 |
+
cv2.destroyAllWindows()
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
if __name__ == "__main__":
|
| 196 |
+
main()
|
CRNN+CTC/calibrated_fields.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Auto-calibrated οΏ½ copy-paste into field_extractor.py
|
| 2 |
+
BIRTH_FIELDS = {
|
| 3 |
+
"Province": (0.0941, 0.0701, 0.6361, 0.0848),
|
| 4 |
+
"City/Municipality": (0.1621, 0.0880, 0.6429, 0.1086),
|
| 5 |
+
"first_name": (0.0465, 0.1183, 0.3265, 0.1375),
|
| 6 |
+
"middle_name": (0.3469, 0.1189, 0.6916, 0.1375),
|
| 7 |
+
}
|
CRNN+CTC/check_cer.py
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
check_cer.py
|
| 3 |
+
============
|
| 4 |
+
Measures TRUE CER by actually running the model on images.
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
python check_cer.py # live CER on val set
|
| 8 |
+
python check_cer.py --saved # old behavior (fast, unreliable)
|
| 9 |
+
python check_cer.py --images test_images/ # run on any image folder
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import sys
|
| 14 |
+
import json
|
| 15 |
+
import random
|
| 16 |
+
import cv2
|
| 17 |
+
import numpy as np
|
| 18 |
+
import editdistance
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
import torch
|
| 23 |
+
except ImportError:
|
| 24 |
+
print("ERROR: torch not installed. Run: pip install torch")
|
| 25 |
+
exit(1)
|
| 26 |
+
|
| 27 |
+
USE_SAVED = '--saved' in sys.argv
|
| 28 |
+
IMAGE_DIR = None
|
| 29 |
+
for i, arg in enumerate(sys.argv[1:], 1):
|
| 30 |
+
if arg == '--images' and i < len(sys.argv) - 1:
|
| 31 |
+
IMAGE_DIR = sys.argv[i + 1]
|
| 32 |
+
elif arg.startswith('--images='):
|
| 33 |
+
IMAGE_DIR = arg.split('=', 1)[1]
|
| 34 |
+
|
| 35 |
+
CHECKPOINTS = [
|
| 36 |
+
'checkpoint_epoch_50.pth',
|
| 37 |
+
'checkpoint_epoch_60.pth',
|
| 38 |
+
'checkpoint_epoch_70.pth',
|
| 39 |
+
'checkpoint_epoch_80.pth',
|
| 40 |
+
'checkpoint_epoch_90.pth',
|
| 41 |
+
'checkpoint_epoch_100.pth',
|
| 42 |
+
]
|
| 43 |
+
CHECKPOINT_DIR = 'checkpoints'
|
| 44 |
+
VAL_DATA_DIR = 'data/val'
|
| 45 |
+
VAL_ANN_FILE = 'data/val_annotations.json'
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class AdaptiveImageNormalizer:
|
| 49 |
+
def __init__(self, target_height=64, target_width=512):
|
| 50 |
+
self.H = target_height
|
| 51 |
+
self.W = target_width
|
| 52 |
+
|
| 53 |
+
def _crop_to_text(self, gray):
|
| 54 |
+
inv = cv2.bitwise_not(gray)
|
| 55 |
+
_, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
|
| 56 |
+
coords = np.column_stack(np.where(thresh > 0))
|
| 57 |
+
if len(coords) == 0:
|
| 58 |
+
return gray
|
| 59 |
+
y_min, x_min = coords.min(axis=0)
|
| 60 |
+
y_max, x_max = coords.max(axis=0)
|
| 61 |
+
pad = max(4, int((y_max - y_min) * 0.15))
|
| 62 |
+
y_min = max(0, y_min - pad)
|
| 63 |
+
x_min = max(0, x_min - pad)
|
| 64 |
+
y_max = min(gray.shape[0] - 1, y_max + pad)
|
| 65 |
+
x_max = min(gray.shape[1] - 1, x_max + pad)
|
| 66 |
+
return gray[y_min:y_max + 1, x_min:x_max + 1]
|
| 67 |
+
|
| 68 |
+
def _smart_resize_gray(self, gray):
|
| 69 |
+
h, w = gray.shape
|
| 70 |
+
if h == 0 or w == 0:
|
| 71 |
+
return np.ones((self.H, self.W), dtype=np.uint8) * 255
|
| 72 |
+
scale = self.H / h
|
| 73 |
+
new_w = int(w * scale)
|
| 74 |
+
new_h = self.H
|
| 75 |
+
if new_w > self.W:
|
| 76 |
+
scale = self.W / w
|
| 77 |
+
new_h = int(h * scale)
|
| 78 |
+
new_w = self.W
|
| 79 |
+
resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
|
| 80 |
+
canvas = np.ones((self.H, self.W), dtype=np.uint8) * 255
|
| 81 |
+
y_off = (self.H - new_h) // 2
|
| 82 |
+
x_off = (self.W - new_w) // 2
|
| 83 |
+
canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized
|
| 84 |
+
return canvas
|
| 85 |
+
|
| 86 |
+
def _binarize(self, img):
|
| 87 |
+
_, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 88 |
+
white_ratio = np.mean(otsu == 255)
|
| 89 |
+
if white_ratio < 0.30 or white_ratio > 0.97:
|
| 90 |
+
return cv2.adaptiveThreshold(
|
| 91 |
+
img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 92 |
+
cv2.THRESH_BINARY, 11, 2)
|
| 93 |
+
return otsu
|
| 94 |
+
|
| 95 |
+
def normalize(self, img):
|
| 96 |
+
if len(img.shape) == 3:
|
| 97 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 98 |
+
else:
|
| 99 |
+
gray = img.copy()
|
| 100 |
+
gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
|
| 101 |
+
gray = self._crop_to_text(gray)
|
| 102 |
+
gray = self._smart_resize_gray(gray)
|
| 103 |
+
return self._binarize(gray)
|
| 104 |
+
|
| 105 |
+
def to_tensor(self, img):
|
| 106 |
+
return torch.FloatTensor(
|
| 107 |
+
img.astype(np.float32) / 255.0
|
| 108 |
+
).unsqueeze(0).unsqueeze(0)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def greedy_decode(outputs, idx_to_char):
|
| 112 |
+
pred_indices = torch.argmax(outputs, dim=2).permute(1, 0)
|
| 113 |
+
results = []
|
| 114 |
+
for seq in pred_indices:
|
| 115 |
+
chars, prev = [], -1
|
| 116 |
+
for idx in seq:
|
| 117 |
+
idx = idx.item()
|
| 118 |
+
if idx != 0 and idx != prev and idx in idx_to_char:
|
| 119 |
+
chars.append(idx_to_char[idx])
|
| 120 |
+
prev = idx
|
| 121 |
+
results.append(''.join(chars))
|
| 122 |
+
return results
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def measure_live_cer(model, idx_to_char, img_h, img_w,
|
| 126 |
+
ann_file, data_dir, device, max_samples=200):
|
| 127 |
+
if not os.path.exists(ann_file):
|
| 128 |
+
return None, 0, f"Annotation file not found: {ann_file}"
|
| 129 |
+
|
| 130 |
+
with open(ann_file, 'r', encoding='utf-8') as f:
|
| 131 |
+
annotations = json.load(f)
|
| 132 |
+
|
| 133 |
+
if len(annotations) > max_samples:
|
| 134 |
+
random.seed(42)
|
| 135 |
+
annotations = random.sample(annotations, max_samples)
|
| 136 |
+
|
| 137 |
+
normalizer = AdaptiveImageNormalizer(img_h, img_w)
|
| 138 |
+
model.eval()
|
| 139 |
+
|
| 140 |
+
total_char_dist = 0
|
| 141 |
+
total_chars = 0
|
| 142 |
+
total_word_dist = 0
|
| 143 |
+
total_words = 0
|
| 144 |
+
n_exact = 0
|
| 145 |
+
n_evaluated = 0
|
| 146 |
+
worst_errors = []
|
| 147 |
+
|
| 148 |
+
with torch.no_grad():
|
| 149 |
+
for ann in annotations:
|
| 150 |
+
img_path = os.path.join(data_dir, ann['image_path'])
|
| 151 |
+
gt = ann['text']
|
| 152 |
+
if not os.path.exists(img_path):
|
| 153 |
+
continue
|
| 154 |
+
try:
|
| 155 |
+
raw = cv2.imread(img_path)
|
| 156 |
+
if raw is None:
|
| 157 |
+
continue
|
| 158 |
+
norm = normalizer.normalize(raw)
|
| 159 |
+
tensor = normalizer.to_tensor(norm).to(device)
|
| 160 |
+
out = model(tensor)
|
| 161 |
+
pred = greedy_decode(out.cpu(), idx_to_char)[0]
|
| 162 |
+
|
| 163 |
+
cd = editdistance.eval(pred, gt)
|
| 164 |
+
wd = editdistance.eval(pred.split(), gt.split())
|
| 165 |
+
|
| 166 |
+
total_char_dist += cd
|
| 167 |
+
total_chars += len(gt)
|
| 168 |
+
total_word_dist += wd
|
| 169 |
+
total_words += len(gt.split())
|
| 170 |
+
if pred == gt:
|
| 171 |
+
n_exact += 1
|
| 172 |
+
if cd > 0:
|
| 173 |
+
worst_errors.append((gt, pred, cd))
|
| 174 |
+
n_evaluated += 1
|
| 175 |
+
except Exception:
|
| 176 |
+
continue
|
| 177 |
+
|
| 178 |
+
if n_evaluated == 0:
|
| 179 |
+
return None, 0, "No images could be evaluated"
|
| 180 |
+
|
| 181 |
+
cer = (total_char_dist / total_chars * 100) if total_chars > 0 else 0
|
| 182 |
+
wer = (total_word_dist / total_words * 100) if total_words > 0 else 0
|
| 183 |
+
acc = (n_exact / n_evaluated * 100)
|
| 184 |
+
|
| 185 |
+
return {
|
| 186 |
+
'cer': cer, 'wer': wer, 'exact_match': acc,
|
| 187 |
+
'n_evaluated': n_evaluated,
|
| 188 |
+
'errors': sorted(worst_errors, key=lambda x: x[2], reverse=True)[:5]
|
| 189 |
+
}, n_evaluated, None
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def run_on_folder(model, idx_to_char, img_h, img_w, folder, device):
|
| 193 |
+
normalizer = AdaptiveImageNormalizer(img_h, img_w)
|
| 194 |
+
model.eval()
|
| 195 |
+
exts = {'.jpg', '.jpeg', '.png', '.bmp'}
|
| 196 |
+
paths = sorted(p for p in Path(folder).rglob('*') if p.suffix.lower() in exts)
|
| 197 |
+
results = []
|
| 198 |
+
with torch.no_grad():
|
| 199 |
+
for p in paths:
|
| 200 |
+
try:
|
| 201 |
+
raw = cv2.imread(str(p))
|
| 202 |
+
norm = normalizer.normalize(raw)
|
| 203 |
+
tensor = normalizer.to_tensor(norm).to(device)
|
| 204 |
+
pred = greedy_decode(model(tensor).cpu(), idx_to_char)[0]
|
| 205 |
+
results.append((p.name, pred))
|
| 206 |
+
except Exception as e:
|
| 207 |
+
results.append((p.name, f'ERROR: {e}'))
|
| 208 |
+
return results
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 212 |
+
# MAIN
|
| 213 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 214 |
+
|
| 215 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 216 |
+
|
| 217 |
+
if USE_SAVED:
|
| 218 |
+
print("=" * 65)
|
| 219 |
+
print(" SAVED CER (training-time value β may not reflect real accuracy)")
|
| 220 |
+
print(" Run without --saved for true live CER.")
|
| 221 |
+
print("=" * 65)
|
| 222 |
+
print("{:<8} {:<12} {:<12} {}".format("Epoch", "CER(%)", "WER(%)", "File"))
|
| 223 |
+
print("-" * 65)
|
| 224 |
+
best_cer, best_cp = float('inf'), None
|
| 225 |
+
for cp in CHECKPOINTS:
|
| 226 |
+
path = os.path.join(CHECKPOINT_DIR, cp)
|
| 227 |
+
if not os.path.exists(path):
|
| 228 |
+
continue
|
| 229 |
+
try:
|
| 230 |
+
c = torch.load(path, weights_only=False)
|
| 231 |
+
cer = c.get('val_cer', c.get('val_loss', 0))
|
| 232 |
+
epoch = c['epoch']
|
| 233 |
+
history = c.get('history', {})
|
| 234 |
+
wer_list = history.get('val_wer', [])
|
| 235 |
+
wer = wer_list[epoch - 1] if wer_list and epoch <= len(wer_list) else None
|
| 236 |
+
wer_s = f"{wer:.4f}%" if wer else 'N/A'
|
| 237 |
+
marker = ' <-- BEST' if cer < best_cer else ''
|
| 238 |
+
print("{:<8} {:<12} {:<12} {}{}".format(
|
| 239 |
+
epoch, f"{cer:.4f}%", wer_s, cp, marker))
|
| 240 |
+
if cer < best_cer:
|
| 241 |
+
best_cer, best_cp = cer, cp
|
| 242 |
+
except Exception as e:
|
| 243 |
+
print(f" Could not load {cp}: {e}")
|
| 244 |
+
print("=" * 65)
|
| 245 |
+
print(f"\nBEST: {best_cp} CER={best_cer:.4f}%")
|
| 246 |
+
|
| 247 |
+
else:
|
| 248 |
+
print("=" * 78)
|
| 249 |
+
print(" LIVE CER β model actually runs on images (true accuracy)")
|
| 250 |
+
print("=" * 78)
|
| 251 |
+
print("{:<8} {:<10} {:<10} {:<12} {:<8} {}".format(
|
| 252 |
+
"Epoch", "CER(%)", "WER(%)", "ExactMatch", "N", "File"))
|
| 253 |
+
print("-" * 78)
|
| 254 |
+
|
| 255 |
+
best_cer, best_cp, best_metrics = float('inf'), None, None
|
| 256 |
+
|
| 257 |
+
for cp in CHECKPOINTS:
|
| 258 |
+
cp_path = os.path.join(CHECKPOINT_DIR, cp)
|
| 259 |
+
if not os.path.exists(cp_path):
|
| 260 |
+
print(f" (skipping {cp} β not found)")
|
| 261 |
+
continue
|
| 262 |
+
try:
|
| 263 |
+
from crnn_model import get_crnn_model
|
| 264 |
+
c = torch.load(cp_path, map_location=device, weights_only=False)
|
| 265 |
+
epoch = c['epoch']
|
| 266 |
+
idx_to_char = c['idx_to_char']
|
| 267 |
+
config = c.get('config', {})
|
| 268 |
+
img_h = config.get('img_height', 64)
|
| 269 |
+
img_w = config.get('img_width', 512)
|
| 270 |
+
saved_cer = c.get('val_cer', c.get('val_loss', None))
|
| 271 |
+
|
| 272 |
+
model = get_crnn_model(
|
| 273 |
+
model_type=config.get('model_type', 'standard'),
|
| 274 |
+
img_height=img_h,
|
| 275 |
+
num_chars=c['model_state_dict']['fc.weight'].shape[0],
|
| 276 |
+
hidden_size=config.get('hidden_size', 128), # FIXED: was 256
|
| 277 |
+
num_lstm_layers=config.get('num_lstm_layers', 1) # FIXED: was 2
|
| 278 |
+
).to(device)
|
| 279 |
+
model.load_state_dict(c['model_state_dict'])
|
| 280 |
+
|
| 281 |
+
if IMAGE_DIR:
|
| 282 |
+
print(f"\nPredictions from {cp}:")
|
| 283 |
+
for fname, pred in run_on_folder(
|
| 284 |
+
model, idx_to_char, img_h, img_w, IMAGE_DIR, device):
|
| 285 |
+
print(f" {fname:<35} -> {pred}")
|
| 286 |
+
continue
|
| 287 |
+
|
| 288 |
+
metrics, n, err = measure_live_cer(
|
| 289 |
+
model, idx_to_char, img_h, img_w,
|
| 290 |
+
VAL_ANN_FILE, VAL_DATA_DIR, device)
|
| 291 |
+
|
| 292 |
+
if metrics is None:
|
| 293 |
+
print(f" Epoch {epoch} SKIP: {err}")
|
| 294 |
+
continue
|
| 295 |
+
|
| 296 |
+
cer = metrics['cer']
|
| 297 |
+
marker = ' <-- BEST' if cer < best_cer else ''
|
| 298 |
+
print("{:<8} {:<10} {:<10} {:<12} {:<8} {}{}".format(
|
| 299 |
+
epoch,
|
| 300 |
+
f"{cer:.2f}%",
|
| 301 |
+
f"{metrics['wer']:.2f}%",
|
| 302 |
+
f"{metrics['exact_match']:.1f}%",
|
| 303 |
+
n, cp, marker))
|
| 304 |
+
|
| 305 |
+
if saved_cer and abs(cer - saved_cer) > 2.0:
|
| 306 |
+
print(f" ^ MISMATCH: saved={saved_cer:.2f}% live={cer:.2f}%"
|
| 307 |
+
f" diff={abs(cer - saved_cer):.2f}%")
|
| 308 |
+
print(f" Cause: model trained on clean synthetic only.")
|
| 309 |
+
print(f" Fix: regenerate data with fix_data.py + retrain.")
|
| 310 |
+
|
| 311 |
+
if cer < best_cer:
|
| 312 |
+
best_cer, best_cp, best_metrics = cer, cp, metrics
|
| 313 |
+
|
| 314 |
+
except Exception as e:
|
| 315 |
+
print(f" Could not evaluate {cp}: {e}")
|
| 316 |
+
|
| 317 |
+
if not IMAGE_DIR:
|
| 318 |
+
print("=" * 78)
|
| 319 |
+
print(f"\nBEST CHECKPOINT : {best_cp}")
|
| 320 |
+
print(f"BEST LIVE CER : {best_cer:.4f}%")
|
| 321 |
+
|
| 322 |
+
if best_metrics and best_metrics['errors']:
|
| 323 |
+
print(f"\nWorst predictions (GT -> Predicted):")
|
| 324 |
+
for gt, pred, dist in best_metrics['errors']:
|
| 325 |
+
print(f" [{dist:2d}] '{gt}'")
|
| 326 |
+
print(f" '{pred}'")
|
| 327 |
+
|
| 328 |
+
print(f"\nTo use best model:")
|
| 329 |
+
print(f" import shutil")
|
| 330 |
+
print(f" shutil.copy('checkpoints/{best_cp}', 'checkpoints/best_model.pth')")
|
| 331 |
+
print("=" * 78)
|
CRNN+CTC/checkpoints/best_model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f98f0590b354c11f40fefcab6fe172ae57cb37e49277062a00dbbe3f5aa6b8b5
|
| 3 |
+
size 19204606
|
CRNN+CTC/checkpoints/best_model_final.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7da91ba5cd78b602eebb9c9f63175d9bc47ec8cb6fbdac6a06c78814e2e6b8f2
|
| 3 |
+
size 6407143
|
CRNN+CTC/checkpoints/best_model_iam.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7f4cdef044a163632be2cbf7fbed9d869b4a2e85977aef60e5f88501969e257
|
| 3 |
+
size 6405834
|
CRNN+CTC/checkpoints/best_model_v2.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:047c9af89f9486553a2c17736cafcc0a7a45a99e21619064ee00299e2cd6a8df
|
| 3 |
+
size 6406990
|
CRNN+CTC/checkpoints/best_model_v3.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b1def35ee8c623aac01004ecb9f979d51d3ed3a486d8adf7a8acd67e5b03a31
|
| 3 |
+
size 6406990
|
CRNN+CTC/checkpoints/best_model_v4.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73a939bab133573e8b771a6d48aca10c9a98e804cedd79f06eac4e24735df1d4
|
| 3 |
+
size 6406201
|
CRNN+CTC/checkpoints/best_model_v5.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef986cbea34d4c5dc31b32aac3bc2dfaa20720cdb133d9d6c79a5d5123700942
|
| 3 |
+
size 6406201
|
CRNN+CTC/checkpoints/best_model_v6.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2dbcdecce6d83b7f7c74ae6df05ae9222b345668e2dff84de9aa108562bd71ac
|
| 3 |
+
size 6406201
|
CRNN+CTC/checkpoints/best_model_v7.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7adea1b88ab7e4ecdf9354a8f1adbfbe7c95e26808319e307483ca6ea2555e0
|
| 3 |
+
size 6406201
|
CRNN+CTC/checkpoints/best_model_v732.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7adea1b88ab7e4ecdf9354a8f1adbfbe7c95e26808319e307483ca6ea2555e0
|
| 3 |
+
size 6406201
|
CRNN+CTC/checkpoints/checkpoint_epoch_10.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:08f3a40e99411b95e8a8563ed42f6998f367b84dd799b8c0cbcffac1bdd5576f
|
| 3 |
+
size 19201165
|
CRNN+CTC/checkpoints/latest_checkpoint.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b10077d433edbd5946499fef7334421d8b7ba351f55d631fbeb085592c10545
|
| 3 |
+
size 19201651
|
CRNN+CTC/compare_checkpoints.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import sys
|
| 3 |
+
sys.path.append('.')
|
| 4 |
+
from crnn_model import get_crnn_model
|
| 5 |
+
|
| 6 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 7 |
+
|
| 8 |
+
def test_model(path, label):
|
| 9 |
+
c = torch.load(path, map_location=device, weights_only=False)
|
| 10 |
+
config = c.get('config', {})
|
| 11 |
+
model = get_crnn_model(
|
| 12 |
+
model_type = config.get('model_type', 'standard'),
|
| 13 |
+
img_height = config.get('img_height', 64),
|
| 14 |
+
num_chars = c['model_state_dict']['fc.weight'].shape[0],
|
| 15 |
+
hidden_size = config.get('hidden_size', 128),
|
| 16 |
+
num_lstm_layers = config.get('num_lstm_layers', 1),
|
| 17 |
+
).to(device)
|
| 18 |
+
model.load_state_dict(c['model_state_dict'], strict=False)
|
| 19 |
+
epoch = c.get('epoch', 'N/A')
|
| 20 |
+
val_loss = c.get('val_loss', None) # fine-tuned checkpoints (EMNIST, IAM)
|
| 21 |
+
val_cer = c.get('val_cer', None) # synthetic baseline checkpoint
|
| 22 |
+
if val_loss is not None:
|
| 23 |
+
metric_str = f"val_loss={val_loss:.4f}"
|
| 24 |
+
elif val_cer is not None:
|
| 25 |
+
metric_str = f"val_cer={val_cer:.4f}%"
|
| 26 |
+
else:
|
| 27 |
+
metric_str = "no metric saved"
|
| 28 |
+
print(f"{label}: epoch={epoch} {metric_str}")
|
| 29 |
+
|
| 30 |
+
print("=" * 55)
|
| 31 |
+
test_model('checkpoints/best_model.pth', 'Synthetic ')
|
| 32 |
+
test_model('checkpoints/best_model_emnist.pth', 'EMNIST ')
|
| 33 |
+
test_model('checkpoints/best_model_iam.pth', 'IAM ')
|
| 34 |
+
print("=" * 55)
|
CRNN+CTC/compare_live_cer.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
compare_live_cer.py
|
| 3 |
+
===================
|
| 4 |
+
Runs live CER on all three checkpoints to find the best one.
|
| 5 |
+
Usage: python compare_live_cer.py
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
import json
|
| 11 |
+
import random
|
| 12 |
+
import cv2
|
| 13 |
+
import numpy as np
|
| 14 |
+
import editdistance
|
| 15 |
+
import torch
|
| 16 |
+
import torch.nn.functional as F
|
| 17 |
+
sys.path.append('.')
|
| 18 |
+
from crnn_model import get_crnn_model
|
| 19 |
+
|
| 20 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 21 |
+
|
| 22 |
+
VAL_ANN = 'data/val_annotations.json'
|
| 23 |
+
VAL_DIR = 'data/val'
|
| 24 |
+
MAX_SAMPLES = 200
|
| 25 |
+
|
| 26 |
+
CHECKPOINTS = {
|
| 27 |
+
'Synthetic' : 'checkpoints/best_model.pth',
|
| 28 |
+
'EMNIST' : 'checkpoints/best_model_emnist.pth',
|
| 29 |
+
'IAM' : 'checkpoints/best_model_iam.pth',
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def normalize(img, H=64, W=512):
|
| 34 |
+
if len(img.shape) == 3:
|
| 35 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 36 |
+
else:
|
| 37 |
+
gray = img.copy()
|
| 38 |
+
gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
|
| 39 |
+
inv = cv2.bitwise_not(gray)
|
| 40 |
+
_, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
|
| 41 |
+
coords = np.column_stack(np.where(thresh > 0))
|
| 42 |
+
if len(coords) > 0:
|
| 43 |
+
y_min, x_min = coords.min(axis=0)
|
| 44 |
+
y_max, x_max = coords.max(axis=0)
|
| 45 |
+
pad = max(4, int((y_max - y_min) * 0.15))
|
| 46 |
+
y_min = max(0, y_min - pad)
|
| 47 |
+
x_min = max(0, x_min - pad)
|
| 48 |
+
y_max = min(gray.shape[0]-1, y_max + pad)
|
| 49 |
+
x_max = min(gray.shape[1]-1, x_max + pad)
|
| 50 |
+
gray = gray[y_min:y_max+1, x_min:x_max+1]
|
| 51 |
+
h, w = gray.shape
|
| 52 |
+
if h == 0 or w == 0:
|
| 53 |
+
return np.ones((H, W), dtype=np.uint8) * 255
|
| 54 |
+
scale = H / h
|
| 55 |
+
new_w = int(w * scale)
|
| 56 |
+
if new_w > W:
|
| 57 |
+
scale = W / w
|
| 58 |
+
new_w = W
|
| 59 |
+
new_h = int(h * scale)
|
| 60 |
+
else:
|
| 61 |
+
new_h = H
|
| 62 |
+
resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
|
| 63 |
+
canvas = np.ones((H, W), dtype=np.uint8) * 255
|
| 64 |
+
canvas[(H-new_h)//2:(H-new_h)//2+new_h,
|
| 65 |
+
(W-new_w)//2:(W-new_w)//2+new_w] = resized
|
| 66 |
+
_, otsu = cv2.threshold(canvas, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 67 |
+
return otsu
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def greedy_decode(outputs, idx_to_char):
|
| 71 |
+
pred_indices = torch.argmax(outputs, dim=2).permute(1, 0)
|
| 72 |
+
results = []
|
| 73 |
+
for seq in pred_indices:
|
| 74 |
+
chars, prev = [], -1
|
| 75 |
+
for idx in seq:
|
| 76 |
+
idx = idx.item()
|
| 77 |
+
if idx != 0 and idx != prev and idx in idx_to_char:
|
| 78 |
+
chars.append(idx_to_char[idx])
|
| 79 |
+
prev = idx
|
| 80 |
+
results.append(''.join(chars))
|
| 81 |
+
return results
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def evaluate(checkpoint_path, label):
|
| 85 |
+
if not os.path.exists(checkpoint_path):
|
| 86 |
+
print(f" {label:<12}: FILE NOT FOUND β skipping")
|
| 87 |
+
return
|
| 88 |
+
|
| 89 |
+
c = torch.load(checkpoint_path, map_location=device, weights_only=False)
|
| 90 |
+
config = c.get('config', {})
|
| 91 |
+
|
| 92 |
+
# Load idx_to_char from checkpoint if available
|
| 93 |
+
idx_to_char = c.get('idx_to_char', None)
|
| 94 |
+
if idx_to_char is None:
|
| 95 |
+
from dataset import build_char_maps
|
| 96 |
+
_, idx_to_char, _ = build_char_maps()
|
| 97 |
+
|
| 98 |
+
model = get_crnn_model(
|
| 99 |
+
model_type = config.get('model_type', 'standard'),
|
| 100 |
+
img_height = config.get('img_height', 64),
|
| 101 |
+
num_chars = c['model_state_dict']['fc.weight'].shape[0],
|
| 102 |
+
hidden_size = config.get('hidden_size', 128),
|
| 103 |
+
num_lstm_layers = config.get('num_lstm_layers', 1),
|
| 104 |
+
).to(device)
|
| 105 |
+
model.load_state_dict(c['model_state_dict'], strict=False)
|
| 106 |
+
model.eval()
|
| 107 |
+
|
| 108 |
+
with open(VAL_ANN, 'r', encoding='utf-8') as f:
|
| 109 |
+
anns = json.load(f)
|
| 110 |
+
random.seed(42)
|
| 111 |
+
if len(anns) > MAX_SAMPLES:
|
| 112 |
+
anns = random.sample(anns, MAX_SAMPLES)
|
| 113 |
+
|
| 114 |
+
total_cd, total_c = 0, 0
|
| 115 |
+
exact, n = 0, 0
|
| 116 |
+
worst = []
|
| 117 |
+
|
| 118 |
+
with torch.no_grad():
|
| 119 |
+
for ann in anns:
|
| 120 |
+
img_path = os.path.join(VAL_DIR, ann['image_path'])
|
| 121 |
+
gt = ann['text']
|
| 122 |
+
if not os.path.exists(img_path):
|
| 123 |
+
continue
|
| 124 |
+
raw = cv2.imread(img_path)
|
| 125 |
+
if raw is None:
|
| 126 |
+
continue
|
| 127 |
+
norm = normalize(raw)
|
| 128 |
+
tensor = torch.FloatTensor(
|
| 129 |
+
norm.astype(np.float32) / 255.0
|
| 130 |
+
).unsqueeze(0).unsqueeze(0).to(device)
|
| 131 |
+
out = model(tensor)
|
| 132 |
+
pred = greedy_decode(out.cpu(), idx_to_char)[0]
|
| 133 |
+
cd = editdistance.eval(pred, gt)
|
| 134 |
+
total_cd += cd
|
| 135 |
+
total_c += len(gt)
|
| 136 |
+
if pred == gt:
|
| 137 |
+
exact += 1
|
| 138 |
+
if cd > 0:
|
| 139 |
+
worst.append((gt, pred, cd))
|
| 140 |
+
n += 1
|
| 141 |
+
|
| 142 |
+
cer = (total_cd / total_c * 100) if total_c > 0 else 0
|
| 143 |
+
acc = (exact / n * 100) if n > 0 else 0
|
| 144 |
+
print(f" {label:<12}: CER={cer:.2f}% ExactMatch={acc:.1f}% (n={n})")
|
| 145 |
+
|
| 146 |
+
if worst:
|
| 147 |
+
worst = sorted(worst, key=lambda x: x[2], reverse=True)[:2]
|
| 148 |
+
for gt, pred, d in worst:
|
| 149 |
+
print(f" [{d}] '{gt}' -> '{pred}'")
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
print("=" * 60)
|
| 153 |
+
print(" LIVE CER COMPARISON β all checkpoints")
|
| 154 |
+
print("=" * 60)
|
| 155 |
+
for label, path in CHECKPOINTS.items():
|
| 156 |
+
evaluate(path, label)
|
| 157 |
+
print("=" * 60)
|
| 158 |
+
print("Use the checkpoint with the lowest CER for IAM/physical fine-tuning.")
|
CRNN+CTC/create_test_images.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 3 |
+
|
| 4 |
+
os.makedirs('test_images', exist_ok=True)
|
| 5 |
+
|
| 6 |
+
def load_font(size=22): # FIXED: was 20 β must match fix_data.py FONT_SIZE=22
|
| 7 |
+
"""Same font loader as fix_data.py β tries multiple paths."""
|
| 8 |
+
for fp in [
|
| 9 |
+
'arial.ttf', 'Arial.ttf',
|
| 10 |
+
'/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
|
| 11 |
+
'/System/Library/Fonts/Helvetica.ttc',
|
| 12 |
+
'C:/Windows/Fonts/arial.ttf',
|
| 13 |
+
]:
|
| 14 |
+
try:
|
| 15 |
+
return ImageFont.truetype(fp, size)
|
| 16 |
+
except Exception:
|
| 17 |
+
continue
|
| 18 |
+
print("WARNING: Could not load Arial/DejaVu font. Using default β predictions may be inaccurate.")
|
| 19 |
+
return ImageFont.load_default()
|
| 20 |
+
|
| 21 |
+
def create_image(text, filename):
|
| 22 |
+
"""Render text exactly the same way as fix_data.py training images."""
|
| 23 |
+
img = Image.new('RGB', (512, 64), color=(255, 255, 255))
|
| 24 |
+
draw = ImageDraw.Draw(img)
|
| 25 |
+
font = load_font(22)
|
| 26 |
+
|
| 27 |
+
bbox = draw.textbbox((0, 0), text, font=font)
|
| 28 |
+
x = max((512 - (bbox[2] - bbox[0])) // 2, 2)
|
| 29 |
+
y = max((64 - (bbox[3] - bbox[1])) // 2, 2)
|
| 30 |
+
draw.text((x, y), text, fill=(0, 0, 0), font=font)
|
| 31 |
+
img.save(filename)
|
| 32 |
+
print(f'Created: {filename}')
|
| 33 |
+
|
| 34 |
+
# ββ Test samples ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
create_image('Juan Dela Cruz', 'test_images/demo.jpg')
|
| 36 |
+
create_image('Juan Dela Cruz', 'test_images/name1.jpg')
|
| 37 |
+
create_image('01/15/1990', 'test_images/date1.jpg')
|
| 38 |
+
create_image('Tarlac City', 'test_images/place1.jpg')
|
| 39 |
+
create_image('Maria Santos', 'test_images/form1a_sample.jpg')
|
| 40 |
+
|
| 41 |
+
# ββ Extra test cases (names, dates, addresses) ββββββββββββββββ
|
| 42 |
+
create_image('Jose Dela Cruz Jr.', 'test_images/name2.jpg')
|
| 43 |
+
create_image('Ana Marie Reyes', 'test_images/name3.jpg')
|
| 44 |
+
create_image('03/22/1985', 'test_images/date2.jpg')
|
| 45 |
+
create_image('07/04/2000', 'test_images/date3.jpg')
|
| 46 |
+
create_image('Brgy. San Jose, Capas, Tarlac', 'test_images/place2.jpg')
|
| 47 |
+
create_image('78 MacArthur Hwy., Tarlac City', 'test_images/place3.jpg')
|
| 48 |
+
|
| 49 |
+
print('\nAll test images created!')
|
| 50 |
+
print('Font used matches training data β predictions should be accurate.')
|
CRNN+CTC/crnn_model.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CRNN+CTC Model β simplified for small datasets (~5000-10000 samples)
|
| 3 |
+
~700K parameters, converges reliably without CTC blank collapse.
|
| 4 |
+
"""
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn as nn
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class CRNN_CivilRegistry(nn.Module):
|
| 10 |
+
|
| 11 |
+
def __init__(self, img_height=64, num_chars=96, hidden_size=128, num_lstm_layers=1,
|
| 12 |
+
dropout=0.3):
|
| 13 |
+
super().__init__()
|
| 14 |
+
|
| 15 |
+
# CNN β width reductions for 512px input:
|
| 16 |
+
# MaxPool(2,2): 512β256, MaxPool(2,2): 256β128
|
| 17 |
+
# MaxPool(2,1): 128 (height only), MaxPool(2,1): 128 (height only)
|
| 18 |
+
# Conv(k=2,p=0): 127 β seq_len=127, fits labels up to 64 chars
|
| 19 |
+
self.cnn = nn.Sequential(
|
| 20 |
+
nn.Conv2d(1, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(inplace=True),
|
| 21 |
+
nn.MaxPool2d(2, 2),
|
| 22 |
+
|
| 23 |
+
nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True),
|
| 24 |
+
nn.MaxPool2d(2, 2),
|
| 25 |
+
|
| 26 |
+
nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
|
| 27 |
+
nn.MaxPool2d((2, 1)),
|
| 28 |
+
|
| 29 |
+
nn.Conv2d(128, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(inplace=True),
|
| 30 |
+
nn.MaxPool2d((2, 1)),
|
| 31 |
+
|
| 32 |
+
nn.Conv2d(256, 256, kernel_size=2, padding=0),
|
| 33 |
+
nn.BatchNorm2d(256), nn.ReLU(inplace=True),
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# FIXED Bug 4: derive cnn_out_h from a real forward pass instead of
|
| 37 |
+
# a hardcoded formula β safer if architecture or img_height ever changes.
|
| 38 |
+
with torch.no_grad():
|
| 39 |
+
_dummy = torch.zeros(1, 1, img_height, 32)
|
| 40 |
+
_out = self.cnn(_dummy)
|
| 41 |
+
cnn_out_h = _out.shape[2] # actual height after all CNN layers
|
| 42 |
+
rnn_input = 256 * cnn_out_h
|
| 43 |
+
|
| 44 |
+
self.rnn = nn.LSTM(
|
| 45 |
+
input_size=rnn_input,
|
| 46 |
+
hidden_size=hidden_size,
|
| 47 |
+
num_layers=num_lstm_layers,
|
| 48 |
+
bidirectional=True,
|
| 49 |
+
batch_first=False,
|
| 50 |
+
)
|
| 51 |
+
# Dropout before FC β prevents overfitting on small datasets.
|
| 52 |
+
# Applied after BiLSTM output, before character projection.
|
| 53 |
+
# p=0.3 is standard for CRNN OCR models (disabled at inference via model.eval()).
|
| 54 |
+
self.dropout = nn.Dropout(p=dropout)
|
| 55 |
+
self.fc = nn.Linear(hidden_size * 2, num_chars)
|
| 56 |
+
|
| 57 |
+
def forward(self, x):
|
| 58 |
+
f = self.cnn(x)
|
| 59 |
+
B, C, h, w = f.size()
|
| 60 |
+
f = f.permute(3, 0, 1, 2).reshape(w, B, C * h)
|
| 61 |
+
f, _ = self.rnn(f)
|
| 62 |
+
return self.fc(self.dropout(f))
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class CRNN_Ensemble(nn.Module):
|
| 66 |
+
def __init__(self, num_models=3, **kwargs):
|
| 67 |
+
super().__init__()
|
| 68 |
+
self.models = nn.ModuleList([CRNN_CivilRegistry(**kwargs) for _ in range(num_models)])
|
| 69 |
+
|
| 70 |
+
def forward(self, x):
|
| 71 |
+
# FIXED Rec 3: average softmax probabilities across models (correct ensemble),
|
| 72 |
+
# then return log of the average so CTCLoss receives log-probabilities β
|
| 73 |
+
# the same contract as CRNN_CivilRegistry (raw logits + log_softmax in trainer).
|
| 74 |
+
# Returning raw averaged probabilities caused CTCLoss to receive un-logged values.
|
| 75 |
+
probs = [torch.nn.functional.softmax(m(x), dim=2) for m in self.models]
|
| 76 |
+
avg_probs = torch.mean(torch.stack(probs), dim=0)
|
| 77 |
+
return torch.log(avg_probs.clamp(min=1e-9)) # log-probs, safe clamp avoids log(0)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def get_crnn_model(model_type='standard', **kwargs):
|
| 81 |
+
if model_type == 'ensemble':
|
| 82 |
+
return CRNN_Ensemble(**kwargs)
|
| 83 |
+
return CRNN_CivilRegistry(**kwargs)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def initialize_weights(model):
|
| 87 |
+
for m in model.modules():
|
| 88 |
+
if isinstance(m, nn.Conv2d):
|
| 89 |
+
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
| 90 |
+
if m.bias is not None:
|
| 91 |
+
nn.init.constant_(m.bias, 0)
|
| 92 |
+
elif isinstance(m, nn.BatchNorm2d):
|
| 93 |
+
nn.init.constant_(m.weight, 1)
|
| 94 |
+
nn.init.constant_(m.bias, 0)
|
| 95 |
+
elif isinstance(m, nn.Linear):
|
| 96 |
+
nn.init.normal_(m.weight, 0, 0.01)
|
| 97 |
+
nn.init.constant_(m.bias, 0)
|
| 98 |
+
elif isinstance(m, nn.LSTM):
|
| 99 |
+
for name, param in m.named_parameters():
|
| 100 |
+
if 'weight' in name:
|
| 101 |
+
nn.init.orthogonal_(param)
|
| 102 |
+
elif 'bias' in name:
|
| 103 |
+
nn.init.constant_(param, 0)
|
| 104 |
+
# Rec 1: set forget gate bias to 1.0 β helps the model
|
| 105 |
+
# remember across long sequences at the start of training.
|
| 106 |
+
# LSTM gate order: [input | forget | cell | output]
|
| 107 |
+
n = param.size(0)
|
| 108 |
+
param.data[n // 4 : n // 2].fill_(1.0)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
model = get_crnn_model('standard', img_height=64, num_chars=96, hidden_size=128, num_lstm_layers=1)
|
| 113 |
+
initialize_weights(model)
|
| 114 |
+
x = torch.randn(2, 1, 64, 512)
|
| 115 |
+
out = model(x)
|
| 116 |
+
params = sum(p.numel() for p in model.parameters())
|
| 117 |
+
print(f"Output: {out.shape} seq_len={out.shape[0]}")
|
| 118 |
+
print(f"Params: {params:,} (unchanged β dropout adds no parameters)")
|
| 119 |
+
print(f"Dropout p=0.3 active during training, disabled during model.eval()")
|
CRNN+CTC/dataset.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
dataset.py
|
| 3 |
+
==========
|
| 4 |
+
PyTorch Dataset and DataLoader utilities for the Civil Registry OCR system.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
import random
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import List, Tuple, Dict, Optional
|
| 12 |
+
|
| 13 |
+
import cv2
|
| 14 |
+
import numpy as np
|
| 15 |
+
import torch
|
| 16 |
+
from torch.utils.data import Dataset
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
+
# CHARACTER SET
|
| 21 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
+
|
| 23 |
+
PRINTABLE_CHARS = [chr(i) for i in range(32, 127)] # space (32) to ~ (126)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def build_char_maps(extra_chars: Optional[List[str]] = None):
|
| 27 |
+
chars = PRINTABLE_CHARS.copy()
|
| 28 |
+
if extra_chars:
|
| 29 |
+
for c in extra_chars:
|
| 30 |
+
if c not in chars:
|
| 31 |
+
chars.append(c)
|
| 32 |
+
char_to_idx = {c: i + 1 for i, c in enumerate(chars)}
|
| 33 |
+
idx_to_char = {i + 1: c for i, c in enumerate(chars)}
|
| 34 |
+
num_chars = len(chars) + 1 # +1 for blank=0
|
| 35 |
+
return char_to_idx, idx_to_char, num_chars
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
# IMAGE NORMALIZER
|
| 40 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 41 |
+
|
| 42 |
+
class ImageNormalizer:
|
| 43 |
+
|
| 44 |
+
def __init__(self, target_height: int = 64, target_width: int = 512):
|
| 45 |
+
self.H = target_height
|
| 46 |
+
self.W = target_width
|
| 47 |
+
|
| 48 |
+
def _to_gray(self, img):
|
| 49 |
+
if len(img.shape) == 3:
|
| 50 |
+
return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 51 |
+
return img.copy()
|
| 52 |
+
|
| 53 |
+
def _crop_to_text(self, gray):
|
| 54 |
+
inv = cv2.bitwise_not(gray)
|
| 55 |
+
_, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
|
| 56 |
+
coords = np.column_stack(np.where(thresh > 0))
|
| 57 |
+
if len(coords) == 0:
|
| 58 |
+
return gray
|
| 59 |
+
y_min, x_min = coords.min(axis=0)
|
| 60 |
+
y_max, x_max = coords.max(axis=0)
|
| 61 |
+
pad = max(4, int((y_max - y_min) * 0.15))
|
| 62 |
+
y_min = max(0, y_min - pad)
|
| 63 |
+
x_min = max(0, x_min - pad)
|
| 64 |
+
y_max = min(gray.shape[0] - 1, y_max + pad)
|
| 65 |
+
x_max = min(gray.shape[1] - 1, x_max + pad)
|
| 66 |
+
return gray[y_min:y_max + 1, x_min:x_max + 1]
|
| 67 |
+
|
| 68 |
+
def _aspect_resize(self, gray):
|
| 69 |
+
h, w = gray.shape
|
| 70 |
+
if h == 0 or w == 0:
|
| 71 |
+
return np.ones((self.H, self.W), dtype=np.uint8) * 255
|
| 72 |
+
scale = self.H / h
|
| 73 |
+
new_w = int(w * scale)
|
| 74 |
+
new_h = self.H
|
| 75 |
+
if new_w > self.W:
|
| 76 |
+
scale = self.W / w
|
| 77 |
+
new_h = int(h * scale)
|
| 78 |
+
new_w = self.W
|
| 79 |
+
resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
|
| 80 |
+
canvas = np.ones((self.H, self.W), dtype=np.uint8) * 255
|
| 81 |
+
y_off = (self.H - new_h) // 2
|
| 82 |
+
x_off = (self.W - new_w) // 2
|
| 83 |
+
canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized
|
| 84 |
+
return canvas
|
| 85 |
+
|
| 86 |
+
def _binarize(self, img):
|
| 87 |
+
_, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 88 |
+
white_ratio = np.mean(otsu == 255)
|
| 89 |
+
if white_ratio < 0.30 or white_ratio > 0.97:
|
| 90 |
+
return cv2.adaptiveThreshold(
|
| 91 |
+
img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 92 |
+
cv2.THRESH_BINARY, 11, 2)
|
| 93 |
+
return otsu
|
| 94 |
+
|
| 95 |
+
def normalize(self, img: np.ndarray, augmenter=None) -> np.ndarray:
|
| 96 |
+
gray = self._to_gray(img)
|
| 97 |
+
# NOTE: fastNlMeansDenoising intentionally removed from training pipeline.
|
| 98 |
+
# It is slow (~200ms/image) and pointless on clean synthetic images.
|
| 99 |
+
# Denoising is only applied in check_cer.py / inference.py (AdaptiveNormalizer)
|
| 100 |
+
# which runs on real scanned documents where denoising actually helps.
|
| 101 |
+
gray = self._crop_to_text(gray)
|
| 102 |
+
gray = self._aspect_resize(gray)
|
| 103 |
+
# FIXED Bug 3: augment on grayscale BEFORE binarize.
|
| 104 |
+
# Brightness/contrast augmentation has zero effect on binary (0/255) pixels.
|
| 105 |
+
if augmenter is not None:
|
| 106 |
+
gray = augmenter(gray)
|
| 107 |
+
return self._binarize(gray)
|
| 108 |
+
|
| 109 |
+
def to_tensor(self, img: np.ndarray) -> torch.Tensor:
|
| 110 |
+
return torch.FloatTensor(
|
| 111 |
+
img.astype(np.float32) / 255.0
|
| 112 |
+
).unsqueeze(0) # [1, H, W]
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 116 |
+
# AUGMENTATION
|
| 117 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 118 |
+
|
| 119 |
+
class Augmenter:
|
| 120 |
+
|
| 121 |
+
def __call__(self, img: np.ndarray) -> np.ndarray:
|
| 122 |
+
img = img.copy()
|
| 123 |
+
|
| 124 |
+
# Random slight rotation (Β±3Β°)
|
| 125 |
+
if random.random() < 0.3:
|
| 126 |
+
angle = random.uniform(-3, 3)
|
| 127 |
+
h, w = img.shape
|
| 128 |
+
M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
|
| 129 |
+
img = cv2.warpAffine(img, M, (w, h),
|
| 130 |
+
borderMode=cv2.BORDER_CONSTANT,
|
| 131 |
+
borderValue=255)
|
| 132 |
+
|
| 133 |
+
# Random brightness/contrast
|
| 134 |
+
if random.random() < 0.4:
|
| 135 |
+
alpha = random.uniform(0.8, 1.2)
|
| 136 |
+
beta = random.randint(-20, 20)
|
| 137 |
+
img = np.clip(alpha * img.astype(np.float32) + beta,
|
| 138 |
+
0, 255).astype(np.uint8)
|
| 139 |
+
|
| 140 |
+
# Gaussian blur
|
| 141 |
+
if random.random() < 0.3:
|
| 142 |
+
ksize = random.choice([3, 5])
|
| 143 |
+
img = cv2.GaussianBlur(img, (ksize, ksize), 0)
|
| 144 |
+
|
| 145 |
+
# Salt-and-pepper noise
|
| 146 |
+
if random.random() < 0.2:
|
| 147 |
+
noise = np.random.randint(0, 100, img.shape)
|
| 148 |
+
img[noise < 2] = 0
|
| 149 |
+
img[noise > 97] = 255
|
| 150 |
+
|
| 151 |
+
# Random small horizontal shift
|
| 152 |
+
if random.random() < 0.2:
|
| 153 |
+
h, w = img.shape
|
| 154 |
+
shift = random.randint(-int(w * 0.05), int(w * 0.05))
|
| 155 |
+
M = np.float32([[1, 0, shift], [0, 1, 0]])
|
| 156 |
+
img = cv2.warpAffine(img, M, (w, h),
|
| 157 |
+
borderMode=cv2.BORDER_CONSTANT,
|
| 158 |
+
borderValue=255)
|
| 159 |
+
|
| 160 |
+
# ββ NEW: Horizontal line noise ββββββββββββββββββββββββββββββββββββββββ
|
| 161 |
+
# Simulates ruled form lines bleeding through behind the text.
|
| 162 |
+
# Civil registry forms have printed horizontal grid lines β scanners
|
| 163 |
+
# often pick these up as faint grey stripes across text fields.
|
| 164 |
+
if random.random() < 0.3:
|
| 165 |
+
h, w = img.shape
|
| 166 |
+
n_lines = random.randint(1, 3)
|
| 167 |
+
for _ in range(n_lines):
|
| 168 |
+
y = random.randint(0, h - 1)
|
| 169 |
+
thickness = random.choice([1, 1, 1, 2]) # mostly 1px
|
| 170 |
+
intensity = random.randint(160, 220) # light grey, not black
|
| 171 |
+
cv2.line(img, (0, y), (w, y),
|
| 172 |
+
color=intensity, thickness=thickness)
|
| 173 |
+
|
| 174 |
+
# ββ NEW: Perspective warp βββββββββββββββββββββββββββββββββββββββββββββ
|
| 175 |
+
# Simulates documents scanned or photographed at a slight angle.
|
| 176 |
+
# Keystone distortion is common when forms are placed unevenly on
|
| 177 |
+
# a flatbed scanner or photographed with a phone camera.
|
| 178 |
+
if random.random() < 0.25:
|
| 179 |
+
h, w = img.shape
|
| 180 |
+
d = 0.03
|
| 181 |
+
dx = int(w * d)
|
| 182 |
+
dy = int(h * d)
|
| 183 |
+
src = np.float32([[0, 0], [w, 0], [w, h], [0, h]])
|
| 184 |
+
dst = np.float32([
|
| 185 |
+
[random.randint(0, dx), random.randint(0, dy)],
|
| 186 |
+
[w - random.randint(0, dx), random.randint(0, dy)],
|
| 187 |
+
[w - random.randint(0, dx), h - random.randint(0, dy)],
|
| 188 |
+
[random.randint(0, dx), h - random.randint(0, dy)],
|
| 189 |
+
])
|
| 190 |
+
M = cv2.getPerspectiveTransform(src, dst)
|
| 191 |
+
img = cv2.warpPerspective(img, M, (w, h),
|
| 192 |
+
borderMode=cv2.BORDER_CONSTANT,
|
| 193 |
+
borderValue=255)
|
| 194 |
+
|
| 195 |
+
return img
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 199 |
+
# DATASET
|
| 200 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 201 |
+
|
| 202 |
+
class CivilRegistryDataset(Dataset):
|
| 203 |
+
"""
|
| 204 |
+
Args:
|
| 205 |
+
data_dir : root folder containing image subfolders (e.g. 'data/train')
|
| 206 |
+
annotations_file : path to JSON file with image_path + text pairs
|
| 207 |
+
img_height : target image height (default 64)
|
| 208 |
+
img_width : target image width (default 512)
|
| 209 |
+
augment : True = apply augmentation (training only)
|
| 210 |
+
form_type : 'all' or filter by form e.g. 'form1a'
|
| 211 |
+
|
| 212 |
+
Properties used by train.py:
|
| 213 |
+
.num_chars β passed to CRNN model
|
| 214 |
+
.char_to_idx β saved in checkpoint
|
| 215 |
+
.idx_to_char β used for decoding predictions
|
| 216 |
+
|
| 217 |
+
__getitem__ returns:
|
| 218 |
+
image_tensor FloatTensor [1, H, W]
|
| 219 |
+
target LongTensor [label_length]
|
| 220 |
+
target_length int
|
| 221 |
+
text str (original ground truth)
|
| 222 |
+
"""
|
| 223 |
+
|
| 224 |
+
def __init__(
|
| 225 |
+
self,
|
| 226 |
+
data_dir: str,
|
| 227 |
+
annotations_file: str,
|
| 228 |
+
img_height: int = 64,
|
| 229 |
+
img_width: int = 512,
|
| 230 |
+
augment: bool = False,
|
| 231 |
+
form_type: str = 'all',
|
| 232 |
+
seed: Optional[int] = None, # Rec 2: reproducible augmentation
|
| 233 |
+
):
|
| 234 |
+
self.data_dir = Path(data_dir)
|
| 235 |
+
self.augment = augment
|
| 236 |
+
self.normalizer = ImageNormalizer(img_height, img_width)
|
| 237 |
+
self.augmenter = Augmenter()
|
| 238 |
+
if seed is not None: # Rec 2: seed random for reproducibility
|
| 239 |
+
random.seed(seed)
|
| 240 |
+
np.random.seed(seed)
|
| 241 |
+
|
| 242 |
+
self.char_to_idx, self.idx_to_char, self.num_chars = build_char_maps()
|
| 243 |
+
|
| 244 |
+
with open(annotations_file, 'r', encoding='utf-8') as f:
|
| 245 |
+
all_annotations = json.load(f)
|
| 246 |
+
|
| 247 |
+
if form_type != 'all':
|
| 248 |
+
all_annotations = [
|
| 249 |
+
a for a in all_annotations
|
| 250 |
+
if form_type in a.get('image_path', '')
|
| 251 |
+
]
|
| 252 |
+
|
| 253 |
+
self.samples: List[Dict] = []
|
| 254 |
+
missing = 0
|
| 255 |
+
for ann in all_annotations:
|
| 256 |
+
img_path = self.data_dir / ann['image_path']
|
| 257 |
+
if img_path.exists():
|
| 258 |
+
text = ann['text'].strip()
|
| 259 |
+
if text:
|
| 260 |
+
self.samples.append({
|
| 261 |
+
'image_path': str(img_path),
|
| 262 |
+
'text': text,
|
| 263 |
+
})
|
| 264 |
+
else:
|
| 265 |
+
missing += 1
|
| 266 |
+
|
| 267 |
+
if missing > 0:
|
| 268 |
+
print(f" [Dataset] WARNING: {missing} image(s) not found and skipped.")
|
| 269 |
+
|
| 270 |
+
print(f" [Dataset] Loaded {len(self.samples)} samples "
|
| 271 |
+
f"from {annotations_file} (augment={augment})")
|
| 272 |
+
|
| 273 |
+
def __len__(self) -> int:
|
| 274 |
+
return len(self.samples)
|
| 275 |
+
|
| 276 |
+
def __getitem__(self, idx: int):
|
| 277 |
+
sample = self.samples[idx]
|
| 278 |
+
text = sample['text']
|
| 279 |
+
|
| 280 |
+
img = cv2.imread(sample['image_path'])
|
| 281 |
+
if img is None:
|
| 282 |
+
img = np.ones((64, 512, 3), dtype=np.uint8) * 255
|
| 283 |
+
|
| 284 |
+
# FIXED Bug 3: pass augmenter into normalize() so it runs on grayscale
|
| 285 |
+
# (before binarization), not on the binary output where it has no effect.
|
| 286 |
+
aug = self.augmenter if self.augment else None
|
| 287 |
+
normalized = self.normalizer.normalize(img, augmenter=aug)
|
| 288 |
+
|
| 289 |
+
image_tensor = self.normalizer.to_tensor(normalized) # [1, H, W]
|
| 290 |
+
|
| 291 |
+
encoded = [
|
| 292 |
+
self.char_to_idx[c]
|
| 293 |
+
for c in text
|
| 294 |
+
if c in self.char_to_idx
|
| 295 |
+
]
|
| 296 |
+
if len(encoded) == 0:
|
| 297 |
+
encoded = [self.char_to_idx.get(' ', 1)]
|
| 298 |
+
|
| 299 |
+
target = torch.LongTensor(encoded)
|
| 300 |
+
target_length = len(encoded)
|
| 301 |
+
|
| 302 |
+
return image_tensor, target, target_length, text
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 306 |
+
# COLLATE FUNCTION
|
| 307 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 308 |
+
|
| 309 |
+
def collate_fn(batch):
|
| 310 |
+
"""
|
| 311 |
+
CTC loss needs all labels packed into one flat 1D tensor.
|
| 312 |
+
PyTorch's default collator can't handle variable-length labels,
|
| 313 |
+
so this custom function packs them correctly.
|
| 314 |
+
|
| 315 |
+
Returns:
|
| 316 |
+
images FloatTensor [B, 1, H, W]
|
| 317 |
+
targets LongTensor [sum of all label lengths]
|
| 318 |
+
target_lengths LongTensor [B]
|
| 319 |
+
texts List[str]
|
| 320 |
+
"""
|
| 321 |
+
images, targets, target_lengths, texts = zip(*batch)
|
| 322 |
+
|
| 323 |
+
images = torch.stack(images, dim=0)
|
| 324 |
+
targets = torch.cat([t for t in targets])
|
| 325 |
+
target_lengths = torch.LongTensor(target_lengths)
|
| 326 |
+
|
| 327 |
+
return images, targets, target_lengths, list(texts)
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 331 |
+
# HELPER: CREATE ANNOTATION FILE (run once to build your JSON)
|
| 332 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 333 |
+
|
| 334 |
+
def create_annotation_file(data_dir: str, output_file: str,
|
| 335 |
+
extensions=('.jpg', '.jpeg', '.png')):
|
| 336 |
+
"""
|
| 337 |
+
Auto-generate annotations JSON by scanning data_dir.
|
| 338 |
+
For each image, looks for a sidecar .txt file with the same name.
|
| 339 |
+
If not found, uses the filename stem (underscores β spaces) as label.
|
| 340 |
+
|
| 341 |
+
Usage:
|
| 342 |
+
from dataset import create_annotation_file
|
| 343 |
+
create_annotation_file('data/train', 'data/train_annotations.json')
|
| 344 |
+
create_annotation_file('data/val', 'data/val_annotations.json')
|
| 345 |
+
"""
|
| 346 |
+
data_path = Path(data_dir)
|
| 347 |
+
annotations = []
|
| 348 |
+
|
| 349 |
+
for img_path in sorted(data_path.rglob('*')):
|
| 350 |
+
if img_path.suffix.lower() not in extensions:
|
| 351 |
+
continue
|
| 352 |
+
txt_path = img_path.with_suffix('.txt')
|
| 353 |
+
if txt_path.exists():
|
| 354 |
+
label = txt_path.read_text(encoding='utf-8').strip()
|
| 355 |
+
else:
|
| 356 |
+
label = img_path.stem.replace('_', ' ')
|
| 357 |
+
if not label:
|
| 358 |
+
continue
|
| 359 |
+
rel_path = img_path.relative_to(data_path)
|
| 360 |
+
annotations.append({
|
| 361 |
+
'image_path': str(rel_path).replace('\\', '/'),
|
| 362 |
+
'text': label,
|
| 363 |
+
})
|
| 364 |
+
|
| 365 |
+
os.makedirs(Path(output_file).parent, exist_ok=True)
|
| 366 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 367 |
+
json.dump(annotations, f, indent=2, ensure_ascii=False)
|
| 368 |
+
|
| 369 |
+
print(f"β Saved {len(annotations)} entries β {output_file}")
|
| 370 |
+
return annotations
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 374 |
+
# SELF-TEST (python dataset.py)
|
| 375 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 376 |
+
|
| 377 |
+
if __name__ == '__main__':
|
| 378 |
+
print("=" * 55)
|
| 379 |
+
print(" dataset.py self-test")
|
| 380 |
+
print("=" * 55)
|
| 381 |
+
|
| 382 |
+
c2i, i2c, n = build_char_maps()
|
| 383 |
+
print(f"\n Vocab size : {n} (including blank=0)")
|
| 384 |
+
print(f" 'A'={c2i['A']} '0'={c2i['0']} ' '={c2i[' ']} '.'={c2i['.']}")
|
| 385 |
+
|
| 386 |
+
dummy = np.ones((80, 300, 3), dtype=np.uint8) * 200
|
| 387 |
+
norm = ImageNormalizer(64, 512)
|
| 388 |
+
out = norm.normalize(dummy)
|
| 389 |
+
t = norm.to_tensor(out)
|
| 390 |
+
print(f"\n Normalizer : {dummy.shape} β {out.shape} β tensor {t.shape}")
|
| 391 |
+
|
| 392 |
+
fake = [
|
| 393 |
+
(torch.zeros(1, 64, 512), torch.LongTensor([1, 2, 3]), 3, "ABC"),
|
| 394 |
+
(torch.zeros(1, 64, 512), torch.LongTensor([4, 5]), 2, "DE"),
|
| 395 |
+
(torch.zeros(1, 64, 512), torch.LongTensor([6, 7, 8, 9]), 4, "FGHI"),
|
| 396 |
+
]
|
| 397 |
+
imgs, tgts, tlens, txts = collate_fn(fake)
|
| 398 |
+
print(f"\n collate_fn : images={imgs.shape} "
|
| 399 |
+
f"targets={tgts.shape} lengths={tlens.tolist()}")
|
| 400 |
+
|
| 401 |
+
print("\n β All checks passed.\n")
|
CRNN+CTC/field_extractor.py
ADDED
|
@@ -0,0 +1,735 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Philippine Civil Registry β Field Extractor (Dynamic)
|
| 3 |
+
======================================================
|
| 4 |
+
Automatically detects form borders on ANY scan/photo and aligns field
|
| 5 |
+
extraction to the detected boundary β no hardcoded pixel positions.
|
| 6 |
+
|
| 7 |
+
Field coordinates calibrated directly from official PDF renders at 200 DPI:
|
| 8 |
+
Form 102 (Birth): 1700 x 2800 px
|
| 9 |
+
Form 103 (Death): 1700 x 2878 px
|
| 10 |
+
Form 97 (Marriage): 1700 x 2600 px
|
| 11 |
+
Form 90 (License): 1700 x 2600 px
|
| 12 |
+
|
| 13 |
+
Usage:
|
| 14 |
+
python field_extractor.py --pdf FORM_102.pdf --form birth
|
| 15 |
+
python field_extractor.py --pdf FORM_97.pdf --form marriage --visualize
|
| 16 |
+
python field_extractor.py --pdf FORM_103.pdf --form death --output results.json
|
| 17 |
+
python field_extractor.py --image form102.png --form birth --visualize
|
| 18 |
+
python field_extractor.py --pdf FORM_102.pdf --form birth --checkpoint checkpoints/best_model_emnist.pth
|
| 19 |
+
|
| 20 |
+
.env file (project root) β each team member sets their own:
|
| 21 |
+
POPPLER_PATH=C:\\your\\path\\to\\poppler\\Library\\bin
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
import argparse
|
| 25 |
+
import os
|
| 26 |
+
import sys
|
| 27 |
+
import json
|
| 28 |
+
import cv2
|
| 29 |
+
import numpy as np
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
|
| 32 |
+
import torch
|
| 33 |
+
from dotenv import load_dotenv
|
| 34 |
+
|
| 35 |
+
# Load .env from same folder as this script (works regardless of cwd)
|
| 36 |
+
_script_dir = Path(__file__).parent.resolve()
|
| 37 |
+
load_dotenv(dotenv_path=_script_dir / ".env")
|
| 38 |
+
|
| 39 |
+
# Poppler path β from .env or None (Linux/Mac auto-detects)
|
| 40 |
+
POPPLER_PATH = os.environ.get("POPPLER_PATH", None)
|
| 41 |
+
DEFAULT_CHECKPOINT = "checkpoints/best_model.pth"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
# FIELD RATIO MAPS
|
| 46 |
+
# Format: field_name: (x1, y1, x2, y2) β ratios 0.0β1.0
|
| 47 |
+
# Coordinates are relative to the DETECTED FORM BOUNDARY (not full image).
|
| 48 |
+
# x = leftβright, y = topβbottom
|
| 49 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
+
|
| 51 |
+
# Form 102 β Certificate of Live Birth (Form 1A)
|
| 52 |
+
BIRTH_FIELDS = {
|
| 53 |
+
# Header
|
| 54 |
+
"province": (0.02, 0.068, 0.30, 0.088),
|
| 55 |
+
"registry_number": (0.66, 0.068, 0.99, 0.108),
|
| 56 |
+
"city_municipality": (0.02, 0.090, 0.65, 0.108),
|
| 57 |
+
|
| 58 |
+
# Item 1 β Child Name
|
| 59 |
+
"child_first_name": (0.03, 0.109, 0.40, 0.141),
|
| 60 |
+
"child_middle_name": (0.40, 0.109, 0.64, 0.141),
|
| 61 |
+
"child_last_name": (0.64, 0.109, 0.99, 0.141),
|
| 62 |
+
|
| 63 |
+
# Items 2-3 β Sex / Date of Birth
|
| 64 |
+
"sex": (0.03, 0.142, 0.30, 0.167),
|
| 65 |
+
"dob_day": (0.40, 0.142, 0.80, 0.167),
|
| 66 |
+
"dob_month": (0.80, 0.142, 0.60, 0.167),
|
| 67 |
+
"dob_year": (0.80, 0.142, 0.99, 0.167),
|
| 68 |
+
|
| 69 |
+
# Item 4 β Place of Birth
|
| 70 |
+
"place_birth_hospital": (0.03, 0.169, 0.46, 0.197),
|
| 71 |
+
"place_birth_city": (0.47, 0.169, 0.70, 0.199),
|
| 72 |
+
"place_birth_province": (0.71, 0.169, 0.99, 0.199),
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Mother section
|
| 77 |
+
"mother_first_name": (0.03, 0.248, 0.40, 0.276),
|
| 78 |
+
"mother_middle_name": (0.40, 0.248, 0.64, 0.276),
|
| 79 |
+
"mother_last_name": (0.64, 0.248, 0.99, 0.276),
|
| 80 |
+
"mother_citizenship": (0.03, 0.277, 0.50, 0.305),
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# Father section
|
| 84 |
+
"father_first_name": (0.03, 0.380, 0.40, 0.410),
|
| 85 |
+
"father_middle_name": (0.40, 0.380, 0.64, 0.410),
|
| 86 |
+
"father_last_name": (0.64, 0.380, 0.99, 0.410),
|
| 87 |
+
"father_citizenship": (0.03, 0.411, 0.28, 0.445),
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# Item 20 β Marriage of Parents
|
| 91 |
+
"parents_marriage_month": (0.03, 0.496, 0.19, 0.526),
|
| 92 |
+
"parents_marriage_day": (0.19, 0.496, 0.27, 0.526),
|
| 93 |
+
"parents_marriage_year": (0.27, 0.496, 0.38, 0.526),
|
| 94 |
+
|
| 95 |
+
"parents_marriage_city": (0.41, 0.496, 0.68, 0.526),
|
| 96 |
+
"parents_marriage_province": (0.68, 0.496, 0.84, 0.526),
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
# Form 103 β Certificate of Death (Form 2A)
|
| 102 |
+
DEATH_FIELDS = {
|
| 103 |
+
# Header
|
| 104 |
+
"province": (0.04, 0.128, 0.40, 0.144),
|
| 105 |
+
"registry_number": (0.52, 0.128, 0.75, 0.144),
|
| 106 |
+
"city_municipality": (0.04, 0.145, 0.45, 0.160),
|
| 107 |
+
|
| 108 |
+
# Item 1 β Name
|
| 109 |
+
"deceased_first_name": (0.10, 0.162, 0.34, 0.178),
|
| 110 |
+
"deceased_middle_name": (0.34, 0.162, 0.56, 0.178),
|
| 111 |
+
"deceased_last_name": (0.56, 0.162, 0.75, 0.178),
|
| 112 |
+
|
| 113 |
+
# Items 2-4 β Sex / Religion / Age
|
| 114 |
+
"sex": (0.04, 0.182, 0.13, 0.220),
|
| 115 |
+
"age_years": (0.28, 0.182, 0.38, 0.202),
|
| 116 |
+
|
| 117 |
+
# Item 5 β Place of Death
|
| 118 |
+
"place_death_hospital": (0.13, 0.224, 0.42, 0.242),
|
| 119 |
+
"place_death_city": (0.42, 0.224, 0.58, 0.242),
|
| 120 |
+
"place_death_province": (0.58, 0.224, 0.75, 0.242),
|
| 121 |
+
|
| 122 |
+
# Items 6-7 β Date of Death / Citizenship
|
| 123 |
+
"dod_day": (0.10, 0.252, 0.22, 0.268),
|
| 124 |
+
"dod_month": (0.22, 0.252, 0.38, 0.268),
|
| 125 |
+
"dod_year": (0.38, 0.252, 0.52, 0.268),
|
| 126 |
+
"citizenship": (0.52, 0.252, 0.75, 0.268),
|
| 127 |
+
|
| 128 |
+
# Item 8 β Residence
|
| 129 |
+
"residence_house": (0.13, 0.278, 0.40, 0.294),
|
| 130 |
+
"residence_city": (0.40, 0.278, 0.56, 0.294),
|
| 131 |
+
"residence_province": (0.56, 0.278, 0.75, 0.294),
|
| 132 |
+
|
| 133 |
+
# Items 9-10 β Civil Status / Occupation
|
| 134 |
+
"civil_status": (0.04, 0.302, 0.38, 0.360),
|
| 135 |
+
"occupation": (0.44, 0.302, 0.75, 0.360),
|
| 136 |
+
|
| 137 |
+
# Item 17 β Causes of Death
|
| 138 |
+
"cause_immediate": (0.18, 0.402, 0.58, 0.418),
|
| 139 |
+
"cause_antecedent": (0.18, 0.424, 0.58, 0.440),
|
| 140 |
+
"cause_underlying": (0.18, 0.446, 0.58, 0.462),
|
| 141 |
+
"cause_other": (0.18, 0.468, 0.58, 0.484),
|
| 142 |
+
|
| 143 |
+
# Item 25 β Informant
|
| 144 |
+
"informant_name": (0.04, 0.808, 0.35, 0.822),
|
| 145 |
+
"informant_address": (0.04, 0.822, 0.35, 0.836),
|
| 146 |
+
"informant_date": (0.35, 0.836, 0.58, 0.850),
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
# Form 97 β Certificate of Marriage (Form 3A)
|
| 150 |
+
# Only the fields that flow through bridge.py β spaCy NER β SpouseOutput/Form3A.
|
| 151 |
+
# Removed: province, city_municipality, dob_day/month/year (Γ2),
|
| 152 |
+
# place_birth_city/prov/country (Γ2), sex (Γ2), residence (Γ2),
|
| 153 |
+
# religion (Γ2), civil_status (Γ2).
|
| 154 |
+
MARRIAGE_FIELDS = {
|
| 155 |
+
# ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 156 |
+
"registry_number": (0.62, 0.088, 0.97, 0.104), # β Form3A.registry_number
|
| 157 |
+
|
| 158 |
+
#"registry_number": (0.62, 0.088, 0.97, 0.104), # β Form3A.registry_number
|
| 159 |
+
|
| 160 |
+
# ββ Item 1 β Name (HUSBAND left / WIFE right) ββββββββββββββββββββββββββββ
|
| 161 |
+
"husband_first_name": (0.23, 0.121, 0.56, 0.139),
|
| 162 |
+
"husband_middle_name": (0.23, 0.141, 0.56, 0.159),
|
| 163 |
+
"husband_last_name": (0.23, 0.160, 0.56, 0.178),
|
| 164 |
+
"wife_first_name": (0.65, 0.121, 0.98, 0.139),
|
| 165 |
+
"wife_middle_name": (0.65, 0.141, 0.98, 0.159),
|
| 166 |
+
"wife_last_name": (0.65, 0.160, 0.98, 0.178),
|
| 167 |
+
|
| 168 |
+
# "husband_first_name": (0.14, 0.138, 0.47, 0.156),
|
| 169 |
+
# "husband_middle_name": (0.14, 0.156, 0.47, 0.174),
|
| 170 |
+
# "husband_last_name": (0.14, 0.174, 0.47, 0.192),
|
| 171 |
+
# "wife_first_name": (0.53, 0.138, 0.86, 0.156),
|
| 172 |
+
# "wife_middle_name": (0.53, 0.156, 0.86, 0.174),
|
| 173 |
+
# "wife_last_name": (0.53, 0.174, 0.86, 0.192),
|
| 174 |
+
|
| 175 |
+
# ββ Item 2b β Age ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 176 |
+
"husband_age": (0.40, 0.198, 0.47, 0.216), # β husband.age
|
| 177 |
+
"wife_age": (0.78, 0.198, 0.86, 0.216), # β wife.age
|
| 178 |
+
|
| 179 |
+
# ββ Item 4b β Citizenship ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 180 |
+
"husband_citizenship": (0.22, 0.252, 0.47, 0.270), # β husband.nationality
|
| 181 |
+
"wife_citizenship": (0.62, 0.252, 0.86, 0.270), # β wife.nationality
|
| 182 |
+
|
| 183 |
+
# ββ Item 8 β Name of Father ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 184 |
+
"husband_father_first": (0.14, 0.396, 0.24, 0.414),
|
| 185 |
+
"husband_father_middle": (0.24, 0.396, 0.34, 0.414),
|
| 186 |
+
"husband_father_last": (0.34, 0.396, 0.47, 0.414),
|
| 187 |
+
"wife_father_first": (0.53, 0.396, 0.63, 0.414),
|
| 188 |
+
"wife_father_middle": (0.63, 0.396, 0.73, 0.414),
|
| 189 |
+
"wife_father_last": (0.73, 0.396, 0.86, 0.414),
|
| 190 |
+
|
| 191 |
+
# ββ Item 9 β Citizenship of Father ββββββββββββββββββββββββββββββββββββββ
|
| 192 |
+
"husband_father_citizenship": (0.14, 0.420, 0.47, 0.436), # β husband.nationality_of_father
|
| 193 |
+
"wife_father_citizenship": (0.53, 0.420, 0.86, 0.436), # β wife.nationality_of_father
|
| 194 |
+
|
| 195 |
+
# ββ Item 10 β Name of Mother βββββββββββββββββββββββββββββββββββββββββββββ
|
| 196 |
+
"husband_mother_first": (0.14, 0.444, 0.24, 0.462),
|
| 197 |
+
"husband_mother_middle": (0.24, 0.444, 0.34, 0.462),
|
| 198 |
+
"husband_mother_last": (0.34, 0.444, 0.47, 0.462),
|
| 199 |
+
"wife_mother_first": (0.53, 0.444, 0.63, 0.462),
|
| 200 |
+
"wife_mother_middle": (0.63, 0.444, 0.73, 0.462),
|
| 201 |
+
"wife_mother_last": (0.73, 0.444, 0.86, 0.462),
|
| 202 |
+
|
| 203 |
+
# ββ Item 11 β Citizenship of Mother βββββββββββββββββββββββββββββββββββββ
|
| 204 |
+
"husband_mother_citizenship": (0.14, 0.468, 0.47, 0.484), # β husband.nationality_of_mother
|
| 205 |
+
"wife_mother_citizenship": (0.53, 0.468, 0.86, 0.484), # β wife.nationality_of_mother
|
| 206 |
+
|
| 207 |
+
# ββ Items 15β16 β Place / Date of Marriage βββββββββββββββββββββββββββββββ
|
| 208 |
+
"place_marriage_office": (0.14, 0.596, 0.44, 0.614),
|
| 209 |
+
"place_marriage_city": (0.44, 0.596, 0.68, 0.614),
|
| 210 |
+
"place_marriage_province": (0.68, 0.596, 0.88, 0.614),
|
| 211 |
+
"date_marriage_day": (0.14, 0.630, 0.24, 0.648),
|
| 212 |
+
"date_marriage_month": (0.24, 0.630, 0.38, 0.648),
|
| 213 |
+
"date_marriage_year": (0.38, 0.630, 0.48, 0.648),
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
# Form 90 β Application for Marriage License
|
| 217 |
+
MARRIAGE_LICENSE_FIELDS = {
|
| 218 |
+
# Header
|
| 219 |
+
"province": (0.12, 0.092, 0.48, 0.108),
|
| 220 |
+
"registry_number": (0.56, 0.092, 0.97, 0.108),
|
| 221 |
+
"city_municipality": (0.12, 0.108, 0.48, 0.124),
|
| 222 |
+
"received_by": (0.12, 0.124, 0.48, 0.140),
|
| 223 |
+
"date_of_receipt": (0.12, 0.140, 0.48, 0.156),
|
| 224 |
+
"marriage_license_number": (0.56, 0.124, 0.97, 0.140),
|
| 225 |
+
"date_of_issuance": (0.56, 0.140, 0.97, 0.156),
|
| 226 |
+
|
| 227 |
+
# Item 1 β Name of Applicant (GROOM left / BRIDE right)
|
| 228 |
+
"groom_first_name": (0.02, 0.278, 0.46, 0.294),
|
| 229 |
+
"bride_first_name": (0.54, 0.278, 0.97, 0.294),
|
| 230 |
+
"groom_middle_name": (0.02, 0.296, 0.46, 0.312),
|
| 231 |
+
"bride_middle_name": (0.54, 0.296, 0.97, 0.312),
|
| 232 |
+
"groom_last_name": (0.02, 0.314, 0.46, 0.330),
|
| 233 |
+
"bride_last_name": (0.54, 0.314, 0.97, 0.330),
|
| 234 |
+
|
| 235 |
+
# Item 2 β Date of Birth / Age
|
| 236 |
+
"groom_dob_day": (0.02, 0.334, 0.12, 0.350),
|
| 237 |
+
"groom_dob_month": (0.12, 0.334, 0.24, 0.350),
|
| 238 |
+
"groom_dob_year": (0.24, 0.334, 0.34, 0.350),
|
| 239 |
+
"groom_age": (0.34, 0.334, 0.46, 0.350),
|
| 240 |
+
"bride_dob_day": (0.54, 0.334, 0.62, 0.350),
|
| 241 |
+
"bride_dob_month": (0.62, 0.334, 0.74, 0.350),
|
| 242 |
+
"bride_dob_year": (0.74, 0.334, 0.84, 0.350),
|
| 243 |
+
"bride_age": (0.84, 0.334, 0.97, 0.350),
|
| 244 |
+
|
| 245 |
+
# Item 3 β Place of Birth
|
| 246 |
+
"groom_place_birth_city": (0.02, 0.354, 0.18, 0.370),
|
| 247 |
+
"groom_place_birth_province": (0.18, 0.354, 0.32, 0.370),
|
| 248 |
+
"groom_place_birth_country": (0.32, 0.354, 0.46, 0.370),
|
| 249 |
+
"bride_place_birth_city": (0.54, 0.354, 0.70, 0.370),
|
| 250 |
+
"bride_place_birth_province": (0.70, 0.354, 0.84, 0.370),
|
| 251 |
+
"bride_place_birth_country": (0.84, 0.354, 0.97, 0.370),
|
| 252 |
+
|
| 253 |
+
# Item 4 β Sex / Citizenship
|
| 254 |
+
"groom_sex": (0.02, 0.374, 0.16, 0.390),
|
| 255 |
+
"groom_citizenship": (0.16, 0.374, 0.46, 0.390),
|
| 256 |
+
"bride_sex": (0.54, 0.374, 0.68, 0.390),
|
| 257 |
+
"bride_citizenship": (0.68, 0.374, 0.97, 0.390),
|
| 258 |
+
|
| 259 |
+
# Item 5 β Residence
|
| 260 |
+
"groom_residence": (0.02, 0.394, 0.46, 0.412),
|
| 261 |
+
"bride_residence": (0.54, 0.394, 0.97, 0.412),
|
| 262 |
+
|
| 263 |
+
# Item 6 β Religion
|
| 264 |
+
"groom_religion": (0.02, 0.424, 0.46, 0.440),
|
| 265 |
+
"bride_religion": (0.54, 0.424, 0.97, 0.440),
|
| 266 |
+
|
| 267 |
+
# Item 7 β Civil Status
|
| 268 |
+
"groom_civil_status": (0.02, 0.452, 0.46, 0.468),
|
| 269 |
+
"bride_civil_status": (0.54, 0.452, 0.97, 0.468),
|
| 270 |
+
|
| 271 |
+
# Item 9 β Place where dissolved
|
| 272 |
+
"groom_dissolution_city": (0.02, 0.496, 0.16, 0.512),
|
| 273 |
+
"groom_dissolution_province": (0.16, 0.496, 0.30, 0.512),
|
| 274 |
+
"groom_dissolution_country": (0.30, 0.496, 0.46, 0.512),
|
| 275 |
+
"bride_dissolution_city": (0.54, 0.496, 0.68, 0.512),
|
| 276 |
+
"bride_dissolution_province": (0.68, 0.496, 0.82, 0.512),
|
| 277 |
+
"bride_dissolution_country": (0.82, 0.496, 0.97, 0.512),
|
| 278 |
+
|
| 279 |
+
# Item 10 β Date when dissolved
|
| 280 |
+
"groom_dissolution_day": (0.02, 0.520, 0.12, 0.536),
|
| 281 |
+
"groom_dissolution_month": (0.12, 0.520, 0.24, 0.536),
|
| 282 |
+
"groom_dissolution_year": (0.24, 0.520, 0.34, 0.536),
|
| 283 |
+
"bride_dissolution_day": (0.54, 0.520, 0.62, 0.536),
|
| 284 |
+
"bride_dissolution_month": (0.62, 0.520, 0.74, 0.536),
|
| 285 |
+
"bride_dissolution_year": (0.74, 0.520, 0.84, 0.536),
|
| 286 |
+
|
| 287 |
+
# Item 12 β Father Name
|
| 288 |
+
"groom_father_first": (0.02, 0.594, 0.16, 0.610),
|
| 289 |
+
"groom_father_middle": (0.16, 0.594, 0.28, 0.610),
|
| 290 |
+
"groom_father_last": (0.28, 0.594, 0.46, 0.610),
|
| 291 |
+
"bride_father_first": (0.54, 0.594, 0.66, 0.610),
|
| 292 |
+
"bride_father_middle": (0.66, 0.594, 0.78, 0.610),
|
| 293 |
+
"bride_father_last": (0.78, 0.594, 0.97, 0.610),
|
| 294 |
+
|
| 295 |
+
# Item 13 β Father Citizenship
|
| 296 |
+
"groom_father_citizenship": (0.02, 0.620, 0.46, 0.636),
|
| 297 |
+
"bride_father_citizenship": (0.54, 0.620, 0.97, 0.636),
|
| 298 |
+
|
| 299 |
+
# Item 14 β Father Residence
|
| 300 |
+
"groom_father_residence": (0.02, 0.644, 0.46, 0.660),
|
| 301 |
+
"bride_father_residence": (0.54, 0.644, 0.97, 0.660),
|
| 302 |
+
|
| 303 |
+
# Item 15 β Mother Name
|
| 304 |
+
"groom_mother_first": (0.02, 0.674, 0.16, 0.690),
|
| 305 |
+
"groom_mother_middle": (0.16, 0.674, 0.28, 0.690),
|
| 306 |
+
"groom_mother_last": (0.28, 0.674, 0.46, 0.690),
|
| 307 |
+
"bride_mother_first": (0.54, 0.674, 0.66, 0.690),
|
| 308 |
+
"bride_mother_middle": (0.66, 0.674, 0.78, 0.690),
|
| 309 |
+
"bride_mother_last": (0.78, 0.674, 0.97, 0.690),
|
| 310 |
+
|
| 311 |
+
# Item 16 β Mother Citizenship
|
| 312 |
+
"groom_mother_citizenship": (0.02, 0.696, 0.46, 0.712),
|
| 313 |
+
"bride_mother_citizenship": (0.54, 0.696, 0.97, 0.712),
|
| 314 |
+
|
| 315 |
+
# Item 17 β Mother Residence
|
| 316 |
+
"groom_mother_residence": (0.02, 0.720, 0.46, 0.736),
|
| 317 |
+
"bride_mother_residence": (0.54, 0.720, 0.97, 0.736),
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
FORM_FIELDS = {
|
| 321 |
+
"birth": BIRTH_FIELDS,
|
| 322 |
+
"death": DEATH_FIELDS,
|
| 323 |
+
"marriage": MARRIAGE_FIELDS,
|
| 324 |
+
"marriage_license": MARRIAGE_LICENSE_FIELDS,
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
COLOURS = [
|
| 328 |
+
(0,200,0),(0,150,255),(200,0,200),(0,200,200),(200,200,0),(220,20,60),
|
| 329 |
+
(255,140,0),(150,50,200),(0,160,80),(30,144,255),(255,20,147),(100,200,100),
|
| 330 |
+
]
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 334 |
+
# FORM BOUNDS DETECTOR
|
| 335 |
+
# Finds the outer border of a civil registry form using line detection.
|
| 336 |
+
# Falls back to full image if detection fails.
|
| 337 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 338 |
+
|
| 339 |
+
class FormBoundsDetector:
|
| 340 |
+
def __init__(self, verbose=False):
|
| 341 |
+
self.verbose = verbose
|
| 342 |
+
|
| 343 |
+
def detect(self, image_bgr):
|
| 344 |
+
h, w = image_bgr.shape[:2]
|
| 345 |
+
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
|
| 346 |
+
bounds = self._detect_by_lines(gray, w, h)
|
| 347 |
+
if bounds is None:
|
| 348 |
+
if self.verbose:
|
| 349 |
+
print(" [Bounds] Line detection failed β using full image")
|
| 350 |
+
return (0, 0, w, h)
|
| 351 |
+
if self.verbose:
|
| 352 |
+
print(f" [Bounds] Detected: {bounds}")
|
| 353 |
+
return bounds
|
| 354 |
+
|
| 355 |
+
def _detect_by_lines(self, gray, w, h):
|
| 356 |
+
try:
|
| 357 |
+
thresh = cv2.adaptiveThreshold(
|
| 358 |
+
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 359 |
+
cv2.THRESH_BINARY_INV, 11, 2)
|
| 360 |
+
hk = cv2.getStructuringElement(cv2.MORPH_RECT, (max(w // 5, 10), 1))
|
| 361 |
+
h_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, hk)
|
| 362 |
+
h_rows = np.where(np.sum(h_lines, axis=1) > w * 0.15)[0]
|
| 363 |
+
vk = cv2.getStructuringElement(cv2.MORPH_RECT, (1, max(h // 5, 10)))
|
| 364 |
+
v_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vk)
|
| 365 |
+
v_cols = np.where(np.sum(v_lines, axis=0) > h * 0.08)[0]
|
| 366 |
+
if len(h_rows) == 0 or len(v_cols) == 0:
|
| 367 |
+
return None
|
| 368 |
+
top, bottom = int(h_rows.min()), int(h_rows.max())
|
| 369 |
+
left, right = int(v_cols.min()), int(v_cols.max())
|
| 370 |
+
if (right - left) < w * 0.4 or (bottom - top) < h * 0.4:
|
| 371 |
+
return None
|
| 372 |
+
return (left, top, right, bottom)
|
| 373 |
+
except Exception as e:
|
| 374 |
+
if self.verbose:
|
| 375 |
+
print(f" [Bounds error] {e}")
|
| 376 |
+
return None
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 380 |
+
# DYNAMIC FIELD EXTRACTOR
|
| 381 |
+
# Crops each field region relative to the detected form boundary.
|
| 382 |
+
# Works on any image size, DPI, scan margin, or slight rotation.
|
| 383 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 384 |
+
|
| 385 |
+
class DynamicFieldExtractor:
|
| 386 |
+
def __init__(self, form_type="birth", verbose=False):
|
| 387 |
+
self.form_type = form_type.lower()
|
| 388 |
+
self.field_map = FORM_FIELDS.get(self.form_type, BIRTH_FIELDS)
|
| 389 |
+
self.detector = FormBoundsDetector(verbose=verbose)
|
| 390 |
+
self.verbose = verbose
|
| 391 |
+
self._last_bounds = None
|
| 392 |
+
|
| 393 |
+
def _to_bgr(self, image):
|
| 394 |
+
try:
|
| 395 |
+
from PIL import Image as PILImage
|
| 396 |
+
if isinstance(image, PILImage.Image):
|
| 397 |
+
arr = np.array(image.convert("RGB"))
|
| 398 |
+
return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
|
| 399 |
+
except ImportError:
|
| 400 |
+
pass
|
| 401 |
+
if isinstance(image, np.ndarray):
|
| 402 |
+
if len(image.shape) == 2:
|
| 403 |
+
return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
| 404 |
+
if image.shape[2] == 4:
|
| 405 |
+
return cv2.cvtColor(image, cv2.COLOR_BGRA2BGR)
|
| 406 |
+
return image
|
| 407 |
+
raise TypeError(f"Unsupported image type: {type(image)}")
|
| 408 |
+
|
| 409 |
+
def extract(self, image):
|
| 410 |
+
"""Returns {field_name: BGR numpy array}."""
|
| 411 |
+
image = self._to_bgr(image)
|
| 412 |
+
h, w = image.shape[:2]
|
| 413 |
+
left, top, right, bottom = self.detector.detect(image)
|
| 414 |
+
self._last_bounds = (left, top, right, bottom)
|
| 415 |
+
form_w = right - left
|
| 416 |
+
form_h = bottom - top
|
| 417 |
+
if self.verbose:
|
| 418 |
+
print(f" [Extract] Image={w}x{h} "
|
| 419 |
+
f" Form={form_w}x{form_h} @ ({left},{top})-({right},{bottom})")
|
| 420 |
+
crops = {}
|
| 421 |
+
for name, (rx1, ry1, rx2, ry2) in self.field_map.items():
|
| 422 |
+
x1 = max(0, min(int(left + rx1 * form_w), w - 1))
|
| 423 |
+
y1 = max(0, min(int(top + ry1 * form_h), h - 1))
|
| 424 |
+
x2 = max(0, min(int(left + rx2 * form_w), w - 1))
|
| 425 |
+
y2 = max(0, min(int(top + ry2 * form_h), h - 1))
|
| 426 |
+
if x2 > x1 and y2 > y1:
|
| 427 |
+
crops[name] = image[y1:y2, x1:x2]
|
| 428 |
+
return crops
|
| 429 |
+
|
| 430 |
+
def visualize(self, image, output_path=None):
|
| 431 |
+
"""Draw detected boundary + field boxes. Returns annotated BGR image."""
|
| 432 |
+
image = self._to_bgr(image)
|
| 433 |
+
vis = image.copy()
|
| 434 |
+
h, w = vis.shape[:2]
|
| 435 |
+
self.extract(image)
|
| 436 |
+
left, top, right, bottom = self._last_bounds
|
| 437 |
+
form_w = right - left
|
| 438 |
+
form_h = bottom - top
|
| 439 |
+
cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 3)
|
| 440 |
+
cv2.putText(vis, "DETECTED FORM BOUNDARY",
|
| 441 |
+
(left, max(0, top - 8)),
|
| 442 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 140, 255), 1)
|
| 443 |
+
for idx, (name, (rx1, ry1, rx2, ry2)) in enumerate(self.field_map.items()):
|
| 444 |
+
x1 = max(0, min(int(left + rx1 * form_w), w - 1))
|
| 445 |
+
y1 = max(0, min(int(top + ry1 * form_h), h - 1))
|
| 446 |
+
x2 = max(0, min(int(left + rx2 * form_w), w - 1))
|
| 447 |
+
y2 = max(0, min(int(top + ry2 * form_h), h - 1))
|
| 448 |
+
c = COLOURS[idx % len(COLOURS)]
|
| 449 |
+
cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2)
|
| 450 |
+
cv2.putText(vis, name[:22], (x1 + 2, max(0, y1 - 2)),
|
| 451 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.28, c, 1)
|
| 452 |
+
if output_path:
|
| 453 |
+
cv2.imwrite(str(output_path), vis)
|
| 454 |
+
print(f" Field map saved -> {output_path}")
|
| 455 |
+
return vis
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 459 |
+
# FIELD NORMALIZER β prepares a BGR crop for CRNN inference
|
| 460 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 461 |
+
|
| 462 |
+
class FieldNormalizer:
|
| 463 |
+
def __init__(self, target_height=64, target_width=512):
|
| 464 |
+
self.H = target_height
|
| 465 |
+
self.W = target_width
|
| 466 |
+
|
| 467 |
+
def _crop_to_text(self, gray):
|
| 468 |
+
inv = cv2.bitwise_not(gray)
|
| 469 |
+
_, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
|
| 470 |
+
coords = np.column_stack(np.where(thresh > 0))
|
| 471 |
+
if len(coords) == 0:
|
| 472 |
+
return gray
|
| 473 |
+
y_min, x_min = coords.min(axis=0)
|
| 474 |
+
y_max, x_max = coords.max(axis=0)
|
| 475 |
+
pad = max(4, int((y_max - y_min) * 0.15))
|
| 476 |
+
y_min = max(0, y_min - pad)
|
| 477 |
+
x_min = max(0, x_min - pad)
|
| 478 |
+
y_max = min(gray.shape[0] - 1, y_max + pad)
|
| 479 |
+
x_max = min(gray.shape[1] - 1, x_max + pad)
|
| 480 |
+
return gray[y_min:y_max + 1, x_min:x_max + 1]
|
| 481 |
+
|
| 482 |
+
def _smart_resize(self, gray):
|
| 483 |
+
h, w = gray.shape
|
| 484 |
+
if h == 0 or w == 0:
|
| 485 |
+
return np.ones((self.H, self.W), dtype=np.uint8) * 255
|
| 486 |
+
scale = self.H / h
|
| 487 |
+
new_w = int(w * scale)
|
| 488 |
+
new_h = self.H
|
| 489 |
+
if new_w > self.W:
|
| 490 |
+
scale = self.W / w
|
| 491 |
+
new_h = int(h * scale)
|
| 492 |
+
new_w = self.W
|
| 493 |
+
resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
|
| 494 |
+
canvas = np.ones((self.H, self.W), dtype=np.uint8) * 255
|
| 495 |
+
y_off = (self.H - new_h) // 2
|
| 496 |
+
x_off = (self.W - new_w) // 2
|
| 497 |
+
canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized
|
| 498 |
+
return canvas
|
| 499 |
+
|
| 500 |
+
def _binarize(self, img):
|
| 501 |
+
_, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 502 |
+
white_ratio = np.mean(otsu == 255)
|
| 503 |
+
if white_ratio < 0.30 or white_ratio > 0.97:
|
| 504 |
+
return cv2.adaptiveThreshold(
|
| 505 |
+
img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 506 |
+
cv2.THRESH_BINARY, 11, 2)
|
| 507 |
+
return otsu
|
| 508 |
+
|
| 509 |
+
def normalize(self, crop) -> np.ndarray:
|
| 510 |
+
"""Accept BGR numpy array or PIL image, return normalized binary array."""
|
| 511 |
+
try:
|
| 512 |
+
from PIL import Image as PILImage
|
| 513 |
+
if isinstance(crop, PILImage.Image):
|
| 514 |
+
crop = cv2.cvtColor(np.array(crop.convert("RGB")), cv2.COLOR_RGB2BGR)
|
| 515 |
+
except ImportError:
|
| 516 |
+
pass
|
| 517 |
+
gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) if len(crop.shape) == 3 else crop.copy()
|
| 518 |
+
gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
|
| 519 |
+
gray = self._crop_to_text(gray)
|
| 520 |
+
gray = self._smart_resize(gray)
|
| 521 |
+
return self._binarize(gray)
|
| 522 |
+
|
| 523 |
+
def to_tensor(self, img: np.ndarray) -> torch.Tensor:
|
| 524 |
+
return torch.FloatTensor(
|
| 525 |
+
img.astype(np.float32) / 255.0
|
| 526 |
+
).unsqueeze(0).unsqueeze(0)
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 530 |
+
# CRNN MODEL LOADER
|
| 531 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 532 |
+
|
| 533 |
+
def load_crnn_model(checkpoint_path: str, device: torch.device):
|
| 534 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 535 |
+
from crnn_model import get_crnn_model
|
| 536 |
+
|
| 537 |
+
print(f" Loading CRNN model from: {checkpoint_path}")
|
| 538 |
+
c = torch.load(checkpoint_path, map_location=device, weights_only=False)
|
| 539 |
+
config = c.get("config", {})
|
| 540 |
+
idx_to_char = c["idx_to_char"]
|
| 541 |
+
num_chars = c["model_state_dict"]["fc.weight"].shape[0]
|
| 542 |
+
|
| 543 |
+
model = get_crnn_model(
|
| 544 |
+
model_type=config.get("model_type", "standard"),
|
| 545 |
+
img_height=config.get("img_height", 64),
|
| 546 |
+
num_chars=num_chars,
|
| 547 |
+
hidden_size=config.get("hidden_size", 128),
|
| 548 |
+
num_lstm_layers=config.get("num_lstm_layers", 1),
|
| 549 |
+
).to(device)
|
| 550 |
+
model.load_state_dict(c["model_state_dict"])
|
| 551 |
+
model.eval()
|
| 552 |
+
|
| 553 |
+
val_cer = c.get("val_cer", None)
|
| 554 |
+
val_loss = c.get("val_loss", None)
|
| 555 |
+
metric = f"val_cer={val_cer:.2f}%" if val_cer else \
|
| 556 |
+
f"val_loss={val_loss:.4f}" if val_loss else "no metric"
|
| 557 |
+
print(f" Model loaded | {metric} | chars={num_chars}")
|
| 558 |
+
return model, idx_to_char, config.get("img_height", 64), config.get("img_width", 512)
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 562 |
+
# GREEDY CTC DECODE
|
| 563 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 564 |
+
|
| 565 |
+
def greedy_decode(outputs: torch.Tensor, idx_to_char: dict) -> str:
|
| 566 |
+
pred_indices = torch.argmax(outputs, dim=2).permute(1, 0)
|
| 567 |
+
chars, prev = [], -1
|
| 568 |
+
for idx in pred_indices[0]:
|
| 569 |
+
idx = idx.item()
|
| 570 |
+
if idx != 0 and idx != prev and idx in idx_to_char:
|
| 571 |
+
chars.append(idx_to_char[idx])
|
| 572 |
+
prev = idx
|
| 573 |
+
return "".join(chars)
|
| 574 |
+
|
| 575 |
+
|
| 576 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 577 |
+
# PDF β PIL IMAGE
|
| 578 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 579 |
+
|
| 580 |
+
def pdf_to_image(pdf_path: str, dpi: int = 200):
|
| 581 |
+
from pdf2image import convert_from_path
|
| 582 |
+
# Resolve to absolute path β fixes "Unable to get page count" on Windows
|
| 583 |
+
pdf_path = str(Path(pdf_path).resolve())
|
| 584 |
+
kwargs = {"dpi": dpi, "first_page": 1, "last_page": 1}
|
| 585 |
+
if POPPLER_PATH:
|
| 586 |
+
kwargs["poppler_path"] = str(Path(POPPLER_PATH).resolve())
|
| 587 |
+
return convert_from_path(pdf_path, **kwargs)[0]
|
| 588 |
+
|
| 589 |
+
|
| 590 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 591 |
+
# CRNN OCR β runs on extracted field crops
|
| 592 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 593 |
+
|
| 594 |
+
def run_crnn_ocr(crops: dict, model, idx_to_char: dict,
|
| 595 |
+
img_h: int, img_w: int, device: torch.device) -> dict:
|
| 596 |
+
normalizer = FieldNormalizer(target_height=img_h, target_width=img_w)
|
| 597 |
+
results = {}
|
| 598 |
+
with torch.no_grad():
|
| 599 |
+
for name, crop in crops.items():
|
| 600 |
+
try:
|
| 601 |
+
norm = normalizer.normalize(crop)
|
| 602 |
+
tensor = normalizer.to_tensor(norm).to(device)
|
| 603 |
+
text = greedy_decode(model(tensor).cpu(), idx_to_char)
|
| 604 |
+
results[name] = text
|
| 605 |
+
except Exception as e:
|
| 606 |
+
results[name] = f"[ERROR: {e}]"
|
| 607 |
+
return results
|
| 608 |
+
|
| 609 |
+
|
| 610 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 611 |
+
# CONVENIENCE WRAPPER β for other scripts that import this module
|
| 612 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 613 |
+
|
| 614 |
+
def extract_field_images(image, form_type="birth", verbose=False):
|
| 615 |
+
"""Extract field crops using dynamic boundary detection.
|
| 616 |
+
|
| 617 |
+
Parameters
|
| 618 |
+
----------
|
| 619 |
+
image : PIL Image or BGR numpy array
|
| 620 |
+
form_type : str 'birth' | 'death' | 'marriage' | 'marriage_license'
|
| 621 |
+
verbose : bool
|
| 622 |
+
|
| 623 |
+
Returns
|
| 624 |
+
-------
|
| 625 |
+
dict {field_name: BGR numpy array}
|
| 626 |
+
"""
|
| 627 |
+
return DynamicFieldExtractor(form_type=form_type, verbose=verbose).extract(image)
|
| 628 |
+
|
| 629 |
+
|
| 630 |
+
# Keep old name as alias so any existing code doesn't break
|
| 631 |
+
extract_field_images_dynamic = extract_field_images
|
| 632 |
+
|
| 633 |
+
|
| 634 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 635 |
+
# MAIN
|
| 636 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 637 |
+
|
| 638 |
+
def main():
|
| 639 |
+
parser = argparse.ArgumentParser(
|
| 640 |
+
description="PH Civil Registry Field Extractor β Dynamic CRNN OCR")
|
| 641 |
+
group = parser.add_mutually_exclusive_group(required=True)
|
| 642 |
+
group.add_argument("--pdf", help="Path to scanned PDF")
|
| 643 |
+
group.add_argument("--image", help="Path to scanned image (JPG/PNG)")
|
| 644 |
+
parser.add_argument("--form", required=True,
|
| 645 |
+
choices=["birth", "death", "marriage", "marriage_license"])
|
| 646 |
+
parser.add_argument("--checkpoint", default=DEFAULT_CHECKPOINT)
|
| 647 |
+
parser.add_argument("--visualize", action="store_true",
|
| 648 |
+
help="Save annotated field-map image")
|
| 649 |
+
parser.add_argument("--output", default=None,
|
| 650 |
+
help="Save extracted fields to JSON")
|
| 651 |
+
parser.add_argument("--poppler", default=None,
|
| 652 |
+
help="Override Poppler bin path (overrides .env)")
|
| 653 |
+
parser.add_argument("--dpi", type=int, default=200)
|
| 654 |
+
parser.add_argument("--verbose", action="store_true")
|
| 655 |
+
args = parser.parse_args()
|
| 656 |
+
|
| 657 |
+
global POPPLER_PATH
|
| 658 |
+
if args.poppler:
|
| 659 |
+
POPPLER_PATH = args.poppler
|
| 660 |
+
|
| 661 |
+
form_labels = {
|
| 662 |
+
"birth": "Form 102 β Certificate of Live Birth",
|
| 663 |
+
"death": "Form 103 β Certificate of Death",
|
| 664 |
+
"marriage": "Form 97 β Certificate of Marriage",
|
| 665 |
+
"marriage_license": "Form 90 β Application for Marriage License",
|
| 666 |
+
}
|
| 667 |
+
input_file = args.pdf or args.image
|
| 668 |
+
|
| 669 |
+
print("\nPhilippine Civil Registry OCR β Dynamic Field Extractor")
|
| 670 |
+
print("=" * 65)
|
| 671 |
+
print(f" Form : {form_labels[args.form]}")
|
| 672 |
+
print(f" File : {input_file}")
|
| 673 |
+
print(f" Checkpoint : {args.checkpoint}")
|
| 674 |
+
|
| 675 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 676 |
+
print(f" Device : {device}\n")
|
| 677 |
+
|
| 678 |
+
if not os.path.exists(args.checkpoint):
|
| 679 |
+
print(f"ERROR: Checkpoint not found: {args.checkpoint}")
|
| 680 |
+
sys.exit(1)
|
| 681 |
+
|
| 682 |
+
model, idx_to_char, img_h, img_w = load_crnn_model(args.checkpoint, device)
|
| 683 |
+
|
| 684 |
+
# Load image
|
| 685 |
+
if args.pdf:
|
| 686 |
+
print(f" Converting PDF to image at {args.dpi} DPI...")
|
| 687 |
+
try:
|
| 688 |
+
pil_img = pdf_to_image(args.pdf, dpi=args.dpi)
|
| 689 |
+
page_image = cv2.cvtColor(np.array(pil_img.convert("RGB")), cv2.COLOR_RGB2BGR)
|
| 690 |
+
except Exception as e:
|
| 691 |
+
print(f"\nERROR converting PDF: {e}")
|
| 692 |
+
print("Fix: add POPPLER_PATH=C:\\...\\poppler\\Library\\bin to your .env file")
|
| 693 |
+
sys.exit(1)
|
| 694 |
+
else:
|
| 695 |
+
page_image = cv2.imread(args.image)
|
| 696 |
+
if page_image is None:
|
| 697 |
+
print(f"ERROR: Could not load image: {args.image}")
|
| 698 |
+
sys.exit(1)
|
| 699 |
+
|
| 700 |
+
h, w = page_image.shape[:2]
|
| 701 |
+
print(f" Page size : {w} x {h} px")
|
| 702 |
+
|
| 703 |
+
extractor = DynamicFieldExtractor(form_type=args.form, verbose=args.verbose)
|
| 704 |
+
|
| 705 |
+
if args.visualize:
|
| 706 |
+
stem = Path(input_file).stem
|
| 707 |
+
out_path = stem + "_field_map.jpg"
|
| 708 |
+
extractor.visualize(page_image, output_path=out_path)
|
| 709 |
+
print(f" Field map saved -> {out_path}")
|
| 710 |
+
|
| 711 |
+
print(f"\n Detecting form boundary and extracting fields...")
|
| 712 |
+
crops = extractor.extract(page_image)
|
| 713 |
+
print(f" {len(crops)} field crops extracted")
|
| 714 |
+
|
| 715 |
+
print(f"\n Running CRNN OCR on {len(crops)} fields...")
|
| 716 |
+
results = run_crnn_ocr(crops, model, idx_to_char, img_h, img_w, device)
|
| 717 |
+
|
| 718 |
+
print(f"\n{'β'*65}")
|
| 719 |
+
print(f" {'FIELD':<42} TEXT")
|
| 720 |
+
print(f"{'β'*65}")
|
| 721 |
+
for name, text in results.items():
|
| 722 |
+
print(f" {name:<42} {text if text.strip() else '(empty)'}")
|
| 723 |
+
print(f"{'β'*65}")
|
| 724 |
+
print(f"\n Fields recognized : {sum(1 for t in results.values() if t.strip())} / {len(results)}")
|
| 725 |
+
|
| 726 |
+
if args.output:
|
| 727 |
+
with open(args.output, "w", encoding="utf-8") as f:
|
| 728 |
+
json.dump({"form": form_labels[args.form], "file": input_file,
|
| 729 |
+
"fields": results}, f, ensure_ascii=False, indent=2)
|
| 730 |
+
print(f"\n Results saved -> {args.output}")
|
| 731 |
+
print()
|
| 732 |
+
|
| 733 |
+
|
| 734 |
+
if __name__ == "__main__":
|
| 735 |
+
main()
|
CRNN+CTC/finetune.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
finetune.py
|
| 3 |
+
===========
|
| 4 |
+
Fine-tune CRNN+CTC on generated civil registry form crops.
|
| 5 |
+
|
| 6 |
+
Loads best_model_final.pth (pretrained), continues training on
|
| 7 |
+
actual_annotations.json + train_annotations.json.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python finetune.py
|
| 11 |
+
|
| 12 |
+
Output:
|
| 13 |
+
checkpoints/best_model_v2.pth
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
import sys
|
| 18 |
+
import torch
|
| 19 |
+
import torch.nn.functional as F
|
| 20 |
+
import torch.optim as optim
|
| 21 |
+
from torch.utils.data import DataLoader, ConcatDataset
|
| 22 |
+
|
| 23 |
+
sys.path.append('.')
|
| 24 |
+
from crnn_model import get_crnn_model
|
| 25 |
+
from dataset import CivilRegistryDataset, collate_fn
|
| 26 |
+
|
| 27 |
+
# ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
CHECKPOINT_IN = "checkpoints/best_model_final.pth"
|
| 29 |
+
CHECKPOINT_OUT = "checkpoints/best_model_v2.pth"
|
| 30 |
+
|
| 31 |
+
ACTUAL_ANN = "data/actual_annotations.json" # real scanned forms
|
| 32 |
+
SYNTH_ANN = "data/train_annotations.json" # synthetic / train split
|
| 33 |
+
VAL_ANN = "data/val_annotations.json" # validation set
|
| 34 |
+
|
| 35 |
+
IMG_HEIGHT = 64
|
| 36 |
+
IMG_WIDTH = 512
|
| 37 |
+
BATCH_SIZE = 32
|
| 38 |
+
|
| 39 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 40 |
+
|
| 41 |
+
# ββ Phase settings ββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
+
PHASES = [
|
| 43 |
+
# (name, epochs, lr, freeze_cnn, patience)
|
| 44 |
+
("Phase 1 β CNN frozen, adapt to form crops", 20, 1e-4, True, 5),
|
| 45 |
+
("Phase 2 β Full model, low LR polish", 15, 1e-5, False, 4),
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 49 |
+
def main():
|
| 50 |
+
print("=" * 60)
|
| 51 |
+
print(" Fine-tuning CRNN+CTC on civil registry form crops")
|
| 52 |
+
print("=" * 60)
|
| 53 |
+
print(f" Device : {DEVICE}")
|
| 54 |
+
print(f" Checkpoint : {CHECKPOINT_IN}")
|
| 55 |
+
|
| 56 |
+
# ββ Check required files ββββββββββββββββββββββββββββββββββ
|
| 57 |
+
for f in [CHECKPOINT_IN, VAL_ANN]:
|
| 58 |
+
if not os.path.exists(f):
|
| 59 |
+
print(f"ERROR: {f} not found.")
|
| 60 |
+
sys.exit(1)
|
| 61 |
+
|
| 62 |
+
# ββ Datasets ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
+
datasets_to_merge = []
|
| 64 |
+
|
| 65 |
+
# 1. Actual scanned forms (highest priority β real data)
|
| 66 |
+
if os.path.exists(ACTUAL_ANN):
|
| 67 |
+
actual_dataset = CivilRegistryDataset(
|
| 68 |
+
data_dir=".", annotations_file=ACTUAL_ANN,
|
| 69 |
+
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
|
| 70 |
+
)
|
| 71 |
+
datasets_to_merge.append(actual_dataset)
|
| 72 |
+
print(f" Actual crops: {len(actual_dataset)} (real scanned forms)")
|
| 73 |
+
else:
|
| 74 |
+
print(f" [!] {ACTUAL_ANN} not found β run extract_actual_data.py first")
|
| 75 |
+
|
| 76 |
+
# 2. Fully synthetic β keep so model doesn't forget basic characters
|
| 77 |
+
if os.path.exists(SYNTH_ANN):
|
| 78 |
+
synth_dataset = CivilRegistryDataset(
|
| 79 |
+
data_dir="data/train", annotations_file=SYNTH_ANN,
|
| 80 |
+
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
|
| 81 |
+
)
|
| 82 |
+
datasets_to_merge.append(synth_dataset)
|
| 83 |
+
print(f" Synth crops : {len(synth_dataset)} (fully synthetic)")
|
| 84 |
+
|
| 85 |
+
if not datasets_to_merge:
|
| 86 |
+
print("ERROR: No training data found. Run extract_actual_data.py first.")
|
| 87 |
+
sys.exit(1)
|
| 88 |
+
|
| 89 |
+
val_dataset = CivilRegistryDataset(
|
| 90 |
+
data_dir="data/val", annotations_file=VAL_ANN,
|
| 91 |
+
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
train_dataset = ConcatDataset(datasets_to_merge) if len(datasets_to_merge) > 1 else datasets_to_merge[0]
|
| 95 |
+
print(f" Total train : {len(train_dataset)}")
|
| 96 |
+
print(f" Val : {len(val_dataset)}")
|
| 97 |
+
|
| 98 |
+
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
|
| 99 |
+
shuffle=True, num_workers=0, collate_fn=collate_fn)
|
| 100 |
+
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
|
| 101 |
+
shuffle=False, num_workers=0, collate_fn=collate_fn)
|
| 102 |
+
|
| 103 |
+
# ββ Load checkpoint βββββββββββββββββββββββββββββββββββββββ
|
| 104 |
+
print(f"\n Loading {CHECKPOINT_IN}...")
|
| 105 |
+
ckpt = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
|
| 106 |
+
config = ckpt.get('config', {})
|
| 107 |
+
|
| 108 |
+
ref_dataset = datasets_to_merge[0]
|
| 109 |
+
model = get_crnn_model(
|
| 110 |
+
model_type = config.get('model_type', 'standard'),
|
| 111 |
+
img_height = config.get('img_height', 64),
|
| 112 |
+
num_chars = ref_dataset.num_chars,
|
| 113 |
+
hidden_size = config.get('hidden_size', 128),
|
| 114 |
+
num_lstm_layers = config.get('num_lstm_layers', 1),
|
| 115 |
+
).to(DEVICE)
|
| 116 |
+
|
| 117 |
+
missing, _ = model.load_state_dict(ckpt['model_state_dict'], strict=False)
|
| 118 |
+
if missing:
|
| 119 |
+
print(f" Note: {len(missing)} layers re-initialized (expected if vocab size changed)")
|
| 120 |
+
print(f" Loaded epoch {ckpt.get('epoch','?')} "
|
| 121 |
+
f"val_loss={ckpt.get('val_loss', ckpt.get('val_cer', 0)):.4f}")
|
| 122 |
+
|
| 123 |
+
criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
|
| 124 |
+
os.makedirs("checkpoints", exist_ok=True)
|
| 125 |
+
|
| 126 |
+
# ββ Train/val loop ββββββββββββββββββββββββββββββββββββββββ
|
| 127 |
+
def run_epoch(loader, training, optimizer=None):
|
| 128 |
+
model.train() if training else model.eval()
|
| 129 |
+
total, n = 0, 0
|
| 130 |
+
ctx = torch.enable_grad() if training else torch.no_grad()
|
| 131 |
+
with ctx:
|
| 132 |
+
for images, targets, target_lengths, _ in loader:
|
| 133 |
+
images = images.to(DEVICE)
|
| 134 |
+
batch_size = images.size(0)
|
| 135 |
+
if training:
|
| 136 |
+
optimizer.zero_grad()
|
| 137 |
+
outputs = F.log_softmax(model(images), dim=2)
|
| 138 |
+
seq_len = outputs.size(0)
|
| 139 |
+
input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
|
| 140 |
+
loss = criterion(outputs, targets, input_lengths, target_lengths)
|
| 141 |
+
if not torch.isnan(loss) and not torch.isinf(loss):
|
| 142 |
+
if training:
|
| 143 |
+
loss.backward()
|
| 144 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
|
| 145 |
+
optimizer.step()
|
| 146 |
+
total += loss.item()
|
| 147 |
+
n += 1
|
| 148 |
+
return total / max(n, 1)
|
| 149 |
+
|
| 150 |
+
best_overall = float('inf')
|
| 151 |
+
|
| 152 |
+
for phase_name, epochs, lr, freeze_cnn, patience in PHASES:
|
| 153 |
+
print(f"\n{'='*60}")
|
| 154 |
+
print(f" {phase_name} LR={lr}")
|
| 155 |
+
print(f"{'='*60}")
|
| 156 |
+
|
| 157 |
+
for name, param in model.named_parameters():
|
| 158 |
+
param.requires_grad = not (freeze_cnn and 'cnn' in name)
|
| 159 |
+
|
| 160 |
+
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 161 |
+
print(f" Trainable params : {trainable:,}")
|
| 162 |
+
|
| 163 |
+
opt = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
|
| 164 |
+
sched = optim.lr_scheduler.ReduceLROnPlateau(opt, patience=2, factor=0.5)
|
| 165 |
+
best = float('inf')
|
| 166 |
+
wait = 0
|
| 167 |
+
|
| 168 |
+
for epoch in range(1, epochs + 1):
|
| 169 |
+
tr = run_epoch(train_loader, True, opt)
|
| 170 |
+
vl = run_epoch(val_loader, False, None)
|
| 171 |
+
sched.step(vl)
|
| 172 |
+
|
| 173 |
+
if vl < best:
|
| 174 |
+
best = vl
|
| 175 |
+
wait = 0
|
| 176 |
+
if vl < best_overall:
|
| 177 |
+
best_overall = vl
|
| 178 |
+
torch.save({
|
| 179 |
+
'model_state_dict': model.state_dict(),
|
| 180 |
+
'config': config,
|
| 181 |
+
'char_to_idx': ref_dataset.char_to_idx,
|
| 182 |
+
'idx_to_char': ref_dataset.idx_to_char,
|
| 183 |
+
'epoch': epoch,
|
| 184 |
+
'val_loss': vl,
|
| 185 |
+
}, CHECKPOINT_OUT)
|
| 186 |
+
print(f" Epoch {epoch:02d}/{epochs} Train={tr:.4f} Val={vl:.4f} <- saved")
|
| 187 |
+
else:
|
| 188 |
+
wait += 1
|
| 189 |
+
print(f" Epoch {epoch:02d}/{epochs} Train={tr:.4f} Val={vl:.4f} (patience {wait}/{patience})")
|
| 190 |
+
if wait >= patience:
|
| 191 |
+
print(f" Early stopping.")
|
| 192 |
+
break
|
| 193 |
+
|
| 194 |
+
print(f"\n{'='*60}")
|
| 195 |
+
print(f" Fine-tuning complete!")
|
| 196 |
+
print(f" Best val loss : {best_overall:.4f}")
|
| 197 |
+
print(f" Saved : {CHECKPOINT_OUT}")
|
| 198 |
+
print(f"{'='*60}")
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
if __name__ == '__main__':
|
| 202 |
+
main()
|
CRNN+CTC/generate_ph_names.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
generate_ph_names.py
|
| 3 |
+
====================
|
| 4 |
+
Run this file ONCE to extract Filipino names from the
|
| 5 |
+
names-dataset library and save them to data/ph_names.json.
|
| 6 |
+
|
| 7 |
+
Install first:
|
| 8 |
+
pip install names-dataset
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python generate_ph_names.py
|
| 12 |
+
|
| 13 |
+
Output:
|
| 14 |
+
data/ph_names.json <-- used by fix_data.py every run
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
|
| 20 |
+
print("=" * 60)
|
| 21 |
+
print(" Filipino Name Extractor | names-dataset (PyPI)")
|
| 22 |
+
print("=" * 60)
|
| 23 |
+
|
| 24 |
+
# ββ Step 1: Load NameDataset ββββββββββββββββββββββββββββββββββ
|
| 25 |
+
print("\n[1/5] Loading NameDataset...")
|
| 26 |
+
print(" (This takes 30-60 seconds and needs ~3.2 GB RAM)")
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
from names_dataset import NameDataset
|
| 30 |
+
nd = NameDataset()
|
| 31 |
+
print(" OK - Dataset loaded!")
|
| 32 |
+
except ImportError:
|
| 33 |
+
print("\n ERROR: names-dataset is not installed.")
|
| 34 |
+
print(" Fix: pip install names-dataset")
|
| 35 |
+
exit(1)
|
| 36 |
+
except MemoryError:
|
| 37 |
+
print("\n ERROR: Not enough RAM. Need ~3.2 GB free.")
|
| 38 |
+
exit(1)
|
| 39 |
+
|
| 40 |
+
# ββ Step 2: Extract Filipino FIRST names βββββββββββββββββββββ
|
| 41 |
+
print("\n[2/5] Extracting Filipino first names (Male + Female)...")
|
| 42 |
+
|
| 43 |
+
ph_male = nd.get_top_names(n=300, gender='Male', country_alpha2='PH')
|
| 44 |
+
ph_female = nd.get_top_names(n=300, gender='Female', country_alpha2='PH')
|
| 45 |
+
|
| 46 |
+
# API returns: { 'PH': { 'M': [...] } }
|
| 47 |
+
male_first = ph_male.get('PH', {}).get('M', [])
|
| 48 |
+
female_first = ph_female.get('PH', {}).get('F', [])
|
| 49 |
+
all_first = male_first + female_first
|
| 50 |
+
|
| 51 |
+
print(f" Male first names : {len(male_first)}")
|
| 52 |
+
print(f" Female first names : {len(female_first)}")
|
| 53 |
+
print(f" Total first names : {len(all_first)}")
|
| 54 |
+
print(f" Sample (male) : {male_first[:5]}")
|
| 55 |
+
print(f" Sample (female) : {female_first[:5]}")
|
| 56 |
+
|
| 57 |
+
# ββ Step 3: Extract Filipino LAST names ββββββββββββββββββββββ
|
| 58 |
+
print("\n[3/5] Extracting Filipino last names...")
|
| 59 |
+
|
| 60 |
+
ph_last_raw = nd.get_top_names(n=300, country_alpha2='PH', use_first_names=False)
|
| 61 |
+
print(f" Raw last name API type : {type(ph_last_raw)}")
|
| 62 |
+
|
| 63 |
+
ph_last_ph = ph_last_raw.get('PH', {})
|
| 64 |
+
print(f" PH entry type : {type(ph_last_ph)}")
|
| 65 |
+
|
| 66 |
+
raw_last = []
|
| 67 |
+
|
| 68 |
+
if isinstance(ph_last_ph, list):
|
| 69 |
+
raw_last = ph_last_ph
|
| 70 |
+
elif isinstance(ph_last_ph, dict):
|
| 71 |
+
first_val = next(iter(ph_last_ph.values()), None)
|
| 72 |
+
if isinstance(first_val, list):
|
| 73 |
+
for lst in ph_last_ph.values():
|
| 74 |
+
raw_last.extend(lst)
|
| 75 |
+
elif isinstance(first_val, dict):
|
| 76 |
+
raw_last = list(ph_last_ph.keys())
|
| 77 |
+
else:
|
| 78 |
+
raw_last = list(ph_last_ph.keys())
|
| 79 |
+
|
| 80 |
+
# Deduplicate while preserving order
|
| 81 |
+
seen = set()
|
| 82 |
+
all_last = []
|
| 83 |
+
for name in raw_last:
|
| 84 |
+
if isinstance(name, str) and name not in seen:
|
| 85 |
+
seen.add(name)
|
| 86 |
+
all_last.append(name)
|
| 87 |
+
|
| 88 |
+
print(f" Total last names : {len(all_last)}")
|
| 89 |
+
print(f" Sample : {all_last[:5]}")
|
| 90 |
+
|
| 91 |
+
if len(all_last) == 0:
|
| 92 |
+
print("\n WARNING: Could not extract last names from API.")
|
| 93 |
+
print(" Using common Filipino last names as fallback...")
|
| 94 |
+
all_last = [
|
| 95 |
+
'Santos', 'Reyes', 'Cruz', 'Bautista', 'Ocampo',
|
| 96 |
+
'Garcia', 'Mendoza', 'Torres', 'Flores', 'Aquino',
|
| 97 |
+
'Dela Cruz', 'Del Rosario', 'San Jose', 'De Guzman',
|
| 98 |
+
'Villanueva', 'Gonzales', 'Ramos', 'Diaz', 'Castro',
|
| 99 |
+
'Morales', 'Ortega', 'Gutierrez', 'Lopez', 'Ramirez',
|
| 100 |
+
'Navarro', 'Aguilar', 'Espinosa', 'Mercado', 'Tolentino',
|
| 101 |
+
'Lim', 'Tan', 'Go', 'Chua', 'Sy', 'Ong', 'Co',
|
| 102 |
+
'Macaraeg', 'Macapagal', 'Magsaysay', 'Magno',
|
| 103 |
+
'Pascual', 'Buenaventura', 'Concepcion', 'Resurreccion',
|
| 104 |
+
'Ilagan', 'Manalo', 'Soriano', 'Evangelista', 'Salazar',
|
| 105 |
+
]
|
| 106 |
+
print(f" Fallback last names: {len(all_last)}")
|
| 107 |
+
|
| 108 |
+
# ββ Step 4: Build MIDDLE names pool ββββββββββββββββββββββββββ
|
| 109 |
+
# Middle names in Filipino naming convention are the mother's
|
| 110 |
+
# maiden last name. We build a large pool by combining:
|
| 111 |
+
# A) The last names pool already extracted (primary source)
|
| 112 |
+
# B) A curated extended list of common Filipino surnames
|
| 113 |
+
# used specifically as middle names
|
| 114 |
+
print("\n[4/5] Building middle names pool...")
|
| 115 |
+
|
| 116 |
+
EXTENDED_MIDDLE_NAMES = [
|
| 117 |
+
# Common Filipino surnames used as middle names
|
| 118 |
+
'Abad', 'Abaya', 'Abella', 'Ablaza', 'Abrera',
|
| 119 |
+
'Acosta', 'Adriano', 'Afable', 'Africa', 'Agcaoili',
|
| 120 |
+
'Agno', 'Agpalo', 'Aguinaldo', 'Agustin', 'Ahorro',
|
| 121 |
+
'Alano', 'Alba', 'Albano', 'Alberto', 'Alcantara',
|
| 122 |
+
'Alcazar', 'Alcon', 'Aldana', 'Alegre', 'Alejandro',
|
| 123 |
+
'Aligaen', 'Alim', 'Alinea', 'Alipio', 'Almario',
|
| 124 |
+
'Almeda', 'Almendras', 'Alminiana', 'Almodiel', 'Alonto',
|
| 125 |
+
'Alvarado', 'Alvarez', 'Amante', 'Amaro', 'Ambrocio',
|
| 126 |
+
'Amor', 'Amores', 'Amparo', 'Anastacio', 'Andal',
|
| 127 |
+
'Andaya', 'Angeles', 'Angsioco', 'Antiporda', 'Antonio',
|
| 128 |
+
'Apalisok', 'Apolinario', 'Apostol', 'Aquino', 'Araneta',
|
| 129 |
+
'Aranas', 'Aranda', 'Arceo', 'Arenas', 'Arias',
|
| 130 |
+
'Ariate', 'Arillo', 'Arimado', 'Arjona', 'Arlante',
|
| 131 |
+
'Arnaldo', 'Arnaiz', 'Arnoco', 'Arocena', 'Arroyo',
|
| 132 |
+
'Asejo', 'Asuncion', 'Austria', 'Avecilla', 'Avena',
|
| 133 |
+
'Avila', 'Avinante', 'Ayala', 'Azucena', 'Azul',
|
| 134 |
+
'Bacani', 'Bacunawa', 'Baguio', 'Bagunu', 'Balagtas',
|
| 135 |
+
'Balangue', 'Balbin', 'Balde', 'Baldeo', 'Balgos',
|
| 136 |
+
'Balili', 'Balinas', 'Balitaan', 'Balladares', 'Ballesteros',
|
| 137 |
+
'Balmeo', 'Balmores', 'Banaag', 'Banaag', 'Bandola',
|
| 138 |
+
'Bangayan', 'Bansil', 'Bansode', 'Bantigue', 'Bantug',
|
| 139 |
+
'Barbin', 'Barcenas', 'Bareng', 'Barrion', 'Barroga',
|
| 140 |
+
'Bartolome', 'Bases', 'Batac', 'Bataller', 'Batanes',
|
| 141 |
+
'Batungbakal', 'Bautista', 'Bayani', 'Bayot', 'Baysic',
|
| 142 |
+
'Belarmino', 'Beldia', 'Belen', 'Belgica', 'Bello',
|
| 143 |
+
'Benavides', 'BendaΓ±a', 'Benedicto', 'Benigno', 'Benitez',
|
| 144 |
+
'Bernardino', 'Bernardo', 'Bernarte', 'Besares', 'Billones',
|
| 145 |
+
'Binay', 'Binayas', 'Biscocho', 'Blanco', 'Bondoc',
|
| 146 |
+
'Borja', 'Borromeo', 'Bravo', 'Buenaobra', 'Buenaflor',
|
| 147 |
+
'Buenafe', 'Buenaseda', 'Buenconsejo', 'Buendia', 'Bugarin',
|
| 148 |
+
'Bulalacao', 'Bulalacao', 'Bulatao', 'Bumanlag', 'Bunag',
|
| 149 |
+
'Caballero', 'Cabigting', 'Cabral', 'Cabreros', 'Cacal',
|
| 150 |
+
'Cagampan', 'Cagas', 'Caguioa', 'Cahilig', 'Cajucom',
|
| 151 |
+
'Calagos', 'Calamba', 'Calasanz', 'Calatrava', 'Calderon',
|
| 152 |
+
'Calimag', 'Calimutan', 'Calinawan', 'Calleja', 'Callejo',
|
| 153 |
+
'Caluag', 'Calugay', 'Camacho', 'Camino', 'Campaner',
|
| 154 |
+
'Camposano', 'Candelario', 'Canete', 'Caning', 'Canlas',
|
| 155 |
+
'Caoile', 'Capili', 'Carandang', 'Carbonell', 'Cariaga',
|
| 156 |
+
'Carino', 'Carunungan', 'Casaje', 'Casas', 'Casidsid',
|
| 157 |
+
'CastaΓ±eda', 'Castillo', 'Castillo', 'Catalan', 'Catapang',
|
| 158 |
+
'Cayabyab', 'Cayco', 'Celdran', 'Cerillo', 'Cervantes',
|
| 159 |
+
'Chico', 'Chikiamco', 'Chiongbian', 'Cipriano', 'Clarin',
|
| 160 |
+
'Claudio', 'Clavecillas', 'Climaco', 'Cobankiat', 'Colambo',
|
| 161 |
+
'Collado', 'Comafay', 'Comia', 'Concepcion', 'Condino',
|
| 162 |
+
'Consing', 'Contraras', 'Coquia', 'Cordero', 'Corotan',
|
| 163 |
+
'Corpus', 'Cosico', 'Costales', 'Crisostomo', 'Cristobal',
|
| 164 |
+
'Cueto', 'Culala', 'Cunanan', 'Cunanon', 'Curato',
|
| 165 |
+
'Dadivas', 'Daep', 'Daez', 'Daguplo', 'Dalida',
|
| 166 |
+
'Dalisay', 'Dalmacion', 'Dalusong', 'Damasco', 'Damo',
|
| 167 |
+
'Danao', 'Dancel', 'Dandan', 'Danila', 'Daquigan',
|
| 168 |
+
'Dario', 'Datoc', 'Datumanong', 'David', 'Dayao',
|
| 169 |
+
'Dayrit', 'De Borja', 'De Castro', 'De Jesus', 'De Jose',
|
| 170 |
+
'De La Cruz', 'De La Pena', 'De La Rosa', 'De Leon', 'De Lima',
|
| 171 |
+
'De Los Angeles', 'De Los Reyes', 'De Los Santos', 'De Luna', 'De Mesa',
|
| 172 |
+
'De Ocampo', 'De Paz', 'De Vera', 'De Villa', 'Delos Reyes',
|
| 173 |
+
'Demaisip', 'Delos Santos', 'Demillo', 'Demonteverde', 'Denosta',
|
| 174 |
+
'Derequito', 'Deri', 'Detablan', 'Deveraturda', 'Diaz',
|
| 175 |
+
'Dichoso', 'Diego', 'Diesto', 'Dimaano', 'Dimabuyu',
|
| 176 |
+
'Dimagiba', 'Dimaguila', 'Dimaio', 'Dimanlig', 'Dimayuga',
|
| 177 |
+
'Dingal', 'Dinglasan', 'Dionisio', 'Dioquino', 'Ditan',
|
| 178 |
+
'Diwata', 'Domingo', 'Dominguez', 'Donato', 'Dorado',
|
| 179 |
+
'Doria', 'Duallo', 'Duenas', 'Duerme', 'Dulay',
|
| 180 |
+
'Dumalaog', 'Dumpit', 'Duque', 'Duran', 'Durante',
|
| 181 |
+
'Ebdane', 'Echavez', 'Echevarria', 'Edralin', 'Ejercito',
|
| 182 |
+
'Elago', 'Elazegui', 'Elises', 'Elumba', 'Enage',
|
| 183 |
+
'Encarnacion', 'Enriquez', 'Escobar', 'Escueta', 'Escutin',
|
| 184 |
+
'Esguerra', 'Eslit', 'Espejo', 'Espeleta', 'Espinas',
|
| 185 |
+
'Espino', 'Espiritu', 'Estepa', 'Esteves', 'Estrada',
|
| 186 |
+
'Estrellas', 'Evangelista', 'Evasco', 'Evidente', 'Eyas',
|
| 187 |
+
'Fabella', 'Fabros', 'Faelnar', 'Fajardo', 'Fajutag',
|
| 188 |
+
'Famadico', 'Famador', 'Faustino', 'Favila', 'Feliciano',
|
| 189 |
+
'Felipe', 'Fermin', 'Fernandez', 'Fernando', 'Ferrer',
|
| 190 |
+
'Figueras', 'Fider', 'Florendo', 'Florentino', 'Floreta',
|
| 191 |
+
'Flores', 'Florido', 'Floriza', 'Foja', 'Fonacier',
|
| 192 |
+
'Fontanilla', 'Formoso', 'Fornier', 'Fortich', 'Fortuna',
|
| 193 |
+
'Francisco', 'Frano', 'Frasco', 'Frias', 'Fuentes',
|
| 194 |
+
'Gaabucayan', 'Gabutero', 'Gaerlan', 'Gaffud', 'Galapon',
|
| 195 |
+
'Galera', 'Galicia', 'Galindez', 'Gallardo', 'Gallo',
|
| 196 |
+
'Galvez', 'Gamalinda', 'Gamboa', 'Gammad', 'Gandionco',
|
| 197 |
+
'Ganzon', 'Garado', 'Garayblas', 'Garcia', 'Garduce',
|
| 198 |
+
'Garrido', 'Gatdula', 'Gatmaitan', 'Gatus', 'Gawat',
|
| 199 |
+
'Gelera', 'Gelua', 'Gemora', 'Genato', 'Generoso',
|
| 200 |
+
'Gequillana', 'Gerona', 'Gerundio', 'Gianan', 'Gimenez',
|
| 201 |
+
'Gloria', 'Glorioso', 'Glova', 'Golez', 'Gomez',
|
| 202 |
+
'Gonzaga', 'Gonzales', 'Gordoncillo', 'Gorre', 'Grafilo',
|
| 203 |
+
'Gregorio', 'GriΓ±o', 'Guanzon', 'Guerrero', 'Guevara',
|
| 204 |
+
'Guiao', 'Guillen', 'Guinto', 'Guison', 'Gullas',
|
| 205 |
+
'Gutierrez', 'Guzman', 'Hernandez', 'Herrera', 'Hizon',
|
| 206 |
+
'Honasan', 'Hontiveros', 'Horca', 'Hufana', 'Humilde',
|
| 207 |
+
'IbaΓ±ez', 'Ignacio', 'Ilustre', 'Imbong', 'Imperial',
|
| 208 |
+
'Infante', 'Inion', 'Inocentes', 'Inso', 'Iringan',
|
| 209 |
+
'Jacinto', 'Javier', 'Jimenez', 'Jose', 'Joson',
|
| 210 |
+
'Juan', 'Juico', 'Jurado', 'Kabigting', 'Kalaw',
|
| 211 |
+
'Kho', 'Lacaba', 'Lacadin', 'Lacson', 'Ladesma',
|
| 212 |
+
'Laderas', 'Lagman', 'Lagua', 'Laguna', 'Lainez',
|
| 213 |
+
'Lajarca', 'Lamayo', 'Lambino', 'Lapid', 'Lapuz',
|
| 214 |
+
'Lara', 'Largo', 'Lariza', 'Larizal', 'Laserna',
|
| 215 |
+
'Latorre', 'Laurel', 'Laurente', 'Lazaro', 'Leano',
|
| 216 |
+
'Legarda', 'Leonor', 'Leynes', 'Libunao', 'Licup',
|
| 217 |
+
'Lim', 'Limkaichong', 'Limpag', 'Liwanag', 'Llanes',
|
| 218 |
+
'Llamado', 'Llaneta', 'Locsin', 'Logarta', 'Lopez',
|
| 219 |
+
'Lorenzo', 'Lorilla', 'Lozada', 'Lucero', 'Luistro',
|
| 220 |
+
'Luna', 'Luneta', 'Luzon', 'Macalintal', 'Macam',
|
| 221 |
+
'Maceda', 'Madera', 'Madrazo', 'Magtanggol', 'Malabanan',
|
| 222 |
+
'Malacaman', 'Malajacan', 'Malanyaon', 'Malaya', 'Malbas',
|
| 223 |
+
'Malcampo', 'Maldia', 'Maligalig', 'Malinao', 'Malonzo',
|
| 224 |
+
'Mangahas', 'Mangubat', 'Manigbas', 'Manila', 'Manlangit',
|
| 225 |
+
'Manlapaz', 'Manlongat', 'Manrique', 'Mansalay', 'Mante',
|
| 226 |
+
'Manuel', 'Manzano', 'Marcelo', 'Marcos', 'Mariano',
|
| 227 |
+
'Maristela', 'Marquez', 'Maravilla', 'Masangkay', 'Masapol',
|
| 228 |
+
'Mateo', 'Matienzo', 'Matining', 'Matugas', 'Maula',
|
| 229 |
+
'Maulion', 'Mayuga', 'Medina', 'Mejia', 'Melchor',
|
| 230 |
+
'Melo', 'Menor', 'Mercado', 'Mesina', 'Miguel',
|
| 231 |
+
'Miralles', 'Miranda', 'Molano', 'Molina', 'Mondejar',
|
| 232 |
+
'Monreal', 'Montano', 'Montenegro', 'Montero', 'Montes',
|
| 233 |
+
'Montesa', 'Montoya', 'Moraga', 'Moraleda', 'Moreno',
|
| 234 |
+
'Morial', 'Muncal', 'MuΓ±oz', 'Murillo', 'Musni',
|
| 235 |
+
'Nacion', 'Nadal', 'Nagrampa', 'Nalzaro', 'NapeΓ±as',
|
| 236 |
+
'Narciso', 'Natividad', 'Navales', 'Navarro', 'Neri',
|
| 237 |
+
'Nicolas', 'Nisperos', 'Nolasco', 'Noynay', 'NuΓ±ez',
|
| 238 |
+
'Oaminal', 'Ocampo', 'Ocfemia', 'Ochoa', 'Olaguera',
|
| 239 |
+
'Olano', 'Oliva', 'Olivares', 'Oliveros', 'Olpindo',
|
| 240 |
+
'Omadto', 'Ombion', 'Onate', 'Ong', 'Orbeta',
|
| 241 |
+
'Orbita', 'OrdoΓ±o', 'Orendain', 'Orense', 'Orobia',
|
| 242 |
+
'Orozco', 'Ortega', 'OsmeΓ±a', 'Osorio', 'Ostrea',
|
| 243 |
+
'Ouano', 'Pabiton', 'Pableo', 'Pabriaga', 'Pacanan',
|
| 244 |
+
'Padayao', 'Padilla', 'Padua', 'Paguio', 'Pagulayan',
|
| 245 |
+
'Palad', 'Palacios', 'Palafox', 'Palaganas', 'Palattao',
|
| 246 |
+
'Palencia', 'Palma', 'Palo', 'Paloma', 'Palomares',
|
| 247 |
+
'Pamaran', 'Pamintuan', 'Panaligan', 'Panganiban', 'Pangilinan',
|
| 248 |
+
'Panopio', 'Papa', 'Paqueo', 'Paras', 'Paredes',
|
| 249 |
+
'ParreΓ±o', 'Pascua', 'Pascual', 'Pastor', 'Paterno',
|
| 250 |
+
'Patron', 'Pavia', 'PecaΓ±a', 'Pecho', 'Pedrosa',
|
| 251 |
+
'Pelayo', 'PeΓ±a', 'PeΓ±aflor', 'PeΓ±aranda', 'Penarroyo',
|
| 252 |
+
'Peralta', 'Perez', 'Perlas', 'Pernia', 'Pesquera',
|
| 253 |
+
'Pestano', 'Piccio', 'Picardal', 'Pineda', 'Pimentel',
|
| 254 |
+
'Pilapil', 'Pili', 'Piliin', 'Pillar', 'Pilorin',
|
| 255 |
+
'Poblete', 'Poliquit', 'Ponce', 'Ponferrada', 'Porras',
|
| 256 |
+
'Prado', 'Prieto', 'Prodigalidad', 'Prudente', 'Punsalan',
|
| 257 |
+
'Quezon', 'Quiambao', 'Quiaoit', 'Quijano', 'Quimpo',
|
| 258 |
+
'Quinit', 'Quinones', 'Quiogue', 'Quirino', 'Quisao',
|
| 259 |
+
'Racelis', 'Rada', 'Ramirez', 'Ramon', 'Ramos',
|
| 260 |
+
'Ravalo', 'Rayala', 'Razon', 'Recinto', 'Recometa',
|
| 261 |
+
'Reforma', 'Regalado', 'Reganit', 'Regio', 'Regidor',
|
| 262 |
+
'Regis', 'Reodica', 'Respicio', 'Revilla', 'Reyes',
|
| 263 |
+
'Ricafort', 'Ricalde', 'Ridad', 'Rillo', 'Rivera',
|
| 264 |
+
'Rivero', 'Rizal', 'Robles', 'Roca', 'Rocamora',
|
| 265 |
+
'Rocero', 'Rodriguez', 'Rojas', 'Romero', 'Ronquillo',
|
| 266 |
+
'Rosales', 'Rosario', 'Rosete', 'Rotor', 'Roxas',
|
| 267 |
+
'Rubio', 'Rufino', 'Ruiz', 'Sabal', 'Sabando',
|
| 268 |
+
'Sabido', 'Sabijon', 'Sabio', 'Saceda', 'Saclolo',
|
| 269 |
+
'Sagum', 'Salceda', 'Salcedo', 'Salgado', 'Salinas',
|
| 270 |
+
'Saludar', 'Saluta', 'Salvador', 'Sambrano', 'Samson',
|
| 271 |
+
'Sanchez', 'Sandoval', 'Sangalang', 'Santiago', 'Santillan',
|
| 272 |
+
'Sanz', 'Sarino', 'Sarmiento', 'Sarona', 'Savellano',
|
| 273 |
+
'Sebastian', 'Segovia', 'Sendin', 'Seneres', 'Serafica',
|
| 274 |
+
'Sereno', 'Senga', 'Serrano', 'Sierra', 'Sigua',
|
| 275 |
+
'Silva', 'Silvestre', 'Simon', 'Sinco', 'Singson',
|
| 276 |
+
'Siy', 'Sobejana', 'Soberano', 'Socrates', 'Soliman',
|
| 277 |
+
'Solis', 'Soliven', 'Solomon', 'Sotto', 'Suansing',
|
| 278 |
+
'Suarez', 'Subido', 'Sulit', 'Sultan', 'Sumagaysay',
|
| 279 |
+
'Sunga', 'Tabamo', 'Tabinas', 'Tabuena', 'Tagle',
|
| 280 |
+
'Taguba', 'Tajonera', 'Talabong', 'Talavera', 'Talento',
|
| 281 |
+
'Taleon', 'Talosig', 'Tamano', 'Tambalo', 'Tanada',
|
| 282 |
+
'Tandoc', 'TaΓ±ada', 'Tarriela', 'Tating', 'Tautho',
|
| 283 |
+
'Tayag', 'Tayco', 'Tecson', 'Tejano', 'Tejero',
|
| 284 |
+
'Teodoro', 'Tibay', 'Tigas', 'Tiglao', 'Timbol',
|
| 285 |
+
'Tingzon', 'Tiongco', 'Tiongson', 'Tirol', 'Tobias',
|
| 286 |
+
'Toledo', 'Tolentino', 'Tomelden', 'Tomas', 'Tomaro',
|
| 287 |
+
'Tomaroy', 'Torino', 'Torralba', 'Torrente', 'Torno',
|
| 288 |
+
'Trea', 'Trinidad', 'Tuazon', 'Tubig', 'Tubigan',
|
| 289 |
+
'Tugade', 'Tumbocon', 'Tupas', 'Tuquero', 'Turla',
|
| 290 |
+
'Umagat', 'Umali', 'Usman', 'Uson', 'Uy',
|
| 291 |
+
'Valdez', 'Valencia', 'Valenciano', 'Valentin', 'Valera',
|
| 292 |
+
'Valiao', 'Varela', 'Vargas', 'Vasquez', 'Velarde',
|
| 293 |
+
'Velasco', 'Velasquez', 'Velez', 'Vera', 'Vergara',
|
| 294 |
+
'Vibandor', 'Vicente', 'Victorino', 'Vidal', 'Viernes',
|
| 295 |
+
'Villacorta', 'Villaflor', 'Villafranca', 'Villagomez', 'Villagonzalo',
|
| 296 |
+
'Villanueva', 'Villar', 'Villareal', 'Villaruel', 'Villaverde',
|
| 297 |
+
'Villena', 'Virata', 'Vista', 'Vivar', 'Vizconde',
|
| 298 |
+
'Yabes', 'Yap', 'Yasay', 'Yatco', 'Ylagan',
|
| 299 |
+
'YΓ±iguez', 'Yorac', 'Yulo', 'Zabala', 'Zaldivar',
|
| 300 |
+
'Zamora', 'Zapanta', 'Zaragoza', 'Zosa', 'Zulueta',
|
| 301 |
+
]
|
| 302 |
+
|
| 303 |
+
# Combine last names pool + extended middle names, deduplicated
|
| 304 |
+
middle_seen = set()
|
| 305 |
+
all_middle = []
|
| 306 |
+
for name in (all_last + EXTENDED_MIDDLE_NAMES):
|
| 307 |
+
if isinstance(name, str) and name not in middle_seen:
|
| 308 |
+
middle_seen.add(name)
|
| 309 |
+
all_middle.append(name)
|
| 310 |
+
|
| 311 |
+
print(f" Total middle names : {len(all_middle)}")
|
| 312 |
+
print(f" Sample : {all_middle[:5]}")
|
| 313 |
+
|
| 314 |
+
# ββ Step 5: Save to JSON ββββββββββββββββββββββββββββββββββββββ
|
| 315 |
+
print("\n[5/5] Saving to data/ph_names.json ...")
|
| 316 |
+
|
| 317 |
+
os.makedirs('data', exist_ok=True)
|
| 318 |
+
|
| 319 |
+
output = {
|
| 320 |
+
"first_names": {
|
| 321 |
+
"male": male_first,
|
| 322 |
+
"female": female_first,
|
| 323 |
+
"all": all_first
|
| 324 |
+
},
|
| 325 |
+
"last_names": all_last,
|
| 326 |
+
"middle_names": all_middle,
|
| 327 |
+
"metadata": {
|
| 328 |
+
"source": "names-dataset (PyPI) -- country_alpha2='PH'",
|
| 329 |
+
"total_first": len(all_first),
|
| 330 |
+
"total_last": len(all_last),
|
| 331 |
+
"total_middle": len(all_middle),
|
| 332 |
+
"total_name_combos": len(all_first) * len(all_middle) * len(all_last),
|
| 333 |
+
}
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
with open('data/ph_names.json', 'w', encoding='utf-8') as f:
|
| 337 |
+
json.dump(output, f, indent=2, ensure_ascii=False)
|
| 338 |
+
|
| 339 |
+
# ββ Summary βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 340 |
+
print("\n" + "=" * 60)
|
| 341 |
+
print(" DONE!")
|
| 342 |
+
print("=" * 60)
|
| 343 |
+
print(f" Male first names : {len(male_first)}")
|
| 344 |
+
print(f" Female first names : {len(female_first)}")
|
| 345 |
+
print(f" Last names : {len(all_last)}")
|
| 346 |
+
print(f" Middle names : {len(all_middle)}")
|
| 347 |
+
print(f" Possible 3-part name combos : {len(all_first) * len(all_middle) * len(all_last):,}")
|
| 348 |
+
print(f"\n Saved to: data/ph_names.json")
|
| 349 |
+
print(f"\n Next step: python fix_data.py")
|
| 350 |
+
print("=" * 60)
|
CRNN+CTC/inference.py
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Inference Script for CRNN+CTC Civil Registry OCR
|
| 3 |
+
|
| 4 |
+
TWO NORMALIZERS:
|
| 5 |
+
SimpleNormalizer β for PIL-rendered synthetic images (matches training exactly)
|
| 6 |
+
AdaptiveNormalizer β for physical/scanned images (any zoom, any size)
|
| 7 |
+
|
| 8 |
+
AUTO-DETECT MODE: automatically decides which pipeline to use based on
|
| 9 |
+
text density in the image β zoomed-in images get adaptive treatment,
|
| 10 |
+
clean synthetic images get simple treatment.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import torch
|
| 14 |
+
import cv2
|
| 15 |
+
import numpy as np
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Dict, List
|
| 18 |
+
|
| 19 |
+
from crnn_model import get_crnn_model
|
| 20 |
+
from utils import decode_ctc_predictions, extract_form_fields
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 24 |
+
# HELPERS
|
| 25 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
|
| 27 |
+
def _to_gray(img: np.ndarray) -> np.ndarray:
|
| 28 |
+
if len(img.shape) == 3:
|
| 29 |
+
return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 30 |
+
return img.copy()
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _binarize(gray: np.ndarray) -> np.ndarray:
|
| 34 |
+
"""Otsu, falls back to adaptive for uneven backgrounds."""
|
| 35 |
+
_, otsu = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 36 |
+
white_ratio = np.mean(otsu == 255)
|
| 37 |
+
if white_ratio < 0.30 or white_ratio > 0.97:
|
| 38 |
+
return cv2.adaptiveThreshold(
|
| 39 |
+
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 40 |
+
cv2.THRESH_BINARY, 11, 2)
|
| 41 |
+
return otsu
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _crop_to_text(gray: np.ndarray, pad_ratio=0.15) -> np.ndarray:
|
| 45 |
+
"""Crop tightly around dark pixels (the text)."""
|
| 46 |
+
inv = cv2.bitwise_not(gray)
|
| 47 |
+
_, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
|
| 48 |
+
coords = np.column_stack(np.where(thresh > 0))
|
| 49 |
+
if len(coords) == 0:
|
| 50 |
+
return gray
|
| 51 |
+
y_min, x_min = coords.min(axis=0)
|
| 52 |
+
y_max, x_max = coords.max(axis=0)
|
| 53 |
+
pad = max(4, int((y_max - y_min) * pad_ratio))
|
| 54 |
+
y_min = max(0, y_min - pad)
|
| 55 |
+
x_min = max(0, x_min - pad)
|
| 56 |
+
y_max = min(gray.shape[0] - 1, y_max + pad)
|
| 57 |
+
x_max = min(gray.shape[1] - 1, x_max + pad)
|
| 58 |
+
return gray[y_min:y_max+1, x_min:x_max+1]
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _aspect_resize(gray: np.ndarray, H: int, W: int) -> np.ndarray:
|
| 62 |
+
"""Resize preserving aspect ratio, pad with white to fill canvas."""
|
| 63 |
+
h, w = gray.shape
|
| 64 |
+
if h == 0 or w == 0:
|
| 65 |
+
return np.ones((H, W), dtype=np.uint8) * 255
|
| 66 |
+
scale = H / h
|
| 67 |
+
new_w = int(w * scale)
|
| 68 |
+
new_h = H
|
| 69 |
+
if new_w > W:
|
| 70 |
+
scale = W / w
|
| 71 |
+
new_h = int(h * scale)
|
| 72 |
+
new_w = W
|
| 73 |
+
resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
|
| 74 |
+
canvas = np.ones((H, W), dtype=np.uint8) * 255
|
| 75 |
+
y_off = (H - new_h) // 2
|
| 76 |
+
x_off = (W - new_w) // 2
|
| 77 |
+
canvas[y_off:y_off+new_h, x_off:x_off+new_w] = resized
|
| 78 |
+
return canvas
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _detect_mode(gray: np.ndarray) -> str:
|
| 82 |
+
"""
|
| 83 |
+
Auto-detect whether image needs adaptive or simple normalization.
|
| 84 |
+
|
| 85 |
+
Logic:
|
| 86 |
+
- If >25% of pixels are dark, text is very large/zoomed β adaptive.
|
| 87 |
+
- If image size is far from training size (512x64) β adaptive.
|
| 88 |
+
- Otherwise β simple (matches training pipeline).
|
| 89 |
+
"""
|
| 90 |
+
h, w = gray.shape
|
| 91 |
+
_, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
|
| 92 |
+
dark_px = np.mean(bw == 0)
|
| 93 |
+
|
| 94 |
+
# Text fills too much of the image β zoomed in (like shane.jpg)
|
| 95 |
+
if dark_px > 0.25:
|
| 96 |
+
return 'adaptive'
|
| 97 |
+
|
| 98 |
+
# Image is far from expected training size (allow 50% tolerance)
|
| 99 |
+
if not (256 <= w <= 1024 and 32 <= h <= 128):
|
| 100 |
+
return 'adaptive'
|
| 101 |
+
|
| 102 |
+
return 'simple'
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _to_tensor(img: np.ndarray) -> torch.Tensor:
|
| 106 |
+
return torch.FloatTensor(
|
| 107 |
+
img.astype(np.float32) / 255.0
|
| 108 |
+
).unsqueeze(0).unsqueeze(0)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 112 |
+
# SIMPLE NORMALIZER β for PIL-rendered / training-matched images
|
| 113 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 114 |
+
|
| 115 |
+
class SimpleNormalizer:
|
| 116 |
+
"""
|
| 117 |
+
Matches fix_data.py training pipeline exactly:
|
| 118 |
+
grayscale β resize β binarize
|
| 119 |
+
Best for test images created by create_test_images.py.
|
| 120 |
+
"""
|
| 121 |
+
def __init__(self, H=64, W=512):
|
| 122 |
+
self.H, self.W = H, W
|
| 123 |
+
|
| 124 |
+
def normalize(self, img: np.ndarray) -> np.ndarray:
|
| 125 |
+
gray = _to_gray(img)
|
| 126 |
+
resized = cv2.resize(gray, (self.W, self.H), interpolation=cv2.INTER_LANCZOS4)
|
| 127 |
+
return _binarize(resized)
|
| 128 |
+
|
| 129 |
+
def normalize_from_path(self, path: str) -> np.ndarray:
|
| 130 |
+
img = cv2.imread(str(path))
|
| 131 |
+
if img is None:
|
| 132 |
+
raise ValueError(f"Cannot load: {path}")
|
| 133 |
+
return self.normalize(img)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 137 |
+
# ADAPTIVE NORMALIZER β for real / physical / scanned images
|
| 138 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 139 |
+
|
| 140 |
+
class AdaptiveNormalizer:
|
| 141 |
+
"""
|
| 142 |
+
For physical documents or images with non-standard zoom/size:
|
| 143 |
+
grayscale β denoise β crop text β aspect-ratio resize β binarize
|
| 144 |
+
|
| 145 |
+
Crops to actual text first, so a zoomed-in image like shane.jpg
|
| 146 |
+
gets scaled down to training size instead of being squeezed/stretched.
|
| 147 |
+
"""
|
| 148 |
+
def __init__(self, H=64, W=512):
|
| 149 |
+
self.H, self.W = H, W
|
| 150 |
+
|
| 151 |
+
def normalize(self, img: np.ndarray) -> np.ndarray:
|
| 152 |
+
gray = _to_gray(img)
|
| 153 |
+
gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
|
| 154 |
+
gray = _crop_to_text(gray)
|
| 155 |
+
canvas = _aspect_resize(gray, self.H, self.W)
|
| 156 |
+
return _binarize(canvas)
|
| 157 |
+
|
| 158 |
+
def normalize_from_path(self, path: str) -> np.ndarray:
|
| 159 |
+
img = cv2.imread(str(path))
|
| 160 |
+
if img is None:
|
| 161 |
+
raise ValueError(f"Cannot load: {path}")
|
| 162 |
+
return self.normalize(img)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
+
# AUTO NORMALIZER β detects which pipeline to use per image automatically
|
| 167 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 168 |
+
|
| 169 |
+
class AutoNormalizer:
|
| 170 |
+
"""
|
| 171 |
+
Automatically picks Simple or Adaptive based on image characteristics.
|
| 172 |
+
|
| 173 |
+
Examples:
|
| 174 |
+
demo.jpg (clean 512x64 PIL) β Simple (matches training)
|
| 175 |
+
name1.jpg (clean 512x64 PIL) β Simple
|
| 176 |
+
shane.jpg (huge zoomed text) β Adaptive (crop then resize)
|
| 177 |
+
real scan (any size/zoom) β Adaptive
|
| 178 |
+
"""
|
| 179 |
+
def __init__(self, H=64, W=512, verbose=False):
|
| 180 |
+
self.H, self.W = H, W
|
| 181 |
+
self.verbose = verbose
|
| 182 |
+
self._simple = SimpleNormalizer(H, W)
|
| 183 |
+
self._adaptive = AdaptiveNormalizer(H, W)
|
| 184 |
+
|
| 185 |
+
def normalize(self, img: np.ndarray) -> np.ndarray:
|
| 186 |
+
gray = _to_gray(img)
|
| 187 |
+
mode = _detect_mode(gray)
|
| 188 |
+
if self.verbose:
|
| 189 |
+
print(f" auto β {mode}")
|
| 190 |
+
return self._simple.normalize(img) if mode == 'simple' \
|
| 191 |
+
else self._adaptive.normalize(img)
|
| 192 |
+
|
| 193 |
+
def normalize_from_path(self, path: str) -> np.ndarray:
|
| 194 |
+
img = cv2.imread(str(path))
|
| 195 |
+
if img is None:
|
| 196 |
+
raise ValueError(f"Cannot load: {path}")
|
| 197 |
+
gray = _to_gray(img)
|
| 198 |
+
mode = _detect_mode(gray)
|
| 199 |
+
if self.verbose:
|
| 200 |
+
print(f" [{Path(path).name}] β {mode}")
|
| 201 |
+
return self._simple.normalize(img) if mode == 'simple' \
|
| 202 |
+
else self._adaptive.normalize(img)
|
| 203 |
+
|
| 204 |
+
def to_tensor(self, img: np.ndarray) -> torch.Tensor:
|
| 205 |
+
return _to_tensor(img)
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 209 |
+
# MAIN OCR CLASS
|
| 210 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 211 |
+
|
| 212 |
+
class CivilRegistryOCR:
|
| 213 |
+
|
| 214 |
+
def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
|
| 215 |
+
"""
|
| 216 |
+
Args:
|
| 217 |
+
checkpoint_path : path to best_model_v6.pth
|
| 218 |
+
device : 'cuda' or 'cpu'
|
| 219 |
+
mode : 'auto' β auto-detect per image (recommended)
|
| 220 |
+
'simple' β always use simple pipeline
|
| 221 |
+
'adaptive' β always use adaptive pipeline
|
| 222 |
+
verbose : print which mode was chosen per image
|
| 223 |
+
"""
|
| 224 |
+
if device == 'cuda' and not torch.cuda.is_available():
|
| 225 |
+
device = 'cpu'
|
| 226 |
+
|
| 227 |
+
self.device = torch.device(device)
|
| 228 |
+
self.verbose = verbose
|
| 229 |
+
print(f"Loading model from {checkpoint_path}...")
|
| 230 |
+
|
| 231 |
+
checkpoint = torch.load(checkpoint_path, map_location=self.device,
|
| 232 |
+
weights_only=False)
|
| 233 |
+
|
| 234 |
+
self.char_to_idx = checkpoint['char_to_idx']
|
| 235 |
+
self.idx_to_char = checkpoint['idx_to_char']
|
| 236 |
+
self.config = checkpoint.get('config', {})
|
| 237 |
+
|
| 238 |
+
img_height = self.config.get('img_height', 64)
|
| 239 |
+
img_width = self.config.get('img_width', 512)
|
| 240 |
+
|
| 241 |
+
if mode == 'simple':
|
| 242 |
+
self.normalizer = SimpleNormalizer(img_height, img_width)
|
| 243 |
+
elif mode == 'adaptive':
|
| 244 |
+
self.normalizer = AdaptiveNormalizer(img_height, img_width)
|
| 245 |
+
else:
|
| 246 |
+
self.normalizer = AutoNormalizer(img_height, img_width, verbose=verbose)
|
| 247 |
+
|
| 248 |
+
self.model = get_crnn_model(
|
| 249 |
+
model_type=self.config.get('model_type', 'standard'),
|
| 250 |
+
img_height=img_height,
|
| 251 |
+
num_chars=checkpoint['model_state_dict']['fc.weight'].shape[0],
|
| 252 |
+
hidden_size=self.config.get('hidden_size', 128),
|
| 253 |
+
num_lstm_layers=self.config.get('num_lstm_layers', 1)
|
| 254 |
+
)
|
| 255 |
+
self.model.load_state_dict(checkpoint['model_state_dict'])
|
| 256 |
+
self.model = self.model.to(self.device)
|
| 257 |
+
self.model.eval()
|
| 258 |
+
|
| 259 |
+
print(f"Model loaded successfully")
|
| 260 |
+
# Support both key names: val_loss (fine-tuned) and val_cer (synthetic baseline)
|
| 261 |
+
# FIXED Bug 5: removed incorrect `val_cer < 10` heuristic that mislabelled
|
| 262 |
+
# the metric. The key name alone is the reliable indicator.
|
| 263 |
+
val_loss = checkpoint.get('val_loss', None)
|
| 264 |
+
val_cer = checkpoint.get('val_cer', None)
|
| 265 |
+
if val_loss is not None and val_cer is not None:
|
| 266 |
+
print(f" Val Loss : {val_loss:.4f} | Val CER: {val_cer:.2f}%")
|
| 267 |
+
elif val_loss is not None:
|
| 268 |
+
print(f" Val Loss : {val_loss:.4f} (fine-tuned checkpoint β run compare_live_cer.py for true CER)")
|
| 269 |
+
elif val_cer is not None:
|
| 270 |
+
print(f" Val CER : {val_cer:.2f}%")
|
| 271 |
+
else:
|
| 272 |
+
print(f" Val CER : N/A (run check_cer.py for true CER)")
|
| 273 |
+
print(f" Device : {self.device}")
|
| 274 |
+
print(f" Mode : {mode} ({img_height}x{img_width})")
|
| 275 |
+
|
| 276 |
+
def _preprocess(self, image_path) -> torch.Tensor:
|
| 277 |
+
normalized = self.normalizer.normalize_from_path(str(image_path))
|
| 278 |
+
return _to_tensor(normalized)
|
| 279 |
+
|
| 280 |
+
def predict(self, image_path, decode_method='greedy') -> str:
|
| 281 |
+
img = self._preprocess(image_path).to(self.device)
|
| 282 |
+
with torch.no_grad():
|
| 283 |
+
outputs = self.model(img)
|
| 284 |
+
decoded = decode_ctc_predictions(
|
| 285 |
+
outputs.cpu(), self.idx_to_char, method=decode_method)
|
| 286 |
+
return decoded[0]
|
| 287 |
+
|
| 288 |
+
def predict_batch(self, image_paths, decode_method='greedy') -> List[Dict]:
|
| 289 |
+
results = []
|
| 290 |
+
for image_path in image_paths:
|
| 291 |
+
try:
|
| 292 |
+
text = self.predict(image_path, decode_method)
|
| 293 |
+
results.append({'image_path': str(image_path),
|
| 294 |
+
'text': text, 'success': True})
|
| 295 |
+
except Exception as e:
|
| 296 |
+
results.append({'image_path': str(image_path),
|
| 297 |
+
'error': str(e), 'success': False})
|
| 298 |
+
return results
|
| 299 |
+
|
| 300 |
+
def process_form(self, form_image_path, form_type) -> Dict:
|
| 301 |
+
text = self.predict(form_image_path)
|
| 302 |
+
fields = extract_form_fields(text, form_type)
|
| 303 |
+
fields['raw_text'] = text
|
| 304 |
+
return fields
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 308 |
+
# FORM FIELD EXTRACTOR
|
| 309 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 310 |
+
|
| 311 |
+
class FormFieldExtractor:
|
| 312 |
+
def __init__(self, ocr_model: CivilRegistryOCR):
|
| 313 |
+
self.ocr = ocr_model
|
| 314 |
+
|
| 315 |
+
def extract_form1a_fields(self, path):
|
| 316 |
+
text = self.ocr.predict(path)
|
| 317 |
+
return {'form_type': 'Form 1A - Birth Certificate', 'raw_text': text}
|
| 318 |
+
|
| 319 |
+
def extract_form2a_fields(self, path):
|
| 320 |
+
text = self.ocr.predict(path)
|
| 321 |
+
return {'form_type': 'Form 2A - Death Certificate', 'raw_text': text}
|
| 322 |
+
|
| 323 |
+
def extract_form3a_fields(self, path):
|
| 324 |
+
text = self.ocr.predict(path)
|
| 325 |
+
return {'form_type': 'Form 3A - Marriage Certificate', 'raw_text': text}
|
| 326 |
+
|
| 327 |
+
def extract_form90_fields(self, path):
|
| 328 |
+
text = self.ocr.predict(path)
|
| 329 |
+
return {'form_type': 'Form 90 - Marriage License Application',
|
| 330 |
+
'raw_text': text}
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 334 |
+
# DEMO
|
| 335 |
+
# οΏ½οΏ½οΏ½ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 336 |
+
|
| 337 |
+
def demo_inference():
|
| 338 |
+
print("=" * 70)
|
| 339 |
+
print("Civil Registry OCR (auto-adaptive normalizer)")
|
| 340 |
+
print("=" * 70)
|
| 341 |
+
|
| 342 |
+
ocr = CivilRegistryOCR(
|
| 343 |
+
checkpoint_path='checkpoints/best_model_v6.pth',
|
| 344 |
+
device='cuda',
|
| 345 |
+
mode='auto',
|
| 346 |
+
verbose=True # shows which mode each image triggers
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
print("\n1. Single Prediction:")
|
| 350 |
+
try:
|
| 351 |
+
result = ocr.predict('test_images/date1.jpg')
|
| 352 |
+
print(f" Recognized text: {result}")
|
| 353 |
+
except Exception as e:
|
| 354 |
+
print(f" Error: {e}")
|
| 355 |
+
|
| 356 |
+
print("\n2. Batch Prediction:")
|
| 357 |
+
'''batch_results = ocr.predict_batch([
|
| 358 |
+
'test_images/name1.jpg',
|
| 359 |
+
'test_images/shane.jpg',
|
| 360 |
+
'test_images/date1.jpg',
|
| 361 |
+
'test_images/place1.jpg',
|
| 362 |
+
])
|
| 363 |
+
for r in batch_results:
|
| 364 |
+
status = r['text'] if r['success'] else f"ERROR - {r['error']}"
|
| 365 |
+
print(f" {r['image_path']}: {status}")'''
|
| 366 |
+
|
| 367 |
+
print("\n3. Form Processing:")
|
| 368 |
+
try:
|
| 369 |
+
form_data = ocr.process_form('test_images/form1a_sample.jpg', 'form1a')
|
| 370 |
+
print(f" Form Type: Form 1A - Birth Certificate")
|
| 371 |
+
print(f" Raw Text: {form_data['raw_text']}")
|
| 372 |
+
except Exception as e:
|
| 373 |
+
print(f" Error: {e}")
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
def create_inference_api():
|
| 377 |
+
class OCR_API:
|
| 378 |
+
def __init__(self, checkpoint_path, mode='auto'):
|
| 379 |
+
self.ocr = CivilRegistryOCR(checkpoint_path, mode=mode)
|
| 380 |
+
self.extractor = FormFieldExtractor(self.ocr)
|
| 381 |
+
def recognize_text(self, p):
|
| 382 |
+
return {'text': self.ocr.predict(p), 'success': True}
|
| 383 |
+
def process_birth_certificate(self, p):
|
| 384 |
+
return self.extractor.extract_form1a_fields(p)
|
| 385 |
+
def process_death_certificate(self, p):
|
| 386 |
+
return self.extractor.extract_form2a_fields(p)
|
| 387 |
+
def process_marriage_certificate(self, p):
|
| 388 |
+
return self.extractor.extract_form3a_fields(p)
|
| 389 |
+
def process_marriage_license(self, p):
|
| 390 |
+
return self.extractor.extract_form90_fields(p)
|
| 391 |
+
return OCR_API
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
if __name__ == "__main__":
|
| 395 |
+
demo_inference()
|
CRNN+CTC/prepare_emnist.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ο»Ώimport torchvision
|
| 2 |
+
import torchvision.transforms as transforms
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import numpy as np
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
|
| 8 |
+
print("Preparing EMNIST data for CRNN training...")
|
| 9 |
+
print("Using 'balanced' split (47 classes β digits, uppercase, selected lowercase)")
|
| 10 |
+
|
| 11 |
+
# MAX_SAMPLES: how many EMNIST images to use out of 112,800 available.
|
| 12 |
+
# 50,000 chosen deliberately:
|
| 13 |
+
# - ~1,064 images per class (47 classes) β enough for solid character recognition
|
| 14 |
+
# - Keeps a healthy ~3:1 ratio vs synthetic data (16,000) in mixed training
|
| 15 |
+
# - Going higher (e.g. full 112,800) would drown out synthetic Filipino-specific
|
| 16 |
+
# patterns since EMNIST would be 88% of the mixed dataset
|
| 17 |
+
# - IAM fine-tuning and physical scans handle remaining handwriting gaps
|
| 18 |
+
MAX_SAMPLES = 50000
|
| 19 |
+
VAL_RATIO = 0.10 # 90% train, 10% val β proper percentage split
|
| 20 |
+
|
| 21 |
+
train_data = torchvision.datasets.EMNIST(
|
| 22 |
+
root='datasets/emnist',
|
| 23 |
+
split='balanced', # balanced split β already downloaded
|
| 24 |
+
train=True,
|
| 25 |
+
download=False, # files already exist, skip download
|
| 26 |
+
transform=transforms.ToTensor()
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# balanced split has 47 classes:
|
| 30 |
+
# 0-9 digits, A-Z uppercase, and selected lowercase
|
| 31 |
+
# mapping follows EMNIST balanced label order
|
| 32 |
+
LABELS = [
|
| 33 |
+
'0','1','2','3','4','5','6','7','8','9',
|
| 34 |
+
'A','B','C','D','E','F','G','H','I','J','K','L','M',
|
| 35 |
+
'N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
|
| 36 |
+
'a','b','d','e','f','g','h','n','q','r','t',
|
| 37 |
+
] # 47 classes exactly matching balanced split label indices
|
| 38 |
+
|
| 39 |
+
os.makedirs('data/train/emnist', exist_ok=True)
|
| 40 |
+
os.makedirs('data/val/emnist', exist_ok=True)
|
| 41 |
+
|
| 42 |
+
annotations_train = []
|
| 43 |
+
annotations_val = []
|
| 44 |
+
|
| 45 |
+
val_cutoff = int(MAX_SAMPLES * (1 - VAL_RATIO)) # 45,000 train / 5,000 val
|
| 46 |
+
|
| 47 |
+
print(f"Dataset size : {len(train_data)} images available")
|
| 48 |
+
print(f"Using : {MAX_SAMPLES} ({MAX_SAMPLES/len(train_data)*100:.1f}% of full dataset)")
|
| 49 |
+
print(f"Train / Val : {val_cutoff} / {MAX_SAMPLES - val_cutoff} (90/10 split)")
|
| 50 |
+
print("Saving images...")
|
| 51 |
+
|
| 52 |
+
saved = 0 # count of successfully saved images (skips bad label indices)
|
| 53 |
+
for i, (img_tensor, label_idx) in enumerate(train_data):
|
| 54 |
+
if saved >= MAX_SAMPLES:
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
# Safety check β skip if label index is out of range for our LABELS list
|
| 58 |
+
if label_idx >= len(LABELS):
|
| 59 |
+
continue
|
| 60 |
+
|
| 61 |
+
char = LABELS[label_idx]
|
| 62 |
+
img = img_tensor.squeeze().numpy()
|
| 63 |
+
img = (img * 255).astype(np.uint8)
|
| 64 |
+
|
| 65 |
+
# EMNIST images are transposed β rotate and flip to correct orientation
|
| 66 |
+
img = np.rot90(img, k=3)
|
| 67 |
+
img = np.fliplr(img)
|
| 68 |
+
|
| 69 |
+
pil_img = Image.fromarray(img).convert('RGB')
|
| 70 |
+
pil_img = pil_img.resize((512, 64)) # must match IMG_WIDTH=512
|
| 71 |
+
|
| 72 |
+
fname = f'emnist_{saved:05d}.jpg' # sequential filenames based on saved count
|
| 73 |
+
|
| 74 |
+
# FIXED: proper percentage-based split (was hardcoded `if i < 5000`)
|
| 75 |
+
if saved < val_cutoff:
|
| 76 |
+
pil_img.save(f'data/train/emnist/{fname}')
|
| 77 |
+
annotations_train.append({'image_path': f'emnist/{fname}', 'text': char})
|
| 78 |
+
else:
|
| 79 |
+
pil_img.save(f'data/val/emnist/{fname}')
|
| 80 |
+
annotations_val.append({'image_path': f'emnist/{fname}', 'text': char})
|
| 81 |
+
|
| 82 |
+
saved += 1
|
| 83 |
+
if saved % 5000 == 0:
|
| 84 |
+
print(f" Processed {saved}/{MAX_SAMPLES} images...")
|
| 85 |
+
|
| 86 |
+
with open('data/emnist_train_annotations.json', 'w') as f:
|
| 87 |
+
json.dump(annotations_train, f, indent=2)
|
| 88 |
+
with open('data/emnist_val_annotations.json', 'w') as f:
|
| 89 |
+
json.dump(annotations_val, f, indent=2)
|
| 90 |
+
|
| 91 |
+
print(f"\nDone!")
|
| 92 |
+
print(f" Train : {len(annotations_train)} images (~{len(annotations_train)//47} per class)")
|
| 93 |
+
print(f" Val : {len(annotations_val)} images")
|
| 94 |
+
print(f" Total : {len(annotations_train) + len(annotations_val)} / {len(train_data)} used")
|
| 95 |
+
print(f" Labels: {sorted(set(a['text'] for a in annotations_train))}")
|
| 96 |
+
print(f"\nClass coverage: {len(set(a['text'] for a in annotations_train))}/47 classes in train")
|
| 97 |
+
print("\nNext step: python train_with_emnist.py")
|
CRNN+CTC/requirements.txt
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core Deep Learning
|
| 2 |
+
torch>=2.0.0
|
| 3 |
+
torchvision>=0.15.0
|
| 4 |
+
|
| 5 |
+
# Image Processing
|
| 6 |
+
opencv-python>=4.8.0
|
| 7 |
+
Pillow>=10.0.0
|
| 8 |
+
albumentations>=1.3.0
|
| 9 |
+
pdf2image>=1.17.0
|
| 10 |
+
pytesseract>=0.3.13
|
| 11 |
+
|
| 12 |
+
# Data Processing
|
| 13 |
+
numpy>=1.24.0
|
| 14 |
+
pandas>=2.0.0
|
| 15 |
+
|
| 16 |
+
# Metrics
|
| 17 |
+
editdistance>=0.6.2
|
| 18 |
+
|
| 19 |
+
# Progress Bars
|
| 20 |
+
tqdm>=4.65.0
|
| 21 |
+
|
| 22 |
+
# Web Framework (for deployment)
|
| 23 |
+
flask>=3.0.0
|
| 24 |
+
flask-cors>=4.0.0
|
| 25 |
+
fastapi>=0.104.0
|
| 26 |
+
uvicorn>=0.24.0
|
| 27 |
+
python-multipart>=0.0.6
|
| 28 |
+
|
| 29 |
+
# Database
|
| 30 |
+
pymysql>=1.1.0
|
| 31 |
+
sqlalchemy>=2.0.0
|
| 32 |
+
|
| 33 |
+
# NLP for Named Entity Recognition
|
| 34 |
+
spacy>=3.7.0
|
| 35 |
+
# Download model: python -m spacy download en_core_web_sm
|
| 36 |
+
|
| 37 |
+
# Document Classification
|
| 38 |
+
scikit-learn>=1.3.0
|
| 39 |
+
|
| 40 |
+
# Visualization
|
| 41 |
+
matplotlib>=3.7.0
|
| 42 |
+
seaborn>=0.12.0
|
| 43 |
+
|
| 44 |
+
# Configuration
|
| 45 |
+
pyyaml>=6.0
|
| 46 |
+
|
| 47 |
+
# Utilities
|
| 48 |
+
python-dotenv>=1.0.0
|
| 49 |
+
requests>=2.31.0
|
| 50 |
+
|
| 51 |
+
# Document Processing
|
| 52 |
+
python-docx>=1.1.0
|
| 53 |
+
|
| 54 |
+
# Optional: For production deployment
|
| 55 |
+
gunicorn>=21.2.0
|
| 56 |
+
celery>=5.3.0
|
| 57 |
+
redis>=5.0.0
|
| 58 |
+
|
| 59 |
+
# Testing
|
| 60 |
+
pytest>=7.4.0
|
| 61 |
+
pytest-cov>=4.1.0
|
CRNN+CTC/train.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Training Script for CRNN+CTC Civil Registry OCR Includes CTC loss, learning rate scheduling, and model checkpointing
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
import torch.optim as optim
|
| 6 |
+
from torch.utils.data import DataLoader
|
| 7 |
+
import os
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
import numpy as np
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import json
|
| 12 |
+
|
| 13 |
+
from crnn_model import get_crnn_model, initialize_weights
|
| 14 |
+
from dataset import CivilRegistryDataset, collate_fn
|
| 15 |
+
from utils import (
|
| 16 |
+
decode_ctc_predictions,
|
| 17 |
+
calculate_cer,
|
| 18 |
+
calculate_wer,
|
| 19 |
+
EarlyStopping
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class CRNNTrainer:
|
| 24 |
+
"""
|
| 25 |
+
Trainer class for CRNN+CTC model
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, config):
|
| 29 |
+
self.config = config
|
| 30 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 31 |
+
|
| 32 |
+
# Create directories
|
| 33 |
+
self.checkpoint_dir = Path(config['checkpoint_dir'])
|
| 34 |
+
self.log_dir = Path(config['log_dir'])
|
| 35 |
+
self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
|
| 36 |
+
self.log_dir.mkdir(parents=True, exist_ok=True)
|
| 37 |
+
|
| 38 |
+
# Initialize datasets
|
| 39 |
+
print("Loading datasets...")
|
| 40 |
+
self.train_dataset = CivilRegistryDataset(
|
| 41 |
+
data_dir=config['train_data_dir'],
|
| 42 |
+
annotations_file=config['train_annotations'],
|
| 43 |
+
img_height=config['img_height'],
|
| 44 |
+
img_width=config['img_width'],
|
| 45 |
+
augment=True,
|
| 46 |
+
form_type=config.get('form_type', 'all')
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
self.val_dataset = CivilRegistryDataset(
|
| 50 |
+
data_dir=config['val_data_dir'],
|
| 51 |
+
annotations_file=config['val_annotations'],
|
| 52 |
+
img_height=config['img_height'],
|
| 53 |
+
img_width=config['img_width'],
|
| 54 |
+
augment=False,
|
| 55 |
+
form_type=config.get('form_type', 'all')
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# Create data loaders
|
| 59 |
+
self.train_loader = DataLoader(
|
| 60 |
+
self.train_dataset,
|
| 61 |
+
batch_size=config['batch_size'],
|
| 62 |
+
shuffle=True,
|
| 63 |
+
num_workers=config['num_workers'],
|
| 64 |
+
collate_fn=collate_fn,
|
| 65 |
+
pin_memory=False
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
self.val_loader = DataLoader(
|
| 69 |
+
self.val_dataset,
|
| 70 |
+
batch_size=config['batch_size'],
|
| 71 |
+
shuffle=False,
|
| 72 |
+
num_workers=config['num_workers'],
|
| 73 |
+
collate_fn=collate_fn,
|
| 74 |
+
pin_memory=False
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Initialize model
|
| 78 |
+
print(f"Initializing model on {self.device}...")
|
| 79 |
+
self.model = get_crnn_model(
|
| 80 |
+
model_type=config.get('model_type', 'standard'),
|
| 81 |
+
img_height=config['img_height'],
|
| 82 |
+
num_chars=self.train_dataset.num_chars,
|
| 83 |
+
hidden_size=config['hidden_size'],
|
| 84 |
+
num_lstm_layers=config['num_lstm_layers']
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
self.model = self.model.to(self.device)
|
| 88 |
+
|
| 89 |
+
# Loss function - CTC Loss
|
| 90 |
+
self.criterion = nn.CTCLoss(blank=0, zero_infinity=True)
|
| 91 |
+
|
| 92 |
+
# Optimizer β lower LR prevents CTC collapse on epoch 1
|
| 93 |
+
self.optimizer = optim.Adam(
|
| 94 |
+
self.model.parameters(),
|
| 95 |
+
lr=config['learning_rate'],
|
| 96 |
+
weight_decay=config.get('weight_decay', 1e-4) # FIXED: fallback was 1e-5
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Warmup scheduler: ramp LR from near-zero to target over first N epochs,
|
| 100 |
+
# then hand off to ReduceLROnPlateau.
|
| 101 |
+
# This is the single most effective fix for CTC blank collapse.
|
| 102 |
+
warmup_epochs = config.get('warmup_epochs', 5)
|
| 103 |
+
|
| 104 |
+
def warmup_lambda(epoch):
|
| 105 |
+
if epoch < warmup_epochs:
|
| 106 |
+
return (epoch + 1) / warmup_epochs # gradual: 0.2β0.4β0.6β0.8β1.0
|
| 107 |
+
return 1.0
|
| 108 |
+
|
| 109 |
+
self.warmup_scheduler = optim.lr_scheduler.LambdaLR(
|
| 110 |
+
self.optimizer, lr_lambda=warmup_lambda)
|
| 111 |
+
|
| 112 |
+
# ReduceLROnPlateau kicks in after warmup
|
| 113 |
+
self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
|
| 114 |
+
self.optimizer,
|
| 115 |
+
mode='min',
|
| 116 |
+
factor=0.5,
|
| 117 |
+
patience=config.get('lr_patience', 5),
|
| 118 |
+
min_lr=1e-6
|
| 119 |
+
)
|
| 120 |
+
self._warmup_epochs = warmup_epochs
|
| 121 |
+
|
| 122 |
+
# Early stopping
|
| 123 |
+
self.early_stopping = EarlyStopping(
|
| 124 |
+
patience=config.get('early_stopping_patience', 10),
|
| 125 |
+
min_delta=config.get('min_delta', 0.001)
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# Training history
|
| 129 |
+
self.history = {
|
| 130 |
+
'train_loss': [],
|
| 131 |
+
'val_loss': [],
|
| 132 |
+
'val_cer': [],
|
| 133 |
+
'val_wer': [],
|
| 134 |
+
'learning_rates': []
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
# ββ Resume from checkpoint if available ββββββββββββββ
|
| 138 |
+
self.start_epoch = 1
|
| 139 |
+
self.best_val_loss = float('inf')
|
| 140 |
+
resume_path = self.checkpoint_dir / 'latest_checkpoint.pth'
|
| 141 |
+
|
| 142 |
+
if resume_path.exists():
|
| 143 |
+
print(f"\n Found checkpoint: {resume_path}")
|
| 144 |
+
print(f" Resuming training from last saved epoch...")
|
| 145 |
+
ckpt = torch.load(resume_path, map_location=self.device, weights_only=False)
|
| 146 |
+
self.model.load_state_dict(ckpt['model_state_dict'])
|
| 147 |
+
self.optimizer.load_state_dict(ckpt['optimizer_state_dict'])
|
| 148 |
+
self.scheduler.load_state_dict(ckpt['scheduler_state_dict'])
|
| 149 |
+
if 'warmup_scheduler_state_dict' in ckpt:
|
| 150 |
+
self.warmup_scheduler.load_state_dict(ckpt['warmup_scheduler_state_dict'])
|
| 151 |
+
self.start_epoch = ckpt['epoch'] + 1
|
| 152 |
+
self.best_val_loss = ckpt.get('val_loss', float('inf'))
|
| 153 |
+
self.history = ckpt.get('history', self.history)
|
| 154 |
+
print(f" β Resumed from Epoch {ckpt['epoch']} "
|
| 155 |
+
f"(Val Loss: {ckpt['val_loss']:.4f}, CER: {ckpt['val_cer']:.2f}%)")
|
| 156 |
+
else:
|
| 157 |
+
print(f" No checkpoint found β starting fresh.")
|
| 158 |
+
initialize_weights(self.model)
|
| 159 |
+
|
| 160 |
+
print(f"β Model ready with {sum(p.numel() for p in self.model.parameters()):,} parameters")
|
| 161 |
+
|
| 162 |
+
def train_epoch(self, epoch):
|
| 163 |
+
"""Train for one epoch"""
|
| 164 |
+
self.model.train()
|
| 165 |
+
total_loss = 0
|
| 166 |
+
|
| 167 |
+
pbar = tqdm(self.train_loader, desc=f"Epoch {epoch}/{self.config['epochs']}")
|
| 168 |
+
|
| 169 |
+
nan_count = 0
|
| 170 |
+
for batch_idx, (images, targets, target_lengths, _) in enumerate(pbar):
|
| 171 |
+
images = images.to(self.device)
|
| 172 |
+
targets = targets.to(self.device)
|
| 173 |
+
|
| 174 |
+
# FIXED: zero_grad before forward pass (was incorrectly placed after loss)
|
| 175 |
+
self.optimizer.zero_grad()
|
| 176 |
+
|
| 177 |
+
# Forward pass
|
| 178 |
+
outputs = self.model(images) # [seq_len, batch, num_chars]
|
| 179 |
+
|
| 180 |
+
# Apply log_softmax for CTC
|
| 181 |
+
log_probs = nn.functional.log_softmax(outputs, dim=2)
|
| 182 |
+
|
| 183 |
+
# Calculate sequence lengths
|
| 184 |
+
batch_size = images.size(0)
|
| 185 |
+
input_lengths = torch.full(
|
| 186 |
+
size=(batch_size,),
|
| 187 |
+
fill_value=outputs.size(0),
|
| 188 |
+
dtype=torch.long
|
| 189 |
+
).to(self.device)
|
| 190 |
+
|
| 191 |
+
# CTC loss
|
| 192 |
+
loss = self.criterion(
|
| 193 |
+
log_probs,
|
| 194 |
+
targets,
|
| 195 |
+
input_lengths,
|
| 196 |
+
target_lengths
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# FIXED: skip NaN/Inf batches β accumulating them corrupts gradients
|
| 200 |
+
if torch.isnan(loss) or torch.isinf(loss):
|
| 201 |
+
nan_count += 1
|
| 202 |
+
continue
|
| 203 |
+
|
| 204 |
+
# Backward pass
|
| 205 |
+
loss.backward()
|
| 206 |
+
|
| 207 |
+
# Gradient clipping to prevent exploding gradients
|
| 208 |
+
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
|
| 209 |
+
|
| 210 |
+
self.optimizer.step()
|
| 211 |
+
|
| 212 |
+
total_loss += loss.item()
|
| 213 |
+
|
| 214 |
+
# Update progress bar
|
| 215 |
+
pbar.set_postfix({
|
| 216 |
+
'loss': f'{loss.item():.4f}',
|
| 217 |
+
'avg_loss': f'{total_loss / (batch_idx + 1):.4f}'
|
| 218 |
+
})
|
| 219 |
+
if nan_count > 0:
|
| 220 |
+
print(f" [WARNING] {nan_count} NaN/Inf batches skipped this epoch.")
|
| 221 |
+
|
| 222 |
+
avg_loss = total_loss / len(self.train_loader)
|
| 223 |
+
return avg_loss
|
| 224 |
+
|
| 225 |
+
def validate(self):
|
| 226 |
+
"""Validate the model"""
|
| 227 |
+
self.model.eval()
|
| 228 |
+
total_loss = 0
|
| 229 |
+
all_predictions = []
|
| 230 |
+
all_ground_truths = []
|
| 231 |
+
|
| 232 |
+
with torch.no_grad():
|
| 233 |
+
for images, targets, target_lengths, texts in tqdm(self.val_loader, desc="Validating"):
|
| 234 |
+
images = images.to(self.device)
|
| 235 |
+
targets_gpu = targets.to(self.device)
|
| 236 |
+
|
| 237 |
+
# Forward pass
|
| 238 |
+
outputs = self.model(images)
|
| 239 |
+
log_probs = nn.functional.log_softmax(outputs, dim=2)
|
| 240 |
+
|
| 241 |
+
# CTC loss
|
| 242 |
+
batch_size = images.size(0)
|
| 243 |
+
input_lengths = torch.full(
|
| 244 |
+
size=(batch_size,),
|
| 245 |
+
fill_value=outputs.size(0),
|
| 246 |
+
dtype=torch.long
|
| 247 |
+
).to(self.device)
|
| 248 |
+
|
| 249 |
+
loss = self.criterion(log_probs, targets_gpu, input_lengths, target_lengths)
|
| 250 |
+
total_loss += loss.item()
|
| 251 |
+
|
| 252 |
+
# Decode predictions
|
| 253 |
+
predictions = decode_ctc_predictions(
|
| 254 |
+
outputs.cpu(),
|
| 255 |
+
self.train_dataset.idx_to_char
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
all_predictions.extend(predictions)
|
| 259 |
+
all_ground_truths.extend(texts)
|
| 260 |
+
|
| 261 |
+
avg_loss = total_loss / len(self.val_loader)
|
| 262 |
+
|
| 263 |
+
# Calculate metrics
|
| 264 |
+
cer = calculate_cer(all_predictions, all_ground_truths)
|
| 265 |
+
wer = calculate_wer(all_predictions, all_ground_truths)
|
| 266 |
+
|
| 267 |
+
return avg_loss, cer, wer, all_predictions, all_ground_truths
|
| 268 |
+
|
| 269 |
+
def train(self):
|
| 270 |
+
"""Main training loop"""
|
| 271 |
+
print("\n" + "=" * 70)
|
| 272 |
+
print("Starting Training")
|
| 273 |
+
print("=" * 70)
|
| 274 |
+
|
| 275 |
+
best_val_loss = self.best_val_loss
|
| 276 |
+
|
| 277 |
+
for epoch in range(self.start_epoch, self.config['epochs'] + 1):
|
| 278 |
+
print(f"\nEpoch {epoch}/{self.config['epochs']}")
|
| 279 |
+
print("-" * 70)
|
| 280 |
+
|
| 281 |
+
# Train
|
| 282 |
+
train_loss = self.train_epoch(epoch)
|
| 283 |
+
|
| 284 |
+
# Validate
|
| 285 |
+
val_loss, val_cer, val_wer, predictions, ground_truths = self.validate()
|
| 286 |
+
|
| 287 |
+
# Learning rate scheduling
|
| 288 |
+
# Use warmup for first N epochs, then ReduceLROnPlateau
|
| 289 |
+
if epoch <= self._warmup_epochs:
|
| 290 |
+
self.warmup_scheduler.step()
|
| 291 |
+
else:
|
| 292 |
+
self.scheduler.step(val_loss)
|
| 293 |
+
current_lr = self.optimizer.param_groups[0]['lr']
|
| 294 |
+
|
| 295 |
+
# Update history
|
| 296 |
+
self.history['train_loss'].append(train_loss)
|
| 297 |
+
self.history['val_loss'].append(val_loss)
|
| 298 |
+
self.history['val_cer'].append(val_cer)
|
| 299 |
+
self.history['val_wer'].append(val_wer)
|
| 300 |
+
self.history['learning_rates'].append(current_lr)
|
| 301 |
+
|
| 302 |
+
# Print metrics
|
| 303 |
+
print(f"\nMetrics:")
|
| 304 |
+
print(f" Train Loss: {train_loss:.4f}")
|
| 305 |
+
print(f" Val Loss: {val_loss:.4f}")
|
| 306 |
+
print(f" Val CER: {val_cer:.2f}%")
|
| 307 |
+
print(f" Val WER: {val_wer:.2f}%")
|
| 308 |
+
print(f" LR: {current_lr:.6f}")
|
| 309 |
+
|
| 310 |
+
# Print sample predictions
|
| 311 |
+
print(f"\nSample Predictions:")
|
| 312 |
+
for i in range(min(3, len(predictions))):
|
| 313 |
+
print(f" GT: {ground_truths[i]}")
|
| 314 |
+
print(f" Pred: {predictions[i]}")
|
| 315 |
+
print()
|
| 316 |
+
|
| 317 |
+
# show raw model output
|
| 318 |
+
with torch.no_grad():
|
| 319 |
+
sample_img = self.val_dataset[0][0].unsqueeze(0).to(self.device)
|
| 320 |
+
raw_out = self.model(sample_img)
|
| 321 |
+
probs = torch.softmax(raw_out, dim=2)
|
| 322 |
+
best_idx = probs[:, 0, :].argmax(dim=1)
|
| 323 |
+
best_prob = probs[:, 0, :].max(dim=1).values
|
| 324 |
+
blank_pct = (best_idx == 0).float().mean().item() * 100
|
| 325 |
+
avg_conf = best_prob.mean().item()
|
| 326 |
+
non_blank = [self.train_dataset.idx_to_char.get(i.item(), '?')
|
| 327 |
+
for i in best_idx if i.item() != 0]
|
| 328 |
+
print(f" blank={blank_pct:.0f}% conf={avg_conf:.3f} "
|
| 329 |
+
f"chars={''.join(non_blank[:20])!r}")
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
# Save checkpoint
|
| 333 |
+
is_best = val_loss < best_val_loss
|
| 334 |
+
if is_best:
|
| 335 |
+
best_val_loss = val_loss
|
| 336 |
+
|
| 337 |
+
self.save_checkpoint(epoch, val_loss, val_cer, is_best)
|
| 338 |
+
|
| 339 |
+
# Early stopping
|
| 340 |
+
if self.early_stopping(val_loss):
|
| 341 |
+
print(f"\nEarly stopping triggered at epoch {epoch}")
|
| 342 |
+
break
|
| 343 |
+
|
| 344 |
+
print("\n" + "=" * 70)
|
| 345 |
+
print("Training Complete!")
|
| 346 |
+
print(f"Best validation loss: {best_val_loss:.4f}")
|
| 347 |
+
print("=" * 70)
|
| 348 |
+
|
| 349 |
+
# Save final training history
|
| 350 |
+
self.save_history()
|
| 351 |
+
|
| 352 |
+
def save_checkpoint(self, epoch, val_loss, val_cer, is_best=False):
|
| 353 |
+
"""Save model checkpoint"""
|
| 354 |
+
checkpoint = {
|
| 355 |
+
'epoch': epoch,
|
| 356 |
+
'model_state_dict': self.model.state_dict(),
|
| 357 |
+
'optimizer_state_dict': self.optimizer.state_dict(),
|
| 358 |
+
'scheduler_state_dict': self.scheduler.state_dict(),
|
| 359 |
+
'warmup_scheduler_state_dict': self.warmup_scheduler.state_dict(),
|
| 360 |
+
'val_loss': val_loss,
|
| 361 |
+
'val_cer': val_cer,
|
| 362 |
+
'char_to_idx': self.train_dataset.char_to_idx,
|
| 363 |
+
'idx_to_char': self.train_dataset.idx_to_char,
|
| 364 |
+
'config': self.config,
|
| 365 |
+
'history': self.history
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
# Save latest checkpoint
|
| 369 |
+
checkpoint_path = self.checkpoint_dir / 'latest_checkpoint.pth'
|
| 370 |
+
torch.save(checkpoint, checkpoint_path)
|
| 371 |
+
|
| 372 |
+
# Save best checkpoint
|
| 373 |
+
if is_best:
|
| 374 |
+
best_path = self.checkpoint_dir / 'best_model.pth'
|
| 375 |
+
torch.save(checkpoint, best_path)
|
| 376 |
+
print(f" β Best model saved (Val Loss: {val_loss:.4f}, CER: {val_cer:.2f}%)")
|
| 377 |
+
|
| 378 |
+
# Save epoch checkpoint (history omitted to save disk space β it's in latest_checkpoint.pth)
|
| 379 |
+
if epoch % self.config.get('save_freq', 10) == 0:
|
| 380 |
+
epoch_path = self.checkpoint_dir / f'checkpoint_epoch_{epoch}.pth'
|
| 381 |
+
epoch_ckpt = {k: v for k, v in checkpoint.items() if k != 'history'}
|
| 382 |
+
torch.save(epoch_ckpt, epoch_path)
|
| 383 |
+
|
| 384 |
+
def save_history(self):
|
| 385 |
+
"""Save training history"""
|
| 386 |
+
history_path = self.log_dir / 'training_history.json'
|
| 387 |
+
with open(history_path, 'w') as f:
|
| 388 |
+
json.dump(self.history, f, indent=2)
|
| 389 |
+
print(f"\nβ Training history saved to {history_path}")
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
def main():
|
| 393 |
+
"""Main training function"""
|
| 394 |
+
|
| 395 |
+
# Configuration
|
| 396 |
+
config = {
|
| 397 |
+
# Data
|
| 398 |
+
'train_data_dir': 'data/train',
|
| 399 |
+
'train_annotations': 'data/train_annotations.json',
|
| 400 |
+
'val_data_dir': 'data/val',
|
| 401 |
+
'val_annotations': 'data/val_annotations.json',
|
| 402 |
+
'form_type': 'all', # 'all', 'form1a', 'form2a', 'form3a', 'form90'
|
| 403 |
+
|
| 404 |
+
# Model
|
| 405 |
+
'model_type': 'standard', # 'standard', 'ensemble', 'lightweight'
|
| 406 |
+
'img_height': 64,
|
| 407 |
+
'img_width': 512,
|
| 408 |
+
'hidden_size': 128,
|
| 409 |
+
'num_lstm_layers': 1,
|
| 410 |
+
|
| 411 |
+
# Training
|
| 412 |
+
'batch_size': 32,
|
| 413 |
+
'epochs': 100,
|
| 414 |
+
'learning_rate': 0.0001,
|
| 415 |
+
'weight_decay': 1e-4, # FIXED: was 1e-5 β stronger L2 regularisation to reduce overfitting
|
| 416 |
+
'num_workers': 0,
|
| 417 |
+
'warmup_epochs': 5, # Ramp LR gradually for first 5 epochs
|
| 418 |
+
|
| 419 |
+
# Scheduling & Early Stopping
|
| 420 |
+
'lr_patience': 5, # FIXED: was 3 β give model more time before halving LR
|
| 421 |
+
'early_stopping_patience': 20, # FIXED: was 10 β more patience during zoom training
|
| 422 |
+
'min_delta': 0.001,
|
| 423 |
+
|
| 424 |
+
# Saving
|
| 425 |
+
'checkpoint_dir': 'checkpoints',
|
| 426 |
+
'log_dir': 'logs',
|
| 427 |
+
'save_freq': 10,
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
# Initialize trainer
|
| 431 |
+
trainer = CRNNTrainer(config)
|
| 432 |
+
|
| 433 |
+
# Start training
|
| 434 |
+
trainer.train()
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
if __name__ == "__main__":
|
| 438 |
+
main()
|
CRNN+CTC/train_emnist.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torchvision
|
| 2 |
+
import torchvision.transforms as transforms
|
| 3 |
+
|
| 4 |
+
print("Loading EMNIST dataset...")
|
| 5 |
+
|
| 6 |
+
train_data = torchvision.datasets.EMNIST(
|
| 7 |
+
root='datasets/emnist',
|
| 8 |
+
split='byclass',
|
| 9 |
+
train=True,
|
| 10 |
+
download=False,
|
| 11 |
+
transform=transforms.ToTensor()
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
print(f"Training samples: {len(train_data)}")
|
| 15 |
+
print("EMNIST loaded successfully!")
|
CRNN+CTC/train_mnist.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
import numpy as np
|
| 3 |
+
from tensorflow.keras import layers, models
|
| 4 |
+
|
| 5 |
+
# Load MNIST dataset
|
| 6 |
+
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
|
| 7 |
+
|
| 8 |
+
# Normalize pixel values to 0-1
|
| 9 |
+
x_train = x_train / 255.0
|
| 10 |
+
x_test = x_test / 255.0
|
| 11 |
+
|
| 12 |
+
# Add channel dimension (28, 28) -> (28, 28, 1)
|
| 13 |
+
x_train = x_train[..., tf.newaxis]
|
| 14 |
+
x_test = x_test[..., tf.newaxis]
|
| 15 |
+
|
| 16 |
+
# Build simple CNN model
|
| 17 |
+
model = models.Sequential([
|
| 18 |
+
layers.Conv2D(32, (3,3), activation='relu', input_shape=(28,28,1)),
|
| 19 |
+
layers.MaxPooling2D(2,2),
|
| 20 |
+
layers.Conv2D(64, (3,3), activation='relu'),
|
| 21 |
+
layers.MaxPooling2D(2,2),
|
| 22 |
+
layers.Flatten(),
|
| 23 |
+
layers.Dense(128, activation='relu'),
|
| 24 |
+
layers.Dense(10, activation='softmax') # 10 digits (0-9)
|
| 25 |
+
])
|
| 26 |
+
|
| 27 |
+
model.compile(optimizer='adam',
|
| 28 |
+
loss='sparse_categorical_crossentropy',
|
| 29 |
+
metrics=['accuracy'])
|
| 30 |
+
|
| 31 |
+
model.summary()
|
| 32 |
+
|
| 33 |
+
# Train
|
| 34 |
+
model.fit(x_train, y_train, epochs=5, validation_split=0.1)
|
| 35 |
+
|
| 36 |
+
# Evaluate
|
| 37 |
+
test_loss, test_acc = model.evaluate(x_test, y_test)
|
| 38 |
+
print(f"\nTest accuracy: {test_acc:.4f}")
|
| 39 |
+
|
| 40 |
+
# Save model
|
| 41 |
+
model.save("mnist_model.h5")
|
| 42 |
+
print("Model saved as mnist_model.h5")
|
CRNN+CTC/train_with_emnist.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
train_with_emnist.py
|
| 3 |
+
====================
|
| 4 |
+
Fine-tune the CRNN model with EMNIST character data.
|
| 5 |
+
|
| 6 |
+
FIXES vs old version:
|
| 7 |
+
- Phase 1: CNN FROZEN β only RNN+FC trained (prevents catastrophic forgetting)
|
| 8 |
+
- Phase 2: Full model at 10x lower LR for final polish
|
| 9 |
+
- log_softmax applied before CTCLoss (was missing β caused garbage loss)
|
| 10 |
+
- Loads from best_model.pth (synthetic, 0.12% CER baseline)
|
| 11 |
+
- Saves best_model_emnist.pth only when val improves
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
import torch
|
| 17 |
+
import torch.nn.functional as F
|
| 18 |
+
import torch.optim as optim
|
| 19 |
+
from torch.utils.data import DataLoader, ConcatDataset
|
| 20 |
+
|
| 21 |
+
sys.path.append('.')
|
| 22 |
+
from crnn_model import get_crnn_model
|
| 23 |
+
from dataset import CivilRegistryDataset, collate_fn
|
| 24 |
+
|
| 25 |
+
print("=" * 55)
|
| 26 |
+
print("Fine-tuning CRNN with EMNIST dataset")
|
| 27 |
+
print("=" * 55)
|
| 28 |
+
|
| 29 |
+
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 30 |
+
print(f"Device: {DEVICE}")
|
| 31 |
+
|
| 32 |
+
emnist_dataset = CivilRegistryDataset(
|
| 33 |
+
data_dir='data/train',
|
| 34 |
+
annotations_file='data/emnist_train_annotations.json',
|
| 35 |
+
img_height=64, img_width=512, augment=True
|
| 36 |
+
)
|
| 37 |
+
# FIXED: mix synthetic data in so the model never forgets multi-word sequences
|
| 38 |
+
synth_dataset = CivilRegistryDataset(
|
| 39 |
+
data_dir='data/train',
|
| 40 |
+
annotations_file='data/train_annotations.json',
|
| 41 |
+
img_height=64, img_width=512, augment=True
|
| 42 |
+
)
|
| 43 |
+
train_dataset = emnist_dataset # keep reference for char_to_idx / num_chars
|
| 44 |
+
mixed_train = ConcatDataset([emnist_dataset, synth_dataset])
|
| 45 |
+
val_dataset = CivilRegistryDataset(
|
| 46 |
+
data_dir='data/val',
|
| 47 |
+
annotations_file='data/val_annotations.json', # FIXED: was emnist_val β must match real task
|
| 48 |
+
img_height=64, img_width=512, augment=False
|
| 49 |
+
)
|
| 50 |
+
print(f"EMNIST train : {len(emnist_dataset)}")
|
| 51 |
+
print(f"Synthetic train: {len(synth_dataset)}")
|
| 52 |
+
print(f"Mixed train : {len(mixed_train)}")
|
| 53 |
+
print(f"Val : {len(val_dataset)}")
|
| 54 |
+
|
| 55 |
+
train_loader = DataLoader(mixed_train, batch_size=32, shuffle=True,
|
| 56 |
+
num_workers=0, collate_fn=collate_fn)
|
| 57 |
+
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False,
|
| 58 |
+
num_workers=0, collate_fn=collate_fn)
|
| 59 |
+
|
| 60 |
+
# ββ Load best synthetic checkpoint βββββββββββββββββββββββββββ
|
| 61 |
+
BASE = 'checkpoints/best_model.pth'
|
| 62 |
+
if not os.path.exists(BASE):
|
| 63 |
+
print(f"ERROR: {BASE} not found. Run: python train.py")
|
| 64 |
+
sys.exit(1)
|
| 65 |
+
|
| 66 |
+
ckpt = torch.load(BASE, map_location=DEVICE, weights_only=False)
|
| 67 |
+
config = ckpt.get('config', {})
|
| 68 |
+
|
| 69 |
+
model = get_crnn_model(
|
| 70 |
+
model_type = config.get('model_type', 'standard'),
|
| 71 |
+
img_height = config.get('img_height', 64),
|
| 72 |
+
num_chars = train_dataset.num_chars,
|
| 73 |
+
hidden_size = config.get('hidden_size', 128),
|
| 74 |
+
num_lstm_layers = config.get('num_lstm_layers', 1),
|
| 75 |
+
).to(DEVICE)
|
| 76 |
+
|
| 77 |
+
missing, _ = model.load_state_dict(ckpt['model_state_dict'], strict=False)
|
| 78 |
+
if missing:
|
| 79 |
+
print(f" Note: {len(missing)} layers re-initialized (expected for fc layer)")
|
| 80 |
+
print(f" Loaded epoch {ckpt.get('epoch')} "
|
| 81 |
+
f"(val_loss={ckpt.get('val_loss', ckpt.get('val_cer', 0)):.4f})")
|
| 82 |
+
|
| 83 |
+
criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def run_epoch(loader, training, optimizer=None):
|
| 87 |
+
model.train() if training else model.eval()
|
| 88 |
+
total, n = 0, 0
|
| 89 |
+
ctx = torch.enable_grad() if training else torch.no_grad()
|
| 90 |
+
with ctx:
|
| 91 |
+
for images, targets, target_lengths, _ in loader:
|
| 92 |
+
images = images.to(DEVICE)
|
| 93 |
+
batch_size = images.size(0)
|
| 94 |
+
if training:
|
| 95 |
+
optimizer.zero_grad()
|
| 96 |
+
# CRITICAL: log_softmax before CTCLoss
|
| 97 |
+
outputs = F.log_softmax(model(images), dim=2)
|
| 98 |
+
seq_len = outputs.size(0)
|
| 99 |
+
input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
|
| 100 |
+
loss = criterion(outputs, targets, input_lengths, target_lengths)
|
| 101 |
+
if not torch.isnan(loss) and not torch.isinf(loss):
|
| 102 |
+
if training:
|
| 103 |
+
loss.backward()
|
| 104 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
|
| 105 |
+
optimizer.step()
|
| 106 |
+
total += loss.item()
|
| 107 |
+
n += 1
|
| 108 |
+
return total / max(n, 1)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def run_phase(num, epochs, lr, freeze_cnn, patience):
|
| 112 |
+
print(f"\n{'='*55}")
|
| 113 |
+
print(f" PHASE {num} β "
|
| 114 |
+
f"{'CNN FROZEN (RNN+FC only)' if freeze_cnn else 'FULL MODEL (all layers)'}"
|
| 115 |
+
f" LR={lr}")
|
| 116 |
+
print(f"{'='*55}")
|
| 117 |
+
|
| 118 |
+
# Freeze or unfreeze CNN
|
| 119 |
+
for name, param in model.named_parameters():
|
| 120 |
+
param.requires_grad = not (freeze_cnn and 'cnn' in name)
|
| 121 |
+
|
| 122 |
+
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 123 |
+
print(f" Trainable params : {trainable:,}")
|
| 124 |
+
|
| 125 |
+
opt = optim.Adam(
|
| 126 |
+
filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
|
| 127 |
+
sched = optim.lr_scheduler.ReduceLROnPlateau(opt, patience=3, factor=0.5)
|
| 128 |
+
best = float('inf')
|
| 129 |
+
counter = 0
|
| 130 |
+
|
| 131 |
+
for epoch in range(1, epochs + 1):
|
| 132 |
+
tr = run_epoch(train_loader, True, opt)
|
| 133 |
+
vl = run_epoch(val_loader, False, None)
|
| 134 |
+
sched.step(vl)
|
| 135 |
+
|
| 136 |
+
if vl < best:
|
| 137 |
+
best = vl
|
| 138 |
+
counter = 0
|
| 139 |
+
torch.save({
|
| 140 |
+
'model_state_dict': model.state_dict(),
|
| 141 |
+
'config': config,
|
| 142 |
+
'char_to_idx': train_dataset.char_to_idx,
|
| 143 |
+
'idx_to_char': train_dataset.idx_to_char,
|
| 144 |
+
'epoch': epoch,
|
| 145 |
+
'val_loss': vl, # FIXED: renamed from val_cer β this is val loss, not CER%
|
| 146 |
+
}, 'checkpoints/best_model_emnist.pth')
|
| 147 |
+
print(f" Epoch {epoch:02d}/{epochs} Train={tr:.4f} Val={vl:.4f} <- saved")
|
| 148 |
+
else:
|
| 149 |
+
counter += 1
|
| 150 |
+
print(f" Epoch {epoch:02d}/{epochs} Train={tr:.4f} Val={vl:.4f}"
|
| 151 |
+
f" (patience {counter}/{patience})")
|
| 152 |
+
if counter >= patience:
|
| 153 |
+
print(f" Early stopping at epoch {epoch}.")
|
| 154 |
+
break
|
| 155 |
+
return best
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# ββ Phase 1: Freeze CNN β teach RNN+FC to handle EMNIST chars β
|
| 159 |
+
p1_best = run_phase(1, epochs=30, lr=1e-4, freeze_cnn=True, patience=7)
|
| 160 |
+
|
| 161 |
+
# ββ Phase 2: Unfreeze all β gentle full-model polish ββββββββββ
|
| 162 |
+
p2_best = run_phase(2, epochs=20, lr=1e-6, freeze_cnn=False, patience=5)
|
| 163 |
+
|
| 164 |
+
print(f"\n{'='*55}")
|
| 165 |
+
print(f"EMNIST fine-tuning complete!")
|
| 166 |
+
print(f" Phase 1 best val loss : {p1_best:.4f}")
|
| 167 |
+
print(f" Phase 2 best val loss : {p2_best:.4f}")
|
| 168 |
+
print(f" Saved : checkpoints/best_model_emnist.pth")
|
| 169 |
+
print(f"\nNext step: python IAM_train.py --prepare --train")
|
CRNN+CTC/utils.py
ADDED
|
@@ -0,0 +1,397 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility Functions for CRNN+CTC Civil Registry OCR
|
| 3 |
+
Includes CTC decoding, metrics calculation, and helper functions
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
import numpy as np
|
| 8 |
+
def _editdistance(a, b):
|
| 9 |
+
"""Pure-Python Levenshtein distance β replaces the editdistance C extension."""
|
| 10 |
+
m, n = len(a), len(b)
|
| 11 |
+
dp = list(range(n + 1))
|
| 12 |
+
for i in range(1, m + 1):
|
| 13 |
+
prev, dp[0] = dp[0], i
|
| 14 |
+
for j in range(1, n + 1):
|
| 15 |
+
prev, dp[j] = dp[j], prev if a[i-1] == b[j-1] else 1 + min(prev, dp[j], dp[j-1])
|
| 16 |
+
return dp[n]
|
| 17 |
+
from typing import List, Dict, Tuple
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def decode_ctc_predictions(outputs, idx_to_char, method='greedy'):
|
| 21 |
+
"""
|
| 22 |
+
Decode CTC predictions to text
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
outputs: Model outputs [seq_len, batch, num_chars]
|
| 26 |
+
idx_to_char: Dictionary mapping indices to characters
|
| 27 |
+
method: 'greedy' or 'beam_search'
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
List of decoded strings
|
| 31 |
+
"""
|
| 32 |
+
if method == 'greedy':
|
| 33 |
+
return greedy_decode(outputs, idx_to_char)
|
| 34 |
+
elif method == 'beam_search':
|
| 35 |
+
return beam_search_decode(outputs, idx_to_char)
|
| 36 |
+
else:
|
| 37 |
+
raise ValueError(f"Unknown decoding method: {method}")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def greedy_decode(outputs, idx_to_char):
|
| 41 |
+
"""
|
| 42 |
+
Greedy CTC decoding - fast but less accurate
|
| 43 |
+
"""
|
| 44 |
+
# Get most probable characters
|
| 45 |
+
pred_indices = torch.argmax(outputs, dim=2) # [seq_len, batch]
|
| 46 |
+
pred_indices = pred_indices.permute(1, 0) # [batch, seq_len]
|
| 47 |
+
|
| 48 |
+
decoded_texts = []
|
| 49 |
+
|
| 50 |
+
for sequence in pred_indices:
|
| 51 |
+
chars = []
|
| 52 |
+
prev_idx = -1
|
| 53 |
+
|
| 54 |
+
for idx in sequence:
|
| 55 |
+
idx = idx.item()
|
| 56 |
+
# Skip blank (0) and consecutive duplicates
|
| 57 |
+
if idx != 0 and idx != prev_idx:
|
| 58 |
+
if idx in idx_to_char:
|
| 59 |
+
chars.append(idx_to_char[idx])
|
| 60 |
+
prev_idx = idx
|
| 61 |
+
|
| 62 |
+
decoded_texts.append(''.join(chars))
|
| 63 |
+
|
| 64 |
+
return decoded_texts
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def beam_search_decode(outputs, idx_to_char, beam_width=10):
|
| 68 |
+
"""
|
| 69 |
+
Beam search CTC decoding - slower but more accurate.
|
| 70 |
+
|
| 71 |
+
FIXED Bug 6: previous code mixed list-of-chars and string representations.
|
| 72 |
+
After sorting new_beams (a dict keyed by strings), it did `list(seq)` on the
|
| 73 |
+
string key β which splits a string like "AB" into ['A','B'] accidentally works
|
| 74 |
+
for ASCII but is fragile and confusing. Rewritten to use strings throughout:
|
| 75 |
+
beams are now List[Tuple[str, float]] with the sequence always kept as a plain
|
| 76 |
+
string, eliminating the list/string ambiguity entirely.
|
| 77 |
+
"""
|
| 78 |
+
outputs = torch.nn.functional.softmax(outputs, dim=2)
|
| 79 |
+
outputs = outputs.permute(1, 0, 2).cpu().numpy() # [batch, seq_len, num_chars]
|
| 80 |
+
|
| 81 |
+
decoded_texts = []
|
| 82 |
+
|
| 83 |
+
for output in outputs:
|
| 84 |
+
# Each beam is (sequence_string, cumulative_probability)
|
| 85 |
+
beams: list = [('', 1.0)]
|
| 86 |
+
|
| 87 |
+
for timestep in output:
|
| 88 |
+
new_beams: dict = {}
|
| 89 |
+
|
| 90 |
+
for sequence, prob in beams:
|
| 91 |
+
for idx, char_prob in enumerate(timestep):
|
| 92 |
+
if idx == 0: # blank token β sequence unchanged
|
| 93 |
+
new_seq = sequence
|
| 94 |
+
elif idx in idx_to_char:
|
| 95 |
+
char = idx_to_char[idx]
|
| 96 |
+
# CTC rule: merge consecutive duplicate characters
|
| 97 |
+
if sequence and sequence[-1] == char:
|
| 98 |
+
new_seq = sequence # duplicate β stay the same
|
| 99 |
+
else:
|
| 100 |
+
new_seq = sequence + char # append directly to string
|
| 101 |
+
else:
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
new_prob = prob * char_prob
|
| 105 |
+
# Merge beams that produce the same string
|
| 106 |
+
if new_seq in new_beams:
|
| 107 |
+
new_beams[new_seq] = max(new_beams[new_seq], new_prob)
|
| 108 |
+
else:
|
| 109 |
+
new_beams[new_seq] = new_prob
|
| 110 |
+
|
| 111 |
+
# Keep top-k beams; keys are already strings β no list() conversion needed
|
| 112 |
+
beams = sorted(new_beams.items(), key=lambda x: x[1], reverse=True)[:beam_width]
|
| 113 |
+
|
| 114 |
+
# Best sequence is the string with highest probability
|
| 115 |
+
best_sequence = max(beams, key=lambda x: x[1])[0]
|
| 116 |
+
decoded_texts.append(best_sequence)
|
| 117 |
+
|
| 118 |
+
return decoded_texts
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def calculate_cer(predictions: List[str], ground_truths: List[str]) -> float:
|
| 122 |
+
"""
|
| 123 |
+
Calculate Character Error Rate (CER)
|
| 124 |
+
|
| 125 |
+
CER = (Substitutions + Deletions + Insertions) / Total Characters
|
| 126 |
+
"""
|
| 127 |
+
if len(predictions) != len(ground_truths):
|
| 128 |
+
raise ValueError("Predictions and ground truths must have same length")
|
| 129 |
+
|
| 130 |
+
total_distance = 0
|
| 131 |
+
total_length = 0
|
| 132 |
+
|
| 133 |
+
for pred, gt in zip(predictions, ground_truths):
|
| 134 |
+
distance = _editdistance(pred, gt)
|
| 135 |
+
total_distance += distance
|
| 136 |
+
total_length += len(gt)
|
| 137 |
+
|
| 138 |
+
cer = (total_distance / total_length * 100) if total_length > 0 else 0
|
| 139 |
+
return cer
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def calculate_wer(predictions: List[str], ground_truths: List[str]) -> float:
|
| 143 |
+
"""
|
| 144 |
+
Calculate Word Error Rate (WER)
|
| 145 |
+
|
| 146 |
+
WER = (Substitutions + Deletions + Insertions) / Total Words
|
| 147 |
+
"""
|
| 148 |
+
if len(predictions) != len(ground_truths):
|
| 149 |
+
raise ValueError("Predictions and ground truths must have same length")
|
| 150 |
+
|
| 151 |
+
total_distance = 0
|
| 152 |
+
total_length = 0
|
| 153 |
+
|
| 154 |
+
for pred, gt in zip(predictions, ground_truths):
|
| 155 |
+
pred_words = pred.split()
|
| 156 |
+
gt_words = gt.split()
|
| 157 |
+
|
| 158 |
+
distance = _editdistance(pred_words, gt_words)
|
| 159 |
+
total_distance += distance
|
| 160 |
+
total_length += len(gt_words)
|
| 161 |
+
|
| 162 |
+
wer = (total_distance / total_length * 100) if total_length > 0 else 0
|
| 163 |
+
return wer
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def calculate_accuracy(predictions: List[str], ground_truths: List[str]) -> float:
|
| 167 |
+
"""
|
| 168 |
+
Calculate exact match accuracy
|
| 169 |
+
"""
|
| 170 |
+
if len(predictions) != len(ground_truths):
|
| 171 |
+
raise ValueError("Predictions and ground truths must have same length")
|
| 172 |
+
|
| 173 |
+
correct = sum(1 for pred, gt in zip(predictions, ground_truths) if pred == gt)
|
| 174 |
+
accuracy = (correct / len(predictions) * 100) if len(predictions) > 0 else 0
|
| 175 |
+
|
| 176 |
+
return accuracy
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
class EarlyStopping:
|
| 180 |
+
"""
|
| 181 |
+
Early stopping to stop training when validation loss stops improving
|
| 182 |
+
"""
|
| 183 |
+
|
| 184 |
+
def __init__(self, patience=10, min_delta=0.001):
|
| 185 |
+
self.patience = patience
|
| 186 |
+
self.min_delta = min_delta
|
| 187 |
+
self.counter = 0
|
| 188 |
+
self.best_loss = None
|
| 189 |
+
self.early_stop = False
|
| 190 |
+
|
| 191 |
+
def __call__(self, val_loss):
|
| 192 |
+
if self.best_loss is None:
|
| 193 |
+
self.best_loss = val_loss
|
| 194 |
+
elif val_loss > self.best_loss - self.min_delta:
|
| 195 |
+
self.counter += 1
|
| 196 |
+
if self.counter >= self.patience:
|
| 197 |
+
self.early_stop = True
|
| 198 |
+
else:
|
| 199 |
+
self.best_loss = val_loss
|
| 200 |
+
self.counter = 0
|
| 201 |
+
|
| 202 |
+
return self.early_stop
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
class AverageMeter:
|
| 206 |
+
"""
|
| 207 |
+
Computes and stores the average and current value
|
| 208 |
+
"""
|
| 209 |
+
|
| 210 |
+
def __init__(self):
|
| 211 |
+
self.reset()
|
| 212 |
+
|
| 213 |
+
def reset(self):
|
| 214 |
+
self.val = 0
|
| 215 |
+
self.avg = 0
|
| 216 |
+
self.sum = 0
|
| 217 |
+
self.count = 0
|
| 218 |
+
|
| 219 |
+
def update(self, val, n=1):
|
| 220 |
+
self.val = val
|
| 221 |
+
self.sum += val * n
|
| 222 |
+
self.count += n
|
| 223 |
+
self.avg = self.sum / self.count
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def calculate_confusion_matrix(predictions: List[str], ground_truths: List[str], char_set: List[str]) -> np.ndarray:
|
| 227 |
+
"""
|
| 228 |
+
Calculate character-level confusion matrix
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
predictions: List of predicted strings
|
| 232 |
+
ground_truths: List of ground truth strings
|
| 233 |
+
char_set: List of all possible characters
|
| 234 |
+
|
| 235 |
+
Returns:
|
| 236 |
+
Confusion matrix [num_chars, num_chars]
|
| 237 |
+
"""
|
| 238 |
+
char_to_idx = {char: idx for idx, char in enumerate(char_set)}
|
| 239 |
+
n_chars = len(char_set)
|
| 240 |
+
|
| 241 |
+
confusion = np.zeros((n_chars, n_chars), dtype=np.int64)
|
| 242 |
+
|
| 243 |
+
for pred, gt in zip(predictions, ground_truths):
|
| 244 |
+
# Align sequences (simple alignment)
|
| 245 |
+
max_len = max(len(pred), len(gt))
|
| 246 |
+
pred_padded = pred + ' ' * (max_len - len(pred))
|
| 247 |
+
gt_padded = gt + ' ' * (max_len - len(gt))
|
| 248 |
+
|
| 249 |
+
for p_char, g_char in zip(pred_padded, gt_padded):
|
| 250 |
+
if p_char in char_to_idx and g_char in char_to_idx:
|
| 251 |
+
confusion[char_to_idx[g_char], char_to_idx[p_char]] += 1
|
| 252 |
+
|
| 253 |
+
return confusion
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def extract_form_fields(text: str, form_type: str) -> Dict[str, str]:
|
| 257 |
+
"""
|
| 258 |
+
Extract specific fields from recognized text based on form type
|
| 259 |
+
|
| 260 |
+
Args:
|
| 261 |
+
text: Recognized text
|
| 262 |
+
form_type: 'form1a', 'form2a', 'form3a', 'form90'
|
| 263 |
+
|
| 264 |
+
Returns:
|
| 265 |
+
Dictionary of extracted fields
|
| 266 |
+
"""
|
| 267 |
+
fields = {}
|
| 268 |
+
|
| 269 |
+
if form_type == 'form1a': # Birth Certificate
|
| 270 |
+
# Extract common fields (simplified)
|
| 271 |
+
# In practice, use NER or regex patterns
|
| 272 |
+
fields['type'] = 'Birth Certificate'
|
| 273 |
+
# Add more field extraction logic
|
| 274 |
+
|
| 275 |
+
elif form_type == 'form2a': # Death Certificate
|
| 276 |
+
fields['type'] = 'Death Certificate'
|
| 277 |
+
|
| 278 |
+
elif form_type == 'form3a': # Marriage Certificate
|
| 279 |
+
fields['type'] = 'Marriage Certificate'
|
| 280 |
+
|
| 281 |
+
elif form_type == 'form90': # Marriage License Application
|
| 282 |
+
fields['type'] = 'Marriage License Application'
|
| 283 |
+
|
| 284 |
+
return fields
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def validate_extracted_data(data: Dict[str, str], form_type: str) -> Tuple[bool, List[str]]:
|
| 288 |
+
"""
|
| 289 |
+
Validate extracted data for completeness and format
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
data: Extracted data dictionary
|
| 293 |
+
form_type: Form type
|
| 294 |
+
|
| 295 |
+
Returns:
|
| 296 |
+
(is_valid, list_of_errors)
|
| 297 |
+
"""
|
| 298 |
+
errors = []
|
| 299 |
+
|
| 300 |
+
# Define required fields per form type
|
| 301 |
+
required_fields = {
|
| 302 |
+
'form1a': ['name', 'date_of_birth', 'place_of_birth'],
|
| 303 |
+
'form2a': ['name', 'date_of_death', 'place_of_death'],
|
| 304 |
+
'form3a': ['husband_name', 'wife_name', 'date_of_marriage'],
|
| 305 |
+
'form90': ['husband_name', 'wife_name', 'date_of_application']
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
# Check required fields
|
| 309 |
+
for field in required_fields.get(form_type, []):
|
| 310 |
+
if field not in data or not data[field]:
|
| 311 |
+
errors.append(f"Missing required field: {field}")
|
| 312 |
+
|
| 313 |
+
# Additional validation can be added here
|
| 314 |
+
# - Date format validation
|
| 315 |
+
# - Name format validation
|
| 316 |
+
# - etc.
|
| 317 |
+
|
| 318 |
+
is_valid = len(errors) == 0
|
| 319 |
+
return is_valid, errors
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def load_checkpoint(checkpoint_path, model, optimizer=None, device='cpu'):
|
| 323 |
+
"""
|
| 324 |
+
Load model checkpoint
|
| 325 |
+
|
| 326 |
+
Args:
|
| 327 |
+
checkpoint_path: Path to checkpoint file
|
| 328 |
+
model: Model instance
|
| 329 |
+
optimizer: Optimizer instance (optional)
|
| 330 |
+
device: Device to load to
|
| 331 |
+
|
| 332 |
+
Returns:
|
| 333 |
+
(model, optimizer, checkpoint_dict)
|
| 334 |
+
"""
|
| 335 |
+
checkpoint = torch.load(checkpoint_path, map_location=device)
|
| 336 |
+
|
| 337 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
| 338 |
+
|
| 339 |
+
if optimizer is not None and 'optimizer_state_dict' in checkpoint:
|
| 340 |
+
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
|
| 341 |
+
|
| 342 |
+
print(f"β Loaded checkpoint from {checkpoint_path}")
|
| 343 |
+
print(f" Epoch: {checkpoint.get('epoch', 'N/A')}")
|
| 344 |
+
if 'val_cer' in checkpoint:
|
| 345 |
+
print(f" Val CER : {checkpoint['val_cer']:.4f}%")
|
| 346 |
+
elif 'val_loss' in checkpoint:
|
| 347 |
+
print(f" Val Loss : {checkpoint['val_loss']:.4f} (run compare_live_cer.py for true CER)")
|
| 348 |
+
else:
|
| 349 |
+
print(f" Val CER : N/A (run compare_live_cer.py for true CER)")
|
| 350 |
+
|
| 351 |
+
return model, optimizer, checkpoint
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def save_predictions_to_file(predictions: List[str], ground_truths: List[str], output_file: str):
|
| 355 |
+
"""
|
| 356 |
+
Save predictions and ground truths to file for analysis
|
| 357 |
+
"""
|
| 358 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 359 |
+
f.write("Ground Truth\tPrediction\tMatch\n")
|
| 360 |
+
f.write("=" * 80 + "\n")
|
| 361 |
+
|
| 362 |
+
for gt, pred in zip(ground_truths, predictions):
|
| 363 |
+
match = "β" if gt == pred else "β"
|
| 364 |
+
f.write(f"{gt}\t{pred}\t{match}\n")
|
| 365 |
+
|
| 366 |
+
print(f"β Predictions saved to {output_file}")
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
if __name__ == "__main__":
|
| 370 |
+
# Test utility functions
|
| 371 |
+
print("=" * 60)
|
| 372 |
+
print("Testing Utility Functions")
|
| 373 |
+
print("=" * 60)
|
| 374 |
+
|
| 375 |
+
# Test CER calculation
|
| 376 |
+
predictions = ["Hello World", "Test", "Sample Text"]
|
| 377 |
+
ground_truths = ["Hello World", "Tset", "Sample Txt"]
|
| 378 |
+
|
| 379 |
+
cer = calculate_cer(predictions, ground_truths)
|
| 380 |
+
wer = calculate_wer(predictions, ground_truths)
|
| 381 |
+
accuracy = calculate_accuracy(predictions, ground_truths)
|
| 382 |
+
|
| 383 |
+
print(f"\nMetrics:")
|
| 384 |
+
print(f" CER: {cer:.2f}%")
|
| 385 |
+
print(f" WER: {wer:.2f}%")
|
| 386 |
+
print(f" Accuracy: {accuracy:.2f}%")
|
| 387 |
+
|
| 388 |
+
# Test early stopping
|
| 389 |
+
print("\nTesting Early Stopping:")
|
| 390 |
+
early_stopping = EarlyStopping(patience=3, min_delta=0.001)
|
| 391 |
+
|
| 392 |
+
val_losses = [1.0, 0.9, 0.85, 0.84, 0.84, 0.84, 0.84]
|
| 393 |
+
for epoch, loss in enumerate(val_losses, 1):
|
| 394 |
+
should_stop = early_stopping(loss)
|
| 395 |
+
print(f" Epoch {epoch}: Loss = {loss:.2f}, Stop = {should_stop}")
|
| 396 |
+
if should_stop:
|
| 397 |
+
break
|
MNB/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# mnb/__init__.py
|
| 2 |
+
from .classifier import MNBClassifier
|
| 3 |
+
|
| 4 |
+
__all__ = ["MNBClassifier"]
|
MNB/classifier.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# mnb/classifier.py
|
| 2 |
+
# ============================================================
|
| 3 |
+
# MNB CLASSIFIER β wraps the trained DocumentClassifier
|
| 4 |
+
#
|
| 5 |
+
# TWO SEPARATE CONCERNS:
|
| 6 |
+
#
|
| 7 |
+
# PATH A β Certifications Page
|
| 8 |
+
# User uploads a certification scan.
|
| 9 |
+
# MNB identifies which form it is:
|
| 10 |
+
# form102 β Form 102 (Certificate of Live Birth)
|
| 11 |
+
# form103 β Form 103 (Certificate of Death)
|
| 12 |
+
# form97 β Form 97 (Certificate of Marriage)
|
| 13 |
+
#
|
| 14 |
+
# PATH B β Application for Marriage License Page (Form 90)
|
| 15 |
+
# User uploads TWO birth certificates:
|
| 16 |
+
# - Groom's Birth Cert (PSA/NSO sealed)
|
| 17 |
+
# - Bride's Birth Cert (PSA/NSO sealed)
|
| 18 |
+
# MNB is NOT used for form type here β the upload page
|
| 19 |
+
# already tells us it's a birth cert.
|
| 20 |
+
# classify_sex() reads the SEX field β GROOM (Male) or BRIDE (Female)
|
| 21 |
+
# and routes each cert to the correct Form 90 slot.
|
| 22 |
+
#
|
| 23 |
+
# Files needed:
|
| 24 |
+
# form_classifier.py β training + DocumentClassifier
|
| 25 |
+
# models/mnb_classifier.pkl
|
| 26 |
+
# models/tfidf_vectorizer.pkl
|
| 27 |
+
# models/mnb_metadata.json
|
| 28 |
+
# ============================================================
|
| 29 |
+
|
| 30 |
+
import sys
|
| 31 |
+
import os
|
| 32 |
+
|
| 33 |
+
_mnb_dir = os.path.dirname(os.path.abspath(__file__))
|
| 34 |
+
if _mnb_dir not in sys.path:
|
| 35 |
+
sys.path.insert(0, _mnb_dir)
|
| 36 |
+
|
| 37 |
+
_root_dir = os.path.dirname(_mnb_dir)
|
| 38 |
+
if _root_dir not in sys.path:
|
| 39 |
+
sys.path.insert(0, _root_dir)
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
from form_classifier import DocumentClassifier
|
| 43 |
+
_HAVE_DOC_CLASSIFIER = True
|
| 44 |
+
except ImportError:
|
| 45 |
+
_HAVE_DOC_CLASSIFIER = False
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# ββ Keyword fallback (used if .pkl files not found) ββββββββ
|
| 49 |
+
# Uses exact Philippine civil registry form headers
|
| 50 |
+
_FORM_KEYWORDS = {
|
| 51 |
+
"form102": [
|
| 52 |
+
"Municipal Form No. 102",
|
| 53 |
+
"Municipal Form No.102",
|
| 54 |
+
"Certificate of Live Birth",
|
| 55 |
+
"live birth",
|
| 56 |
+
"name of child",
|
| 57 |
+
"date of birth",
|
| 58 |
+
"place of birth",
|
| 59 |
+
"birth certificate",
|
| 60 |
+
"mother", "father",
|
| 61 |
+
"infant", "newborn",
|
| 62 |
+
"attendant at birth",
|
| 63 |
+
],
|
| 64 |
+
"form103": [
|
| 65 |
+
"Municipal Form No. 103",
|
| 66 |
+
"Municipal Form No.103",
|
| 67 |
+
"Certificate of Death",
|
| 68 |
+
"death certificate",
|
| 69 |
+
"name of deceased",
|
| 70 |
+
"date of death",
|
| 71 |
+
"place of death",
|
| 72 |
+
"cause of death",
|
| 73 |
+
"burial", "deceased",
|
| 74 |
+
"immediate cause",
|
| 75 |
+
],
|
| 76 |
+
"form97": [
|
| 77 |
+
"Municipal Form No. 97",
|
| 78 |
+
"Municipal Form No.97",
|
| 79 |
+
"Certificate of Marriage",
|
| 80 |
+
"marriage certificate",
|
| 81 |
+
"name of husband",
|
| 82 |
+
"name of wife",
|
| 83 |
+
"date of marriage",
|
| 84 |
+
"place of marriage",
|
| 85 |
+
"solemnizing officer",
|
| 86 |
+
"contracting parties",
|
| 87 |
+
"witnesses",
|
| 88 |
+
],
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
# Sex keywords for Form 90 routing (Groom/Bride)
|
| 92 |
+
_SEX_KEYWORDS = {
|
| 93 |
+
"GROOM": [
|
| 94 |
+
"sex: male",
|
| 95 |
+
"sex male",
|
| 96 |
+
"2. sex: male",
|
| 97 |
+
" male",
|
| 98 |
+
"sex m",
|
| 99 |
+
],
|
| 100 |
+
"BRIDE": [
|
| 101 |
+
"sex: female",
|
| 102 |
+
"sex female",
|
| 103 |
+
"2. sex: female",
|
| 104 |
+
" female",
|
| 105 |
+
"sex f",
|
| 106 |
+
],
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
def _keyword_classify_form(text: str) -> str:
|
| 110 |
+
"""Keyword fallback for Certifications page classification."""
|
| 111 |
+
t = text.lower()
|
| 112 |
+
scores = {k: sum(1 for kw in v if kw.lower() in t) for k, v in _FORM_KEYWORDS.items()}
|
| 113 |
+
return max(scores, key=scores.get)
|
| 114 |
+
|
| 115 |
+
def _keyword_classify_sex(text: str) -> str:
|
| 116 |
+
"""Keyword-based sex classifier for Form 90 routing."""
|
| 117 |
+
t = text.lower()
|
| 118 |
+
scores = {k: sum(1 for kw in v if kw.lower() in t) for k, v in _SEX_KEYWORDS.items()}
|
| 119 |
+
return max(scores, key=scores.get)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# ββ Form code β NER hint map ββββββββββββββββββββββββββββββ
|
| 123 |
+
_FORM_CODE_TO_HINT = {
|
| 124 |
+
"form102": "birth",
|
| 125 |
+
"form103": "death",
|
| 126 |
+
"form97": "marriage",
|
| 127 |
+
# Form 90 is handled by classify_sex() β not this map
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
class MNBClassifier:
|
| 132 |
+
"""
|
| 133 |
+
MNB Classifier for the Civil Registry Digitization System.
|
| 134 |
+
|
| 135 |
+
PATH A β Certifications Page:
|
| 136 |
+
mnb = MNBClassifier()
|
| 137 |
+
form_code = mnb.classify_form_type(ocr_text)
|
| 138 |
+
# β 'form102' | 'form103' | 'form97'
|
| 139 |
+
|
| 140 |
+
hint = mnb.get_ner_hint(ocr_text)
|
| 141 |
+
# β 'birth' | 'death' | 'marriage'
|
| 142 |
+
|
| 143 |
+
result = mnb.classify_full(ocr_text)
|
| 144 |
+
# β {'label': 'Form 102 - Certificate of Live Birth',
|
| 145 |
+
# 'form_code': 'form102', 'confidence': 0.97, 'probabilities': {...}}
|
| 146 |
+
|
| 147 |
+
PATH B β Application for Marriage License Page (Form 90):
|
| 148 |
+
sex_role = mnb.classify_sex(ocr_text)
|
| 149 |
+
# β 'GROOM' (Male birth cert) | 'BRIDE' (Female birth cert)
|
| 150 |
+
"""
|
| 151 |
+
|
| 152 |
+
def __init__(self, model_dir: str = "models"):
|
| 153 |
+
self._doc_clf = None
|
| 154 |
+
if _HAVE_DOC_CLASSIFIER:
|
| 155 |
+
try:
|
| 156 |
+
self._doc_clf = DocumentClassifier(model_dir=model_dir)
|
| 157 |
+
print(f" [MNB] Loaded DocumentClassifier from {model_dir}/")
|
| 158 |
+
except FileNotFoundError as e:
|
| 159 |
+
print(f" [MNB] {e}")
|
| 160 |
+
print(" [MNB] Using keyword fallback β run: python mnb/form_classifier.py")
|
| 161 |
+
else:
|
| 162 |
+
print(" [MNB] form_classifier.py not found β using keyword fallback")
|
| 163 |
+
|
| 164 |
+
# ββ PATH A: Certifications Page ββββββββββββββββββββββββ
|
| 165 |
+
|
| 166 |
+
def classify_form_type(self, ocr_text: str) -> str:
|
| 167 |
+
"""
|
| 168 |
+
Certifications page: identify which form was uploaded.
|
| 169 |
+
Returns: 'form102' | 'form103' | 'form97'
|
| 170 |
+
"""
|
| 171 |
+
if self._doc_clf is not None:
|
| 172 |
+
return self._doc_clf.predict(ocr_text)["form_code"]
|
| 173 |
+
return _keyword_classify_form(ocr_text)
|
| 174 |
+
|
| 175 |
+
def classify_full(self, ocr_text: str) -> dict:
|
| 176 |
+
"""
|
| 177 |
+
Certifications page: full result with confidence scores.
|
| 178 |
+
Returns:
|
| 179 |
+
{
|
| 180 |
+
'label': 'Form 102 - Certificate of Live Birth',
|
| 181 |
+
'form_code': 'form102',
|
| 182 |
+
'confidence': 0.97,
|
| 183 |
+
'probabilities': { ... }
|
| 184 |
+
}
|
| 185 |
+
"""
|
| 186 |
+
if self._doc_clf is not None:
|
| 187 |
+
return self._doc_clf.predict(ocr_text)
|
| 188 |
+
winner = _keyword_classify_form(ocr_text)
|
| 189 |
+
return {
|
| 190 |
+
"label": winner,
|
| 191 |
+
"form_code": winner,
|
| 192 |
+
"confidence": 1.0,
|
| 193 |
+
"probabilities": {k: (1.0 if k == winner else 0.0) for k in _FORM_KEYWORDS},
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
def get_ner_hint(self, ocr_text: str) -> str:
|
| 197 |
+
"""
|
| 198 |
+
Returns NER hint string for bridge.py:
|
| 199 |
+
'birth' | 'death' | 'marriage'
|
| 200 |
+
"""
|
| 201 |
+
code = self.classify_form_type(ocr_text)
|
| 202 |
+
return _FORM_CODE_TO_HINT.get(code, "birth")
|
| 203 |
+
|
| 204 |
+
# ββ PATH B: Marriage License Page (Form 90) ββββββββββββ
|
| 205 |
+
|
| 206 |
+
def classify_sex(self, ocr_text: str) -> str:
|
| 207 |
+
"""
|
| 208 |
+
Form 90 upload page only.
|
| 209 |
+
Reads the SEX field on a PSA/NSO birth certificate.
|
| 210 |
+
Returns: 'GROOM' (Male) | 'BRIDE' (Female)
|
| 211 |
+
"""
|
| 212 |
+
return _keyword_classify_sex(ocr_text)
|
| 213 |
+
|
| 214 |
+
def classify_sex_proba(self, ocr_text: str) -> dict:
|
| 215 |
+
"""
|
| 216 |
+
Returns confidence scores for sex classification.
|
| 217 |
+
Returns: {'GROOM': 0.9, 'BRIDE': 0.1}
|
| 218 |
+
"""
|
| 219 |
+
winner = _keyword_classify_sex(ocr_text)
|
| 220 |
+
return {k: (1.0 if k == winner else 0.0) for k in _SEX_KEYWORDS}
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
# ββ Quick test ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 224 |
+
if __name__ == "__main__":
|
| 225 |
+
mnb = MNBClassifier()
|
| 226 |
+
|
| 227 |
+
print("\n ββ PATH A: Certifications Page Tests ββ")
|
| 228 |
+
cert_tests = [
|
| 229 |
+
(
|
| 230 |
+
"Municipal Form No. 102 Certificate of Live Birth "
|
| 231 |
+
"Name of child Maria Santos Date of birth 01/15/1990 "
|
| 232 |
+
"Place of birth Brgy. San Jose Tarlac City "
|
| 233 |
+
"Name of mother Lani Santos Name of father Jose Santos "
|
| 234 |
+
"Sex Female birth certificate infant",
|
| 235 |
+
"form102"
|
| 236 |
+
),
|
| 237 |
+
(
|
| 238 |
+
"Municipal Form No.102 Certificate of Live Birth "
|
| 239 |
+
"PSA Child Juan Dela Cruz born 03/22/1985 Capas Tarlac "
|
| 240 |
+
"mother Rosa father Pedro Sex Male",
|
| 241 |
+
"form102"
|
| 242 |
+
),
|
| 243 |
+
(
|
| 244 |
+
"Municipal Form No. 103 Certificate of Death "
|
| 245 |
+
"Name of deceased Pedro Reyes Date of death 03/22/2020 "
|
| 246 |
+
"Cause of death Cardiac Arrest death certificate burial",
|
| 247 |
+
"form103"
|
| 248 |
+
),
|
| 249 |
+
(
|
| 250 |
+
"Municipal Form No.103 Certificate of Death "
|
| 251 |
+
"Deceased Ana Torres died 07/04/2000 Pneumonia burial permit",
|
| 252 |
+
"form103"
|
| 253 |
+
),
|
| 254 |
+
(
|
| 255 |
+
"Municipal Form No. 97 Certificate of Marriage "
|
| 256 |
+
"Name of husband Carlos Bautista Name of wife Ana Torres "
|
| 257 |
+
"Date of marriage 07/04/2005 solemnizing officer witnesses",
|
| 258 |
+
"form97"
|
| 259 |
+
),
|
| 260 |
+
(
|
| 261 |
+
"Municipal Form No.97 Certificate of Marriage "
|
| 262 |
+
"Husband Jose Santos wife Maria Reyes married 11/30/1995 "
|
| 263 |
+
"contracting parties",
|
| 264 |
+
"form97"
|
| 265 |
+
),
|
| 266 |
+
]
|
| 267 |
+
|
| 268 |
+
for text, expected in cert_tests:
|
| 269 |
+
result = mnb.classify_full(text)
|
| 270 |
+
mark = "β
" if result["form_code"] == expected else "β"
|
| 271 |
+
print(f" {mark} Expected={expected:<8} Got={result['form_code']:<8} "
|
| 272 |
+
f"Confidence={result['confidence']:.1%} ({result['label']})")
|
| 273 |
+
|
| 274 |
+
print("\n ββ PATH B: Form 90 Marriage License β Sex Routing Tests ββ")
|
| 275 |
+
sex_tests = [
|
| 276 |
+
(
|
| 277 |
+
"Municipal Form No.102 Certificate of Live Birth PSA "
|
| 278 |
+
"CHILD (First): Juan Dela Cruz SEX: Male "
|
| 279 |
+
"Date of Birth March 15 1990 Mother Maria Dela Cruz",
|
| 280 |
+
"GROOM"
|
| 281 |
+
),
|
| 282 |
+
(
|
| 283 |
+
"Municipal Form No.102 Certificate of Live Birth NSO "
|
| 284 |
+
"CHILD (First): Ana Santos SEX: Female "
|
| 285 |
+
"Date of Birth August 21 1995 Mother Gloria Santos",
|
| 286 |
+
"BRIDE"
|
| 287 |
+
),
|
| 288 |
+
]
|
| 289 |
+
for text, expected in sex_tests:
|
| 290 |
+
pred = mnb.classify_sex(text)
|
| 291 |
+
mark = "β
" if pred == expected else "β"
|
| 292 |
+
print(f" {mark} Expected={expected} Got={pred}")
|
MNB/form_classifier.py
ADDED
|
@@ -0,0 +1,466 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
form_classifier.py
|
| 3 |
+
=======================
|
| 4 |
+
Multinomial Naive Bayes (MNB) Document Classifier
|
| 5 |
+
for Local Civil Registry Document Digitization System
|
| 6 |
+
|
| 7 |
+
Classifies extracted OCR text into:
|
| 8 |
+
- Form 102 (Certificate of Live Birth) β Certifications page
|
| 9 |
+
- Form 103 (Certificate of Death) β Certifications page
|
| 10 |
+
- Form 97 (Certificate of Marriage) β Certifications page
|
| 11 |
+
|
| 12 |
+
NOTE: Form 90 (Application for Marriage License) is NOT classified here.
|
| 13 |
+
Form 90 has its OWN upload page where the user uploads:
|
| 14 |
+
- Groom's Birth Certificate (PSA/NSO sealed)
|
| 15 |
+
- Bride's Birth Certificate (PSA/NSO sealed)
|
| 16 |
+
The SEX field on each birth cert determines GROOM (Male) or BRIDE (Female).
|
| 17 |
+
See classify_sex() in classifier.py for that routing.
|
| 18 |
+
|
| 19 |
+
Usage:
|
| 20 |
+
python form_classifier.py # trains and saves model
|
| 21 |
+
python form_classifier.py --test # runs test predictions
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
import os
|
| 25 |
+
import json
|
| 26 |
+
import random
|
| 27 |
+
import argparse
|
| 28 |
+
import pickle
|
| 29 |
+
import numpy as np
|
| 30 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 31 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 32 |
+
from sklearn.model_selection import train_test_split
|
| 33 |
+
from sklearn.metrics import (
|
| 34 |
+
accuracy_score, classification_report, confusion_matrix
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
+
# 1. LABEL MAP (Certifications page only β NO Form 90 here)
|
| 39 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
LABEL_MAP = {
|
| 41 |
+
0: 'Form 102 - Certificate of Live Birth',
|
| 42 |
+
1: 'Form 103 - Certificate of Death',
|
| 43 |
+
2: 'Form 97 - Certificate of Marriage',
|
| 44 |
+
}
|
| 45 |
+
LABEL_NAMES = list(LABEL_MAP.values())
|
| 46 |
+
|
| 47 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 48 |
+
# 2. VOCABULARY POOLS (Filipino civil registry)
|
| 49 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
+
FIRST_NAMES = [
|
| 51 |
+
'Juan', 'Maria', 'Jose', 'Ana', 'Pedro', 'Rosa', 'Carlos', 'Lani',
|
| 52 |
+
'Roberto', 'Nena', 'Ramon', 'Cynthia', 'Eduardo', 'Marites', 'Danilo',
|
| 53 |
+
'Rowena', 'Renato', 'Melinda', 'Ernesto', 'Josephine', 'Michael',
|
| 54 |
+
'Jennifer', 'Angelo', 'Christine', 'Mark', 'Patricia', 'John', 'Mary'
|
| 55 |
+
]
|
| 56 |
+
LAST_NAMES = [
|
| 57 |
+
'Dela Cruz', 'Santos', 'Reyes', 'Garcia', 'Torres', 'Flores',
|
| 58 |
+
'Bautista', 'Villanueva', 'Mendoza', 'Castro', 'Ramos', 'Lim',
|
| 59 |
+
'Aquino', 'Diaz', 'Fernandez', 'Lopez', 'Gonzales', 'Ramirez',
|
| 60 |
+
'Abad', 'Aguilar', 'Manalo', 'Navarro', 'Ocampo', 'Pascual'
|
| 61 |
+
]
|
| 62 |
+
MUNICIPALITIES = [
|
| 63 |
+
'Tarlac City', 'Capas', 'Paniqui', 'Gerona', 'Camiling',
|
| 64 |
+
'Victoria', 'San Manuel', 'Concepcion', 'La Paz', 'Sta. Ignacia',
|
| 65 |
+
'Bamban', 'Moncada', 'Pura', 'Ramos', 'Anao'
|
| 66 |
+
]
|
| 67 |
+
PROVINCES = ['Tarlac', 'Pampanga', 'Nueva Ecija', 'Bulacan', 'Zambales']
|
| 68 |
+
BARANGAYS = [
|
| 69 |
+
'Brgy. San Jose', 'Brgy. Poblacion', 'Brgy. Sto. Cristo',
|
| 70 |
+
'Brgy. Tibag', 'Brgy. Maliwalo', 'Brgy. San Nicolas',
|
| 71 |
+
'Brgy. San Roque', 'Brgy. San Vicente', 'Brgy. Salapungan'
|
| 72 |
+
]
|
| 73 |
+
DATES = [
|
| 74 |
+
'01/15/1990', '03/22/1985', '07/04/2000', '11/30/1995',
|
| 75 |
+
'05/18/1988', '09/12/1975', '02/28/1993', '06/06/1980',
|
| 76 |
+
'12/25/1998', '04/17/2001', '08/08/1965', '10/31/1970',
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
def _name():
|
| 80 |
+
return f"{random.choice(FIRST_NAMES)} {random.choice(LAST_NAMES)}"
|
| 81 |
+
|
| 82 |
+
def _date():
|
| 83 |
+
return random.choice(DATES)
|
| 84 |
+
|
| 85 |
+
def _place():
|
| 86 |
+
return f"{random.choice(BARANGAYS)}, {random.choice(MUNICIPALITIES)}, {random.choice(PROVINCES)}"
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
# 3. SAMPLE GENERATORS
|
| 91 |
+
# Each generator uses the EXACT Philippine form header
|
| 92 |
+
# so MNB learns the real keywords from actual documents.
|
| 93 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 94 |
+
|
| 95 |
+
def generate_form102():
|
| 96 |
+
"""
|
| 97 |
+
Form 102 β Certificate of Live Birth
|
| 98 |
+
Header keywords: 'Municipal Form No. 102', 'Certificate of Live Birth'
|
| 99 |
+
"""
|
| 100 |
+
templates = [
|
| 101 |
+
# Template A: Exact header present
|
| 102 |
+
f"Municipal Form No. 102 Certificate of Live Birth "
|
| 103 |
+
f"Name of child {_name()} Date of birth {_date()} Place of birth {_place()} "
|
| 104 |
+
f"Name of mother {_name()} Name of father {_name()} "
|
| 105 |
+
f"Sex {random.choice(['Male', 'Female'])} "
|
| 106 |
+
f"Legitimacy {random.choice(['Legitimate', 'Illegitimate'])} "
|
| 107 |
+
f"Attendant {random.choice(['Physician', 'Midwife', 'Nurse'])} "
|
| 108 |
+
f"birth certificate registry birth registration infant newborn child",
|
| 109 |
+
|
| 110 |
+
# Template B: No. without space
|
| 111 |
+
f"Municipal Form No.102 Certificate of Live Birth "
|
| 112 |
+
f"Child {_name()} born {_date()} at {_place()} "
|
| 113 |
+
f"mother {_name()} father {_name()} "
|
| 114 |
+
f"birth weight {random.randint(2, 4)}.{random.randint(1, 9)} kg "
|
| 115 |
+
f"birth order {random.choice(['First', 'Second', 'Third'])} "
|
| 116 |
+
f"birth certificate Form 102",
|
| 117 |
+
|
| 118 |
+
# Template C: Registry number format
|
| 119 |
+
f"Municipal Form No. 102 Certificate of Live Birth "
|
| 120 |
+
f"Registry number {random.randint(100, 999)}-{random.randint(1, 99):02d} "
|
| 121 |
+
f"name of child {_name()} date of birth {_date()} "
|
| 122 |
+
f"place of birth {_place()} birth certificate municipal civil registrar",
|
| 123 |
+
|
| 124 |
+
# Template D: PSA/NSO sealed copy (used when filing Form 90)
|
| 125 |
+
f"Municipal Form No. 102 Certificate of Live Birth "
|
| 126 |
+
f"PSA {_name()} born on {_date()} "
|
| 127 |
+
f"place of birth {_place()} "
|
| 128 |
+
f"mother maiden name {_name()} father {_name()} "
|
| 129 |
+
f"type of birth {random.choice(['Single', 'Twin'])} infant newborn",
|
| 130 |
+
|
| 131 |
+
# Template E: NSO variation
|
| 132 |
+
f"Municipal Form No.102 Certificate of Live Birth "
|
| 133 |
+
f"NSO birth registration {_name()} "
|
| 134 |
+
f"birth date {_date()} birthplace {_place()} "
|
| 135 |
+
f"parents mother {_name()} father {_name()} "
|
| 136 |
+
f"attendant at birth {random.choice(['hospital', 'midwife', 'physician'])} "
|
| 137 |
+
f"sex {random.choice(['male', 'female'])}",
|
| 138 |
+
]
|
| 139 |
+
return random.choice(templates)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def generate_form103():
|
| 143 |
+
"""
|
| 144 |
+
Form 103 β Certificate of Death
|
| 145 |
+
Header keywords: 'Municipal Form No. 103', 'Certificate of Death'
|
| 146 |
+
"""
|
| 147 |
+
causes = [
|
| 148 |
+
'Cardiac Arrest', 'Pneumonia', 'Hypertension', 'Diabetes Mellitus',
|
| 149 |
+
'Stroke', 'Respiratory Failure', 'Natural Causes', 'Cancer',
|
| 150 |
+
'Septicemia', 'Renal Failure'
|
| 151 |
+
]
|
| 152 |
+
templates = [
|
| 153 |
+
# Template A: Exact header
|
| 154 |
+
f"Municipal Form No. 103 Certificate of Death "
|
| 155 |
+
f"Name of deceased {_name()} Date of death {_date()} Place of death {_place()} "
|
| 156 |
+
f"Cause of death {random.choice(causes)} Age at death {random.randint(1, 95)} "
|
| 157 |
+
f"Sex {random.choice(['Male', 'Female'])} "
|
| 158 |
+
f"Civil status {random.choice(['Single', 'Married', 'Widowed'])} "
|
| 159 |
+
f"death certificate deceased burial interment",
|
| 160 |
+
|
| 161 |
+
# Template B: No space
|
| 162 |
+
f"Municipal Form No.103 Certificate of Death "
|
| 163 |
+
f"Deceased {_name()} died on {_date()} at {_place()} "
|
| 164 |
+
f"cause {random.choice(causes)} corpse informant {_name()} "
|
| 165 |
+
f"death certificate Form 103 municipal civil registrar",
|
| 166 |
+
|
| 167 |
+
# Template C: Registry format
|
| 168 |
+
f"Municipal Form No. 103 Certificate of Death "
|
| 169 |
+
f"Registry number death {random.randint(100, 999)}-{random.randint(1, 99):02d} "
|
| 170 |
+
f"name of deceased {_name()} date of death {_date()} "
|
| 171 |
+
f"place of death {_place()} cause of death {random.choice(causes)} "
|
| 172 |
+
f"death certificate burial permit",
|
| 173 |
+
|
| 174 |
+
# Template D: Clinical format
|
| 175 |
+
f"Municipal Form No.103 Certificate of Death "
|
| 176 |
+
f"{_name()} died {_date()} "
|
| 177 |
+
f"place {_place()} cause of death {random.choice(causes)} "
|
| 178 |
+
f"informant {_name()} relationship {random.choice(['spouse', 'child', 'sibling', 'parent'])} "
|
| 179 |
+
f"death deceased cadaver",
|
| 180 |
+
|
| 181 |
+
# Template E: Full form
|
| 182 |
+
f"Municipal Form No. 103 Certificate of Death "
|
| 183 |
+
f"Form 103 death registration {_name()} "
|
| 184 |
+
f"date of death {_date()} place of death {_place()} "
|
| 185 |
+
f"immediate cause {random.choice(causes)} "
|
| 186 |
+
f"attending physician {_name()} certificate of death",
|
| 187 |
+
]
|
| 188 |
+
return random.choice(templates)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def generate_form97():
|
| 192 |
+
"""
|
| 193 |
+
Form 97 β Certificate of Marriage
|
| 194 |
+
Header keywords: 'Municipal Form No. 97', 'Certificate of Marriage'
|
| 195 |
+
"""
|
| 196 |
+
officers = ['Rev.', 'Judge', 'Mayor', 'Pastor', 'Fr.']
|
| 197 |
+
licenses = [f"{random.randint(10000, 99999)}", f"ML-{random.randint(1000, 9999)}"]
|
| 198 |
+
templates = [
|
| 199 |
+
# Template A: Exact header
|
| 200 |
+
f"Municipal Form No. 97 Certificate of Marriage "
|
| 201 |
+
f"Name of husband {_name()} Name of wife {_name()} "
|
| 202 |
+
f"Date of marriage {_date()} Place of marriage {_place()} "
|
| 203 |
+
f"Solemnizing officer {random.choice(officers)} {_name()} "
|
| 204 |
+
f"Marriage license number {random.choice(licenses)} witnesses {_name()} {_name()} "
|
| 205 |
+
f"marriage certificate contracting parties wedding",
|
| 206 |
+
|
| 207 |
+
# Template B: No space
|
| 208 |
+
f"Municipal Form No.97 Certificate of Marriage "
|
| 209 |
+
f"Husband {_name()} wife {_name()} "
|
| 210 |
+
f"married on {_date()} at {_place()} "
|
| 211 |
+
f"officiated by {random.choice(officers)} {_name()} "
|
| 212 |
+
f"marriage certificate Form 97 solemnizing officer",
|
| 213 |
+
|
| 214 |
+
# Template C: Registry format
|
| 215 |
+
f"Municipal Form No. 97 Certificate of Marriage "
|
| 216 |
+
f"Registry number marriage {random.randint(100, 999)}-{random.randint(1, 99):02d} "
|
| 217 |
+
f"husband {_name()} wife {_name()} "
|
| 218 |
+
f"date of marriage {_date()} place {_place()} "
|
| 219 |
+
f"marriage license {random.choice(licenses)} issued at {_place()} "
|
| 220 |
+
f"marriage certificate civil registrar",
|
| 221 |
+
|
| 222 |
+
# Template D: Ceremony format
|
| 223 |
+
f"Municipal Form No.97 Certificate of Marriage "
|
| 224 |
+
f"{_name()} and {_name()} "
|
| 225 |
+
f"solemnized {_date()} at {_place()} "
|
| 226 |
+
f"solemnizing officer {random.choice(officers)} {_name()} "
|
| 227 |
+
f"witnesses {_name()} {_name()} "
|
| 228 |
+
f"marriage contracting parties husband wife ceremony",
|
| 229 |
+
|
| 230 |
+
# Template E: Full form
|
| 231 |
+
f"Municipal Form No. 97 Certificate of Marriage "
|
| 232 |
+
f"Form 97 marriage registration husband {_name()} "
|
| 233 |
+
f"wife {_name()} date of marriage {_date()} "
|
| 234 |
+
f"place of marriage {_place()} "
|
| 235 |
+
f"license number {random.choice(licenses)} marriage nuptial wed",
|
| 236 |
+
]
|
| 237 |
+
return random.choice(templates)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 241 |
+
# 4. DATASET GENERATOR (3 classes only β no Form 90)
|
| 242 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 243 |
+
def generate_dataset(samples_per_class=150):
|
| 244 |
+
generators = [generate_form102, generate_form103, generate_form97]
|
| 245 |
+
labels_map = [0, 1, 2] # 0=Form102, 1=Form103, 2=Form97
|
| 246 |
+
|
| 247 |
+
texts, labels = [], []
|
| 248 |
+
for gen, label in zip(generators, labels_map):
|
| 249 |
+
for _ in range(samples_per_class):
|
| 250 |
+
texts.append(gen())
|
| 251 |
+
labels.append(label)
|
| 252 |
+
|
| 253 |
+
combined = list(zip(texts, labels))
|
| 254 |
+
random.shuffle(combined)
|
| 255 |
+
texts, labels = zip(*combined)
|
| 256 |
+
return list(texts), list(labels)
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 260 |
+
# 5. TRAIN & SAVE
|
| 261 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 262 |
+
def train(samples_per_class=150, save_dir='models'):
|
| 263 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 264 |
+
|
| 265 |
+
print("=" * 60)
|
| 266 |
+
print(" MNB Document Classifier | Filipino Civil Registry")
|
| 267 |
+
print(" Certifications Page: Form 102 / 103 / 97 ONLY")
|
| 268 |
+
print(" (Form 90 routing is handled separately via SEX field)")
|
| 269 |
+
print("=" * 60)
|
| 270 |
+
|
| 271 |
+
print(f"\n Generating dataset ({samples_per_class} samples Γ 3 forms = {samples_per_class * 3} total)...")
|
| 272 |
+
texts, labels = generate_dataset(samples_per_class)
|
| 273 |
+
|
| 274 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 275 |
+
texts, labels, test_size=0.2, random_state=42, stratify=labels
|
| 276 |
+
)
|
| 277 |
+
print(f" Train: {len(X_train)} | Test: {len(X_test)}")
|
| 278 |
+
|
| 279 |
+
# TF-IDF vectorizer
|
| 280 |
+
vectorizer = TfidfVectorizer(
|
| 281 |
+
ngram_range=(1, 2),
|
| 282 |
+
max_features=5000,
|
| 283 |
+
sublinear_tf=True,
|
| 284 |
+
min_df=1,
|
| 285 |
+
)
|
| 286 |
+
X_train_vec = vectorizer.fit_transform(X_train)
|
| 287 |
+
X_test_vec = vectorizer.transform(X_test)
|
| 288 |
+
|
| 289 |
+
# Train MNB
|
| 290 |
+
clf = MultinomialNB(alpha=0.1)
|
| 291 |
+
clf.fit(X_train_vec, y_train)
|
| 292 |
+
|
| 293 |
+
# Evaluate
|
| 294 |
+
y_pred = clf.predict(X_test_vec)
|
| 295 |
+
acc = accuracy_score(y_test, y_pred)
|
| 296 |
+
|
| 297 |
+
print(f"\n Accuracy : {acc * 100:.2f}%")
|
| 298 |
+
print("\n Classification Report:")
|
| 299 |
+
print(classification_report(y_test, y_pred, target_names=LABEL_NAMES))
|
| 300 |
+
|
| 301 |
+
print(" Confusion Matrix:")
|
| 302 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 303 |
+
headers = ['Form102', 'Form103', 'Form97']
|
| 304 |
+
print(f" {'':30s} " + " ".join(headers))
|
| 305 |
+
for i, row in enumerate(cm):
|
| 306 |
+
print(f" Actual {headers[i]}: {str(row)}")
|
| 307 |
+
|
| 308 |
+
# Save
|
| 309 |
+
model_path = os.path.join(save_dir, 'mnb_classifier.pkl')
|
| 310 |
+
vec_path = os.path.join(save_dir, 'tfidf_vectorizer.pkl')
|
| 311 |
+
with open(model_path, 'wb') as f:
|
| 312 |
+
pickle.dump(clf, f)
|
| 313 |
+
with open(vec_path, 'wb') as f:
|
| 314 |
+
pickle.dump(vectorizer, f)
|
| 315 |
+
|
| 316 |
+
meta = {
|
| 317 |
+
'accuracy': round(acc * 100, 2),
|
| 318 |
+
'samples_per_class': samples_per_class,
|
| 319 |
+
'total_samples': samples_per_class * 3,
|
| 320 |
+
'labels': LABEL_MAP,
|
| 321 |
+
'note': 'Form 90 routing is handled by classify_sex() β not this model',
|
| 322 |
+
'model_path': model_path,
|
| 323 |
+
'vectorizer_path': vec_path,
|
| 324 |
+
}
|
| 325 |
+
with open(os.path.join(save_dir, 'mnb_metadata.json'), 'w') as f:
|
| 326 |
+
json.dump(meta, f, indent=2)
|
| 327 |
+
|
| 328 |
+
print(f"\n Model saved : {model_path}")
|
| 329 |
+
print(f" Vectorizer saved: {vec_path}")
|
| 330 |
+
print(f"\n Target accuracy : >90%")
|
| 331 |
+
print(f" Achieved : {acc * 100:.2f}% {'β' if acc >= 0.90 else 'β (try increasing samples_per_class)'}")
|
| 332 |
+
print("=" * 60)
|
| 333 |
+
|
| 334 |
+
return clf, vectorizer, acc
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 338 |
+
# 6. DOCUMENT CLASSIFIER CLASS
|
| 339 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 340 |
+
class DocumentClassifier:
|
| 341 |
+
"""Load trained MNB model and classify OCR text from Certifications page."""
|
| 342 |
+
|
| 343 |
+
def __init__(self, model_dir='models'):
|
| 344 |
+
model_path = os.path.join(model_dir, 'mnb_classifier.pkl')
|
| 345 |
+
vec_path = os.path.join(model_dir, 'tfidf_vectorizer.pkl')
|
| 346 |
+
|
| 347 |
+
if not os.path.exists(model_path):
|
| 348 |
+
raise FileNotFoundError(
|
| 349 |
+
f"Model not found at {model_path}. Run: python form_classifier.py"
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
with open(model_path, 'rb') as f:
|
| 353 |
+
self.clf = pickle.load(f)
|
| 354 |
+
with open(vec_path, 'rb') as f:
|
| 355 |
+
self.vectorizer = pickle.load(f)
|
| 356 |
+
|
| 357 |
+
def predict(self, text: str) -> dict:
|
| 358 |
+
"""
|
| 359 |
+
Classify OCR text from Certifications page.
|
| 360 |
+
|
| 361 |
+
Returns:
|
| 362 |
+
{
|
| 363 |
+
'label': 'Form 102 - Certificate of Live Birth',
|
| 364 |
+
'form_code': 'form102',
|
| 365 |
+
'confidence': 0.95,
|
| 366 |
+
'probabilities': { ... }
|
| 367 |
+
}
|
| 368 |
+
"""
|
| 369 |
+
vec = self.vectorizer.transform([text])
|
| 370 |
+
probs = self.clf.predict_proba(vec)[0]
|
| 371 |
+
idx = int(np.argmax(probs))
|
| 372 |
+
|
| 373 |
+
form_codes = ['form102', 'form103', 'form97']
|
| 374 |
+
return {
|
| 375 |
+
'label': LABEL_MAP[idx],
|
| 376 |
+
'form_code': form_codes[idx],
|
| 377 |
+
'confidence': round(float(probs[idx]), 4),
|
| 378 |
+
'probabilities': {
|
| 379 |
+
LABEL_MAP[i]: round(float(p), 4)
|
| 380 |
+
for i, p in enumerate(probs)
|
| 381 |
+
}
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 386 |
+
# 7. TEST DEMO
|
| 387 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 388 |
+
def run_test():
|
| 389 |
+
print("\n" + "=" * 60)
|
| 390 |
+
print(" Testing DocumentClassifier β Certifications Page")
|
| 391 |
+
print("=" * 60)
|
| 392 |
+
|
| 393 |
+
classifier = DocumentClassifier()
|
| 394 |
+
|
| 395 |
+
test_cases = [
|
| 396 |
+
(
|
| 397 |
+
"Municipal Form No. 102 Certificate of Live Birth "
|
| 398 |
+
"Name of child Maria Santos Date of birth 01/15/1990 "
|
| 399 |
+
"Place of birth Brgy. San Jose, Tarlac City, Tarlac "
|
| 400 |
+
"Name of mother Lani Santos Name of father Jose Santos "
|
| 401 |
+
"Sex Female birth certificate infant",
|
| 402 |
+
"Form 102 - Certificate of Live Birth"
|
| 403 |
+
),
|
| 404 |
+
(
|
| 405 |
+
"Municipal Form No.102 Certificate of Live Birth "
|
| 406 |
+
"PSA Child Juan Dela Cruz born 03/22/1985 "
|
| 407 |
+
"Place of birth Capas Tarlac mother Rosa Dela Cruz "
|
| 408 |
+
"father Pedro Dela Cruz Sex Male",
|
| 409 |
+
"Form 102 - Certificate of Live Birth"
|
| 410 |
+
),
|
| 411 |
+
(
|
| 412 |
+
"Municipal Form No. 103 Certificate of Death "
|
| 413 |
+
"Name of deceased Pedro Reyes Date of death 03/22/2020 "
|
| 414 |
+
"Place of death Capas, Tarlac Cause of death Cardiac Arrest "
|
| 415 |
+
"Age at death 75 death certificate deceased burial",
|
| 416 |
+
"Form 103 - Certificate of Death"
|
| 417 |
+
),
|
| 418 |
+
(
|
| 419 |
+
"Municipal Form No.103 Certificate of Death "
|
| 420 |
+
"Deceased Ana Torres died 07/04/2000 "
|
| 421 |
+
"cause Pneumonia burial permit interment",
|
| 422 |
+
"Form 103 - Certificate of Death"
|
| 423 |
+
),
|
| 424 |
+
(
|
| 425 |
+
"Municipal Form No. 97 Certificate of Marriage "
|
| 426 |
+
"Name of husband Carlos Bautista Name of wife Ana Torres "
|
| 427 |
+
"Date of marriage 07/04/2005 Place of marriage Paniqui, Tarlac "
|
| 428 |
+
"Solemnizing officer Rev. Santos witnesses marriage certificate",
|
| 429 |
+
"Form 97 - Certificate of Marriage"
|
| 430 |
+
),
|
| 431 |
+
(
|
| 432 |
+
"Municipal Form No.97 Certificate of Marriage "
|
| 433 |
+
"Husband Jose Santos wife Maria Reyes "
|
| 434 |
+
"married 11/30/1995 contracting parties solemnizing officer",
|
| 435 |
+
"Form 97 - Certificate of Marriage"
|
| 436 |
+
),
|
| 437 |
+
]
|
| 438 |
+
|
| 439 |
+
correct = 0
|
| 440 |
+
for text, expected in test_cases:
|
| 441 |
+
result = classifier.predict(text)
|
| 442 |
+
status = 'β' if expected in result['label'] else 'β'
|
| 443 |
+
if expected in result['label']:
|
| 444 |
+
correct += 1
|
| 445 |
+
print(f"\n {status} Expected : {expected}")
|
| 446 |
+
print(f" Predicted: {result['label']} ({result['confidence'] * 100:.1f}% confidence)")
|
| 447 |
+
|
| 448 |
+
print(f"\n Test Accuracy: {correct}/{len(test_cases)} ({correct / len(test_cases) * 100:.0f}%)")
|
| 449 |
+
print("=" * 60)
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 453 |
+
# 8. MAIN
|
| 454 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 455 |
+
if __name__ == '__main__':
|
| 456 |
+
parser = argparse.ArgumentParser()
|
| 457 |
+
parser.add_argument('--test', action='store_true', help='Run test predictions only')
|
| 458 |
+
parser.add_argument('--samples', type=int, default=150, help='Samples per class (default: 150)')
|
| 459 |
+
args = parser.parse_args()
|
| 460 |
+
|
| 461 |
+
if args.test:
|
| 462 |
+
run_test()
|
| 463 |
+
else:
|
| 464 |
+
train(samples_per_class=args.samples)
|
| 465 |
+
print("\nTo test predictions, run:")
|
| 466 |
+
print(" python form_classifier.py --test")
|
MNB/keywords.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# mnb/keywords.py
|
| 2 |
+
# ============================================================
|
| 3 |
+
# Keyword fallback lists used by classifier.py when the
|
| 4 |
+
# trained .pkl models are not available.
|
| 5 |
+
#
|
| 6 |
+
# Uses EXACT Philippine civil registry form headers:
|
| 7 |
+
# Form 102 β "Municipal Form No. 102 / Certificate of Live Birth"
|
| 8 |
+
# Form 103 β "Municipal Form No. 103 / Certificate of Death"
|
| 9 |
+
# Form 97 β "Municipal Form No. 97 / Certificate of Marriage"
|
| 10 |
+
#
|
| 11 |
+
# NOTE: Form 90 is NOT classified here.
|
| 12 |
+
# Form 90 has its own upload page (Application for Marriage License).
|
| 13 |
+
# The SEX field on the uploaded birth cert determines routing:
|
| 14 |
+
# Male β GROOM slot in Form 90
|
| 15 |
+
# Female β BRIDE slot in Form 90
|
| 16 |
+
# ============================================================
|
| 17 |
+
|
| 18 |
+
# ββ PATH A: Certifications Page ββββββββββββββββββββββββββββββ
|
| 19 |
+
FORM_KEYWORDS = {
|
| 20 |
+
|
| 21 |
+
"form102": [
|
| 22 |
+
# Exact header variants
|
| 23 |
+
"Municipal Form No. 102",
|
| 24 |
+
"Municipal Form No.102",
|
| 25 |
+
"Certificate of Live Birth",
|
| 26 |
+
# Field-level keywords
|
| 27 |
+
"name of child",
|
| 28 |
+
"date of birth",
|
| 29 |
+
"place of birth",
|
| 30 |
+
"birth certificate",
|
| 31 |
+
"name of mother",
|
| 32 |
+
"name of father",
|
| 33 |
+
"attendant at birth",
|
| 34 |
+
"type of birth",
|
| 35 |
+
"birth order",
|
| 36 |
+
"legitimacy",
|
| 37 |
+
"infant",
|
| 38 |
+
"newborn",
|
| 39 |
+
# PSA/NSO sealed copy keywords
|
| 40 |
+
"PSA",
|
| 41 |
+
"NSO",
|
| 42 |
+
"bc registry",
|
| 43 |
+
],
|
| 44 |
+
|
| 45 |
+
"form103": [
|
| 46 |
+
# Exact header variants
|
| 47 |
+
"Municipal Form No. 103",
|
| 48 |
+
"Municipal Form No.103",
|
| 49 |
+
"Certificate of Death",
|
| 50 |
+
# Field-level keywords
|
| 51 |
+
"name of deceased",
|
| 52 |
+
"date of death",
|
| 53 |
+
"place of death",
|
| 54 |
+
"cause of death",
|
| 55 |
+
"death certificate",
|
| 56 |
+
"immediate cause",
|
| 57 |
+
"antecedent cause",
|
| 58 |
+
"underlying cause",
|
| 59 |
+
"burial",
|
| 60 |
+
"deceased",
|
| 61 |
+
"died",
|
| 62 |
+
"burial permit",
|
| 63 |
+
"interment",
|
| 64 |
+
],
|
| 65 |
+
|
| 66 |
+
"form97": [
|
| 67 |
+
# Exact header variants
|
| 68 |
+
"Municipal Form No. 97",
|
| 69 |
+
"Municipal Form No.97",
|
| 70 |
+
"Certificate of Marriage",
|
| 71 |
+
# Field-level keywords
|
| 72 |
+
"name of husband",
|
| 73 |
+
"name of wife",
|
| 74 |
+
"date of marriage",
|
| 75 |
+
"place of marriage",
|
| 76 |
+
"marriage certificate",
|
| 77 |
+
"solemnizing officer",
|
| 78 |
+
"contracting parties",
|
| 79 |
+
"witnesses",
|
| 80 |
+
"marriage license number",
|
| 81 |
+
"mc registry",
|
| 82 |
+
"nuptial",
|
| 83 |
+
"wed",
|
| 84 |
+
],
|
| 85 |
+
|
| 86 |
+
"form90": [
|
| 87 |
+
# Exact header variants
|
| 88 |
+
"Municipal Form 90",
|
| 89 |
+
"Municipal Form No. 90",
|
| 90 |
+
"Municipal Form No.90",
|
| 91 |
+
"Application for Marriage License",
|
| 92 |
+
"APPLICATION FOR MARRIAGE LICENSE",
|
| 93 |
+
"Form No. 2",
|
| 94 |
+
# Field-level keywords
|
| 95 |
+
"name of applicant",
|
| 96 |
+
"marriage license no",
|
| 97 |
+
"marriage license number",
|
| 98 |
+
"date of issuance",
|
| 99 |
+
"date of issuance of marriage license",
|
| 100 |
+
"groom",
|
| 101 |
+
"bride",
|
| 102 |
+
"may i apply for a license",
|
| 103 |
+
"accountable form no. 54",
|
| 104 |
+
],
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
# ββ PATH B: Form 90 Marriage License Page ββββββββββββββββββββ
|
| 108 |
+
# Used ONLY on the Marriage License upload page.
|
| 109 |
+
# Reads the SEX field from the uploaded PSA/NSO birth certificate.
|
| 110 |
+
# Male β GROOM (routed to Groom slot in Form 90)
|
| 111 |
+
# Female β BRIDE (routed to Bride slot in Form 90)
|
| 112 |
+
SEX_KEYWORDS = {
|
| 113 |
+
"GROOM": [
|
| 114 |
+
"sex: male",
|
| 115 |
+
"sex male",
|
| 116 |
+
"2. sex: male",
|
| 117 |
+
" male",
|
| 118 |
+
"sex m",
|
| 119 |
+
],
|
| 120 |
+
"BRIDE": [
|
| 121 |
+
"sex: female",
|
| 122 |
+
"sex female",
|
| 123 |
+
"2. sex: female",
|
| 124 |
+
" female",
|
| 125 |
+
"sex f",
|
| 126 |
+
],
|
| 127 |
+
}
|
MNB/mnb_metadata.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 100.0,
|
| 3 |
+
"samples_per_class": 150,
|
| 4 |
+
"total_samples": 450,
|
| 5 |
+
"labels": {
|
| 6 |
+
"0": "Form 102 - Certificate of Live Birth",
|
| 7 |
+
"1": "Form 103 - Certificate of Death",
|
| 8 |
+
"2": "Form 97 - Certificate of Marriage"
|
| 9 |
+
},
|
| 10 |
+
"note": "Form 90 routing is handled separately by classify_sex() using the SEX field on uploaded PSA/NSO birth certificates. Male = GROOM, Female = BRIDE.",
|
| 11 |
+
"pages": {
|
| 12 |
+
"certifications": "Classifies Form 102 / 103 / 97 from uploaded certification scan",
|
| 13 |
+
"marriage_license": "classify_sex() routes birth cert to GROOM or BRIDE slot in Form 90"
|
| 14 |
+
},
|
| 15 |
+
"model_path": "models/mnb_classifier.pkl",
|
| 16 |
+
"vectorizer_path": "models/tfidf_vectorizer.pkl"
|
| 17 |
+
}
|
MNB/models/mnb_classifier.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d62d9cbdd7d76b60d17787b93bcc22f51c5602934ac60117e15279c3a22c519
|
| 3 |
+
size 200089
|
MNB/models/mnb_metadata.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 100.0,
|
| 3 |
+
"samples_per_class": 150,
|
| 4 |
+
"total_samples": 600,
|
| 5 |
+
"labels": {
|
| 6 |
+
"0": "Form 1A - Birth Certificate",
|
| 7 |
+
"1": "Form 2A - Death Certificate",
|
| 8 |
+
"2": "Form 3A - Marriage Certificate",
|
| 9 |
+
"3": "Form 90 - Application for Marriage License"
|
| 10 |
+
},
|
| 11 |
+
"model_path": "models\\mnb_classifier.pkl",
|
| 12 |
+
"vectorizer_path": "models\\tfidf_vectorizer.pkl"
|
| 13 |
+
}
|
MNB/models/tfidf_vectorizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:217cd506c7d9d7bfcfef73fc107273c129d4d55ab7dfddc1190e2863ee381ec4
|
| 3 |
+
size 129497
|
references/12
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d9e1c47ea7a15f7ff1e14a3b34db3f2eb690c15c45c2a5b8174d964633d0f6f
|
| 3 |
+
size 1924369
|
references/321
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc0159e24fa6735aeed7153ecf0092ba6d7bec510c57c8ec52a28328083d2e61
|
| 3 |
+
size 957650
|
references/321321
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3e4cf4da9290a262997067fda2640da298979a5f3dc069b88177104d1a629ce
|
| 3 |
+
size 3225794
|
references/old.jpg
ADDED
|
Git LFS Details
|
references/reference-102.png
ADDED
|
Git LFS Details
|
references/reference-103.png
ADDED
|
Git LFS Details
|