ifgr003 commited on
Commit
4bfc055
Β·
verified Β·
1 Parent(s): 091afb2

Upload 124 files

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .gitattributes +37 -0
  2. CRNN+CTC/.env +1 -0
  3. CRNN+CTC/.gitignore +14 -0
  4. CRNN+CTC/IAM_train.py +332 -0
  5. CRNN+CTC/README.md +449 -0
  6. CRNN+CTC/calibrate_fields.py +196 -0
  7. CRNN+CTC/calibrated_fields.py +7 -0
  8. CRNN+CTC/check_cer.py +331 -0
  9. CRNN+CTC/checkpoints/best_model.pth +3 -0
  10. CRNN+CTC/checkpoints/best_model_final.pth +3 -0
  11. CRNN+CTC/checkpoints/best_model_iam.pth +3 -0
  12. CRNN+CTC/checkpoints/best_model_v2.pth +3 -0
  13. CRNN+CTC/checkpoints/best_model_v3.pth +3 -0
  14. CRNN+CTC/checkpoints/best_model_v4.pth +3 -0
  15. CRNN+CTC/checkpoints/best_model_v5.pth +3 -0
  16. CRNN+CTC/checkpoints/best_model_v6.pth +3 -0
  17. CRNN+CTC/checkpoints/best_model_v7.pth +3 -0
  18. CRNN+CTC/checkpoints/best_model_v732.pth +3 -0
  19. CRNN+CTC/checkpoints/checkpoint_epoch_10.pth +3 -0
  20. CRNN+CTC/checkpoints/latest_checkpoint.pth +3 -0
  21. CRNN+CTC/compare_checkpoints.py +34 -0
  22. CRNN+CTC/compare_live_cer.py +158 -0
  23. CRNN+CTC/create_test_images.py +50 -0
  24. CRNN+CTC/crnn_model.py +119 -0
  25. CRNN+CTC/dataset.py +401 -0
  26. CRNN+CTC/field_extractor.py +735 -0
  27. CRNN+CTC/finetune.py +202 -0
  28. CRNN+CTC/generate_ph_names.py +350 -0
  29. CRNN+CTC/inference.py +395 -0
  30. CRNN+CTC/prepare_emnist.py +97 -0
  31. CRNN+CTC/requirements.txt +61 -0
  32. CRNN+CTC/train.py +438 -0
  33. CRNN+CTC/train_emnist.py +15 -0
  34. CRNN+CTC/train_mnist.py +42 -0
  35. CRNN+CTC/train_with_emnist.py +169 -0
  36. CRNN+CTC/utils.py +397 -0
  37. MNB/__init__.py +4 -0
  38. MNB/classifier.py +292 -0
  39. MNB/form_classifier.py +466 -0
  40. MNB/keywords.py +127 -0
  41. MNB/mnb_metadata.json +17 -0
  42. MNB/models/mnb_classifier.pkl +3 -0
  43. MNB/models/mnb_metadata.json +13 -0
  44. MNB/models/tfidf_vectorizer.pkl +3 -0
  45. references/12 +3 -0
  46. references/321 +3 -0
  47. references/321321 +3 -0
  48. references/old.jpg +3 -0
  49. references/reference-102.png +3 -0
  50. references/reference-103.png +3 -0
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CRNN+CTC/checkpoints/best_model_final.pth filter=lfs diff=lfs merge=lfs -text
2
+ CRNN+CTC/checkpoints/best_model_iam.pth filter=lfs diff=lfs merge=lfs -text
3
+ CRNN+CTC/checkpoints/best_model_v2.pth filter=lfs diff=lfs merge=lfs -text
4
+ CRNN+CTC/checkpoints/best_model_v3.pth filter=lfs diff=lfs merge=lfs -text
5
+ CRNN+CTC/checkpoints/best_model_v4.pth filter=lfs diff=lfs merge=lfs -text
6
+ CRNN+CTC/checkpoints/best_model_v5.pth filter=lfs diff=lfs merge=lfs -text
7
+ CRNN+CTC/checkpoints/best_model_v6.pth filter=lfs diff=lfs merge=lfs -text
8
+ CRNN+CTC/checkpoints/best_model_v7.pth filter=lfs diff=lfs merge=lfs -text
9
+ CRNN+CTC/checkpoints/best_model_v732.pth filter=lfs diff=lfs merge=lfs -text
10
+ CRNN+CTC/checkpoints/best_model.pth filter=lfs diff=lfs merge=lfs -text
11
+ CRNN+CTC/checkpoints/checkpoint_epoch_10.pth filter=lfs diff=lfs merge=lfs -text
12
+ CRNN+CTC/checkpoints/latest_checkpoint.pth filter=lfs diff=lfs merge=lfs -text
13
+ MNB/models/mnb_classifier.pkl filter=lfs diff=lfs merge=lfs -text
14
+ MNB/models/tfidf_vectorizer.pkl filter=lfs diff=lfs merge=lfs -text
15
+ references/12 filter=lfs diff=lfs merge=lfs -text
16
+ references/321 filter=lfs diff=lfs merge=lfs -text
17
+ references/321321 filter=lfs diff=lfs merge=lfs -text
18
+ references/old.jpg filter=lfs diff=lfs merge=lfs -text
19
+ references/reference-102.png filter=lfs diff=lfs merge=lfs -text
20
+ references/reference-103.png filter=lfs diff=lfs merge=lfs -text
21
+ references/reference-97.png filter=lfs diff=lfs merge=lfs -text
22
+ spacyNER/models/civil_registry_model/model-best/ner/model filter=lfs diff=lfs merge=lfs -text
23
+ spacyNER/models/civil_registry_model/model-best/tok2vec/model filter=lfs diff=lfs merge=lfs -text
24
+ spacyNER/models/civil_registry_model/model-best/vocab/key2row filter=lfs diff=lfs merge=lfs -text
25
+ spacyNER/models/civil_registry_model/model-best/vocab/vectors filter=lfs diff=lfs merge=lfs -text
26
+ spacyNER/models/civil_registry_model/model-last/ner/model filter=lfs diff=lfs merge=lfs -text
27
+ spacyNER/models/civil_registry_model/model-last/tok2vec/model filter=lfs diff=lfs merge=lfs -text
28
+ spacyNER/models/civil_registry_model/model-last/vocab/key2row filter=lfs diff=lfs merge=lfs -text
29
+ spacyNER/models/civil_registry_model/model-last/vocab/vectors filter=lfs diff=lfs merge=lfs -text
30
+ spacyNER/models/phase1_funsd/model-best/ner/model filter=lfs diff=lfs merge=lfs -text
31
+ spacyNER/models/phase1_funsd/model-best/tok2vec/model filter=lfs diff=lfs merge=lfs -text
32
+ spacyNER/models/phase1_funsd/model-best/vocab/key2row filter=lfs diff=lfs merge=lfs -text
33
+ spacyNER/models/phase1_funsd/model-best/vocab/vectors filter=lfs diff=lfs merge=lfs -text
34
+ spacyNER/models/phase1_funsd/model-last/ner/model filter=lfs diff=lfs merge=lfs -text
35
+ spacyNER/models/phase1_funsd/model-last/tok2vec/model filter=lfs diff=lfs merge=lfs -text
36
+ spacyNER/models/phase1_funsd/model-last/vocab/key2row filter=lfs diff=lfs merge=lfs -text
37
+ spacyNER/models/phase1_funsd/model-last/vocab/vectors filter=lfs diff=lfs merge=lfs -text
CRNN+CTC/.env ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»ΏPOPPLER_PATH=C:\Program Files\poppler-25.12.0\Library\bin
CRNN+CTC/.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets/
2
+ checkpoints/
3
+ logs/
4
+ test_images/
5
+ data/
6
+ __pycache__/
7
+ *.png
8
+ *.jpg
9
+ *.jpeg
10
+ *.npy
11
+ *.h5
12
+ *.pkl
13
+ *.pyc
14
+ iam-handwriting-word-database/
CRNN+CTC/IAM_train.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ IAM_train.py
3
+ ============
4
+ Fine-tune the CRNN model using the IAM Handwriting Word Database.
5
+ Builds on top of EMNIST-trained model (best_model_emnist.pth).
6
+
7
+ FIXES vs old version:
8
+ - IMG_WIDTH 400 -> 512 (must match pipeline)
9
+ - Added log_softmax before CTCLoss (was missing β€” caused catastrophic forgetting)
10
+ - Phase 1: CNN FROZEN β€” only RNN+FC trained
11
+ - Phase 2: Full model at very low LR
12
+ - Loads from best_model_emnist.pth, falls back to best_model.pth
13
+ - Uses get_crnn_model() with correct architecture from checkpoint config
14
+
15
+ DATASET:
16
+ Download from: https://www.kaggle.com/datasets/nibinv23/iam-handwriting-word-database
17
+ Expected structure:
18
+ data/IAM/iam_words/
19
+ words/ <- word image folders (a01, a02, ...)
20
+ words.txt <- annotation file
21
+
22
+ USAGE:
23
+ python IAM_train.py --prepare # convert IAM -> annotation JSON
24
+ python IAM_train.py --train # fine-tune model
25
+ python IAM_train.py --prepare --train # do both
26
+ """
27
+
28
+ import os
29
+ import sys
30
+ import json
31
+ import argparse
32
+ import random
33
+ from pathlib import Path
34
+
35
+ import torch
36
+ import torch.nn.functional as F
37
+ import torch.optim as optim
38
+ from torch.utils.data import DataLoader, ConcatDataset
39
+
40
+ sys.path.append('.')
41
+ from crnn_model import get_crnn_model
42
+ from dataset import CivilRegistryDataset, collate_fn
43
+
44
+ # ─────────────────────────────────────────────
45
+ # CONFIG
46
+ # ─────────────────────────────────────────────
47
+ IAM_ROOT = "data/IAM/iam_words"
48
+ IAM_WORDS_TXT = f"{IAM_ROOT}/words.txt"
49
+ IAM_WORDS_DIR = f"{IAM_ROOT}/words"
50
+
51
+ TRAIN_ANN = "data/iam_train_annotations.json"
52
+ IAM_VAL_ANN = "data/iam_val_annotations.json" # written by --prepare (IAM word images)
53
+ SYNTH_VAL_ANN = "data/val_annotations.json" # real civil registry val set β€” never overwritten
54
+ TRAIN_IMG_DIR = "data/train/iam"
55
+ VAL_IMG_DIR = "data/val/iam"
56
+
57
+ IMG_HEIGHT = 64
58
+ IMG_WIDTH = 512 # FIXED: was 400 β€” must match pipeline
59
+ BATCH_SIZE = 32
60
+ VAL_SPLIT = 0.1
61
+ MAX_SAMPLES = 50000
62
+
63
+ # Load from EMNIST checkpoint, fall back to synthetic if not found
64
+ CHECKPOINT_IN = "checkpoints/best_model_emnist.pth"
65
+ CHECKPOINT_IN2 = "checkpoints/best_model.pth" # fallback
66
+ CHECKPOINT_OUT = "checkpoints/best_model_iam.pth"
67
+
68
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
69
+
70
+
71
+ # ─────────────────────────────────────────────
72
+ # STEP 1 β€” PREPARE
73
+ # ─────────────────────────────────────────────
74
+ def prepare_iam():
75
+ from PIL import Image
76
+
77
+ print("\n" + "=" * 50)
78
+ print("STEP 1 β€” Preparing IAM dataset")
79
+ print("=" * 50)
80
+
81
+ if not os.path.exists(IAM_WORDS_TXT):
82
+ print(f"ERROR: {IAM_WORDS_TXT} not found!")
83
+ print("Download from: https://www.kaggle.com/datasets/nibinv23/iam-handwriting-word-database")
84
+ print("Expected structure:")
85
+ print(" data/IAM/iam_words/words.txt")
86
+ print(" data/IAM/iam_words/words/")
87
+ sys.exit(1)
88
+
89
+ os.makedirs(TRAIN_IMG_DIR, exist_ok=True)
90
+ os.makedirs(VAL_IMG_DIR, exist_ok=True)
91
+
92
+ entries = []
93
+ print(f" Reading {IAM_WORDS_TXT} ...")
94
+ with open(IAM_WORDS_TXT, "r") as f:
95
+ for line in f:
96
+ line = line.strip()
97
+ if not line or line.startswith("#"):
98
+ continue
99
+ parts = line.split(" ")
100
+ if len(parts) < 9:
101
+ continue
102
+ word_id = parts[0]
103
+ seg_result = parts[1]
104
+ text = parts[-1]
105
+ if seg_result != "ok":
106
+ continue
107
+ if len(text) < 1 or len(text) > 32:
108
+ continue
109
+ parts_id = word_id.split("-")
110
+ img_path = os.path.join(
111
+ IAM_WORDS_DIR,
112
+ parts_id[0],
113
+ f"{parts_id[0]}-{parts_id[1]}",
114
+ f"{word_id}.png"
115
+ )
116
+ if not os.path.exists(img_path):
117
+ continue
118
+ entries.append((img_path, text))
119
+
120
+ print(f" Found {len(entries)} valid word entries")
121
+
122
+ if MAX_SAMPLES and len(entries) > MAX_SAMPLES:
123
+ random.shuffle(entries)
124
+ entries = entries[:MAX_SAMPLES]
125
+ print(f" Limiting to {MAX_SAMPLES} samples")
126
+
127
+ random.shuffle(entries)
128
+ split_idx = int(len(entries) * (1 - VAL_SPLIT))
129
+ train_entries = entries[:split_idx]
130
+ val_entries = entries[split_idx:]
131
+ print(f" Train: {len(train_entries)} | Val: {len(val_entries)}")
132
+ print(" Copying and resizing images...")
133
+
134
+ def process_entries(entry_list, out_dir, prefix):
135
+ annotations = []
136
+ for i, (src_path, text) in enumerate(entry_list):
137
+ try:
138
+ img = Image.open(src_path).convert("RGB")
139
+ img = img.resize((IMG_WIDTH, IMG_HEIGHT)) # FIXED: 512x64
140
+ fname = f"iam_{prefix}_{i:06d}.jpg"
141
+ out_path = os.path.join(out_dir, fname)
142
+ img.save(out_path, quality=90)
143
+ annotations.append({"image_path": f"iam/{fname}", "text": text})
144
+ except Exception:
145
+ continue
146
+ if i % 5000 == 0:
147
+ print(f" {i}/{len(entry_list)} processed...")
148
+ return annotations
149
+
150
+ train_ann = process_entries(train_entries, TRAIN_IMG_DIR, "train")
151
+ val_ann = process_entries(val_entries, VAL_IMG_DIR, "val")
152
+
153
+ with open(TRAIN_ANN, "w") as f:
154
+ json.dump(train_ann, f, indent=2)
155
+ with open(IAM_VAL_ANN, "w") as f:
156
+ json.dump(val_ann, f, indent=2)
157
+
158
+ print(f"\n Train annotations -> {TRAIN_ANN} ({len(train_ann)} entries)")
159
+ print(f" Val annotations -> {IAM_VAL_ANN} ({len(val_ann)} entries)")
160
+ print("\n Done! Now run: python IAM_train.py --train")
161
+
162
+
163
+ # ─────────────────────────────────────────────
164
+ # STEP 2 β€” TRAIN
165
+ # ─────────────────────────────────────────────
166
+ def train_iam():
167
+ print("\n" + "=" * 55)
168
+ print("STEP 2 β€” Fine-tuning CRNN with IAM dataset")
169
+ print("=" * 55)
170
+ print(f" Device : {DEVICE}")
171
+
172
+ for ann_file in [TRAIN_ANN, SYNTH_VAL_ANN]:
173
+ if not os.path.exists(ann_file):
174
+ print(f"ERROR: {ann_file} not found! Run --prepare first.")
175
+ sys.exit(1)
176
+
177
+ train_dataset = CivilRegistryDataset(
178
+ data_dir="data/train", annotations_file=TRAIN_ANN,
179
+ img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
180
+ )
181
+ # FIXED: mix synthetic data in so the model never forgets Filipino multi-word sequences
182
+ synth_dataset = CivilRegistryDataset(
183
+ data_dir="data/train", annotations_file="data/train_annotations.json",
184
+ img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
185
+ )
186
+ mixed_train = ConcatDataset([train_dataset, synth_dataset])
187
+ val_dataset = CivilRegistryDataset(
188
+ data_dir="data/val", annotations_file=SYNTH_VAL_ANN,
189
+ img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
190
+ )
191
+ print(f" IAM train : {len(train_dataset)}")
192
+ print(f" Synthetic train: {len(synth_dataset)}")
193
+ print(f" Mixed train : {len(mixed_train)}")
194
+ print(f" Val : {len(val_dataset)}")
195
+
196
+ train_loader = DataLoader(mixed_train, batch_size=BATCH_SIZE,
197
+ shuffle=True, num_workers=0, collate_fn=collate_fn)
198
+ val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
199
+ shuffle=False, num_workers=0, collate_fn=collate_fn)
200
+
201
+ # ── Load checkpoint (EMNIST preferred, synthetic fallback) ──
202
+ ckpt_path = CHECKPOINT_IN if os.path.exists(CHECKPOINT_IN) else CHECKPOINT_IN2
203
+ if not os.path.exists(ckpt_path):
204
+ print(f"ERROR: No checkpoint found at {CHECKPOINT_IN} or {CHECKPOINT_IN2}")
205
+ print("Run: python train.py then python train_with_emnist.py")
206
+ sys.exit(1)
207
+
208
+ print(f" Loading: {ckpt_path}")
209
+ ckpt = torch.load(ckpt_path, map_location=DEVICE, weights_only=False)
210
+ config = ckpt.get('config', {})
211
+
212
+ model = get_crnn_model(
213
+ model_type = config.get('model_type', 'standard'),
214
+ img_height = config.get('img_height', 64),
215
+ num_chars = train_dataset.num_chars,
216
+ hidden_size = config.get('hidden_size', 128),
217
+ num_lstm_layers = config.get('num_lstm_layers', 1),
218
+ ).to(DEVICE)
219
+
220
+ missing, _ = model.load_state_dict(ckpt['model_state_dict'], strict=False)
221
+ if missing:
222
+ print(f" Note: {len(missing)} layers re-initialized")
223
+ print(f" Loaded epoch {ckpt.get('epoch', 'N/A')} "
224
+ f"val_loss={ckpt.get('val_loss', ckpt.get('val_cer', 0)):.4f}")
225
+
226
+ criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
227
+ os.makedirs("checkpoints", exist_ok=True)
228
+
229
+ def run_epoch(loader, training, optimizer=None):
230
+ model.train() if training else model.eval()
231
+ total, n = 0, 0
232
+ ctx = torch.enable_grad() if training else torch.no_grad()
233
+ with ctx:
234
+ for images, targets, target_lengths, _ in loader:
235
+ images = images.to(DEVICE)
236
+ batch_size = images.size(0)
237
+ if training:
238
+ optimizer.zero_grad()
239
+ # CRITICAL: log_softmax before CTCLoss
240
+ outputs = F.log_softmax(model(images), dim=2)
241
+ seq_len = outputs.size(0)
242
+ input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
243
+ loss = criterion(outputs, targets, input_lengths, target_lengths)
244
+ if not torch.isnan(loss) and not torch.isinf(loss):
245
+ if training:
246
+ loss.backward()
247
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
248
+ optimizer.step()
249
+ total += loss.item()
250
+ n += 1
251
+ return total / max(n, 1)
252
+
253
+ def run_phase(num, epochs, lr, freeze_cnn, patience):
254
+ print(f"\n{'='*55}")
255
+ print(f" PHASE {num} β€” "
256
+ f"{'CNN FROZEN (RNN+FC only)' if freeze_cnn else 'FULL MODEL (all layers)'}"
257
+ f" LR={lr}")
258
+ print(f"{'='*55}")
259
+
260
+ for name, param in model.named_parameters():
261
+ param.requires_grad = not (freeze_cnn and 'cnn' in name)
262
+
263
+ trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
264
+ print(f" Trainable params : {trainable:,}")
265
+
266
+ opt = optim.Adam(
267
+ filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
268
+ sched = optim.lr_scheduler.ReduceLROnPlateau(opt, patience=3, factor=0.5)
269
+ best = float('inf')
270
+ counter = 0
271
+
272
+ for epoch in range(1, epochs + 1):
273
+ tr = run_epoch(train_loader, True, opt)
274
+ vl = run_epoch(val_loader, False, None)
275
+ sched.step(vl)
276
+
277
+ if vl < best:
278
+ best = vl
279
+ counter = 0
280
+ torch.save({
281
+ 'model_state_dict': model.state_dict(),
282
+ 'config': config,
283
+ 'char_to_idx': train_dataset.char_to_idx,
284
+ 'idx_to_char': train_dataset.idx_to_char,
285
+ 'epoch': epoch,
286
+ 'val_loss': vl, # FIXED: renamed from val_cer β€” this is val loss, not CER%
287
+ }, CHECKPOINT_OUT)
288
+ print(f" Epoch {epoch:02d}/{epochs} "
289
+ f"Train={tr:.4f} Val={vl:.4f} <- saved")
290
+ else:
291
+ counter += 1
292
+ print(f" Epoch {epoch:02d}/{epochs} "
293
+ f"Train={tr:.4f} Val={vl:.4f} "
294
+ f"(patience {counter}/{patience})")
295
+ if counter >= patience:
296
+ print(f" Early stopping at epoch {epoch}.")
297
+ break
298
+ return best
299
+
300
+ # Phase 1: Freeze CNN
301
+ p1 = run_phase(1, epochs=30, lr=1e-4, freeze_cnn=True, patience=7)
302
+ # Phase 2: Full model, very low LR
303
+ p2 = run_phase(2, epochs=20, lr=1e-6, freeze_cnn=False, patience=5)
304
+
305
+ print(f"\n{'='*55}")
306
+ print(f"IAM fine-tuning complete!")
307
+ print(f" Phase 1 best val loss : {p1:.4f}")
308
+ print(f" Phase 2 best val loss : {p2:.4f}")
309
+ print(f" Saved : {CHECKPOINT_OUT}")
310
+ print(f"\nNext step: collect physical certificate scans")
311
+
312
+
313
+ # ─────────────────────────────────────────────
314
+ # MAIN
315
+ # ─────────────────────────────────────────────
316
+ if __name__ == "__main__":
317
+ parser = argparse.ArgumentParser()
318
+ parser.add_argument("--prepare", action="store_true")
319
+ parser.add_argument("--train", action="store_true")
320
+ args = parser.parse_args()
321
+
322
+ if not args.prepare and not args.train:
323
+ print("Usage:")
324
+ print(" python IAM_train.py --prepare # prepare dataset")
325
+ print(" python IAM_train.py --train # train model")
326
+ print(" python IAM_train.py --prepare --train # do both")
327
+ sys.exit(0)
328
+
329
+ if args.prepare:
330
+ prepare_iam()
331
+ if args.train:
332
+ train_iam()
CRNN+CTC/README.md ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local Civil Registry Document Digitization and Data Extraction
2
+
3
+ ## Using CRNN+CTC, Multinomial Naive Bayes, and Named Entity Recognition
4
+
5
+ **Thesis Project by:**
6
+ - Shane Mark C. Blanco
7
+ - Princess A. Pasamonte
8
+ - Irish Faith G. Ramirez
9
+
10
+ **Institution:** Tarlac State University, College of Computer Studies
11
+
12
+ ---
13
+
14
+ ## πŸ“‹ Project Overview
15
+
16
+ This system automates the digitization and data extraction of Philippine Civil Registry documents using advanced machine learning algorithms:
17
+
18
+ ### Target Documents:
19
+ - **Form 1A** - Birth Certificate
20
+ - **Form 2A** - Death Certificate
21
+ - **Form 3A** - Marriage Certificate
22
+ - **Form 90** - Application of Marriage License
23
+
24
+ ### Key Features:
25
+ βœ… OCR for printed and handwritten text
26
+ βœ… Automatic document classification
27
+ βœ… Named entity extraction (names, dates, places)
28
+ βœ… Auto-fill digital forms
29
+ βœ… MySQL database storage
30
+ βœ… Searchable digital archive
31
+ βœ… Data visualization dashboard
32
+
33
+ ---
34
+
35
+ ## πŸ—οΈ System Architecture
36
+
37
+ ```
38
+ Input: Scanned Civil Registry Form
39
+ ↓
40
+ 1. Image Preprocessing
41
+ ↓
42
+ 2. CRNN+CTC β†’ Text Recognition
43
+ ↓
44
+ 3. Multinomial Naive Bayes β†’ Document Classification
45
+ ↓
46
+ 4. spaCy NER β†’ Entity Extraction
47
+ ↓
48
+ 5. Data Validation & Storage β†’ MySQL Database
49
+ ↓
50
+ Output: Digitized & Searchable Record
51
+ ```
52
+
53
+ ---
54
+
55
+ ## πŸš€ Quick Start
56
+
57
+ ### Prerequisites
58
+
59
+ - Python 3.8+
60
+ - CUDA-capable GPU (recommended) or CPU
61
+ - 8GB RAM minimum
62
+
63
+ ### Installation
64
+
65
+ ```bash
66
+ # 1. Clone or download the project
67
+ cd civil_registry_ocr
68
+
69
+ # 2. Create virtual environment
70
+ python -m venv venv
71
+ source venv/bin/activate # Linux/Mac
72
+ venv\Scripts\activate # Windows
73
+
74
+ # 3. Install dependencies
75
+ pip install -r requirements.txt
76
+
77
+ # 4. Download spaCy model
78
+ python -m spacy download en_core_web_sm
79
+ ```
80
+
81
+ ### Quick Test
82
+
83
+ ```python
84
+ from inference import CivilRegistryOCR
85
+
86
+ # Load model
87
+ ocr = CivilRegistryOCR('checkpoints/best_model.pth')
88
+
89
+ # Recognize text
90
+ text = ocr.predict('test_images/sample_name.jpg')
91
+ print(f"Recognized: {text}")
92
+ ```
93
+
94
+ ---
95
+
96
+ ## πŸ“ Project Files
97
+
98
+ ### Core Implementation Files:
99
+
100
+ 1. **crnn_model.py** - CRNN+CTC neural network architecture
101
+ 2. **dataset.py** - Data loading and preprocessing
102
+ 3. **train.py** - Model training script
103
+ 4. **inference.py** - Prediction and inference
104
+ 5. **utils.py** - Helper functions and metrics
105
+ 6. **requirements.txt** - Python dependencies
106
+ 7. **IMPLEMENTATION_GUIDE.md** - Detailed implementation guide
107
+
108
+ ### Additional Components (To be created):
109
+
110
+ 8. **document_classifier.py** - Multinomial Naive Bayes classifier
111
+ 9. **ner_extractor.py** - Named Entity Recognition
112
+ 10. **web_app.py** - Web application (Flask/FastAPI)
113
+ 11. **database.py** - MySQL integration
114
+
115
+ ---
116
+
117
+ ## πŸ“Š Training the Model
118
+
119
+ ### 1. Prepare Your Data
120
+
121
+ Organize images and labels:
122
+ ```
123
+ data/
124
+ train/
125
+ form1a/
126
+ name_001.jpg
127
+ name_001.txt
128
+ form2a/
129
+ ...
130
+ val/
131
+ ...
132
+ ```
133
+
134
+ ### 2. Create Annotations
135
+
136
+ ```python
137
+ from dataset import create_annotation_file
138
+
139
+ create_annotation_file('data/train', 'data/train_annotations.json')
140
+ create_annotation_file('data/val', 'data/val_annotations.json')
141
+ ```
142
+
143
+ ### 3. Train Model
144
+
145
+ ```bash
146
+ python train.py
147
+ ```
148
+
149
+ Monitor metrics:
150
+ - Character Error Rate (CER)
151
+ - Word Error Rate (WER)
152
+ - Training/Validation Loss
153
+
154
+ ### 4. Evaluate
155
+
156
+ ```python
157
+ from utils import calculate_cer, calculate_wer
158
+
159
+ predictions = [ocr.predict(img) for img in test_images]
160
+ cer = calculate_cer(predictions, ground_truths)
161
+ print(f"CER: {cer:.2f}%")
162
+ ```
163
+
164
+ ---
165
+
166
+ ## 🌐 Web Application
167
+
168
+ ### Start the Server
169
+
170
+ ```bash
171
+ python web_app.py
172
+ ```
173
+
174
+ ### API Endpoints
175
+
176
+ **POST /api/ocr** - Process document
177
+ ```bash
178
+ curl -X POST -F "file=@birth_cert.jpg" http://localhost:8000/api/ocr
179
+ ```
180
+
181
+ **Response:**
182
+ ```json
183
+ {
184
+ "text": "Juan Dela Cruz\n01/15/1990\nTarlac City",
185
+ "form_type": "form1a",
186
+ "entities": {
187
+ "persons": ["Juan Dela Cruz"],
188
+ "dates": ["01/15/1990"],
189
+ "locations": ["Tarlac City"]
190
+ }
191
+ }
192
+ ```
193
+
194
+ ---
195
+
196
+ ## 🎯 Expected Performance
197
+
198
+ Based on thesis objectives:
199
+
200
+ ### CRNN+CTC Model:
201
+ - **Target CER:** < 5%
202
+ - **Target Accuracy:** > 95%
203
+ - Handles both printed and handwritten text
204
+
205
+ ### Document Classifier (MNB):
206
+ - **Target Accuracy:** > 90%
207
+ - Fast classification (< 100ms)
208
+
209
+ ### NER (spaCy):
210
+ - **F1 Score:** > 85%
211
+ - Extracts: Names, Dates, Places
212
+
213
+ ---
214
+
215
+ ## πŸ§ͺ Testing
216
+
217
+ ### ISO 25010 Evaluation
218
+
219
+ **Usability Testing:**
220
+ ```python
221
+ # Metrics to measure:
222
+ - Task completion rate
223
+ - Average time per task
224
+ - User satisfaction score (SUS)
225
+ ```
226
+
227
+ **Reliability Testing:**
228
+ ```python
229
+ # Metrics to measure:
230
+ - System uptime %
231
+ - Error rate
232
+ - Recovery time
233
+ ```
234
+
235
+ ### Confusion Matrix
236
+
237
+ ```python
238
+ from sklearn.metrics import confusion_matrix
239
+ import seaborn as sns
240
+
241
+ cm = confusion_matrix(true_labels, predicted_labels)
242
+ sns.heatmap(cm, annot=True)
243
+ ```
244
+
245
+ ---
246
+
247
+ ## πŸ’Ύ Database Schema
248
+
249
+ ### Birth Certificates Table
250
+ ```sql
251
+ CREATE TABLE birth_certificates (
252
+ id INT PRIMARY KEY AUTO_INCREMENT,
253
+ child_name VARCHAR(255),
254
+ date_of_birth DATE,
255
+ place_of_birth VARCHAR(255),
256
+ sex CHAR(1),
257
+ father_name VARCHAR(255),
258
+ mother_name VARCHAR(255),
259
+ raw_text TEXT,
260
+ form_image LONGBLOB,
261
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
262
+ );
263
+ ```
264
+
265
+ ---
266
+
267
+ ## πŸ“ˆ System Requirements
268
+
269
+ ### Minimum:
270
+ - CPU: Intel i5 or equivalent
271
+ - RAM: 8GB
272
+ - Storage: 10GB
273
+ - OS: Windows 10, Ubuntu 18.04, macOS 10.14
274
+
275
+ ### Recommended:
276
+ - CPU: Intel i7 or equivalent
277
+ - GPU: NVIDIA GTX 1060 or better
278
+ - RAM: 16GB
279
+ - Storage: 50GB SSD
280
+
281
+ ---
282
+
283
+ ## πŸ”’ Data Privacy & Security
284
+
285
+ Following Philippine Data Privacy Act (RA 10173):
286
+
287
+ - βœ… Encrypted data transmission
288
+ - βœ… Access control and authentication
289
+ - βœ… Audit logging
290
+ - βœ… Regular security updates
291
+ - βœ… Data retention policies
292
+
293
+ ---
294
+
295
+ ## πŸ“š Key Algorithms
296
+
297
+ ### 1. CRNN+CTC
298
+ **Purpose:** Text recognition from images
299
+ **Strengths:** Handles variable-length sequences, no character segmentation needed
300
+ **Reference:** Shi et al. (2016)
301
+
302
+ ### 2. Multinomial Naive Bayes
303
+ **Purpose:** Document classification
304
+ **Strengths:** Fast, efficient, works well with text data
305
+ **Reference:** McCallum & Nigam (1998)
306
+
307
+ ### 3. Named Entity Recognition
308
+ **Purpose:** Extract entities (names, dates, places)
309
+ **Strengths:** Pre-trained, accurate, easy to use
310
+ **Reference:** spaCy (Honnibal & Montani, 2017)
311
+
312
+ ---
313
+
314
+ ## πŸ› οΈ Troubleshooting
315
+
316
+ ### Low Accuracy?
317
+ 1. Increase training data (target: 10,000+ samples)
318
+ 2. Use data augmentation
319
+ 3. Train longer (100+ epochs)
320
+ 4. Clean your dataset
321
+
322
+ ### Out of Memory?
323
+ 1. Reduce batch size
324
+ 2. Use smaller image dimensions
325
+ 3. Use gradient accumulation
326
+ 4. Enable mixed precision
327
+
328
+ ### Slow Inference?
329
+ 1. Use GPU if available
330
+ 2. Batch process images
331
+ 3. Optimize model (ONNX)
332
+ 4. Cache frequent results
333
+
334
+ ---
335
+
336
+ ## πŸ“– Documentation
337
+
338
+ - **IMPLEMENTATION_GUIDE.md** - Complete step-by-step guide
339
+ - **API_DOCUMENTATION.md** - API reference (to be created)
340
+ - **USER_MANUAL.md** - End-user guide (to be created)
341
+
342
+ ---
343
+
344
+ ## πŸŽ“ Academic References
345
+
346
+ ### Key Papers:
347
+
348
+ 1. **CRNN**
349
+ Shi, B., Bai, X., & Yao, C. (2016). An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. *IEEE TPAMI*.
350
+
351
+ 2. **CTC Loss**
352
+ Graves, A., et al. (2006). Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural networks. *ICML*.
353
+
354
+ 3. **Naive Bayes**
355
+ McCallum, A., & Nigam, K. (1998). A comparison of event models for naive bayes text classification. *AAAI Workshop*.
356
+
357
+ 4. **spaCy**
358
+ Honnibal, M., & Montani, I. (2017). spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing.
359
+
360
+ ---
361
+
362
+ ## πŸ‘₯ Contributors
363
+
364
+ **Researchers:**
365
+ - Shane Mark C. Blanco
366
+ - Princess A. Pasamonte
367
+ - Irish Faith G. Ramirez
368
+
369
+ **Advisers:**
370
+ - Mr. Rengel V. Corpuz (Technical Adviser)
371
+ - Mr. Joselito T. Tan (Subject Teacher)
372
+
373
+ **Institution:**
374
+ Tarlac State University
375
+ College of Computer Studies
376
+ Bachelor of Science in Computer Science
377
+
378
+ ---
379
+
380
+ ## πŸ“ž Support
381
+
382
+ For questions regarding this implementation:
383
+
384
+ 1. Review IMPLEMENTATION_GUIDE.md
385
+ 2. Check code documentation
386
+ 3. Consult with thesis advisers
387
+
388
+ ---
389
+
390
+ ## πŸ“„ License
391
+
392
+ This project is for academic purposes as part of a thesis requirement.
393
+
394
+ ---
395
+
396
+ ## βœ… Implementation Checklist
397
+
398
+ ### Phase 1: Setup βœ“
399
+ - [x] Install dependencies
400
+ - [x] Set up project structure
401
+ - [x] Prepare development environment
402
+
403
+ ### Phase 2: Data Preparation
404
+ - [ ] Collect civil registry form images
405
+ - [ ] Create annotations
406
+ - [ ] Split into train/val/test sets
407
+
408
+ ### Phase 3: Model Development
409
+ - [ ] Train CRNN+CTC model
410
+ - [ ] Train document classifier
411
+ - [ ] Integrate NER system
412
+
413
+ ### Phase 4: Web Application
414
+ - [ ] Develop Flask/FastAPI backend
415
+ - [ ] Create frontend interface
416
+ - [ ] Implement database integration
417
+
418
+ ### Phase 5: Testing
419
+ - [ ] Accuracy testing
420
+ - [ ] Black-box testing
421
+ - [ ] ISO 25010 evaluation
422
+ - [ ] User acceptance testing
423
+
424
+ ### Phase 6: Deployment
425
+ - [ ] Optimize for production
426
+ - [ ] Set up server
427
+ - [ ] Deploy application
428
+ - [ ] Monitor performance
429
+
430
+ ---
431
+
432
+ ## 🎯 Success Metrics
433
+
434
+ Target metrics for thesis evaluation:
435
+
436
+ | Metric | Target | Status |
437
+ |--------|--------|--------|
438
+ | OCR Accuracy | > 95% | Pending |
439
+ | CER | < 5% | Pending |
440
+ | Classifier Accuracy | > 90% | Pending |
441
+ | NER F1 Score | > 85% | Pending |
442
+ | Response Time | < 2s | Pending |
443
+ | System Uptime | > 99% | Pending |
444
+
445
+ ---
446
+
447
+ **Good luck with your thesis defense! πŸŽ“βœ¨**
448
+
449
+ For detailed implementation instructions, see **IMPLEMENTATION_GUIDE.md**
CRNN+CTC/calibrate_fields.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ calibrate_fields.py
3
+ ===================
4
+ Click-to-measure tool for recalibrating field ratios in field_extractor.py.
5
+
6
+ Usage:
7
+ python calibrate_fields.py --image your_scan.png --form birth
8
+
9
+ Controls:
10
+ β€’ Click and drag β†’ draw a field box
11
+ β€’ After releasing β†’ enter the field name in the terminal
12
+ β€’ Press S β†’ save all measured ratios to calibrated_fields.py
13
+ β€’ Press Z β†’ undo last box
14
+ β€’ Press Q / ESC β†’ quit without saving
15
+
16
+ Output:
17
+ calibrated_fields.py β€” copy-paste the dict into field_extractor.py
18
+ """
19
+
20
+ import argparse
21
+ import json
22
+ import cv2
23
+ import numpy as np
24
+ from pathlib import Path
25
+
26
+ # ── state ─────────────────────────────────────────────────────────────────────
27
+ drawing = False
28
+ ix, iy = -1, -1
29
+ ex, ey = -1, -1
30
+ boxes = [] # list of (name, rx1, ry1, rx2, ry2)
31
+ form_name = "birth"
32
+
33
+ COLOURS = [
34
+ (0,200,0),(0,150,255),(200,0,200),(0,200,200),(200,200,0),(220,20,60),
35
+ (255,140,0),(150,50,200),(0,160,80),(30,144,255),(255,20,147),(100,200,100),
36
+ ]
37
+
38
+ def draw_boxes(img, bounds):
39
+ left, top, right, bottom = bounds
40
+ h, w = img.shape[:2]
41
+
42
+ vis = img.copy()
43
+ # form boundary
44
+ cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 2)
45
+
46
+ for idx, (name, rx1, ry1, rx2, ry2) in enumerate(boxes):
47
+ x1 = int(rx1 * w)
48
+ y1 = int(ry1 * h)
49
+ x2 = int(rx2 * w)
50
+ y2 = int(ry2 * h)
51
+ c = COLOURS[idx % len(COLOURS)]
52
+ cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2)
53
+ cv2.putText(vis, name[:25], (x1 + 2, max(0, y1 - 3)),
54
+ cv2.FONT_HERSHEY_SIMPLEX, 0.35, c, 1)
55
+
56
+ # live cursor box
57
+ if drawing and ix >= 0 and ex >= 0:
58
+ cv2.rectangle(vis, (ix, iy), (ex, ey), (255, 255, 255), 1)
59
+
60
+ # instructions
61
+ cv2.putText(vis, "Drag=draw box | S=save | Z=undo | Q=quit",
62
+ (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1)
63
+ cv2.putText(vis, f"Boxes: {len(boxes)}",
64
+ (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1)
65
+ return vis
66
+
67
+
68
+ def detect_bounds(image_bgr):
69
+ """Simple form boundary detection (reuses logic from FormBoundsDetector)."""
70
+ h, w = image_bgr.shape[:2]
71
+ gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
72
+ try:
73
+ thresh = cv2.adaptiveThreshold(
74
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
75
+ cv2.THRESH_BINARY_INV, 11, 2)
76
+ hk = cv2.getStructuringElement(cv2.MORPH_RECT, (max(w // 5, 10), 1))
77
+ h_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, hk)
78
+ h_rows = np.where(np.sum(h_lines, axis=1) > w * 0.15)[0]
79
+ vk = cv2.getStructuringElement(cv2.MORPH_RECT, (1, max(h // 5, 10)))
80
+ v_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vk)
81
+ v_cols = np.where(np.sum(v_lines, axis=0) > h * 0.08)[0]
82
+ if len(h_rows) == 0 or len(v_cols) == 0:
83
+ return (0, 0, w, h)
84
+ top_b, bottom_b = int(h_rows.min()), int(h_rows.max())
85
+ left_b, right_b = int(v_cols.min()), int(v_cols.max())
86
+ if (right_b - left_b) < w * 0.4 or (bottom_b - top_b) < h * 0.4:
87
+ return (0, 0, w, h)
88
+ return (left_b, top_b, right_b, bottom_b)
89
+ except Exception:
90
+ return (0, 0, w, h)
91
+
92
+
93
+ def save_calibration(output_path, form):
94
+ dict_name = {
95
+ "birth": "BIRTH_FIELDS",
96
+ "death": "DEATH_FIELDS",
97
+ "marriage": "MARRIAGE_FIELDS",
98
+ "marriage_license": "MARRIAGE_LICENSE_FIELDS",
99
+ }.get(form, "CALIBRATED_FIELDS")
100
+
101
+ lines = [f"# Auto-calibrated β€” copy-paste into field_extractor.py\n",
102
+ f"{dict_name} = {{\n"]
103
+ for name, rx1, ry1, rx2, ry2 in boxes:
104
+ lines.append(f' "{name}":{" " * max(1, 34 - len(name))}'
105
+ f'({rx1:.4f}, {ry1:.4f}, {rx2:.4f}, {ry2:.4f}),\n')
106
+ lines.append("}\n")
107
+
108
+ with open(output_path, "w") as f:
109
+ f.writelines(lines)
110
+ print(f"\n Saved {len(boxes)} fields β†’ {output_path}")
111
+
112
+
113
+ def main():
114
+ global drawing, ix, iy, ex, ey, form_name
115
+
116
+ parser = argparse.ArgumentParser()
117
+ parser.add_argument("--image", required=True)
118
+ parser.add_argument("--form", default="birth",
119
+ choices=["birth","death","marriage","marriage_license"])
120
+ parser.add_argument("--output", default="calibrated_fields.py")
121
+ parser.add_argument("--scale", type=float, default=1.0,
122
+ help="Scale factor to fit image on screen (e.g. 0.5)")
123
+ args = parser.parse_args()
124
+ form_name = args.form
125
+
126
+ img_orig = cv2.imread(args.image)
127
+ if img_orig is None:
128
+ print(f"ERROR: Cannot load {args.image}")
129
+ return
130
+
131
+ scale = args.scale
132
+ if scale != 1.0:
133
+ img_orig = cv2.resize(img_orig, None, fx=scale, fy=scale)
134
+
135
+ bounds = detect_bounds(img_orig)
136
+ left, top, right, bottom = bounds
137
+ fw = right - left
138
+ fh = bottom - top
139
+ print(f" Form boundary detected: {bounds} ({fw}Γ—{fh} px)")
140
+ print(f" Scale: {scale}")
141
+ print("\n Instructions:")
142
+ print(" Drag β†’ draw a field box")
143
+ print(" After releasing β†’ type field name in terminal, press Enter")
144
+ print(" S β†’ save all boxes")
145
+ print(" Z β†’ undo last box")
146
+ print(" Q/ESC β†’ quit\n")
147
+
148
+ win = "Calibrate Fields"
149
+ cv2.namedWindow(win, cv2.WINDOW_NORMAL)
150
+
151
+ def mouse(event, x, y, flags, param):
152
+ global drawing, ix, iy, ex, ey
153
+ if event == cv2.EVENT_LBUTTONDOWN:
154
+ drawing = True
155
+ ix, iy = x, y
156
+ ex, ey = x, y
157
+ elif event == cv2.EVENT_MOUSEMOVE and drawing:
158
+ ex, ey = x, y
159
+ elif event == cv2.EVENT_LBUTTONUP:
160
+ drawing = False
161
+ ex, ey = x, y
162
+ ih, iw = img_orig.shape[:2]
163
+ x1r = min(ix, ex) / iw
164
+ y1r = min(iy, ey) / ih
165
+ x2r = max(ix, ex) / iw
166
+ y2r = max(iy, ey) / ih
167
+ x1r, y1r = max(0.0, x1r), max(0.0, y1r)
168
+ x2r, y2r = min(1.0, x2r), min(1.0, y2r)
169
+ if (x2r - x1r) > 0.005 and (y2r - y1r) > 0.003:
170
+ name = input(f" Field name for ({x1r:.3f},{y1r:.3f},{x2r:.3f},{y2r:.3f}): ").strip()
171
+ if name:
172
+ boxes.append((name, x1r, y1r, x2r, y2r))
173
+ print(f" βœ“ '{name}' added (total: {len(boxes)})")
174
+
175
+ cv2.setMouseCallback(win, mouse)
176
+
177
+ while True:
178
+ vis = draw_boxes(img_orig, bounds)
179
+ cv2.imshow(win, vis)
180
+ key = cv2.waitKey(20) & 0xFF
181
+
182
+ if key in (ord('q'), 27):
183
+ print(" Quit β€” no file saved.")
184
+ break
185
+ elif key == ord('s'):
186
+ save_calibration(args.output, form_name)
187
+ break
188
+ elif key == ord('z') and boxes:
189
+ removed = boxes.pop()
190
+ print(f" Undone: '{removed[0]}'")
191
+
192
+ cv2.destroyAllWindows()
193
+
194
+
195
+ if __name__ == "__main__":
196
+ main()
CRNN+CTC/calibrated_fields.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Auto-calibrated οΏ½ copy-paste into field_extractor.py
2
+ BIRTH_FIELDS = {
3
+ "Province": (0.0941, 0.0701, 0.6361, 0.0848),
4
+ "City/Municipality": (0.1621, 0.0880, 0.6429, 0.1086),
5
+ "first_name": (0.0465, 0.1183, 0.3265, 0.1375),
6
+ "middle_name": (0.3469, 0.1189, 0.6916, 0.1375),
7
+ }
CRNN+CTC/check_cer.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ check_cer.py
3
+ ============
4
+ Measures TRUE CER by actually running the model on images.
5
+
6
+ Usage:
7
+ python check_cer.py # live CER on val set
8
+ python check_cer.py --saved # old behavior (fast, unreliable)
9
+ python check_cer.py --images test_images/ # run on any image folder
10
+ """
11
+
12
+ import os
13
+ import sys
14
+ import json
15
+ import random
16
+ import cv2
17
+ import numpy as np
18
+ import editdistance
19
+ from pathlib import Path
20
+
21
+ try:
22
+ import torch
23
+ except ImportError:
24
+ print("ERROR: torch not installed. Run: pip install torch")
25
+ exit(1)
26
+
27
+ USE_SAVED = '--saved' in sys.argv
28
+ IMAGE_DIR = None
29
+ for i, arg in enumerate(sys.argv[1:], 1):
30
+ if arg == '--images' and i < len(sys.argv) - 1:
31
+ IMAGE_DIR = sys.argv[i + 1]
32
+ elif arg.startswith('--images='):
33
+ IMAGE_DIR = arg.split('=', 1)[1]
34
+
35
+ CHECKPOINTS = [
36
+ 'checkpoint_epoch_50.pth',
37
+ 'checkpoint_epoch_60.pth',
38
+ 'checkpoint_epoch_70.pth',
39
+ 'checkpoint_epoch_80.pth',
40
+ 'checkpoint_epoch_90.pth',
41
+ 'checkpoint_epoch_100.pth',
42
+ ]
43
+ CHECKPOINT_DIR = 'checkpoints'
44
+ VAL_DATA_DIR = 'data/val'
45
+ VAL_ANN_FILE = 'data/val_annotations.json'
46
+
47
+
48
+ class AdaptiveImageNormalizer:
49
+ def __init__(self, target_height=64, target_width=512):
50
+ self.H = target_height
51
+ self.W = target_width
52
+
53
+ def _crop_to_text(self, gray):
54
+ inv = cv2.bitwise_not(gray)
55
+ _, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
56
+ coords = np.column_stack(np.where(thresh > 0))
57
+ if len(coords) == 0:
58
+ return gray
59
+ y_min, x_min = coords.min(axis=0)
60
+ y_max, x_max = coords.max(axis=0)
61
+ pad = max(4, int((y_max - y_min) * 0.15))
62
+ y_min = max(0, y_min - pad)
63
+ x_min = max(0, x_min - pad)
64
+ y_max = min(gray.shape[0] - 1, y_max + pad)
65
+ x_max = min(gray.shape[1] - 1, x_max + pad)
66
+ return gray[y_min:y_max + 1, x_min:x_max + 1]
67
+
68
+ def _smart_resize_gray(self, gray):
69
+ h, w = gray.shape
70
+ if h == 0 or w == 0:
71
+ return np.ones((self.H, self.W), dtype=np.uint8) * 255
72
+ scale = self.H / h
73
+ new_w = int(w * scale)
74
+ new_h = self.H
75
+ if new_w > self.W:
76
+ scale = self.W / w
77
+ new_h = int(h * scale)
78
+ new_w = self.W
79
+ resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
80
+ canvas = np.ones((self.H, self.W), dtype=np.uint8) * 255
81
+ y_off = (self.H - new_h) // 2
82
+ x_off = (self.W - new_w) // 2
83
+ canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized
84
+ return canvas
85
+
86
+ def _binarize(self, img):
87
+ _, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
88
+ white_ratio = np.mean(otsu == 255)
89
+ if white_ratio < 0.30 or white_ratio > 0.97:
90
+ return cv2.adaptiveThreshold(
91
+ img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
92
+ cv2.THRESH_BINARY, 11, 2)
93
+ return otsu
94
+
95
+ def normalize(self, img):
96
+ if len(img.shape) == 3:
97
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
98
+ else:
99
+ gray = img.copy()
100
+ gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
101
+ gray = self._crop_to_text(gray)
102
+ gray = self._smart_resize_gray(gray)
103
+ return self._binarize(gray)
104
+
105
+ def to_tensor(self, img):
106
+ return torch.FloatTensor(
107
+ img.astype(np.float32) / 255.0
108
+ ).unsqueeze(0).unsqueeze(0)
109
+
110
+
111
+ def greedy_decode(outputs, idx_to_char):
112
+ pred_indices = torch.argmax(outputs, dim=2).permute(1, 0)
113
+ results = []
114
+ for seq in pred_indices:
115
+ chars, prev = [], -1
116
+ for idx in seq:
117
+ idx = idx.item()
118
+ if idx != 0 and idx != prev and idx in idx_to_char:
119
+ chars.append(idx_to_char[idx])
120
+ prev = idx
121
+ results.append(''.join(chars))
122
+ return results
123
+
124
+
125
+ def measure_live_cer(model, idx_to_char, img_h, img_w,
126
+ ann_file, data_dir, device, max_samples=200):
127
+ if not os.path.exists(ann_file):
128
+ return None, 0, f"Annotation file not found: {ann_file}"
129
+
130
+ with open(ann_file, 'r', encoding='utf-8') as f:
131
+ annotations = json.load(f)
132
+
133
+ if len(annotations) > max_samples:
134
+ random.seed(42)
135
+ annotations = random.sample(annotations, max_samples)
136
+
137
+ normalizer = AdaptiveImageNormalizer(img_h, img_w)
138
+ model.eval()
139
+
140
+ total_char_dist = 0
141
+ total_chars = 0
142
+ total_word_dist = 0
143
+ total_words = 0
144
+ n_exact = 0
145
+ n_evaluated = 0
146
+ worst_errors = []
147
+
148
+ with torch.no_grad():
149
+ for ann in annotations:
150
+ img_path = os.path.join(data_dir, ann['image_path'])
151
+ gt = ann['text']
152
+ if not os.path.exists(img_path):
153
+ continue
154
+ try:
155
+ raw = cv2.imread(img_path)
156
+ if raw is None:
157
+ continue
158
+ norm = normalizer.normalize(raw)
159
+ tensor = normalizer.to_tensor(norm).to(device)
160
+ out = model(tensor)
161
+ pred = greedy_decode(out.cpu(), idx_to_char)[0]
162
+
163
+ cd = editdistance.eval(pred, gt)
164
+ wd = editdistance.eval(pred.split(), gt.split())
165
+
166
+ total_char_dist += cd
167
+ total_chars += len(gt)
168
+ total_word_dist += wd
169
+ total_words += len(gt.split())
170
+ if pred == gt:
171
+ n_exact += 1
172
+ if cd > 0:
173
+ worst_errors.append((gt, pred, cd))
174
+ n_evaluated += 1
175
+ except Exception:
176
+ continue
177
+
178
+ if n_evaluated == 0:
179
+ return None, 0, "No images could be evaluated"
180
+
181
+ cer = (total_char_dist / total_chars * 100) if total_chars > 0 else 0
182
+ wer = (total_word_dist / total_words * 100) if total_words > 0 else 0
183
+ acc = (n_exact / n_evaluated * 100)
184
+
185
+ return {
186
+ 'cer': cer, 'wer': wer, 'exact_match': acc,
187
+ 'n_evaluated': n_evaluated,
188
+ 'errors': sorted(worst_errors, key=lambda x: x[2], reverse=True)[:5]
189
+ }, n_evaluated, None
190
+
191
+
192
+ def run_on_folder(model, idx_to_char, img_h, img_w, folder, device):
193
+ normalizer = AdaptiveImageNormalizer(img_h, img_w)
194
+ model.eval()
195
+ exts = {'.jpg', '.jpeg', '.png', '.bmp'}
196
+ paths = sorted(p for p in Path(folder).rglob('*') if p.suffix.lower() in exts)
197
+ results = []
198
+ with torch.no_grad():
199
+ for p in paths:
200
+ try:
201
+ raw = cv2.imread(str(p))
202
+ norm = normalizer.normalize(raw)
203
+ tensor = normalizer.to_tensor(norm).to(device)
204
+ pred = greedy_decode(model(tensor).cpu(), idx_to_char)[0]
205
+ results.append((p.name, pred))
206
+ except Exception as e:
207
+ results.append((p.name, f'ERROR: {e}'))
208
+ return results
209
+
210
+
211
+ # ─────────────────────────────────────────────────────────────────────────────
212
+ # MAIN
213
+ # ─────────────────────────────────────────────────────────────────────────────
214
+
215
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
216
+
217
+ if USE_SAVED:
218
+ print("=" * 65)
219
+ print(" SAVED CER (training-time value β€” may not reflect real accuracy)")
220
+ print(" Run without --saved for true live CER.")
221
+ print("=" * 65)
222
+ print("{:<8} {:<12} {:<12} {}".format("Epoch", "CER(%)", "WER(%)", "File"))
223
+ print("-" * 65)
224
+ best_cer, best_cp = float('inf'), None
225
+ for cp in CHECKPOINTS:
226
+ path = os.path.join(CHECKPOINT_DIR, cp)
227
+ if not os.path.exists(path):
228
+ continue
229
+ try:
230
+ c = torch.load(path, weights_only=False)
231
+ cer = c.get('val_cer', c.get('val_loss', 0))
232
+ epoch = c['epoch']
233
+ history = c.get('history', {})
234
+ wer_list = history.get('val_wer', [])
235
+ wer = wer_list[epoch - 1] if wer_list and epoch <= len(wer_list) else None
236
+ wer_s = f"{wer:.4f}%" if wer else 'N/A'
237
+ marker = ' <-- BEST' if cer < best_cer else ''
238
+ print("{:<8} {:<12} {:<12} {}{}".format(
239
+ epoch, f"{cer:.4f}%", wer_s, cp, marker))
240
+ if cer < best_cer:
241
+ best_cer, best_cp = cer, cp
242
+ except Exception as e:
243
+ print(f" Could not load {cp}: {e}")
244
+ print("=" * 65)
245
+ print(f"\nBEST: {best_cp} CER={best_cer:.4f}%")
246
+
247
+ else:
248
+ print("=" * 78)
249
+ print(" LIVE CER β€” model actually runs on images (true accuracy)")
250
+ print("=" * 78)
251
+ print("{:<8} {:<10} {:<10} {:<12} {:<8} {}".format(
252
+ "Epoch", "CER(%)", "WER(%)", "ExactMatch", "N", "File"))
253
+ print("-" * 78)
254
+
255
+ best_cer, best_cp, best_metrics = float('inf'), None, None
256
+
257
+ for cp in CHECKPOINTS:
258
+ cp_path = os.path.join(CHECKPOINT_DIR, cp)
259
+ if not os.path.exists(cp_path):
260
+ print(f" (skipping {cp} β€” not found)")
261
+ continue
262
+ try:
263
+ from crnn_model import get_crnn_model
264
+ c = torch.load(cp_path, map_location=device, weights_only=False)
265
+ epoch = c['epoch']
266
+ idx_to_char = c['idx_to_char']
267
+ config = c.get('config', {})
268
+ img_h = config.get('img_height', 64)
269
+ img_w = config.get('img_width', 512)
270
+ saved_cer = c.get('val_cer', c.get('val_loss', None))
271
+
272
+ model = get_crnn_model(
273
+ model_type=config.get('model_type', 'standard'),
274
+ img_height=img_h,
275
+ num_chars=c['model_state_dict']['fc.weight'].shape[0],
276
+ hidden_size=config.get('hidden_size', 128), # FIXED: was 256
277
+ num_lstm_layers=config.get('num_lstm_layers', 1) # FIXED: was 2
278
+ ).to(device)
279
+ model.load_state_dict(c['model_state_dict'])
280
+
281
+ if IMAGE_DIR:
282
+ print(f"\nPredictions from {cp}:")
283
+ for fname, pred in run_on_folder(
284
+ model, idx_to_char, img_h, img_w, IMAGE_DIR, device):
285
+ print(f" {fname:<35} -> {pred}")
286
+ continue
287
+
288
+ metrics, n, err = measure_live_cer(
289
+ model, idx_to_char, img_h, img_w,
290
+ VAL_ANN_FILE, VAL_DATA_DIR, device)
291
+
292
+ if metrics is None:
293
+ print(f" Epoch {epoch} SKIP: {err}")
294
+ continue
295
+
296
+ cer = metrics['cer']
297
+ marker = ' <-- BEST' if cer < best_cer else ''
298
+ print("{:<8} {:<10} {:<10} {:<12} {:<8} {}{}".format(
299
+ epoch,
300
+ f"{cer:.2f}%",
301
+ f"{metrics['wer']:.2f}%",
302
+ f"{metrics['exact_match']:.1f}%",
303
+ n, cp, marker))
304
+
305
+ if saved_cer and abs(cer - saved_cer) > 2.0:
306
+ print(f" ^ MISMATCH: saved={saved_cer:.2f}% live={cer:.2f}%"
307
+ f" diff={abs(cer - saved_cer):.2f}%")
308
+ print(f" Cause: model trained on clean synthetic only.")
309
+ print(f" Fix: regenerate data with fix_data.py + retrain.")
310
+
311
+ if cer < best_cer:
312
+ best_cer, best_cp, best_metrics = cer, cp, metrics
313
+
314
+ except Exception as e:
315
+ print(f" Could not evaluate {cp}: {e}")
316
+
317
+ if not IMAGE_DIR:
318
+ print("=" * 78)
319
+ print(f"\nBEST CHECKPOINT : {best_cp}")
320
+ print(f"BEST LIVE CER : {best_cer:.4f}%")
321
+
322
+ if best_metrics and best_metrics['errors']:
323
+ print(f"\nWorst predictions (GT -> Predicted):")
324
+ for gt, pred, dist in best_metrics['errors']:
325
+ print(f" [{dist:2d}] '{gt}'")
326
+ print(f" '{pred}'")
327
+
328
+ print(f"\nTo use best model:")
329
+ print(f" import shutil")
330
+ print(f" shutil.copy('checkpoints/{best_cp}', 'checkpoints/best_model.pth')")
331
+ print("=" * 78)
CRNN+CTC/checkpoints/best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f98f0590b354c11f40fefcab6fe172ae57cb37e49277062a00dbbe3f5aa6b8b5
3
+ size 19204606
CRNN+CTC/checkpoints/best_model_final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7da91ba5cd78b602eebb9c9f63175d9bc47ec8cb6fbdac6a06c78814e2e6b8f2
3
+ size 6407143
CRNN+CTC/checkpoints/best_model_iam.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7f4cdef044a163632be2cbf7fbed9d869b4a2e85977aef60e5f88501969e257
3
+ size 6405834
CRNN+CTC/checkpoints/best_model_v2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:047c9af89f9486553a2c17736cafcc0a7a45a99e21619064ee00299e2cd6a8df
3
+ size 6406990
CRNN+CTC/checkpoints/best_model_v3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b1def35ee8c623aac01004ecb9f979d51d3ed3a486d8adf7a8acd67e5b03a31
3
+ size 6406990
CRNN+CTC/checkpoints/best_model_v4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73a939bab133573e8b771a6d48aca10c9a98e804cedd79f06eac4e24735df1d4
3
+ size 6406201
CRNN+CTC/checkpoints/best_model_v5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef986cbea34d4c5dc31b32aac3bc2dfaa20720cdb133d9d6c79a5d5123700942
3
+ size 6406201
CRNN+CTC/checkpoints/best_model_v6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dbcdecce6d83b7f7c74ae6df05ae9222b345668e2dff84de9aa108562bd71ac
3
+ size 6406201
CRNN+CTC/checkpoints/best_model_v7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7adea1b88ab7e4ecdf9354a8f1adbfbe7c95e26808319e307483ca6ea2555e0
3
+ size 6406201
CRNN+CTC/checkpoints/best_model_v732.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7adea1b88ab7e4ecdf9354a8f1adbfbe7c95e26808319e307483ca6ea2555e0
3
+ size 6406201
CRNN+CTC/checkpoints/checkpoint_epoch_10.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08f3a40e99411b95e8a8563ed42f6998f367b84dd799b8c0cbcffac1bdd5576f
3
+ size 19201165
CRNN+CTC/checkpoints/latest_checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b10077d433edbd5946499fef7334421d8b7ba351f55d631fbeb085592c10545
3
+ size 19201651
CRNN+CTC/compare_checkpoints.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import sys
3
+ sys.path.append('.')
4
+ from crnn_model import get_crnn_model
5
+
6
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
7
+
8
+ def test_model(path, label):
9
+ c = torch.load(path, map_location=device, weights_only=False)
10
+ config = c.get('config', {})
11
+ model = get_crnn_model(
12
+ model_type = config.get('model_type', 'standard'),
13
+ img_height = config.get('img_height', 64),
14
+ num_chars = c['model_state_dict']['fc.weight'].shape[0],
15
+ hidden_size = config.get('hidden_size', 128),
16
+ num_lstm_layers = config.get('num_lstm_layers', 1),
17
+ ).to(device)
18
+ model.load_state_dict(c['model_state_dict'], strict=False)
19
+ epoch = c.get('epoch', 'N/A')
20
+ val_loss = c.get('val_loss', None) # fine-tuned checkpoints (EMNIST, IAM)
21
+ val_cer = c.get('val_cer', None) # synthetic baseline checkpoint
22
+ if val_loss is not None:
23
+ metric_str = f"val_loss={val_loss:.4f}"
24
+ elif val_cer is not None:
25
+ metric_str = f"val_cer={val_cer:.4f}%"
26
+ else:
27
+ metric_str = "no metric saved"
28
+ print(f"{label}: epoch={epoch} {metric_str}")
29
+
30
+ print("=" * 55)
31
+ test_model('checkpoints/best_model.pth', 'Synthetic ')
32
+ test_model('checkpoints/best_model_emnist.pth', 'EMNIST ')
33
+ test_model('checkpoints/best_model_iam.pth', 'IAM ')
34
+ print("=" * 55)
CRNN+CTC/compare_live_cer.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ compare_live_cer.py
3
+ ===================
4
+ Runs live CER on all three checkpoints to find the best one.
5
+ Usage: python compare_live_cer.py
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import json
11
+ import random
12
+ import cv2
13
+ import numpy as np
14
+ import editdistance
15
+ import torch
16
+ import torch.nn.functional as F
17
+ sys.path.append('.')
18
+ from crnn_model import get_crnn_model
19
+
20
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
+
22
+ VAL_ANN = 'data/val_annotations.json'
23
+ VAL_DIR = 'data/val'
24
+ MAX_SAMPLES = 200
25
+
26
+ CHECKPOINTS = {
27
+ 'Synthetic' : 'checkpoints/best_model.pth',
28
+ 'EMNIST' : 'checkpoints/best_model_emnist.pth',
29
+ 'IAM' : 'checkpoints/best_model_iam.pth',
30
+ }
31
+
32
+
33
+ def normalize(img, H=64, W=512):
34
+ if len(img.shape) == 3:
35
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
36
+ else:
37
+ gray = img.copy()
38
+ gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
39
+ inv = cv2.bitwise_not(gray)
40
+ _, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
41
+ coords = np.column_stack(np.where(thresh > 0))
42
+ if len(coords) > 0:
43
+ y_min, x_min = coords.min(axis=0)
44
+ y_max, x_max = coords.max(axis=0)
45
+ pad = max(4, int((y_max - y_min) * 0.15))
46
+ y_min = max(0, y_min - pad)
47
+ x_min = max(0, x_min - pad)
48
+ y_max = min(gray.shape[0]-1, y_max + pad)
49
+ x_max = min(gray.shape[1]-1, x_max + pad)
50
+ gray = gray[y_min:y_max+1, x_min:x_max+1]
51
+ h, w = gray.shape
52
+ if h == 0 or w == 0:
53
+ return np.ones((H, W), dtype=np.uint8) * 255
54
+ scale = H / h
55
+ new_w = int(w * scale)
56
+ if new_w > W:
57
+ scale = W / w
58
+ new_w = W
59
+ new_h = int(h * scale)
60
+ else:
61
+ new_h = H
62
+ resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
63
+ canvas = np.ones((H, W), dtype=np.uint8) * 255
64
+ canvas[(H-new_h)//2:(H-new_h)//2+new_h,
65
+ (W-new_w)//2:(W-new_w)//2+new_w] = resized
66
+ _, otsu = cv2.threshold(canvas, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
67
+ return otsu
68
+
69
+
70
+ def greedy_decode(outputs, idx_to_char):
71
+ pred_indices = torch.argmax(outputs, dim=2).permute(1, 0)
72
+ results = []
73
+ for seq in pred_indices:
74
+ chars, prev = [], -1
75
+ for idx in seq:
76
+ idx = idx.item()
77
+ if idx != 0 and idx != prev and idx in idx_to_char:
78
+ chars.append(idx_to_char[idx])
79
+ prev = idx
80
+ results.append(''.join(chars))
81
+ return results
82
+
83
+
84
+ def evaluate(checkpoint_path, label):
85
+ if not os.path.exists(checkpoint_path):
86
+ print(f" {label:<12}: FILE NOT FOUND β€” skipping")
87
+ return
88
+
89
+ c = torch.load(checkpoint_path, map_location=device, weights_only=False)
90
+ config = c.get('config', {})
91
+
92
+ # Load idx_to_char from checkpoint if available
93
+ idx_to_char = c.get('idx_to_char', None)
94
+ if idx_to_char is None:
95
+ from dataset import build_char_maps
96
+ _, idx_to_char, _ = build_char_maps()
97
+
98
+ model = get_crnn_model(
99
+ model_type = config.get('model_type', 'standard'),
100
+ img_height = config.get('img_height', 64),
101
+ num_chars = c['model_state_dict']['fc.weight'].shape[0],
102
+ hidden_size = config.get('hidden_size', 128),
103
+ num_lstm_layers = config.get('num_lstm_layers', 1),
104
+ ).to(device)
105
+ model.load_state_dict(c['model_state_dict'], strict=False)
106
+ model.eval()
107
+
108
+ with open(VAL_ANN, 'r', encoding='utf-8') as f:
109
+ anns = json.load(f)
110
+ random.seed(42)
111
+ if len(anns) > MAX_SAMPLES:
112
+ anns = random.sample(anns, MAX_SAMPLES)
113
+
114
+ total_cd, total_c = 0, 0
115
+ exact, n = 0, 0
116
+ worst = []
117
+
118
+ with torch.no_grad():
119
+ for ann in anns:
120
+ img_path = os.path.join(VAL_DIR, ann['image_path'])
121
+ gt = ann['text']
122
+ if not os.path.exists(img_path):
123
+ continue
124
+ raw = cv2.imread(img_path)
125
+ if raw is None:
126
+ continue
127
+ norm = normalize(raw)
128
+ tensor = torch.FloatTensor(
129
+ norm.astype(np.float32) / 255.0
130
+ ).unsqueeze(0).unsqueeze(0).to(device)
131
+ out = model(tensor)
132
+ pred = greedy_decode(out.cpu(), idx_to_char)[0]
133
+ cd = editdistance.eval(pred, gt)
134
+ total_cd += cd
135
+ total_c += len(gt)
136
+ if pred == gt:
137
+ exact += 1
138
+ if cd > 0:
139
+ worst.append((gt, pred, cd))
140
+ n += 1
141
+
142
+ cer = (total_cd / total_c * 100) if total_c > 0 else 0
143
+ acc = (exact / n * 100) if n > 0 else 0
144
+ print(f" {label:<12}: CER={cer:.2f}% ExactMatch={acc:.1f}% (n={n})")
145
+
146
+ if worst:
147
+ worst = sorted(worst, key=lambda x: x[2], reverse=True)[:2]
148
+ for gt, pred, d in worst:
149
+ print(f" [{d}] '{gt}' -> '{pred}'")
150
+
151
+
152
+ print("=" * 60)
153
+ print(" LIVE CER COMPARISON β€” all checkpoints")
154
+ print("=" * 60)
155
+ for label, path in CHECKPOINTS.items():
156
+ evaluate(path, label)
157
+ print("=" * 60)
158
+ print("Use the checkpoint with the lowest CER for IAM/physical fine-tuning.")
CRNN+CTC/create_test_images.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from PIL import Image, ImageDraw, ImageFont
3
+
4
+ os.makedirs('test_images', exist_ok=True)
5
+
6
+ def load_font(size=22): # FIXED: was 20 β€” must match fix_data.py FONT_SIZE=22
7
+ """Same font loader as fix_data.py β€” tries multiple paths."""
8
+ for fp in [
9
+ 'arial.ttf', 'Arial.ttf',
10
+ '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
11
+ '/System/Library/Fonts/Helvetica.ttc',
12
+ 'C:/Windows/Fonts/arial.ttf',
13
+ ]:
14
+ try:
15
+ return ImageFont.truetype(fp, size)
16
+ except Exception:
17
+ continue
18
+ print("WARNING: Could not load Arial/DejaVu font. Using default β€” predictions may be inaccurate.")
19
+ return ImageFont.load_default()
20
+
21
+ def create_image(text, filename):
22
+ """Render text exactly the same way as fix_data.py training images."""
23
+ img = Image.new('RGB', (512, 64), color=(255, 255, 255))
24
+ draw = ImageDraw.Draw(img)
25
+ font = load_font(22)
26
+
27
+ bbox = draw.textbbox((0, 0), text, font=font)
28
+ x = max((512 - (bbox[2] - bbox[0])) // 2, 2)
29
+ y = max((64 - (bbox[3] - bbox[1])) // 2, 2)
30
+ draw.text((x, y), text, fill=(0, 0, 0), font=font)
31
+ img.save(filename)
32
+ print(f'Created: {filename}')
33
+
34
+ # ── Test samples ──────────────────────────────────────────────
35
+ create_image('Juan Dela Cruz', 'test_images/demo.jpg')
36
+ create_image('Juan Dela Cruz', 'test_images/name1.jpg')
37
+ create_image('01/15/1990', 'test_images/date1.jpg')
38
+ create_image('Tarlac City', 'test_images/place1.jpg')
39
+ create_image('Maria Santos', 'test_images/form1a_sample.jpg')
40
+
41
+ # ── Extra test cases (names, dates, addresses) ────────────────
42
+ create_image('Jose Dela Cruz Jr.', 'test_images/name2.jpg')
43
+ create_image('Ana Marie Reyes', 'test_images/name3.jpg')
44
+ create_image('03/22/1985', 'test_images/date2.jpg')
45
+ create_image('07/04/2000', 'test_images/date3.jpg')
46
+ create_image('Brgy. San Jose, Capas, Tarlac', 'test_images/place2.jpg')
47
+ create_image('78 MacArthur Hwy., Tarlac City', 'test_images/place3.jpg')
48
+
49
+ print('\nAll test images created!')
50
+ print('Font used matches training data β€” predictions should be accurate.')
CRNN+CTC/crnn_model.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CRNN+CTC Model β€” simplified for small datasets (~5000-10000 samples)
3
+ ~700K parameters, converges reliably without CTC blank collapse.
4
+ """
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+
9
+ class CRNN_CivilRegistry(nn.Module):
10
+
11
+ def __init__(self, img_height=64, num_chars=96, hidden_size=128, num_lstm_layers=1,
12
+ dropout=0.3):
13
+ super().__init__()
14
+
15
+ # CNN β€” width reductions for 512px input:
16
+ # MaxPool(2,2): 512β†’256, MaxPool(2,2): 256β†’128
17
+ # MaxPool(2,1): 128 (height only), MaxPool(2,1): 128 (height only)
18
+ # Conv(k=2,p=0): 127 β†’ seq_len=127, fits labels up to 64 chars
19
+ self.cnn = nn.Sequential(
20
+ nn.Conv2d(1, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(inplace=True),
21
+ nn.MaxPool2d(2, 2),
22
+
23
+ nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True),
24
+ nn.MaxPool2d(2, 2),
25
+
26
+ nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
27
+ nn.MaxPool2d((2, 1)),
28
+
29
+ nn.Conv2d(128, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(inplace=True),
30
+ nn.MaxPool2d((2, 1)),
31
+
32
+ nn.Conv2d(256, 256, kernel_size=2, padding=0),
33
+ nn.BatchNorm2d(256), nn.ReLU(inplace=True),
34
+ )
35
+
36
+ # FIXED Bug 4: derive cnn_out_h from a real forward pass instead of
37
+ # a hardcoded formula β€” safer if architecture or img_height ever changes.
38
+ with torch.no_grad():
39
+ _dummy = torch.zeros(1, 1, img_height, 32)
40
+ _out = self.cnn(_dummy)
41
+ cnn_out_h = _out.shape[2] # actual height after all CNN layers
42
+ rnn_input = 256 * cnn_out_h
43
+
44
+ self.rnn = nn.LSTM(
45
+ input_size=rnn_input,
46
+ hidden_size=hidden_size,
47
+ num_layers=num_lstm_layers,
48
+ bidirectional=True,
49
+ batch_first=False,
50
+ )
51
+ # Dropout before FC β€” prevents overfitting on small datasets.
52
+ # Applied after BiLSTM output, before character projection.
53
+ # p=0.3 is standard for CRNN OCR models (disabled at inference via model.eval()).
54
+ self.dropout = nn.Dropout(p=dropout)
55
+ self.fc = nn.Linear(hidden_size * 2, num_chars)
56
+
57
+ def forward(self, x):
58
+ f = self.cnn(x)
59
+ B, C, h, w = f.size()
60
+ f = f.permute(3, 0, 1, 2).reshape(w, B, C * h)
61
+ f, _ = self.rnn(f)
62
+ return self.fc(self.dropout(f))
63
+
64
+
65
+ class CRNN_Ensemble(nn.Module):
66
+ def __init__(self, num_models=3, **kwargs):
67
+ super().__init__()
68
+ self.models = nn.ModuleList([CRNN_CivilRegistry(**kwargs) for _ in range(num_models)])
69
+
70
+ def forward(self, x):
71
+ # FIXED Rec 3: average softmax probabilities across models (correct ensemble),
72
+ # then return log of the average so CTCLoss receives log-probabilities β€”
73
+ # the same contract as CRNN_CivilRegistry (raw logits + log_softmax in trainer).
74
+ # Returning raw averaged probabilities caused CTCLoss to receive un-logged values.
75
+ probs = [torch.nn.functional.softmax(m(x), dim=2) for m in self.models]
76
+ avg_probs = torch.mean(torch.stack(probs), dim=0)
77
+ return torch.log(avg_probs.clamp(min=1e-9)) # log-probs, safe clamp avoids log(0)
78
+
79
+
80
+ def get_crnn_model(model_type='standard', **kwargs):
81
+ if model_type == 'ensemble':
82
+ return CRNN_Ensemble(**kwargs)
83
+ return CRNN_CivilRegistry(**kwargs)
84
+
85
+
86
+ def initialize_weights(model):
87
+ for m in model.modules():
88
+ if isinstance(m, nn.Conv2d):
89
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
90
+ if m.bias is not None:
91
+ nn.init.constant_(m.bias, 0)
92
+ elif isinstance(m, nn.BatchNorm2d):
93
+ nn.init.constant_(m.weight, 1)
94
+ nn.init.constant_(m.bias, 0)
95
+ elif isinstance(m, nn.Linear):
96
+ nn.init.normal_(m.weight, 0, 0.01)
97
+ nn.init.constant_(m.bias, 0)
98
+ elif isinstance(m, nn.LSTM):
99
+ for name, param in m.named_parameters():
100
+ if 'weight' in name:
101
+ nn.init.orthogonal_(param)
102
+ elif 'bias' in name:
103
+ nn.init.constant_(param, 0)
104
+ # Rec 1: set forget gate bias to 1.0 β€” helps the model
105
+ # remember across long sequences at the start of training.
106
+ # LSTM gate order: [input | forget | cell | output]
107
+ n = param.size(0)
108
+ param.data[n // 4 : n // 2].fill_(1.0)
109
+
110
+
111
+ if __name__ == "__main__":
112
+ model = get_crnn_model('standard', img_height=64, num_chars=96, hidden_size=128, num_lstm_layers=1)
113
+ initialize_weights(model)
114
+ x = torch.randn(2, 1, 64, 512)
115
+ out = model(x)
116
+ params = sum(p.numel() for p in model.parameters())
117
+ print(f"Output: {out.shape} seq_len={out.shape[0]}")
118
+ print(f"Params: {params:,} (unchanged β€” dropout adds no parameters)")
119
+ print(f"Dropout p=0.3 active during training, disabled during model.eval()")
CRNN+CTC/dataset.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ dataset.py
3
+ ==========
4
+ PyTorch Dataset and DataLoader utilities for the Civil Registry OCR system.
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import random
10
+ from pathlib import Path
11
+ from typing import List, Tuple, Dict, Optional
12
+
13
+ import cv2
14
+ import numpy as np
15
+ import torch
16
+ from torch.utils.data import Dataset
17
+
18
+
19
+ # ─────────────────────────────────────────────────────────────────────────────
20
+ # CHARACTER SET
21
+ # ─────────────────────────────────────────────────────────────────────────────
22
+
23
+ PRINTABLE_CHARS = [chr(i) for i in range(32, 127)] # space (32) to ~ (126)
24
+
25
+
26
+ def build_char_maps(extra_chars: Optional[List[str]] = None):
27
+ chars = PRINTABLE_CHARS.copy()
28
+ if extra_chars:
29
+ for c in extra_chars:
30
+ if c not in chars:
31
+ chars.append(c)
32
+ char_to_idx = {c: i + 1 for i, c in enumerate(chars)}
33
+ idx_to_char = {i + 1: c for i, c in enumerate(chars)}
34
+ num_chars = len(chars) + 1 # +1 for blank=0
35
+ return char_to_idx, idx_to_char, num_chars
36
+
37
+
38
+ # ─────────────────────────────────────────────────────────────────────────────
39
+ # IMAGE NORMALIZER
40
+ # ─────────────────────────────────────────────────────────────────────────────
41
+
42
+ class ImageNormalizer:
43
+
44
+ def __init__(self, target_height: int = 64, target_width: int = 512):
45
+ self.H = target_height
46
+ self.W = target_width
47
+
48
+ def _to_gray(self, img):
49
+ if len(img.shape) == 3:
50
+ return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
51
+ return img.copy()
52
+
53
+ def _crop_to_text(self, gray):
54
+ inv = cv2.bitwise_not(gray)
55
+ _, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
56
+ coords = np.column_stack(np.where(thresh > 0))
57
+ if len(coords) == 0:
58
+ return gray
59
+ y_min, x_min = coords.min(axis=0)
60
+ y_max, x_max = coords.max(axis=0)
61
+ pad = max(4, int((y_max - y_min) * 0.15))
62
+ y_min = max(0, y_min - pad)
63
+ x_min = max(0, x_min - pad)
64
+ y_max = min(gray.shape[0] - 1, y_max + pad)
65
+ x_max = min(gray.shape[1] - 1, x_max + pad)
66
+ return gray[y_min:y_max + 1, x_min:x_max + 1]
67
+
68
+ def _aspect_resize(self, gray):
69
+ h, w = gray.shape
70
+ if h == 0 or w == 0:
71
+ return np.ones((self.H, self.W), dtype=np.uint8) * 255
72
+ scale = self.H / h
73
+ new_w = int(w * scale)
74
+ new_h = self.H
75
+ if new_w > self.W:
76
+ scale = self.W / w
77
+ new_h = int(h * scale)
78
+ new_w = self.W
79
+ resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
80
+ canvas = np.ones((self.H, self.W), dtype=np.uint8) * 255
81
+ y_off = (self.H - new_h) // 2
82
+ x_off = (self.W - new_w) // 2
83
+ canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized
84
+ return canvas
85
+
86
+ def _binarize(self, img):
87
+ _, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
88
+ white_ratio = np.mean(otsu == 255)
89
+ if white_ratio < 0.30 or white_ratio > 0.97:
90
+ return cv2.adaptiveThreshold(
91
+ img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
92
+ cv2.THRESH_BINARY, 11, 2)
93
+ return otsu
94
+
95
+ def normalize(self, img: np.ndarray, augmenter=None) -> np.ndarray:
96
+ gray = self._to_gray(img)
97
+ # NOTE: fastNlMeansDenoising intentionally removed from training pipeline.
98
+ # It is slow (~200ms/image) and pointless on clean synthetic images.
99
+ # Denoising is only applied in check_cer.py / inference.py (AdaptiveNormalizer)
100
+ # which runs on real scanned documents where denoising actually helps.
101
+ gray = self._crop_to_text(gray)
102
+ gray = self._aspect_resize(gray)
103
+ # FIXED Bug 3: augment on grayscale BEFORE binarize.
104
+ # Brightness/contrast augmentation has zero effect on binary (0/255) pixels.
105
+ if augmenter is not None:
106
+ gray = augmenter(gray)
107
+ return self._binarize(gray)
108
+
109
+ def to_tensor(self, img: np.ndarray) -> torch.Tensor:
110
+ return torch.FloatTensor(
111
+ img.astype(np.float32) / 255.0
112
+ ).unsqueeze(0) # [1, H, W]
113
+
114
+
115
+ # ─────────────────────────────────────────────────────────────────────────────
116
+ # AUGMENTATION
117
+ # ─────────────────────────────────────────────────────────────────────────────
118
+
119
+ class Augmenter:
120
+
121
+ def __call__(self, img: np.ndarray) -> np.ndarray:
122
+ img = img.copy()
123
+
124
+ # Random slight rotation (Β±3Β°)
125
+ if random.random() < 0.3:
126
+ angle = random.uniform(-3, 3)
127
+ h, w = img.shape
128
+ M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
129
+ img = cv2.warpAffine(img, M, (w, h),
130
+ borderMode=cv2.BORDER_CONSTANT,
131
+ borderValue=255)
132
+
133
+ # Random brightness/contrast
134
+ if random.random() < 0.4:
135
+ alpha = random.uniform(0.8, 1.2)
136
+ beta = random.randint(-20, 20)
137
+ img = np.clip(alpha * img.astype(np.float32) + beta,
138
+ 0, 255).astype(np.uint8)
139
+
140
+ # Gaussian blur
141
+ if random.random() < 0.3:
142
+ ksize = random.choice([3, 5])
143
+ img = cv2.GaussianBlur(img, (ksize, ksize), 0)
144
+
145
+ # Salt-and-pepper noise
146
+ if random.random() < 0.2:
147
+ noise = np.random.randint(0, 100, img.shape)
148
+ img[noise < 2] = 0
149
+ img[noise > 97] = 255
150
+
151
+ # Random small horizontal shift
152
+ if random.random() < 0.2:
153
+ h, w = img.shape
154
+ shift = random.randint(-int(w * 0.05), int(w * 0.05))
155
+ M = np.float32([[1, 0, shift], [0, 1, 0]])
156
+ img = cv2.warpAffine(img, M, (w, h),
157
+ borderMode=cv2.BORDER_CONSTANT,
158
+ borderValue=255)
159
+
160
+ # ── NEW: Horizontal line noise ────────────────────────────────────────
161
+ # Simulates ruled form lines bleeding through behind the text.
162
+ # Civil registry forms have printed horizontal grid lines β€” scanners
163
+ # often pick these up as faint grey stripes across text fields.
164
+ if random.random() < 0.3:
165
+ h, w = img.shape
166
+ n_lines = random.randint(1, 3)
167
+ for _ in range(n_lines):
168
+ y = random.randint(0, h - 1)
169
+ thickness = random.choice([1, 1, 1, 2]) # mostly 1px
170
+ intensity = random.randint(160, 220) # light grey, not black
171
+ cv2.line(img, (0, y), (w, y),
172
+ color=intensity, thickness=thickness)
173
+
174
+ # ── NEW: Perspective warp ─────────────────────────────────────────────
175
+ # Simulates documents scanned or photographed at a slight angle.
176
+ # Keystone distortion is common when forms are placed unevenly on
177
+ # a flatbed scanner or photographed with a phone camera.
178
+ if random.random() < 0.25:
179
+ h, w = img.shape
180
+ d = 0.03
181
+ dx = int(w * d)
182
+ dy = int(h * d)
183
+ src = np.float32([[0, 0], [w, 0], [w, h], [0, h]])
184
+ dst = np.float32([
185
+ [random.randint(0, dx), random.randint(0, dy)],
186
+ [w - random.randint(0, dx), random.randint(0, dy)],
187
+ [w - random.randint(0, dx), h - random.randint(0, dy)],
188
+ [random.randint(0, dx), h - random.randint(0, dy)],
189
+ ])
190
+ M = cv2.getPerspectiveTransform(src, dst)
191
+ img = cv2.warpPerspective(img, M, (w, h),
192
+ borderMode=cv2.BORDER_CONSTANT,
193
+ borderValue=255)
194
+
195
+ return img
196
+
197
+
198
+ # ─────────────────────────────────────────────────────────────────────────────
199
+ # DATASET
200
+ # ─────────────────────────────────────────────────────────────────────────────
201
+
202
+ class CivilRegistryDataset(Dataset):
203
+ """
204
+ Args:
205
+ data_dir : root folder containing image subfolders (e.g. 'data/train')
206
+ annotations_file : path to JSON file with image_path + text pairs
207
+ img_height : target image height (default 64)
208
+ img_width : target image width (default 512)
209
+ augment : True = apply augmentation (training only)
210
+ form_type : 'all' or filter by form e.g. 'form1a'
211
+
212
+ Properties used by train.py:
213
+ .num_chars β†’ passed to CRNN model
214
+ .char_to_idx β†’ saved in checkpoint
215
+ .idx_to_char β†’ used for decoding predictions
216
+
217
+ __getitem__ returns:
218
+ image_tensor FloatTensor [1, H, W]
219
+ target LongTensor [label_length]
220
+ target_length int
221
+ text str (original ground truth)
222
+ """
223
+
224
+ def __init__(
225
+ self,
226
+ data_dir: str,
227
+ annotations_file: str,
228
+ img_height: int = 64,
229
+ img_width: int = 512,
230
+ augment: bool = False,
231
+ form_type: str = 'all',
232
+ seed: Optional[int] = None, # Rec 2: reproducible augmentation
233
+ ):
234
+ self.data_dir = Path(data_dir)
235
+ self.augment = augment
236
+ self.normalizer = ImageNormalizer(img_height, img_width)
237
+ self.augmenter = Augmenter()
238
+ if seed is not None: # Rec 2: seed random for reproducibility
239
+ random.seed(seed)
240
+ np.random.seed(seed)
241
+
242
+ self.char_to_idx, self.idx_to_char, self.num_chars = build_char_maps()
243
+
244
+ with open(annotations_file, 'r', encoding='utf-8') as f:
245
+ all_annotations = json.load(f)
246
+
247
+ if form_type != 'all':
248
+ all_annotations = [
249
+ a for a in all_annotations
250
+ if form_type in a.get('image_path', '')
251
+ ]
252
+
253
+ self.samples: List[Dict] = []
254
+ missing = 0
255
+ for ann in all_annotations:
256
+ img_path = self.data_dir / ann['image_path']
257
+ if img_path.exists():
258
+ text = ann['text'].strip()
259
+ if text:
260
+ self.samples.append({
261
+ 'image_path': str(img_path),
262
+ 'text': text,
263
+ })
264
+ else:
265
+ missing += 1
266
+
267
+ if missing > 0:
268
+ print(f" [Dataset] WARNING: {missing} image(s) not found and skipped.")
269
+
270
+ print(f" [Dataset] Loaded {len(self.samples)} samples "
271
+ f"from {annotations_file} (augment={augment})")
272
+
273
+ def __len__(self) -> int:
274
+ return len(self.samples)
275
+
276
+ def __getitem__(self, idx: int):
277
+ sample = self.samples[idx]
278
+ text = sample['text']
279
+
280
+ img = cv2.imread(sample['image_path'])
281
+ if img is None:
282
+ img = np.ones((64, 512, 3), dtype=np.uint8) * 255
283
+
284
+ # FIXED Bug 3: pass augmenter into normalize() so it runs on grayscale
285
+ # (before binarization), not on the binary output where it has no effect.
286
+ aug = self.augmenter if self.augment else None
287
+ normalized = self.normalizer.normalize(img, augmenter=aug)
288
+
289
+ image_tensor = self.normalizer.to_tensor(normalized) # [1, H, W]
290
+
291
+ encoded = [
292
+ self.char_to_idx[c]
293
+ for c in text
294
+ if c in self.char_to_idx
295
+ ]
296
+ if len(encoded) == 0:
297
+ encoded = [self.char_to_idx.get(' ', 1)]
298
+
299
+ target = torch.LongTensor(encoded)
300
+ target_length = len(encoded)
301
+
302
+ return image_tensor, target, target_length, text
303
+
304
+
305
+ # ─────────────────────────────────────────────────────────────────────────────
306
+ # COLLATE FUNCTION
307
+ # ─────────────────────────────────────────────────────────────────────────────
308
+
309
+ def collate_fn(batch):
310
+ """
311
+ CTC loss needs all labels packed into one flat 1D tensor.
312
+ PyTorch's default collator can't handle variable-length labels,
313
+ so this custom function packs them correctly.
314
+
315
+ Returns:
316
+ images FloatTensor [B, 1, H, W]
317
+ targets LongTensor [sum of all label lengths]
318
+ target_lengths LongTensor [B]
319
+ texts List[str]
320
+ """
321
+ images, targets, target_lengths, texts = zip(*batch)
322
+
323
+ images = torch.stack(images, dim=0)
324
+ targets = torch.cat([t for t in targets])
325
+ target_lengths = torch.LongTensor(target_lengths)
326
+
327
+ return images, targets, target_lengths, list(texts)
328
+
329
+
330
+ # ─────────────────────────────────────────────────────────────────────────────
331
+ # HELPER: CREATE ANNOTATION FILE (run once to build your JSON)
332
+ # ─────────────────────────────────────────────────────────────────────────────
333
+
334
+ def create_annotation_file(data_dir: str, output_file: str,
335
+ extensions=('.jpg', '.jpeg', '.png')):
336
+ """
337
+ Auto-generate annotations JSON by scanning data_dir.
338
+ For each image, looks for a sidecar .txt file with the same name.
339
+ If not found, uses the filename stem (underscores β†’ spaces) as label.
340
+
341
+ Usage:
342
+ from dataset import create_annotation_file
343
+ create_annotation_file('data/train', 'data/train_annotations.json')
344
+ create_annotation_file('data/val', 'data/val_annotations.json')
345
+ """
346
+ data_path = Path(data_dir)
347
+ annotations = []
348
+
349
+ for img_path in sorted(data_path.rglob('*')):
350
+ if img_path.suffix.lower() not in extensions:
351
+ continue
352
+ txt_path = img_path.with_suffix('.txt')
353
+ if txt_path.exists():
354
+ label = txt_path.read_text(encoding='utf-8').strip()
355
+ else:
356
+ label = img_path.stem.replace('_', ' ')
357
+ if not label:
358
+ continue
359
+ rel_path = img_path.relative_to(data_path)
360
+ annotations.append({
361
+ 'image_path': str(rel_path).replace('\\', '/'),
362
+ 'text': label,
363
+ })
364
+
365
+ os.makedirs(Path(output_file).parent, exist_ok=True)
366
+ with open(output_file, 'w', encoding='utf-8') as f:
367
+ json.dump(annotations, f, indent=2, ensure_ascii=False)
368
+
369
+ print(f"βœ“ Saved {len(annotations)} entries β†’ {output_file}")
370
+ return annotations
371
+
372
+
373
+ # ─────────────────────────────────────────────────────────────────────────────
374
+ # SELF-TEST (python dataset.py)
375
+ # ─────────────────────────────────────────────────────────────────────────────
376
+
377
+ if __name__ == '__main__':
378
+ print("=" * 55)
379
+ print(" dataset.py self-test")
380
+ print("=" * 55)
381
+
382
+ c2i, i2c, n = build_char_maps()
383
+ print(f"\n Vocab size : {n} (including blank=0)")
384
+ print(f" 'A'={c2i['A']} '0'={c2i['0']} ' '={c2i[' ']} '.'={c2i['.']}")
385
+
386
+ dummy = np.ones((80, 300, 3), dtype=np.uint8) * 200
387
+ norm = ImageNormalizer(64, 512)
388
+ out = norm.normalize(dummy)
389
+ t = norm.to_tensor(out)
390
+ print(f"\n Normalizer : {dummy.shape} β†’ {out.shape} β†’ tensor {t.shape}")
391
+
392
+ fake = [
393
+ (torch.zeros(1, 64, 512), torch.LongTensor([1, 2, 3]), 3, "ABC"),
394
+ (torch.zeros(1, 64, 512), torch.LongTensor([4, 5]), 2, "DE"),
395
+ (torch.zeros(1, 64, 512), torch.LongTensor([6, 7, 8, 9]), 4, "FGHI"),
396
+ ]
397
+ imgs, tgts, tlens, txts = collate_fn(fake)
398
+ print(f"\n collate_fn : images={imgs.shape} "
399
+ f"targets={tgts.shape} lengths={tlens.tolist()}")
400
+
401
+ print("\n βœ“ All checks passed.\n")
CRNN+CTC/field_extractor.py ADDED
@@ -0,0 +1,735 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Philippine Civil Registry β€” Field Extractor (Dynamic)
3
+ ======================================================
4
+ Automatically detects form borders on ANY scan/photo and aligns field
5
+ extraction to the detected boundary β€” no hardcoded pixel positions.
6
+
7
+ Field coordinates calibrated directly from official PDF renders at 200 DPI:
8
+ Form 102 (Birth): 1700 x 2800 px
9
+ Form 103 (Death): 1700 x 2878 px
10
+ Form 97 (Marriage): 1700 x 2600 px
11
+ Form 90 (License): 1700 x 2600 px
12
+
13
+ Usage:
14
+ python field_extractor.py --pdf FORM_102.pdf --form birth
15
+ python field_extractor.py --pdf FORM_97.pdf --form marriage --visualize
16
+ python field_extractor.py --pdf FORM_103.pdf --form death --output results.json
17
+ python field_extractor.py --image form102.png --form birth --visualize
18
+ python field_extractor.py --pdf FORM_102.pdf --form birth --checkpoint checkpoints/best_model_emnist.pth
19
+
20
+ .env file (project root) β€” each team member sets their own:
21
+ POPPLER_PATH=C:\\your\\path\\to\\poppler\\Library\\bin
22
+ """
23
+
24
+ import argparse
25
+ import os
26
+ import sys
27
+ import json
28
+ import cv2
29
+ import numpy as np
30
+ from pathlib import Path
31
+
32
+ import torch
33
+ from dotenv import load_dotenv
34
+
35
+ # Load .env from same folder as this script (works regardless of cwd)
36
+ _script_dir = Path(__file__).parent.resolve()
37
+ load_dotenv(dotenv_path=_script_dir / ".env")
38
+
39
+ # Poppler path β€” from .env or None (Linux/Mac auto-detects)
40
+ POPPLER_PATH = os.environ.get("POPPLER_PATH", None)
41
+ DEFAULT_CHECKPOINT = "checkpoints/best_model.pth"
42
+
43
+
44
+ # ══════════════════════════════════════════════════════════════════════════════
45
+ # FIELD RATIO MAPS
46
+ # Format: field_name: (x1, y1, x2, y2) β€” ratios 0.0–1.0
47
+ # Coordinates are relative to the DETECTED FORM BOUNDARY (not full image).
48
+ # x = left→right, y = top→bottom
49
+ # ══════════════════════════════════════════════════════════════════════════════
50
+
51
+ # Form 102 β†’ Certificate of Live Birth (Form 1A)
52
+ BIRTH_FIELDS = {
53
+ # Header
54
+ "province": (0.02, 0.068, 0.30, 0.088),
55
+ "registry_number": (0.66, 0.068, 0.99, 0.108),
56
+ "city_municipality": (0.02, 0.090, 0.65, 0.108),
57
+
58
+ # Item 1 β€” Child Name
59
+ "child_first_name": (0.03, 0.109, 0.40, 0.141),
60
+ "child_middle_name": (0.40, 0.109, 0.64, 0.141),
61
+ "child_last_name": (0.64, 0.109, 0.99, 0.141),
62
+
63
+ # Items 2-3 β€” Sex / Date of Birth
64
+ "sex": (0.03, 0.142, 0.30, 0.167),
65
+ "dob_day": (0.40, 0.142, 0.80, 0.167),
66
+ "dob_month": (0.80, 0.142, 0.60, 0.167),
67
+ "dob_year": (0.80, 0.142, 0.99, 0.167),
68
+
69
+ # Item 4 β€” Place of Birth
70
+ "place_birth_hospital": (0.03, 0.169, 0.46, 0.197),
71
+ "place_birth_city": (0.47, 0.169, 0.70, 0.199),
72
+ "place_birth_province": (0.71, 0.169, 0.99, 0.199),
73
+
74
+
75
+
76
+ # Mother section
77
+ "mother_first_name": (0.03, 0.248, 0.40, 0.276),
78
+ "mother_middle_name": (0.40, 0.248, 0.64, 0.276),
79
+ "mother_last_name": (0.64, 0.248, 0.99, 0.276),
80
+ "mother_citizenship": (0.03, 0.277, 0.50, 0.305),
81
+
82
+
83
+ # Father section
84
+ "father_first_name": (0.03, 0.380, 0.40, 0.410),
85
+ "father_middle_name": (0.40, 0.380, 0.64, 0.410),
86
+ "father_last_name": (0.64, 0.380, 0.99, 0.410),
87
+ "father_citizenship": (0.03, 0.411, 0.28, 0.445),
88
+
89
+
90
+ # Item 20 β€” Marriage of Parents
91
+ "parents_marriage_month": (0.03, 0.496, 0.19, 0.526),
92
+ "parents_marriage_day": (0.19, 0.496, 0.27, 0.526),
93
+ "parents_marriage_year": (0.27, 0.496, 0.38, 0.526),
94
+
95
+ "parents_marriage_city": (0.41, 0.496, 0.68, 0.526),
96
+ "parents_marriage_province": (0.68, 0.496, 0.84, 0.526),
97
+
98
+
99
+ }
100
+
101
+ # Form 103 β†’ Certificate of Death (Form 2A)
102
+ DEATH_FIELDS = {
103
+ # Header
104
+ "province": (0.04, 0.128, 0.40, 0.144),
105
+ "registry_number": (0.52, 0.128, 0.75, 0.144),
106
+ "city_municipality": (0.04, 0.145, 0.45, 0.160),
107
+
108
+ # Item 1 β€” Name
109
+ "deceased_first_name": (0.10, 0.162, 0.34, 0.178),
110
+ "deceased_middle_name": (0.34, 0.162, 0.56, 0.178),
111
+ "deceased_last_name": (0.56, 0.162, 0.75, 0.178),
112
+
113
+ # Items 2-4 β€” Sex / Religion / Age
114
+ "sex": (0.04, 0.182, 0.13, 0.220),
115
+ "age_years": (0.28, 0.182, 0.38, 0.202),
116
+
117
+ # Item 5 β€” Place of Death
118
+ "place_death_hospital": (0.13, 0.224, 0.42, 0.242),
119
+ "place_death_city": (0.42, 0.224, 0.58, 0.242),
120
+ "place_death_province": (0.58, 0.224, 0.75, 0.242),
121
+
122
+ # Items 6-7 β€” Date of Death / Citizenship
123
+ "dod_day": (0.10, 0.252, 0.22, 0.268),
124
+ "dod_month": (0.22, 0.252, 0.38, 0.268),
125
+ "dod_year": (0.38, 0.252, 0.52, 0.268),
126
+ "citizenship": (0.52, 0.252, 0.75, 0.268),
127
+
128
+ # Item 8 β€” Residence
129
+ "residence_house": (0.13, 0.278, 0.40, 0.294),
130
+ "residence_city": (0.40, 0.278, 0.56, 0.294),
131
+ "residence_province": (0.56, 0.278, 0.75, 0.294),
132
+
133
+ # Items 9-10 β€” Civil Status / Occupation
134
+ "civil_status": (0.04, 0.302, 0.38, 0.360),
135
+ "occupation": (0.44, 0.302, 0.75, 0.360),
136
+
137
+ # Item 17 β€” Causes of Death
138
+ "cause_immediate": (0.18, 0.402, 0.58, 0.418),
139
+ "cause_antecedent": (0.18, 0.424, 0.58, 0.440),
140
+ "cause_underlying": (0.18, 0.446, 0.58, 0.462),
141
+ "cause_other": (0.18, 0.468, 0.58, 0.484),
142
+
143
+ # Item 25 β€” Informant
144
+ "informant_name": (0.04, 0.808, 0.35, 0.822),
145
+ "informant_address": (0.04, 0.822, 0.35, 0.836),
146
+ "informant_date": (0.35, 0.836, 0.58, 0.850),
147
+ }
148
+
149
+ # Form 97 β†’ Certificate of Marriage (Form 3A)
150
+ # Only the fields that flow through bridge.py β†’ spaCy NER β†’ SpouseOutput/Form3A.
151
+ # Removed: province, city_municipality, dob_day/month/year (Γ—2),
152
+ # place_birth_city/prov/country (Γ—2), sex (Γ—2), residence (Γ—2),
153
+ # religion (Γ—2), civil_status (Γ—2).
154
+ MARRIAGE_FIELDS = {
155
+ # ── Header ───────────────────────────────────────────────────────────────
156
+ "registry_number": (0.62, 0.088, 0.97, 0.104), # β†’ Form3A.registry_number
157
+
158
+ #"registry_number": (0.62, 0.088, 0.97, 0.104), # β†’ Form3A.registry_number
159
+
160
+ # ── Item 1 β€” Name (HUSBAND left / WIFE right) ────────────────────────────
161
+ "husband_first_name": (0.23, 0.121, 0.56, 0.139),
162
+ "husband_middle_name": (0.23, 0.141, 0.56, 0.159),
163
+ "husband_last_name": (0.23, 0.160, 0.56, 0.178),
164
+ "wife_first_name": (0.65, 0.121, 0.98, 0.139),
165
+ "wife_middle_name": (0.65, 0.141, 0.98, 0.159),
166
+ "wife_last_name": (0.65, 0.160, 0.98, 0.178),
167
+
168
+ # "husband_first_name": (0.14, 0.138, 0.47, 0.156),
169
+ # "husband_middle_name": (0.14, 0.156, 0.47, 0.174),
170
+ # "husband_last_name": (0.14, 0.174, 0.47, 0.192),
171
+ # "wife_first_name": (0.53, 0.138, 0.86, 0.156),
172
+ # "wife_middle_name": (0.53, 0.156, 0.86, 0.174),
173
+ # "wife_last_name": (0.53, 0.174, 0.86, 0.192),
174
+
175
+ # ── Item 2b β€” Age ────────────────────────────────────────────────────────
176
+ "husband_age": (0.40, 0.198, 0.47, 0.216), # β†’ husband.age
177
+ "wife_age": (0.78, 0.198, 0.86, 0.216), # β†’ wife.age
178
+
179
+ # ── Item 4b β€” Citizenship ────────────────────────────────────────────────
180
+ "husband_citizenship": (0.22, 0.252, 0.47, 0.270), # β†’ husband.nationality
181
+ "wife_citizenship": (0.62, 0.252, 0.86, 0.270), # β†’ wife.nationality
182
+
183
+ # ── Item 8 β€” Name of Father ──────────────────────────────────────────────
184
+ "husband_father_first": (0.14, 0.396, 0.24, 0.414),
185
+ "husband_father_middle": (0.24, 0.396, 0.34, 0.414),
186
+ "husband_father_last": (0.34, 0.396, 0.47, 0.414),
187
+ "wife_father_first": (0.53, 0.396, 0.63, 0.414),
188
+ "wife_father_middle": (0.63, 0.396, 0.73, 0.414),
189
+ "wife_father_last": (0.73, 0.396, 0.86, 0.414),
190
+
191
+ # ── Item 9 β€” Citizenship of Father ──────────────────────────────────────
192
+ "husband_father_citizenship": (0.14, 0.420, 0.47, 0.436), # β†’ husband.nationality_of_father
193
+ "wife_father_citizenship": (0.53, 0.420, 0.86, 0.436), # β†’ wife.nationality_of_father
194
+
195
+ # ── Item 10 β€” Name of Mother ─────────────────────────────────────────────
196
+ "husband_mother_first": (0.14, 0.444, 0.24, 0.462),
197
+ "husband_mother_middle": (0.24, 0.444, 0.34, 0.462),
198
+ "husband_mother_last": (0.34, 0.444, 0.47, 0.462),
199
+ "wife_mother_first": (0.53, 0.444, 0.63, 0.462),
200
+ "wife_mother_middle": (0.63, 0.444, 0.73, 0.462),
201
+ "wife_mother_last": (0.73, 0.444, 0.86, 0.462),
202
+
203
+ # ── Item 11 β€” Citizenship of Mother ─────────────────────────────────────
204
+ "husband_mother_citizenship": (0.14, 0.468, 0.47, 0.484), # β†’ husband.nationality_of_mother
205
+ "wife_mother_citizenship": (0.53, 0.468, 0.86, 0.484), # β†’ wife.nationality_of_mother
206
+
207
+ # ── Items 15–16 β€” Place / Date of Marriage ───────────────────────────────
208
+ "place_marriage_office": (0.14, 0.596, 0.44, 0.614),
209
+ "place_marriage_city": (0.44, 0.596, 0.68, 0.614),
210
+ "place_marriage_province": (0.68, 0.596, 0.88, 0.614),
211
+ "date_marriage_day": (0.14, 0.630, 0.24, 0.648),
212
+ "date_marriage_month": (0.24, 0.630, 0.38, 0.648),
213
+ "date_marriage_year": (0.38, 0.630, 0.48, 0.648),
214
+ }
215
+
216
+ # Form 90 β†’ Application for Marriage License
217
+ MARRIAGE_LICENSE_FIELDS = {
218
+ # Header
219
+ "province": (0.12, 0.092, 0.48, 0.108),
220
+ "registry_number": (0.56, 0.092, 0.97, 0.108),
221
+ "city_municipality": (0.12, 0.108, 0.48, 0.124),
222
+ "received_by": (0.12, 0.124, 0.48, 0.140),
223
+ "date_of_receipt": (0.12, 0.140, 0.48, 0.156),
224
+ "marriage_license_number": (0.56, 0.124, 0.97, 0.140),
225
+ "date_of_issuance": (0.56, 0.140, 0.97, 0.156),
226
+
227
+ # Item 1 β€” Name of Applicant (GROOM left / BRIDE right)
228
+ "groom_first_name": (0.02, 0.278, 0.46, 0.294),
229
+ "bride_first_name": (0.54, 0.278, 0.97, 0.294),
230
+ "groom_middle_name": (0.02, 0.296, 0.46, 0.312),
231
+ "bride_middle_name": (0.54, 0.296, 0.97, 0.312),
232
+ "groom_last_name": (0.02, 0.314, 0.46, 0.330),
233
+ "bride_last_name": (0.54, 0.314, 0.97, 0.330),
234
+
235
+ # Item 2 β€” Date of Birth / Age
236
+ "groom_dob_day": (0.02, 0.334, 0.12, 0.350),
237
+ "groom_dob_month": (0.12, 0.334, 0.24, 0.350),
238
+ "groom_dob_year": (0.24, 0.334, 0.34, 0.350),
239
+ "groom_age": (0.34, 0.334, 0.46, 0.350),
240
+ "bride_dob_day": (0.54, 0.334, 0.62, 0.350),
241
+ "bride_dob_month": (0.62, 0.334, 0.74, 0.350),
242
+ "bride_dob_year": (0.74, 0.334, 0.84, 0.350),
243
+ "bride_age": (0.84, 0.334, 0.97, 0.350),
244
+
245
+ # Item 3 β€” Place of Birth
246
+ "groom_place_birth_city": (0.02, 0.354, 0.18, 0.370),
247
+ "groom_place_birth_province": (0.18, 0.354, 0.32, 0.370),
248
+ "groom_place_birth_country": (0.32, 0.354, 0.46, 0.370),
249
+ "bride_place_birth_city": (0.54, 0.354, 0.70, 0.370),
250
+ "bride_place_birth_province": (0.70, 0.354, 0.84, 0.370),
251
+ "bride_place_birth_country": (0.84, 0.354, 0.97, 0.370),
252
+
253
+ # Item 4 β€” Sex / Citizenship
254
+ "groom_sex": (0.02, 0.374, 0.16, 0.390),
255
+ "groom_citizenship": (0.16, 0.374, 0.46, 0.390),
256
+ "bride_sex": (0.54, 0.374, 0.68, 0.390),
257
+ "bride_citizenship": (0.68, 0.374, 0.97, 0.390),
258
+
259
+ # Item 5 β€” Residence
260
+ "groom_residence": (0.02, 0.394, 0.46, 0.412),
261
+ "bride_residence": (0.54, 0.394, 0.97, 0.412),
262
+
263
+ # Item 6 β€” Religion
264
+ "groom_religion": (0.02, 0.424, 0.46, 0.440),
265
+ "bride_religion": (0.54, 0.424, 0.97, 0.440),
266
+
267
+ # Item 7 β€” Civil Status
268
+ "groom_civil_status": (0.02, 0.452, 0.46, 0.468),
269
+ "bride_civil_status": (0.54, 0.452, 0.97, 0.468),
270
+
271
+ # Item 9 β€” Place where dissolved
272
+ "groom_dissolution_city": (0.02, 0.496, 0.16, 0.512),
273
+ "groom_dissolution_province": (0.16, 0.496, 0.30, 0.512),
274
+ "groom_dissolution_country": (0.30, 0.496, 0.46, 0.512),
275
+ "bride_dissolution_city": (0.54, 0.496, 0.68, 0.512),
276
+ "bride_dissolution_province": (0.68, 0.496, 0.82, 0.512),
277
+ "bride_dissolution_country": (0.82, 0.496, 0.97, 0.512),
278
+
279
+ # Item 10 β€” Date when dissolved
280
+ "groom_dissolution_day": (0.02, 0.520, 0.12, 0.536),
281
+ "groom_dissolution_month": (0.12, 0.520, 0.24, 0.536),
282
+ "groom_dissolution_year": (0.24, 0.520, 0.34, 0.536),
283
+ "bride_dissolution_day": (0.54, 0.520, 0.62, 0.536),
284
+ "bride_dissolution_month": (0.62, 0.520, 0.74, 0.536),
285
+ "bride_dissolution_year": (0.74, 0.520, 0.84, 0.536),
286
+
287
+ # Item 12 β€” Father Name
288
+ "groom_father_first": (0.02, 0.594, 0.16, 0.610),
289
+ "groom_father_middle": (0.16, 0.594, 0.28, 0.610),
290
+ "groom_father_last": (0.28, 0.594, 0.46, 0.610),
291
+ "bride_father_first": (0.54, 0.594, 0.66, 0.610),
292
+ "bride_father_middle": (0.66, 0.594, 0.78, 0.610),
293
+ "bride_father_last": (0.78, 0.594, 0.97, 0.610),
294
+
295
+ # Item 13 β€” Father Citizenship
296
+ "groom_father_citizenship": (0.02, 0.620, 0.46, 0.636),
297
+ "bride_father_citizenship": (0.54, 0.620, 0.97, 0.636),
298
+
299
+ # Item 14 β€” Father Residence
300
+ "groom_father_residence": (0.02, 0.644, 0.46, 0.660),
301
+ "bride_father_residence": (0.54, 0.644, 0.97, 0.660),
302
+
303
+ # Item 15 β€” Mother Name
304
+ "groom_mother_first": (0.02, 0.674, 0.16, 0.690),
305
+ "groom_mother_middle": (0.16, 0.674, 0.28, 0.690),
306
+ "groom_mother_last": (0.28, 0.674, 0.46, 0.690),
307
+ "bride_mother_first": (0.54, 0.674, 0.66, 0.690),
308
+ "bride_mother_middle": (0.66, 0.674, 0.78, 0.690),
309
+ "bride_mother_last": (0.78, 0.674, 0.97, 0.690),
310
+
311
+ # Item 16 β€” Mother Citizenship
312
+ "groom_mother_citizenship": (0.02, 0.696, 0.46, 0.712),
313
+ "bride_mother_citizenship": (0.54, 0.696, 0.97, 0.712),
314
+
315
+ # Item 17 β€” Mother Residence
316
+ "groom_mother_residence": (0.02, 0.720, 0.46, 0.736),
317
+ "bride_mother_residence": (0.54, 0.720, 0.97, 0.736),
318
+ }
319
+
320
+ FORM_FIELDS = {
321
+ "birth": BIRTH_FIELDS,
322
+ "death": DEATH_FIELDS,
323
+ "marriage": MARRIAGE_FIELDS,
324
+ "marriage_license": MARRIAGE_LICENSE_FIELDS,
325
+ }
326
+
327
+ COLOURS = [
328
+ (0,200,0),(0,150,255),(200,0,200),(0,200,200),(200,200,0),(220,20,60),
329
+ (255,140,0),(150,50,200),(0,160,80),(30,144,255),(255,20,147),(100,200,100),
330
+ ]
331
+
332
+
333
+ # ══════════════════════════════════════════════════════════════════════════════
334
+ # FORM BOUNDS DETECTOR
335
+ # Finds the outer border of a civil registry form using line detection.
336
+ # Falls back to full image if detection fails.
337
+ # ══════════════════════════════════════════════════════════════════════════════
338
+
339
+ class FormBoundsDetector:
340
+ def __init__(self, verbose=False):
341
+ self.verbose = verbose
342
+
343
+ def detect(self, image_bgr):
344
+ h, w = image_bgr.shape[:2]
345
+ gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
346
+ bounds = self._detect_by_lines(gray, w, h)
347
+ if bounds is None:
348
+ if self.verbose:
349
+ print(" [Bounds] Line detection failed β€” using full image")
350
+ return (0, 0, w, h)
351
+ if self.verbose:
352
+ print(f" [Bounds] Detected: {bounds}")
353
+ return bounds
354
+
355
+ def _detect_by_lines(self, gray, w, h):
356
+ try:
357
+ thresh = cv2.adaptiveThreshold(
358
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
359
+ cv2.THRESH_BINARY_INV, 11, 2)
360
+ hk = cv2.getStructuringElement(cv2.MORPH_RECT, (max(w // 5, 10), 1))
361
+ h_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, hk)
362
+ h_rows = np.where(np.sum(h_lines, axis=1) > w * 0.15)[0]
363
+ vk = cv2.getStructuringElement(cv2.MORPH_RECT, (1, max(h // 5, 10)))
364
+ v_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vk)
365
+ v_cols = np.where(np.sum(v_lines, axis=0) > h * 0.08)[0]
366
+ if len(h_rows) == 0 or len(v_cols) == 0:
367
+ return None
368
+ top, bottom = int(h_rows.min()), int(h_rows.max())
369
+ left, right = int(v_cols.min()), int(v_cols.max())
370
+ if (right - left) < w * 0.4 or (bottom - top) < h * 0.4:
371
+ return None
372
+ return (left, top, right, bottom)
373
+ except Exception as e:
374
+ if self.verbose:
375
+ print(f" [Bounds error] {e}")
376
+ return None
377
+
378
+
379
+ # ══════════════════════════════════════════════════════════════════════════════
380
+ # DYNAMIC FIELD EXTRACTOR
381
+ # Crops each field region relative to the detected form boundary.
382
+ # Works on any image size, DPI, scan margin, or slight rotation.
383
+ # ══════════════════════════════════════════════════════════════════════════════
384
+
385
+ class DynamicFieldExtractor:
386
+ def __init__(self, form_type="birth", verbose=False):
387
+ self.form_type = form_type.lower()
388
+ self.field_map = FORM_FIELDS.get(self.form_type, BIRTH_FIELDS)
389
+ self.detector = FormBoundsDetector(verbose=verbose)
390
+ self.verbose = verbose
391
+ self._last_bounds = None
392
+
393
+ def _to_bgr(self, image):
394
+ try:
395
+ from PIL import Image as PILImage
396
+ if isinstance(image, PILImage.Image):
397
+ arr = np.array(image.convert("RGB"))
398
+ return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
399
+ except ImportError:
400
+ pass
401
+ if isinstance(image, np.ndarray):
402
+ if len(image.shape) == 2:
403
+ return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
404
+ if image.shape[2] == 4:
405
+ return cv2.cvtColor(image, cv2.COLOR_BGRA2BGR)
406
+ return image
407
+ raise TypeError(f"Unsupported image type: {type(image)}")
408
+
409
+ def extract(self, image):
410
+ """Returns {field_name: BGR numpy array}."""
411
+ image = self._to_bgr(image)
412
+ h, w = image.shape[:2]
413
+ left, top, right, bottom = self.detector.detect(image)
414
+ self._last_bounds = (left, top, right, bottom)
415
+ form_w = right - left
416
+ form_h = bottom - top
417
+ if self.verbose:
418
+ print(f" [Extract] Image={w}x{h} "
419
+ f" Form={form_w}x{form_h} @ ({left},{top})-({right},{bottom})")
420
+ crops = {}
421
+ for name, (rx1, ry1, rx2, ry2) in self.field_map.items():
422
+ x1 = max(0, min(int(left + rx1 * form_w), w - 1))
423
+ y1 = max(0, min(int(top + ry1 * form_h), h - 1))
424
+ x2 = max(0, min(int(left + rx2 * form_w), w - 1))
425
+ y2 = max(0, min(int(top + ry2 * form_h), h - 1))
426
+ if x2 > x1 and y2 > y1:
427
+ crops[name] = image[y1:y2, x1:x2]
428
+ return crops
429
+
430
+ def visualize(self, image, output_path=None):
431
+ """Draw detected boundary + field boxes. Returns annotated BGR image."""
432
+ image = self._to_bgr(image)
433
+ vis = image.copy()
434
+ h, w = vis.shape[:2]
435
+ self.extract(image)
436
+ left, top, right, bottom = self._last_bounds
437
+ form_w = right - left
438
+ form_h = bottom - top
439
+ cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 3)
440
+ cv2.putText(vis, "DETECTED FORM BOUNDARY",
441
+ (left, max(0, top - 8)),
442
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 140, 255), 1)
443
+ for idx, (name, (rx1, ry1, rx2, ry2)) in enumerate(self.field_map.items()):
444
+ x1 = max(0, min(int(left + rx1 * form_w), w - 1))
445
+ y1 = max(0, min(int(top + ry1 * form_h), h - 1))
446
+ x2 = max(0, min(int(left + rx2 * form_w), w - 1))
447
+ y2 = max(0, min(int(top + ry2 * form_h), h - 1))
448
+ c = COLOURS[idx % len(COLOURS)]
449
+ cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2)
450
+ cv2.putText(vis, name[:22], (x1 + 2, max(0, y1 - 2)),
451
+ cv2.FONT_HERSHEY_SIMPLEX, 0.28, c, 1)
452
+ if output_path:
453
+ cv2.imwrite(str(output_path), vis)
454
+ print(f" Field map saved -> {output_path}")
455
+ return vis
456
+
457
+
458
+ # ══════════════════════════════════════════════════════════════════════════════
459
+ # FIELD NORMALIZER β€” prepares a BGR crop for CRNN inference
460
+ # ══════════════════════════════════════════════════════════════════════════════
461
+
462
+ class FieldNormalizer:
463
+ def __init__(self, target_height=64, target_width=512):
464
+ self.H = target_height
465
+ self.W = target_width
466
+
467
+ def _crop_to_text(self, gray):
468
+ inv = cv2.bitwise_not(gray)
469
+ _, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
470
+ coords = np.column_stack(np.where(thresh > 0))
471
+ if len(coords) == 0:
472
+ return gray
473
+ y_min, x_min = coords.min(axis=0)
474
+ y_max, x_max = coords.max(axis=0)
475
+ pad = max(4, int((y_max - y_min) * 0.15))
476
+ y_min = max(0, y_min - pad)
477
+ x_min = max(0, x_min - pad)
478
+ y_max = min(gray.shape[0] - 1, y_max + pad)
479
+ x_max = min(gray.shape[1] - 1, x_max + pad)
480
+ return gray[y_min:y_max + 1, x_min:x_max + 1]
481
+
482
+ def _smart_resize(self, gray):
483
+ h, w = gray.shape
484
+ if h == 0 or w == 0:
485
+ return np.ones((self.H, self.W), dtype=np.uint8) * 255
486
+ scale = self.H / h
487
+ new_w = int(w * scale)
488
+ new_h = self.H
489
+ if new_w > self.W:
490
+ scale = self.W / w
491
+ new_h = int(h * scale)
492
+ new_w = self.W
493
+ resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
494
+ canvas = np.ones((self.H, self.W), dtype=np.uint8) * 255
495
+ y_off = (self.H - new_h) // 2
496
+ x_off = (self.W - new_w) // 2
497
+ canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized
498
+ return canvas
499
+
500
+ def _binarize(self, img):
501
+ _, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
502
+ white_ratio = np.mean(otsu == 255)
503
+ if white_ratio < 0.30 or white_ratio > 0.97:
504
+ return cv2.adaptiveThreshold(
505
+ img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
506
+ cv2.THRESH_BINARY, 11, 2)
507
+ return otsu
508
+
509
+ def normalize(self, crop) -> np.ndarray:
510
+ """Accept BGR numpy array or PIL image, return normalized binary array."""
511
+ try:
512
+ from PIL import Image as PILImage
513
+ if isinstance(crop, PILImage.Image):
514
+ crop = cv2.cvtColor(np.array(crop.convert("RGB")), cv2.COLOR_RGB2BGR)
515
+ except ImportError:
516
+ pass
517
+ gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) if len(crop.shape) == 3 else crop.copy()
518
+ gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
519
+ gray = self._crop_to_text(gray)
520
+ gray = self._smart_resize(gray)
521
+ return self._binarize(gray)
522
+
523
+ def to_tensor(self, img: np.ndarray) -> torch.Tensor:
524
+ return torch.FloatTensor(
525
+ img.astype(np.float32) / 255.0
526
+ ).unsqueeze(0).unsqueeze(0)
527
+
528
+
529
+ # ══════════════════════════════════════════════════════════════════════════════
530
+ # CRNN MODEL LOADER
531
+ # ══════════════════════════════════════════════════════════════════════════════
532
+
533
+ def load_crnn_model(checkpoint_path: str, device: torch.device):
534
+ sys.path.insert(0, str(Path(__file__).parent))
535
+ from crnn_model import get_crnn_model
536
+
537
+ print(f" Loading CRNN model from: {checkpoint_path}")
538
+ c = torch.load(checkpoint_path, map_location=device, weights_only=False)
539
+ config = c.get("config", {})
540
+ idx_to_char = c["idx_to_char"]
541
+ num_chars = c["model_state_dict"]["fc.weight"].shape[0]
542
+
543
+ model = get_crnn_model(
544
+ model_type=config.get("model_type", "standard"),
545
+ img_height=config.get("img_height", 64),
546
+ num_chars=num_chars,
547
+ hidden_size=config.get("hidden_size", 128),
548
+ num_lstm_layers=config.get("num_lstm_layers", 1),
549
+ ).to(device)
550
+ model.load_state_dict(c["model_state_dict"])
551
+ model.eval()
552
+
553
+ val_cer = c.get("val_cer", None)
554
+ val_loss = c.get("val_loss", None)
555
+ metric = f"val_cer={val_cer:.2f}%" if val_cer else \
556
+ f"val_loss={val_loss:.4f}" if val_loss else "no metric"
557
+ print(f" Model loaded | {metric} | chars={num_chars}")
558
+ return model, idx_to_char, config.get("img_height", 64), config.get("img_width", 512)
559
+
560
+
561
+ # ══════════════════════════════════════════════════════════════════════════════
562
+ # GREEDY CTC DECODE
563
+ # ══════════════════════════════════════════════════════════════════════════════
564
+
565
+ def greedy_decode(outputs: torch.Tensor, idx_to_char: dict) -> str:
566
+ pred_indices = torch.argmax(outputs, dim=2).permute(1, 0)
567
+ chars, prev = [], -1
568
+ for idx in pred_indices[0]:
569
+ idx = idx.item()
570
+ if idx != 0 and idx != prev and idx in idx_to_char:
571
+ chars.append(idx_to_char[idx])
572
+ prev = idx
573
+ return "".join(chars)
574
+
575
+
576
+ # ══════════════════════════════════════════════════════════════════════════════
577
+ # PDF β†’ PIL IMAGE
578
+ # ══════════════════════════════════════════════════════════════════════════════
579
+
580
+ def pdf_to_image(pdf_path: str, dpi: int = 200):
581
+ from pdf2image import convert_from_path
582
+ # Resolve to absolute path β€” fixes "Unable to get page count" on Windows
583
+ pdf_path = str(Path(pdf_path).resolve())
584
+ kwargs = {"dpi": dpi, "first_page": 1, "last_page": 1}
585
+ if POPPLER_PATH:
586
+ kwargs["poppler_path"] = str(Path(POPPLER_PATH).resolve())
587
+ return convert_from_path(pdf_path, **kwargs)[0]
588
+
589
+
590
+ # ══════════════════════════════════════════════════════════════════════════════
591
+ # CRNN OCR β€” runs on extracted field crops
592
+ # ══════════════════════════════════════════════════════════════════════════════
593
+
594
+ def run_crnn_ocr(crops: dict, model, idx_to_char: dict,
595
+ img_h: int, img_w: int, device: torch.device) -> dict:
596
+ normalizer = FieldNormalizer(target_height=img_h, target_width=img_w)
597
+ results = {}
598
+ with torch.no_grad():
599
+ for name, crop in crops.items():
600
+ try:
601
+ norm = normalizer.normalize(crop)
602
+ tensor = normalizer.to_tensor(norm).to(device)
603
+ text = greedy_decode(model(tensor).cpu(), idx_to_char)
604
+ results[name] = text
605
+ except Exception as e:
606
+ results[name] = f"[ERROR: {e}]"
607
+ return results
608
+
609
+
610
+ # ══════════════════════════════════════════════════════════════════════════════
611
+ # CONVENIENCE WRAPPER β€” for other scripts that import this module
612
+ # ══════════════════════════════════════════════════════════════════════════════
613
+
614
+ def extract_field_images(image, form_type="birth", verbose=False):
615
+ """Extract field crops using dynamic boundary detection.
616
+
617
+ Parameters
618
+ ----------
619
+ image : PIL Image or BGR numpy array
620
+ form_type : str 'birth' | 'death' | 'marriage' | 'marriage_license'
621
+ verbose : bool
622
+
623
+ Returns
624
+ -------
625
+ dict {field_name: BGR numpy array}
626
+ """
627
+ return DynamicFieldExtractor(form_type=form_type, verbose=verbose).extract(image)
628
+
629
+
630
+ # Keep old name as alias so any existing code doesn't break
631
+ extract_field_images_dynamic = extract_field_images
632
+
633
+
634
+ # ══════════════════════════════════════════════════════════════════════════════
635
+ # MAIN
636
+ # ══════════════════════════════════════════════════════════════════════════════
637
+
638
+ def main():
639
+ parser = argparse.ArgumentParser(
640
+ description="PH Civil Registry Field Extractor β€” Dynamic CRNN OCR")
641
+ group = parser.add_mutually_exclusive_group(required=True)
642
+ group.add_argument("--pdf", help="Path to scanned PDF")
643
+ group.add_argument("--image", help="Path to scanned image (JPG/PNG)")
644
+ parser.add_argument("--form", required=True,
645
+ choices=["birth", "death", "marriage", "marriage_license"])
646
+ parser.add_argument("--checkpoint", default=DEFAULT_CHECKPOINT)
647
+ parser.add_argument("--visualize", action="store_true",
648
+ help="Save annotated field-map image")
649
+ parser.add_argument("--output", default=None,
650
+ help="Save extracted fields to JSON")
651
+ parser.add_argument("--poppler", default=None,
652
+ help="Override Poppler bin path (overrides .env)")
653
+ parser.add_argument("--dpi", type=int, default=200)
654
+ parser.add_argument("--verbose", action="store_true")
655
+ args = parser.parse_args()
656
+
657
+ global POPPLER_PATH
658
+ if args.poppler:
659
+ POPPLER_PATH = args.poppler
660
+
661
+ form_labels = {
662
+ "birth": "Form 102 β€” Certificate of Live Birth",
663
+ "death": "Form 103 β€” Certificate of Death",
664
+ "marriage": "Form 97 β€” Certificate of Marriage",
665
+ "marriage_license": "Form 90 β€” Application for Marriage License",
666
+ }
667
+ input_file = args.pdf or args.image
668
+
669
+ print("\nPhilippine Civil Registry OCR β€” Dynamic Field Extractor")
670
+ print("=" * 65)
671
+ print(f" Form : {form_labels[args.form]}")
672
+ print(f" File : {input_file}")
673
+ print(f" Checkpoint : {args.checkpoint}")
674
+
675
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
676
+ print(f" Device : {device}\n")
677
+
678
+ if not os.path.exists(args.checkpoint):
679
+ print(f"ERROR: Checkpoint not found: {args.checkpoint}")
680
+ sys.exit(1)
681
+
682
+ model, idx_to_char, img_h, img_w = load_crnn_model(args.checkpoint, device)
683
+
684
+ # Load image
685
+ if args.pdf:
686
+ print(f" Converting PDF to image at {args.dpi} DPI...")
687
+ try:
688
+ pil_img = pdf_to_image(args.pdf, dpi=args.dpi)
689
+ page_image = cv2.cvtColor(np.array(pil_img.convert("RGB")), cv2.COLOR_RGB2BGR)
690
+ except Exception as e:
691
+ print(f"\nERROR converting PDF: {e}")
692
+ print("Fix: add POPPLER_PATH=C:\\...\\poppler\\Library\\bin to your .env file")
693
+ sys.exit(1)
694
+ else:
695
+ page_image = cv2.imread(args.image)
696
+ if page_image is None:
697
+ print(f"ERROR: Could not load image: {args.image}")
698
+ sys.exit(1)
699
+
700
+ h, w = page_image.shape[:2]
701
+ print(f" Page size : {w} x {h} px")
702
+
703
+ extractor = DynamicFieldExtractor(form_type=args.form, verbose=args.verbose)
704
+
705
+ if args.visualize:
706
+ stem = Path(input_file).stem
707
+ out_path = stem + "_field_map.jpg"
708
+ extractor.visualize(page_image, output_path=out_path)
709
+ print(f" Field map saved -> {out_path}")
710
+
711
+ print(f"\n Detecting form boundary and extracting fields...")
712
+ crops = extractor.extract(page_image)
713
+ print(f" {len(crops)} field crops extracted")
714
+
715
+ print(f"\n Running CRNN OCR on {len(crops)} fields...")
716
+ results = run_crnn_ocr(crops, model, idx_to_char, img_h, img_w, device)
717
+
718
+ print(f"\n{'─'*65}")
719
+ print(f" {'FIELD':<42} TEXT")
720
+ print(f"{'─'*65}")
721
+ for name, text in results.items():
722
+ print(f" {name:<42} {text if text.strip() else '(empty)'}")
723
+ print(f"{'─'*65}")
724
+ print(f"\n Fields recognized : {sum(1 for t in results.values() if t.strip())} / {len(results)}")
725
+
726
+ if args.output:
727
+ with open(args.output, "w", encoding="utf-8") as f:
728
+ json.dump({"form": form_labels[args.form], "file": input_file,
729
+ "fields": results}, f, ensure_ascii=False, indent=2)
730
+ print(f"\n Results saved -> {args.output}")
731
+ print()
732
+
733
+
734
+ if __name__ == "__main__":
735
+ main()
CRNN+CTC/finetune.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ finetune.py
3
+ ===========
4
+ Fine-tune CRNN+CTC on generated civil registry form crops.
5
+
6
+ Loads best_model_final.pth (pretrained), continues training on
7
+ actual_annotations.json + train_annotations.json.
8
+
9
+ Usage:
10
+ python finetune.py
11
+
12
+ Output:
13
+ checkpoints/best_model_v2.pth
14
+ """
15
+
16
+ import os
17
+ import sys
18
+ import torch
19
+ import torch.nn.functional as F
20
+ import torch.optim as optim
21
+ from torch.utils.data import DataLoader, ConcatDataset
22
+
23
+ sys.path.append('.')
24
+ from crnn_model import get_crnn_model
25
+ from dataset import CivilRegistryDataset, collate_fn
26
+
27
+ # ── Config ────────────────────────────────────────────────────
28
+ CHECKPOINT_IN = "checkpoints/best_model_final.pth"
29
+ CHECKPOINT_OUT = "checkpoints/best_model_v2.pth"
30
+
31
+ ACTUAL_ANN = "data/actual_annotations.json" # real scanned forms
32
+ SYNTH_ANN = "data/train_annotations.json" # synthetic / train split
33
+ VAL_ANN = "data/val_annotations.json" # validation set
34
+
35
+ IMG_HEIGHT = 64
36
+ IMG_WIDTH = 512
37
+ BATCH_SIZE = 32
38
+
39
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
40
+
41
+ # ── Phase settings ────────────────────────────────────────────
42
+ PHASES = [
43
+ # (name, epochs, lr, freeze_cnn, patience)
44
+ ("Phase 1 β€” CNN frozen, adapt to form crops", 20, 1e-4, True, 5),
45
+ ("Phase 2 β€” Full model, low LR polish", 15, 1e-5, False, 4),
46
+ ]
47
+
48
+ # ── Main ──────────────────────────────────────────────────────
49
+ def main():
50
+ print("=" * 60)
51
+ print(" Fine-tuning CRNN+CTC on civil registry form crops")
52
+ print("=" * 60)
53
+ print(f" Device : {DEVICE}")
54
+ print(f" Checkpoint : {CHECKPOINT_IN}")
55
+
56
+ # ── Check required files ──────────────────────────────────
57
+ for f in [CHECKPOINT_IN, VAL_ANN]:
58
+ if not os.path.exists(f):
59
+ print(f"ERROR: {f} not found.")
60
+ sys.exit(1)
61
+
62
+ # ── Datasets ──────────────────────────────────────────────
63
+ datasets_to_merge = []
64
+
65
+ # 1. Actual scanned forms (highest priority β€” real data)
66
+ if os.path.exists(ACTUAL_ANN):
67
+ actual_dataset = CivilRegistryDataset(
68
+ data_dir=".", annotations_file=ACTUAL_ANN,
69
+ img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
70
+ )
71
+ datasets_to_merge.append(actual_dataset)
72
+ print(f" Actual crops: {len(actual_dataset)} (real scanned forms)")
73
+ else:
74
+ print(f" [!] {ACTUAL_ANN} not found β€” run extract_actual_data.py first")
75
+
76
+ # 2. Fully synthetic β€” keep so model doesn't forget basic characters
77
+ if os.path.exists(SYNTH_ANN):
78
+ synth_dataset = CivilRegistryDataset(
79
+ data_dir="data/train", annotations_file=SYNTH_ANN,
80
+ img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
81
+ )
82
+ datasets_to_merge.append(synth_dataset)
83
+ print(f" Synth crops : {len(synth_dataset)} (fully synthetic)")
84
+
85
+ if not datasets_to_merge:
86
+ print("ERROR: No training data found. Run extract_actual_data.py first.")
87
+ sys.exit(1)
88
+
89
+ val_dataset = CivilRegistryDataset(
90
+ data_dir="data/val", annotations_file=VAL_ANN,
91
+ img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
92
+ )
93
+
94
+ train_dataset = ConcatDataset(datasets_to_merge) if len(datasets_to_merge) > 1 else datasets_to_merge[0]
95
+ print(f" Total train : {len(train_dataset)}")
96
+ print(f" Val : {len(val_dataset)}")
97
+
98
+ train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
99
+ shuffle=True, num_workers=0, collate_fn=collate_fn)
100
+ val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
101
+ shuffle=False, num_workers=0, collate_fn=collate_fn)
102
+
103
+ # ── Load checkpoint ───────────────────────────────────────
104
+ print(f"\n Loading {CHECKPOINT_IN}...")
105
+ ckpt = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
106
+ config = ckpt.get('config', {})
107
+
108
+ ref_dataset = datasets_to_merge[0]
109
+ model = get_crnn_model(
110
+ model_type = config.get('model_type', 'standard'),
111
+ img_height = config.get('img_height', 64),
112
+ num_chars = ref_dataset.num_chars,
113
+ hidden_size = config.get('hidden_size', 128),
114
+ num_lstm_layers = config.get('num_lstm_layers', 1),
115
+ ).to(DEVICE)
116
+
117
+ missing, _ = model.load_state_dict(ckpt['model_state_dict'], strict=False)
118
+ if missing:
119
+ print(f" Note: {len(missing)} layers re-initialized (expected if vocab size changed)")
120
+ print(f" Loaded epoch {ckpt.get('epoch','?')} "
121
+ f"val_loss={ckpt.get('val_loss', ckpt.get('val_cer', 0)):.4f}")
122
+
123
+ criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
124
+ os.makedirs("checkpoints", exist_ok=True)
125
+
126
+ # ── Train/val loop ────────────────────────────────────────
127
+ def run_epoch(loader, training, optimizer=None):
128
+ model.train() if training else model.eval()
129
+ total, n = 0, 0
130
+ ctx = torch.enable_grad() if training else torch.no_grad()
131
+ with ctx:
132
+ for images, targets, target_lengths, _ in loader:
133
+ images = images.to(DEVICE)
134
+ batch_size = images.size(0)
135
+ if training:
136
+ optimizer.zero_grad()
137
+ outputs = F.log_softmax(model(images), dim=2)
138
+ seq_len = outputs.size(0)
139
+ input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
140
+ loss = criterion(outputs, targets, input_lengths, target_lengths)
141
+ if not torch.isnan(loss) and not torch.isinf(loss):
142
+ if training:
143
+ loss.backward()
144
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
145
+ optimizer.step()
146
+ total += loss.item()
147
+ n += 1
148
+ return total / max(n, 1)
149
+
150
+ best_overall = float('inf')
151
+
152
+ for phase_name, epochs, lr, freeze_cnn, patience in PHASES:
153
+ print(f"\n{'='*60}")
154
+ print(f" {phase_name} LR={lr}")
155
+ print(f"{'='*60}")
156
+
157
+ for name, param in model.named_parameters():
158
+ param.requires_grad = not (freeze_cnn and 'cnn' in name)
159
+
160
+ trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
161
+ print(f" Trainable params : {trainable:,}")
162
+
163
+ opt = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
164
+ sched = optim.lr_scheduler.ReduceLROnPlateau(opt, patience=2, factor=0.5)
165
+ best = float('inf')
166
+ wait = 0
167
+
168
+ for epoch in range(1, epochs + 1):
169
+ tr = run_epoch(train_loader, True, opt)
170
+ vl = run_epoch(val_loader, False, None)
171
+ sched.step(vl)
172
+
173
+ if vl < best:
174
+ best = vl
175
+ wait = 0
176
+ if vl < best_overall:
177
+ best_overall = vl
178
+ torch.save({
179
+ 'model_state_dict': model.state_dict(),
180
+ 'config': config,
181
+ 'char_to_idx': ref_dataset.char_to_idx,
182
+ 'idx_to_char': ref_dataset.idx_to_char,
183
+ 'epoch': epoch,
184
+ 'val_loss': vl,
185
+ }, CHECKPOINT_OUT)
186
+ print(f" Epoch {epoch:02d}/{epochs} Train={tr:.4f} Val={vl:.4f} <- saved")
187
+ else:
188
+ wait += 1
189
+ print(f" Epoch {epoch:02d}/{epochs} Train={tr:.4f} Val={vl:.4f} (patience {wait}/{patience})")
190
+ if wait >= patience:
191
+ print(f" Early stopping.")
192
+ break
193
+
194
+ print(f"\n{'='*60}")
195
+ print(f" Fine-tuning complete!")
196
+ print(f" Best val loss : {best_overall:.4f}")
197
+ print(f" Saved : {CHECKPOINT_OUT}")
198
+ print(f"{'='*60}")
199
+
200
+
201
+ if __name__ == '__main__':
202
+ main()
CRNN+CTC/generate_ph_names.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ generate_ph_names.py
3
+ ====================
4
+ Run this file ONCE to extract Filipino names from the
5
+ names-dataset library and save them to data/ph_names.json.
6
+
7
+ Install first:
8
+ pip install names-dataset
9
+
10
+ Usage:
11
+ python generate_ph_names.py
12
+
13
+ Output:
14
+ data/ph_names.json <-- used by fix_data.py every run
15
+ """
16
+
17
+ import json
18
+ import os
19
+
20
+ print("=" * 60)
21
+ print(" Filipino Name Extractor | names-dataset (PyPI)")
22
+ print("=" * 60)
23
+
24
+ # ── Step 1: Load NameDataset ──────────────────────────────────
25
+ print("\n[1/5] Loading NameDataset...")
26
+ print(" (This takes 30-60 seconds and needs ~3.2 GB RAM)")
27
+
28
+ try:
29
+ from names_dataset import NameDataset
30
+ nd = NameDataset()
31
+ print(" OK - Dataset loaded!")
32
+ except ImportError:
33
+ print("\n ERROR: names-dataset is not installed.")
34
+ print(" Fix: pip install names-dataset")
35
+ exit(1)
36
+ except MemoryError:
37
+ print("\n ERROR: Not enough RAM. Need ~3.2 GB free.")
38
+ exit(1)
39
+
40
+ # ── Step 2: Extract Filipino FIRST names ─────────────────────
41
+ print("\n[2/5] Extracting Filipino first names (Male + Female)...")
42
+
43
+ ph_male = nd.get_top_names(n=300, gender='Male', country_alpha2='PH')
44
+ ph_female = nd.get_top_names(n=300, gender='Female', country_alpha2='PH')
45
+
46
+ # API returns: { 'PH': { 'M': [...] } }
47
+ male_first = ph_male.get('PH', {}).get('M', [])
48
+ female_first = ph_female.get('PH', {}).get('F', [])
49
+ all_first = male_first + female_first
50
+
51
+ print(f" Male first names : {len(male_first)}")
52
+ print(f" Female first names : {len(female_first)}")
53
+ print(f" Total first names : {len(all_first)}")
54
+ print(f" Sample (male) : {male_first[:5]}")
55
+ print(f" Sample (female) : {female_first[:5]}")
56
+
57
+ # ── Step 3: Extract Filipino LAST names ──────────────────────
58
+ print("\n[3/5] Extracting Filipino last names...")
59
+
60
+ ph_last_raw = nd.get_top_names(n=300, country_alpha2='PH', use_first_names=False)
61
+ print(f" Raw last name API type : {type(ph_last_raw)}")
62
+
63
+ ph_last_ph = ph_last_raw.get('PH', {})
64
+ print(f" PH entry type : {type(ph_last_ph)}")
65
+
66
+ raw_last = []
67
+
68
+ if isinstance(ph_last_ph, list):
69
+ raw_last = ph_last_ph
70
+ elif isinstance(ph_last_ph, dict):
71
+ first_val = next(iter(ph_last_ph.values()), None)
72
+ if isinstance(first_val, list):
73
+ for lst in ph_last_ph.values():
74
+ raw_last.extend(lst)
75
+ elif isinstance(first_val, dict):
76
+ raw_last = list(ph_last_ph.keys())
77
+ else:
78
+ raw_last = list(ph_last_ph.keys())
79
+
80
+ # Deduplicate while preserving order
81
+ seen = set()
82
+ all_last = []
83
+ for name in raw_last:
84
+ if isinstance(name, str) and name not in seen:
85
+ seen.add(name)
86
+ all_last.append(name)
87
+
88
+ print(f" Total last names : {len(all_last)}")
89
+ print(f" Sample : {all_last[:5]}")
90
+
91
+ if len(all_last) == 0:
92
+ print("\n WARNING: Could not extract last names from API.")
93
+ print(" Using common Filipino last names as fallback...")
94
+ all_last = [
95
+ 'Santos', 'Reyes', 'Cruz', 'Bautista', 'Ocampo',
96
+ 'Garcia', 'Mendoza', 'Torres', 'Flores', 'Aquino',
97
+ 'Dela Cruz', 'Del Rosario', 'San Jose', 'De Guzman',
98
+ 'Villanueva', 'Gonzales', 'Ramos', 'Diaz', 'Castro',
99
+ 'Morales', 'Ortega', 'Gutierrez', 'Lopez', 'Ramirez',
100
+ 'Navarro', 'Aguilar', 'Espinosa', 'Mercado', 'Tolentino',
101
+ 'Lim', 'Tan', 'Go', 'Chua', 'Sy', 'Ong', 'Co',
102
+ 'Macaraeg', 'Macapagal', 'Magsaysay', 'Magno',
103
+ 'Pascual', 'Buenaventura', 'Concepcion', 'Resurreccion',
104
+ 'Ilagan', 'Manalo', 'Soriano', 'Evangelista', 'Salazar',
105
+ ]
106
+ print(f" Fallback last names: {len(all_last)}")
107
+
108
+ # ── Step 4: Build MIDDLE names pool ──────────────────────────
109
+ # Middle names in Filipino naming convention are the mother's
110
+ # maiden last name. We build a large pool by combining:
111
+ # A) The last names pool already extracted (primary source)
112
+ # B) A curated extended list of common Filipino surnames
113
+ # used specifically as middle names
114
+ print("\n[4/5] Building middle names pool...")
115
+
116
+ EXTENDED_MIDDLE_NAMES = [
117
+ # Common Filipino surnames used as middle names
118
+ 'Abad', 'Abaya', 'Abella', 'Ablaza', 'Abrera',
119
+ 'Acosta', 'Adriano', 'Afable', 'Africa', 'Agcaoili',
120
+ 'Agno', 'Agpalo', 'Aguinaldo', 'Agustin', 'Ahorro',
121
+ 'Alano', 'Alba', 'Albano', 'Alberto', 'Alcantara',
122
+ 'Alcazar', 'Alcon', 'Aldana', 'Alegre', 'Alejandro',
123
+ 'Aligaen', 'Alim', 'Alinea', 'Alipio', 'Almario',
124
+ 'Almeda', 'Almendras', 'Alminiana', 'Almodiel', 'Alonto',
125
+ 'Alvarado', 'Alvarez', 'Amante', 'Amaro', 'Ambrocio',
126
+ 'Amor', 'Amores', 'Amparo', 'Anastacio', 'Andal',
127
+ 'Andaya', 'Angeles', 'Angsioco', 'Antiporda', 'Antonio',
128
+ 'Apalisok', 'Apolinario', 'Apostol', 'Aquino', 'Araneta',
129
+ 'Aranas', 'Aranda', 'Arceo', 'Arenas', 'Arias',
130
+ 'Ariate', 'Arillo', 'Arimado', 'Arjona', 'Arlante',
131
+ 'Arnaldo', 'Arnaiz', 'Arnoco', 'Arocena', 'Arroyo',
132
+ 'Asejo', 'Asuncion', 'Austria', 'Avecilla', 'Avena',
133
+ 'Avila', 'Avinante', 'Ayala', 'Azucena', 'Azul',
134
+ 'Bacani', 'Bacunawa', 'Baguio', 'Bagunu', 'Balagtas',
135
+ 'Balangue', 'Balbin', 'Balde', 'Baldeo', 'Balgos',
136
+ 'Balili', 'Balinas', 'Balitaan', 'Balladares', 'Ballesteros',
137
+ 'Balmeo', 'Balmores', 'Banaag', 'Banaag', 'Bandola',
138
+ 'Bangayan', 'Bansil', 'Bansode', 'Bantigue', 'Bantug',
139
+ 'Barbin', 'Barcenas', 'Bareng', 'Barrion', 'Barroga',
140
+ 'Bartolome', 'Bases', 'Batac', 'Bataller', 'Batanes',
141
+ 'Batungbakal', 'Bautista', 'Bayani', 'Bayot', 'Baysic',
142
+ 'Belarmino', 'Beldia', 'Belen', 'Belgica', 'Bello',
143
+ 'Benavides', 'BendaΓ±a', 'Benedicto', 'Benigno', 'Benitez',
144
+ 'Bernardino', 'Bernardo', 'Bernarte', 'Besares', 'Billones',
145
+ 'Binay', 'Binayas', 'Biscocho', 'Blanco', 'Bondoc',
146
+ 'Borja', 'Borromeo', 'Bravo', 'Buenaobra', 'Buenaflor',
147
+ 'Buenafe', 'Buenaseda', 'Buenconsejo', 'Buendia', 'Bugarin',
148
+ 'Bulalacao', 'Bulalacao', 'Bulatao', 'Bumanlag', 'Bunag',
149
+ 'Caballero', 'Cabigting', 'Cabral', 'Cabreros', 'Cacal',
150
+ 'Cagampan', 'Cagas', 'Caguioa', 'Cahilig', 'Cajucom',
151
+ 'Calagos', 'Calamba', 'Calasanz', 'Calatrava', 'Calderon',
152
+ 'Calimag', 'Calimutan', 'Calinawan', 'Calleja', 'Callejo',
153
+ 'Caluag', 'Calugay', 'Camacho', 'Camino', 'Campaner',
154
+ 'Camposano', 'Candelario', 'Canete', 'Caning', 'Canlas',
155
+ 'Caoile', 'Capili', 'Carandang', 'Carbonell', 'Cariaga',
156
+ 'Carino', 'Carunungan', 'Casaje', 'Casas', 'Casidsid',
157
+ 'CastaΓ±eda', 'Castillo', 'Castillo', 'Catalan', 'Catapang',
158
+ 'Cayabyab', 'Cayco', 'Celdran', 'Cerillo', 'Cervantes',
159
+ 'Chico', 'Chikiamco', 'Chiongbian', 'Cipriano', 'Clarin',
160
+ 'Claudio', 'Clavecillas', 'Climaco', 'Cobankiat', 'Colambo',
161
+ 'Collado', 'Comafay', 'Comia', 'Concepcion', 'Condino',
162
+ 'Consing', 'Contraras', 'Coquia', 'Cordero', 'Corotan',
163
+ 'Corpus', 'Cosico', 'Costales', 'Crisostomo', 'Cristobal',
164
+ 'Cueto', 'Culala', 'Cunanan', 'Cunanon', 'Curato',
165
+ 'Dadivas', 'Daep', 'Daez', 'Daguplo', 'Dalida',
166
+ 'Dalisay', 'Dalmacion', 'Dalusong', 'Damasco', 'Damo',
167
+ 'Danao', 'Dancel', 'Dandan', 'Danila', 'Daquigan',
168
+ 'Dario', 'Datoc', 'Datumanong', 'David', 'Dayao',
169
+ 'Dayrit', 'De Borja', 'De Castro', 'De Jesus', 'De Jose',
170
+ 'De La Cruz', 'De La Pena', 'De La Rosa', 'De Leon', 'De Lima',
171
+ 'De Los Angeles', 'De Los Reyes', 'De Los Santos', 'De Luna', 'De Mesa',
172
+ 'De Ocampo', 'De Paz', 'De Vera', 'De Villa', 'Delos Reyes',
173
+ 'Demaisip', 'Delos Santos', 'Demillo', 'Demonteverde', 'Denosta',
174
+ 'Derequito', 'Deri', 'Detablan', 'Deveraturda', 'Diaz',
175
+ 'Dichoso', 'Diego', 'Diesto', 'Dimaano', 'Dimabuyu',
176
+ 'Dimagiba', 'Dimaguila', 'Dimaio', 'Dimanlig', 'Dimayuga',
177
+ 'Dingal', 'Dinglasan', 'Dionisio', 'Dioquino', 'Ditan',
178
+ 'Diwata', 'Domingo', 'Dominguez', 'Donato', 'Dorado',
179
+ 'Doria', 'Duallo', 'Duenas', 'Duerme', 'Dulay',
180
+ 'Dumalaog', 'Dumpit', 'Duque', 'Duran', 'Durante',
181
+ 'Ebdane', 'Echavez', 'Echevarria', 'Edralin', 'Ejercito',
182
+ 'Elago', 'Elazegui', 'Elises', 'Elumba', 'Enage',
183
+ 'Encarnacion', 'Enriquez', 'Escobar', 'Escueta', 'Escutin',
184
+ 'Esguerra', 'Eslit', 'Espejo', 'Espeleta', 'Espinas',
185
+ 'Espino', 'Espiritu', 'Estepa', 'Esteves', 'Estrada',
186
+ 'Estrellas', 'Evangelista', 'Evasco', 'Evidente', 'Eyas',
187
+ 'Fabella', 'Fabros', 'Faelnar', 'Fajardo', 'Fajutag',
188
+ 'Famadico', 'Famador', 'Faustino', 'Favila', 'Feliciano',
189
+ 'Felipe', 'Fermin', 'Fernandez', 'Fernando', 'Ferrer',
190
+ 'Figueras', 'Fider', 'Florendo', 'Florentino', 'Floreta',
191
+ 'Flores', 'Florido', 'Floriza', 'Foja', 'Fonacier',
192
+ 'Fontanilla', 'Formoso', 'Fornier', 'Fortich', 'Fortuna',
193
+ 'Francisco', 'Frano', 'Frasco', 'Frias', 'Fuentes',
194
+ 'Gaabucayan', 'Gabutero', 'Gaerlan', 'Gaffud', 'Galapon',
195
+ 'Galera', 'Galicia', 'Galindez', 'Gallardo', 'Gallo',
196
+ 'Galvez', 'Gamalinda', 'Gamboa', 'Gammad', 'Gandionco',
197
+ 'Ganzon', 'Garado', 'Garayblas', 'Garcia', 'Garduce',
198
+ 'Garrido', 'Gatdula', 'Gatmaitan', 'Gatus', 'Gawat',
199
+ 'Gelera', 'Gelua', 'Gemora', 'Genato', 'Generoso',
200
+ 'Gequillana', 'Gerona', 'Gerundio', 'Gianan', 'Gimenez',
201
+ 'Gloria', 'Glorioso', 'Glova', 'Golez', 'Gomez',
202
+ 'Gonzaga', 'Gonzales', 'Gordoncillo', 'Gorre', 'Grafilo',
203
+ 'Gregorio', 'GriΓ±o', 'Guanzon', 'Guerrero', 'Guevara',
204
+ 'Guiao', 'Guillen', 'Guinto', 'Guison', 'Gullas',
205
+ 'Gutierrez', 'Guzman', 'Hernandez', 'Herrera', 'Hizon',
206
+ 'Honasan', 'Hontiveros', 'Horca', 'Hufana', 'Humilde',
207
+ 'IbaΓ±ez', 'Ignacio', 'Ilustre', 'Imbong', 'Imperial',
208
+ 'Infante', 'Inion', 'Inocentes', 'Inso', 'Iringan',
209
+ 'Jacinto', 'Javier', 'Jimenez', 'Jose', 'Joson',
210
+ 'Juan', 'Juico', 'Jurado', 'Kabigting', 'Kalaw',
211
+ 'Kho', 'Lacaba', 'Lacadin', 'Lacson', 'Ladesma',
212
+ 'Laderas', 'Lagman', 'Lagua', 'Laguna', 'Lainez',
213
+ 'Lajarca', 'Lamayo', 'Lambino', 'Lapid', 'Lapuz',
214
+ 'Lara', 'Largo', 'Lariza', 'Larizal', 'Laserna',
215
+ 'Latorre', 'Laurel', 'Laurente', 'Lazaro', 'Leano',
216
+ 'Legarda', 'Leonor', 'Leynes', 'Libunao', 'Licup',
217
+ 'Lim', 'Limkaichong', 'Limpag', 'Liwanag', 'Llanes',
218
+ 'Llamado', 'Llaneta', 'Locsin', 'Logarta', 'Lopez',
219
+ 'Lorenzo', 'Lorilla', 'Lozada', 'Lucero', 'Luistro',
220
+ 'Luna', 'Luneta', 'Luzon', 'Macalintal', 'Macam',
221
+ 'Maceda', 'Madera', 'Madrazo', 'Magtanggol', 'Malabanan',
222
+ 'Malacaman', 'Malajacan', 'Malanyaon', 'Malaya', 'Malbas',
223
+ 'Malcampo', 'Maldia', 'Maligalig', 'Malinao', 'Malonzo',
224
+ 'Mangahas', 'Mangubat', 'Manigbas', 'Manila', 'Manlangit',
225
+ 'Manlapaz', 'Manlongat', 'Manrique', 'Mansalay', 'Mante',
226
+ 'Manuel', 'Manzano', 'Marcelo', 'Marcos', 'Mariano',
227
+ 'Maristela', 'Marquez', 'Maravilla', 'Masangkay', 'Masapol',
228
+ 'Mateo', 'Matienzo', 'Matining', 'Matugas', 'Maula',
229
+ 'Maulion', 'Mayuga', 'Medina', 'Mejia', 'Melchor',
230
+ 'Melo', 'Menor', 'Mercado', 'Mesina', 'Miguel',
231
+ 'Miralles', 'Miranda', 'Molano', 'Molina', 'Mondejar',
232
+ 'Monreal', 'Montano', 'Montenegro', 'Montero', 'Montes',
233
+ 'Montesa', 'Montoya', 'Moraga', 'Moraleda', 'Moreno',
234
+ 'Morial', 'Muncal', 'MuΓ±oz', 'Murillo', 'Musni',
235
+ 'Nacion', 'Nadal', 'Nagrampa', 'Nalzaro', 'NapeΓ±as',
236
+ 'Narciso', 'Natividad', 'Navales', 'Navarro', 'Neri',
237
+ 'Nicolas', 'Nisperos', 'Nolasco', 'Noynay', 'NuΓ±ez',
238
+ 'Oaminal', 'Ocampo', 'Ocfemia', 'Ochoa', 'Olaguera',
239
+ 'Olano', 'Oliva', 'Olivares', 'Oliveros', 'Olpindo',
240
+ 'Omadto', 'Ombion', 'Onate', 'Ong', 'Orbeta',
241
+ 'Orbita', 'OrdoΓ±o', 'Orendain', 'Orense', 'Orobia',
242
+ 'Orozco', 'Ortega', 'OsmeΓ±a', 'Osorio', 'Ostrea',
243
+ 'Ouano', 'Pabiton', 'Pableo', 'Pabriaga', 'Pacanan',
244
+ 'Padayao', 'Padilla', 'Padua', 'Paguio', 'Pagulayan',
245
+ 'Palad', 'Palacios', 'Palafox', 'Palaganas', 'Palattao',
246
+ 'Palencia', 'Palma', 'Palo', 'Paloma', 'Palomares',
247
+ 'Pamaran', 'Pamintuan', 'Panaligan', 'Panganiban', 'Pangilinan',
248
+ 'Panopio', 'Papa', 'Paqueo', 'Paras', 'Paredes',
249
+ 'ParreΓ±o', 'Pascua', 'Pascual', 'Pastor', 'Paterno',
250
+ 'Patron', 'Pavia', 'PecaΓ±a', 'Pecho', 'Pedrosa',
251
+ 'Pelayo', 'PeΓ±a', 'PeΓ±aflor', 'PeΓ±aranda', 'Penarroyo',
252
+ 'Peralta', 'Perez', 'Perlas', 'Pernia', 'Pesquera',
253
+ 'Pestano', 'Piccio', 'Picardal', 'Pineda', 'Pimentel',
254
+ 'Pilapil', 'Pili', 'Piliin', 'Pillar', 'Pilorin',
255
+ 'Poblete', 'Poliquit', 'Ponce', 'Ponferrada', 'Porras',
256
+ 'Prado', 'Prieto', 'Prodigalidad', 'Prudente', 'Punsalan',
257
+ 'Quezon', 'Quiambao', 'Quiaoit', 'Quijano', 'Quimpo',
258
+ 'Quinit', 'Quinones', 'Quiogue', 'Quirino', 'Quisao',
259
+ 'Racelis', 'Rada', 'Ramirez', 'Ramon', 'Ramos',
260
+ 'Ravalo', 'Rayala', 'Razon', 'Recinto', 'Recometa',
261
+ 'Reforma', 'Regalado', 'Reganit', 'Regio', 'Regidor',
262
+ 'Regis', 'Reodica', 'Respicio', 'Revilla', 'Reyes',
263
+ 'Ricafort', 'Ricalde', 'Ridad', 'Rillo', 'Rivera',
264
+ 'Rivero', 'Rizal', 'Robles', 'Roca', 'Rocamora',
265
+ 'Rocero', 'Rodriguez', 'Rojas', 'Romero', 'Ronquillo',
266
+ 'Rosales', 'Rosario', 'Rosete', 'Rotor', 'Roxas',
267
+ 'Rubio', 'Rufino', 'Ruiz', 'Sabal', 'Sabando',
268
+ 'Sabido', 'Sabijon', 'Sabio', 'Saceda', 'Saclolo',
269
+ 'Sagum', 'Salceda', 'Salcedo', 'Salgado', 'Salinas',
270
+ 'Saludar', 'Saluta', 'Salvador', 'Sambrano', 'Samson',
271
+ 'Sanchez', 'Sandoval', 'Sangalang', 'Santiago', 'Santillan',
272
+ 'Sanz', 'Sarino', 'Sarmiento', 'Sarona', 'Savellano',
273
+ 'Sebastian', 'Segovia', 'Sendin', 'Seneres', 'Serafica',
274
+ 'Sereno', 'Senga', 'Serrano', 'Sierra', 'Sigua',
275
+ 'Silva', 'Silvestre', 'Simon', 'Sinco', 'Singson',
276
+ 'Siy', 'Sobejana', 'Soberano', 'Socrates', 'Soliman',
277
+ 'Solis', 'Soliven', 'Solomon', 'Sotto', 'Suansing',
278
+ 'Suarez', 'Subido', 'Sulit', 'Sultan', 'Sumagaysay',
279
+ 'Sunga', 'Tabamo', 'Tabinas', 'Tabuena', 'Tagle',
280
+ 'Taguba', 'Tajonera', 'Talabong', 'Talavera', 'Talento',
281
+ 'Taleon', 'Talosig', 'Tamano', 'Tambalo', 'Tanada',
282
+ 'Tandoc', 'TaΓ±ada', 'Tarriela', 'Tating', 'Tautho',
283
+ 'Tayag', 'Tayco', 'Tecson', 'Tejano', 'Tejero',
284
+ 'Teodoro', 'Tibay', 'Tigas', 'Tiglao', 'Timbol',
285
+ 'Tingzon', 'Tiongco', 'Tiongson', 'Tirol', 'Tobias',
286
+ 'Toledo', 'Tolentino', 'Tomelden', 'Tomas', 'Tomaro',
287
+ 'Tomaroy', 'Torino', 'Torralba', 'Torrente', 'Torno',
288
+ 'Trea', 'Trinidad', 'Tuazon', 'Tubig', 'Tubigan',
289
+ 'Tugade', 'Tumbocon', 'Tupas', 'Tuquero', 'Turla',
290
+ 'Umagat', 'Umali', 'Usman', 'Uson', 'Uy',
291
+ 'Valdez', 'Valencia', 'Valenciano', 'Valentin', 'Valera',
292
+ 'Valiao', 'Varela', 'Vargas', 'Vasquez', 'Velarde',
293
+ 'Velasco', 'Velasquez', 'Velez', 'Vera', 'Vergara',
294
+ 'Vibandor', 'Vicente', 'Victorino', 'Vidal', 'Viernes',
295
+ 'Villacorta', 'Villaflor', 'Villafranca', 'Villagomez', 'Villagonzalo',
296
+ 'Villanueva', 'Villar', 'Villareal', 'Villaruel', 'Villaverde',
297
+ 'Villena', 'Virata', 'Vista', 'Vivar', 'Vizconde',
298
+ 'Yabes', 'Yap', 'Yasay', 'Yatco', 'Ylagan',
299
+ 'YΓ±iguez', 'Yorac', 'Yulo', 'Zabala', 'Zaldivar',
300
+ 'Zamora', 'Zapanta', 'Zaragoza', 'Zosa', 'Zulueta',
301
+ ]
302
+
303
+ # Combine last names pool + extended middle names, deduplicated
304
+ middle_seen = set()
305
+ all_middle = []
306
+ for name in (all_last + EXTENDED_MIDDLE_NAMES):
307
+ if isinstance(name, str) and name not in middle_seen:
308
+ middle_seen.add(name)
309
+ all_middle.append(name)
310
+
311
+ print(f" Total middle names : {len(all_middle)}")
312
+ print(f" Sample : {all_middle[:5]}")
313
+
314
+ # ── Step 5: Save to JSON ──────────────────────────────────────
315
+ print("\n[5/5] Saving to data/ph_names.json ...")
316
+
317
+ os.makedirs('data', exist_ok=True)
318
+
319
+ output = {
320
+ "first_names": {
321
+ "male": male_first,
322
+ "female": female_first,
323
+ "all": all_first
324
+ },
325
+ "last_names": all_last,
326
+ "middle_names": all_middle,
327
+ "metadata": {
328
+ "source": "names-dataset (PyPI) -- country_alpha2='PH'",
329
+ "total_first": len(all_first),
330
+ "total_last": len(all_last),
331
+ "total_middle": len(all_middle),
332
+ "total_name_combos": len(all_first) * len(all_middle) * len(all_last),
333
+ }
334
+ }
335
+
336
+ with open('data/ph_names.json', 'w', encoding='utf-8') as f:
337
+ json.dump(output, f, indent=2, ensure_ascii=False)
338
+
339
+ # ── Summary ───────────────────────────────────────────────────
340
+ print("\n" + "=" * 60)
341
+ print(" DONE!")
342
+ print("=" * 60)
343
+ print(f" Male first names : {len(male_first)}")
344
+ print(f" Female first names : {len(female_first)}")
345
+ print(f" Last names : {len(all_last)}")
346
+ print(f" Middle names : {len(all_middle)}")
347
+ print(f" Possible 3-part name combos : {len(all_first) * len(all_middle) * len(all_last):,}")
348
+ print(f"\n Saved to: data/ph_names.json")
349
+ print(f"\n Next step: python fix_data.py")
350
+ print("=" * 60)
CRNN+CTC/inference.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference Script for CRNN+CTC Civil Registry OCR
3
+
4
+ TWO NORMALIZERS:
5
+ SimpleNormalizer β€” for PIL-rendered synthetic images (matches training exactly)
6
+ AdaptiveNormalizer β€” for physical/scanned images (any zoom, any size)
7
+
8
+ AUTO-DETECT MODE: automatically decides which pipeline to use based on
9
+ text density in the image β€” zoomed-in images get adaptive treatment,
10
+ clean synthetic images get simple treatment.
11
+ """
12
+
13
+ import torch
14
+ import cv2
15
+ import numpy as np
16
+ from pathlib import Path
17
+ from typing import Dict, List
18
+
19
+ from crnn_model import get_crnn_model
20
+ from utils import decode_ctc_predictions, extract_form_fields
21
+
22
+
23
+ # ─────────────────────────────────────────────────────────────────────────────
24
+ # HELPERS
25
+ # ─────────────────────────────────────────────────────────────────────────────
26
+
27
+ def _to_gray(img: np.ndarray) -> np.ndarray:
28
+ if len(img.shape) == 3:
29
+ return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
30
+ return img.copy()
31
+
32
+
33
+ def _binarize(gray: np.ndarray) -> np.ndarray:
34
+ """Otsu, falls back to adaptive for uneven backgrounds."""
35
+ _, otsu = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
36
+ white_ratio = np.mean(otsu == 255)
37
+ if white_ratio < 0.30 or white_ratio > 0.97:
38
+ return cv2.adaptiveThreshold(
39
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
40
+ cv2.THRESH_BINARY, 11, 2)
41
+ return otsu
42
+
43
+
44
+ def _crop_to_text(gray: np.ndarray, pad_ratio=0.15) -> np.ndarray:
45
+ """Crop tightly around dark pixels (the text)."""
46
+ inv = cv2.bitwise_not(gray)
47
+ _, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
48
+ coords = np.column_stack(np.where(thresh > 0))
49
+ if len(coords) == 0:
50
+ return gray
51
+ y_min, x_min = coords.min(axis=0)
52
+ y_max, x_max = coords.max(axis=0)
53
+ pad = max(4, int((y_max - y_min) * pad_ratio))
54
+ y_min = max(0, y_min - pad)
55
+ x_min = max(0, x_min - pad)
56
+ y_max = min(gray.shape[0] - 1, y_max + pad)
57
+ x_max = min(gray.shape[1] - 1, x_max + pad)
58
+ return gray[y_min:y_max+1, x_min:x_max+1]
59
+
60
+
61
+ def _aspect_resize(gray: np.ndarray, H: int, W: int) -> np.ndarray:
62
+ """Resize preserving aspect ratio, pad with white to fill canvas."""
63
+ h, w = gray.shape
64
+ if h == 0 or w == 0:
65
+ return np.ones((H, W), dtype=np.uint8) * 255
66
+ scale = H / h
67
+ new_w = int(w * scale)
68
+ new_h = H
69
+ if new_w > W:
70
+ scale = W / w
71
+ new_h = int(h * scale)
72
+ new_w = W
73
+ resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
74
+ canvas = np.ones((H, W), dtype=np.uint8) * 255
75
+ y_off = (H - new_h) // 2
76
+ x_off = (W - new_w) // 2
77
+ canvas[y_off:y_off+new_h, x_off:x_off+new_w] = resized
78
+ return canvas
79
+
80
+
81
+ def _detect_mode(gray: np.ndarray) -> str:
82
+ """
83
+ Auto-detect whether image needs adaptive or simple normalization.
84
+
85
+ Logic:
86
+ - If >25% of pixels are dark, text is very large/zoomed β†’ adaptive.
87
+ - If image size is far from training size (512x64) β†’ adaptive.
88
+ - Otherwise β†’ simple (matches training pipeline).
89
+ """
90
+ h, w = gray.shape
91
+ _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
92
+ dark_px = np.mean(bw == 0)
93
+
94
+ # Text fills too much of the image β†’ zoomed in (like shane.jpg)
95
+ if dark_px > 0.25:
96
+ return 'adaptive'
97
+
98
+ # Image is far from expected training size (allow 50% tolerance)
99
+ if not (256 <= w <= 1024 and 32 <= h <= 128):
100
+ return 'adaptive'
101
+
102
+ return 'simple'
103
+
104
+
105
+ def _to_tensor(img: np.ndarray) -> torch.Tensor:
106
+ return torch.FloatTensor(
107
+ img.astype(np.float32) / 255.0
108
+ ).unsqueeze(0).unsqueeze(0)
109
+
110
+
111
+ # ─────────────────────────────────────────────────────────────────────────────
112
+ # SIMPLE NORMALIZER ← for PIL-rendered / training-matched images
113
+ # ─────────────────────────────────────────────────────────────────────────────
114
+
115
+ class SimpleNormalizer:
116
+ """
117
+ Matches fix_data.py training pipeline exactly:
118
+ grayscale β†’ resize β†’ binarize
119
+ Best for test images created by create_test_images.py.
120
+ """
121
+ def __init__(self, H=64, W=512):
122
+ self.H, self.W = H, W
123
+
124
+ def normalize(self, img: np.ndarray) -> np.ndarray:
125
+ gray = _to_gray(img)
126
+ resized = cv2.resize(gray, (self.W, self.H), interpolation=cv2.INTER_LANCZOS4)
127
+ return _binarize(resized)
128
+
129
+ def normalize_from_path(self, path: str) -> np.ndarray:
130
+ img = cv2.imread(str(path))
131
+ if img is None:
132
+ raise ValueError(f"Cannot load: {path}")
133
+ return self.normalize(img)
134
+
135
+
136
+ # ─────────────────────────────────────────────────────────────────────────────
137
+ # ADAPTIVE NORMALIZER ← for real / physical / scanned images
138
+ # ─────────────────────────────────────────────────────────────────────────────
139
+
140
+ class AdaptiveNormalizer:
141
+ """
142
+ For physical documents or images with non-standard zoom/size:
143
+ grayscale β†’ denoise β†’ crop text β†’ aspect-ratio resize β†’ binarize
144
+
145
+ Crops to actual text first, so a zoomed-in image like shane.jpg
146
+ gets scaled down to training size instead of being squeezed/stretched.
147
+ """
148
+ def __init__(self, H=64, W=512):
149
+ self.H, self.W = H, W
150
+
151
+ def normalize(self, img: np.ndarray) -> np.ndarray:
152
+ gray = _to_gray(img)
153
+ gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
154
+ gray = _crop_to_text(gray)
155
+ canvas = _aspect_resize(gray, self.H, self.W)
156
+ return _binarize(canvas)
157
+
158
+ def normalize_from_path(self, path: str) -> np.ndarray:
159
+ img = cv2.imread(str(path))
160
+ if img is None:
161
+ raise ValueError(f"Cannot load: {path}")
162
+ return self.normalize(img)
163
+
164
+
165
+ # ─────────────────────────────────────────────────────────────────────────────
166
+ # AUTO NORMALIZER ← detects which pipeline to use per image automatically
167
+ # ─────────────────────────────────────────────────────────────────────────────
168
+
169
+ class AutoNormalizer:
170
+ """
171
+ Automatically picks Simple or Adaptive based on image characteristics.
172
+
173
+ Examples:
174
+ demo.jpg (clean 512x64 PIL) β†’ Simple (matches training)
175
+ name1.jpg (clean 512x64 PIL) β†’ Simple
176
+ shane.jpg (huge zoomed text) β†’ Adaptive (crop then resize)
177
+ real scan (any size/zoom) β†’ Adaptive
178
+ """
179
+ def __init__(self, H=64, W=512, verbose=False):
180
+ self.H, self.W = H, W
181
+ self.verbose = verbose
182
+ self._simple = SimpleNormalizer(H, W)
183
+ self._adaptive = AdaptiveNormalizer(H, W)
184
+
185
+ def normalize(self, img: np.ndarray) -> np.ndarray:
186
+ gray = _to_gray(img)
187
+ mode = _detect_mode(gray)
188
+ if self.verbose:
189
+ print(f" auto β†’ {mode}")
190
+ return self._simple.normalize(img) if mode == 'simple' \
191
+ else self._adaptive.normalize(img)
192
+
193
+ def normalize_from_path(self, path: str) -> np.ndarray:
194
+ img = cv2.imread(str(path))
195
+ if img is None:
196
+ raise ValueError(f"Cannot load: {path}")
197
+ gray = _to_gray(img)
198
+ mode = _detect_mode(gray)
199
+ if self.verbose:
200
+ print(f" [{Path(path).name}] β†’ {mode}")
201
+ return self._simple.normalize(img) if mode == 'simple' \
202
+ else self._adaptive.normalize(img)
203
+
204
+ def to_tensor(self, img: np.ndarray) -> torch.Tensor:
205
+ return _to_tensor(img)
206
+
207
+
208
+ # ─────────────────────────────────────────────────────────────────────────────
209
+ # MAIN OCR CLASS
210
+ # ─────────────────────────────────────────────────────────────────────────────
211
+
212
+ class CivilRegistryOCR:
213
+
214
+ def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
215
+ """
216
+ Args:
217
+ checkpoint_path : path to best_model_v6.pth
218
+ device : 'cuda' or 'cpu'
219
+ mode : 'auto' β†’ auto-detect per image (recommended)
220
+ 'simple' β†’ always use simple pipeline
221
+ 'adaptive' β†’ always use adaptive pipeline
222
+ verbose : print which mode was chosen per image
223
+ """
224
+ if device == 'cuda' and not torch.cuda.is_available():
225
+ device = 'cpu'
226
+
227
+ self.device = torch.device(device)
228
+ self.verbose = verbose
229
+ print(f"Loading model from {checkpoint_path}...")
230
+
231
+ checkpoint = torch.load(checkpoint_path, map_location=self.device,
232
+ weights_only=False)
233
+
234
+ self.char_to_idx = checkpoint['char_to_idx']
235
+ self.idx_to_char = checkpoint['idx_to_char']
236
+ self.config = checkpoint.get('config', {})
237
+
238
+ img_height = self.config.get('img_height', 64)
239
+ img_width = self.config.get('img_width', 512)
240
+
241
+ if mode == 'simple':
242
+ self.normalizer = SimpleNormalizer(img_height, img_width)
243
+ elif mode == 'adaptive':
244
+ self.normalizer = AdaptiveNormalizer(img_height, img_width)
245
+ else:
246
+ self.normalizer = AutoNormalizer(img_height, img_width, verbose=verbose)
247
+
248
+ self.model = get_crnn_model(
249
+ model_type=self.config.get('model_type', 'standard'),
250
+ img_height=img_height,
251
+ num_chars=checkpoint['model_state_dict']['fc.weight'].shape[0],
252
+ hidden_size=self.config.get('hidden_size', 128),
253
+ num_lstm_layers=self.config.get('num_lstm_layers', 1)
254
+ )
255
+ self.model.load_state_dict(checkpoint['model_state_dict'])
256
+ self.model = self.model.to(self.device)
257
+ self.model.eval()
258
+
259
+ print(f"Model loaded successfully")
260
+ # Support both key names: val_loss (fine-tuned) and val_cer (synthetic baseline)
261
+ # FIXED Bug 5: removed incorrect `val_cer < 10` heuristic that mislabelled
262
+ # the metric. The key name alone is the reliable indicator.
263
+ val_loss = checkpoint.get('val_loss', None)
264
+ val_cer = checkpoint.get('val_cer', None)
265
+ if val_loss is not None and val_cer is not None:
266
+ print(f" Val Loss : {val_loss:.4f} | Val CER: {val_cer:.2f}%")
267
+ elif val_loss is not None:
268
+ print(f" Val Loss : {val_loss:.4f} (fine-tuned checkpoint β€” run compare_live_cer.py for true CER)")
269
+ elif val_cer is not None:
270
+ print(f" Val CER : {val_cer:.2f}%")
271
+ else:
272
+ print(f" Val CER : N/A (run check_cer.py for true CER)")
273
+ print(f" Device : {self.device}")
274
+ print(f" Mode : {mode} ({img_height}x{img_width})")
275
+
276
+ def _preprocess(self, image_path) -> torch.Tensor:
277
+ normalized = self.normalizer.normalize_from_path(str(image_path))
278
+ return _to_tensor(normalized)
279
+
280
+ def predict(self, image_path, decode_method='greedy') -> str:
281
+ img = self._preprocess(image_path).to(self.device)
282
+ with torch.no_grad():
283
+ outputs = self.model(img)
284
+ decoded = decode_ctc_predictions(
285
+ outputs.cpu(), self.idx_to_char, method=decode_method)
286
+ return decoded[0]
287
+
288
+ def predict_batch(self, image_paths, decode_method='greedy') -> List[Dict]:
289
+ results = []
290
+ for image_path in image_paths:
291
+ try:
292
+ text = self.predict(image_path, decode_method)
293
+ results.append({'image_path': str(image_path),
294
+ 'text': text, 'success': True})
295
+ except Exception as e:
296
+ results.append({'image_path': str(image_path),
297
+ 'error': str(e), 'success': False})
298
+ return results
299
+
300
+ def process_form(self, form_image_path, form_type) -> Dict:
301
+ text = self.predict(form_image_path)
302
+ fields = extract_form_fields(text, form_type)
303
+ fields['raw_text'] = text
304
+ return fields
305
+
306
+
307
+ # ─────────────────────────────────────────────────────────────────────────────
308
+ # FORM FIELD EXTRACTOR
309
+ # ─────────────────────────────────────────────────────────────────────────────
310
+
311
+ class FormFieldExtractor:
312
+ def __init__(self, ocr_model: CivilRegistryOCR):
313
+ self.ocr = ocr_model
314
+
315
+ def extract_form1a_fields(self, path):
316
+ text = self.ocr.predict(path)
317
+ return {'form_type': 'Form 1A - Birth Certificate', 'raw_text': text}
318
+
319
+ def extract_form2a_fields(self, path):
320
+ text = self.ocr.predict(path)
321
+ return {'form_type': 'Form 2A - Death Certificate', 'raw_text': text}
322
+
323
+ def extract_form3a_fields(self, path):
324
+ text = self.ocr.predict(path)
325
+ return {'form_type': 'Form 3A - Marriage Certificate', 'raw_text': text}
326
+
327
+ def extract_form90_fields(self, path):
328
+ text = self.ocr.predict(path)
329
+ return {'form_type': 'Form 90 - Marriage License Application',
330
+ 'raw_text': text}
331
+
332
+
333
+ # ─────────────────────────────────────────────────────────────────────────────
334
+ # DEMO
335
+ # ���────────────────────────────────────────────────────────────────────────────
336
+
337
+ def demo_inference():
338
+ print("=" * 70)
339
+ print("Civil Registry OCR (auto-adaptive normalizer)")
340
+ print("=" * 70)
341
+
342
+ ocr = CivilRegistryOCR(
343
+ checkpoint_path='checkpoints/best_model_v6.pth',
344
+ device='cuda',
345
+ mode='auto',
346
+ verbose=True # shows which mode each image triggers
347
+ )
348
+
349
+ print("\n1. Single Prediction:")
350
+ try:
351
+ result = ocr.predict('test_images/date1.jpg')
352
+ print(f" Recognized text: {result}")
353
+ except Exception as e:
354
+ print(f" Error: {e}")
355
+
356
+ print("\n2. Batch Prediction:")
357
+ '''batch_results = ocr.predict_batch([
358
+ 'test_images/name1.jpg',
359
+ 'test_images/shane.jpg',
360
+ 'test_images/date1.jpg',
361
+ 'test_images/place1.jpg',
362
+ ])
363
+ for r in batch_results:
364
+ status = r['text'] if r['success'] else f"ERROR - {r['error']}"
365
+ print(f" {r['image_path']}: {status}")'''
366
+
367
+ print("\n3. Form Processing:")
368
+ try:
369
+ form_data = ocr.process_form('test_images/form1a_sample.jpg', 'form1a')
370
+ print(f" Form Type: Form 1A - Birth Certificate")
371
+ print(f" Raw Text: {form_data['raw_text']}")
372
+ except Exception as e:
373
+ print(f" Error: {e}")
374
+
375
+
376
+ def create_inference_api():
377
+ class OCR_API:
378
+ def __init__(self, checkpoint_path, mode='auto'):
379
+ self.ocr = CivilRegistryOCR(checkpoint_path, mode=mode)
380
+ self.extractor = FormFieldExtractor(self.ocr)
381
+ def recognize_text(self, p):
382
+ return {'text': self.ocr.predict(p), 'success': True}
383
+ def process_birth_certificate(self, p):
384
+ return self.extractor.extract_form1a_fields(p)
385
+ def process_death_certificate(self, p):
386
+ return self.extractor.extract_form2a_fields(p)
387
+ def process_marriage_certificate(self, p):
388
+ return self.extractor.extract_form3a_fields(p)
389
+ def process_marriage_license(self, p):
390
+ return self.extractor.extract_form90_fields(p)
391
+ return OCR_API
392
+
393
+
394
+ if __name__ == "__main__":
395
+ demo_inference()
CRNN+CTC/prepare_emnist.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ο»Ώimport torchvision
2
+ import torchvision.transforms as transforms
3
+ from PIL import Image
4
+ import numpy as np
5
+ import os
6
+ import json
7
+
8
+ print("Preparing EMNIST data for CRNN training...")
9
+ print("Using 'balanced' split (47 classes β€” digits, uppercase, selected lowercase)")
10
+
11
+ # MAX_SAMPLES: how many EMNIST images to use out of 112,800 available.
12
+ # 50,000 chosen deliberately:
13
+ # - ~1,064 images per class (47 classes) β€” enough for solid character recognition
14
+ # - Keeps a healthy ~3:1 ratio vs synthetic data (16,000) in mixed training
15
+ # - Going higher (e.g. full 112,800) would drown out synthetic Filipino-specific
16
+ # patterns since EMNIST would be 88% of the mixed dataset
17
+ # - IAM fine-tuning and physical scans handle remaining handwriting gaps
18
+ MAX_SAMPLES = 50000
19
+ VAL_RATIO = 0.10 # 90% train, 10% val β€” proper percentage split
20
+
21
+ train_data = torchvision.datasets.EMNIST(
22
+ root='datasets/emnist',
23
+ split='balanced', # balanced split β€” already downloaded
24
+ train=True,
25
+ download=False, # files already exist, skip download
26
+ transform=transforms.ToTensor()
27
+ )
28
+
29
+ # balanced split has 47 classes:
30
+ # 0-9 digits, A-Z uppercase, and selected lowercase
31
+ # mapping follows EMNIST balanced label order
32
+ LABELS = [
33
+ '0','1','2','3','4','5','6','7','8','9',
34
+ 'A','B','C','D','E','F','G','H','I','J','K','L','M',
35
+ 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
36
+ 'a','b','d','e','f','g','h','n','q','r','t',
37
+ ] # 47 classes exactly matching balanced split label indices
38
+
39
+ os.makedirs('data/train/emnist', exist_ok=True)
40
+ os.makedirs('data/val/emnist', exist_ok=True)
41
+
42
+ annotations_train = []
43
+ annotations_val = []
44
+
45
+ val_cutoff = int(MAX_SAMPLES * (1 - VAL_RATIO)) # 45,000 train / 5,000 val
46
+
47
+ print(f"Dataset size : {len(train_data)} images available")
48
+ print(f"Using : {MAX_SAMPLES} ({MAX_SAMPLES/len(train_data)*100:.1f}% of full dataset)")
49
+ print(f"Train / Val : {val_cutoff} / {MAX_SAMPLES - val_cutoff} (90/10 split)")
50
+ print("Saving images...")
51
+
52
+ saved = 0 # count of successfully saved images (skips bad label indices)
53
+ for i, (img_tensor, label_idx) in enumerate(train_data):
54
+ if saved >= MAX_SAMPLES:
55
+ break
56
+
57
+ # Safety check β€” skip if label index is out of range for our LABELS list
58
+ if label_idx >= len(LABELS):
59
+ continue
60
+
61
+ char = LABELS[label_idx]
62
+ img = img_tensor.squeeze().numpy()
63
+ img = (img * 255).astype(np.uint8)
64
+
65
+ # EMNIST images are transposed β€” rotate and flip to correct orientation
66
+ img = np.rot90(img, k=3)
67
+ img = np.fliplr(img)
68
+
69
+ pil_img = Image.fromarray(img).convert('RGB')
70
+ pil_img = pil_img.resize((512, 64)) # must match IMG_WIDTH=512
71
+
72
+ fname = f'emnist_{saved:05d}.jpg' # sequential filenames based on saved count
73
+
74
+ # FIXED: proper percentage-based split (was hardcoded `if i < 5000`)
75
+ if saved < val_cutoff:
76
+ pil_img.save(f'data/train/emnist/{fname}')
77
+ annotations_train.append({'image_path': f'emnist/{fname}', 'text': char})
78
+ else:
79
+ pil_img.save(f'data/val/emnist/{fname}')
80
+ annotations_val.append({'image_path': f'emnist/{fname}', 'text': char})
81
+
82
+ saved += 1
83
+ if saved % 5000 == 0:
84
+ print(f" Processed {saved}/{MAX_SAMPLES} images...")
85
+
86
+ with open('data/emnist_train_annotations.json', 'w') as f:
87
+ json.dump(annotations_train, f, indent=2)
88
+ with open('data/emnist_val_annotations.json', 'w') as f:
89
+ json.dump(annotations_val, f, indent=2)
90
+
91
+ print(f"\nDone!")
92
+ print(f" Train : {len(annotations_train)} images (~{len(annotations_train)//47} per class)")
93
+ print(f" Val : {len(annotations_val)} images")
94
+ print(f" Total : {len(annotations_train) + len(annotations_val)} / {len(train_data)} used")
95
+ print(f" Labels: {sorted(set(a['text'] for a in annotations_train))}")
96
+ print(f"\nClass coverage: {len(set(a['text'] for a in annotations_train))}/47 classes in train")
97
+ print("\nNext step: python train_with_emnist.py")
CRNN+CTC/requirements.txt ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Deep Learning
2
+ torch>=2.0.0
3
+ torchvision>=0.15.0
4
+
5
+ # Image Processing
6
+ opencv-python>=4.8.0
7
+ Pillow>=10.0.0
8
+ albumentations>=1.3.0
9
+ pdf2image>=1.17.0
10
+ pytesseract>=0.3.13
11
+
12
+ # Data Processing
13
+ numpy>=1.24.0
14
+ pandas>=2.0.0
15
+
16
+ # Metrics
17
+ editdistance>=0.6.2
18
+
19
+ # Progress Bars
20
+ tqdm>=4.65.0
21
+
22
+ # Web Framework (for deployment)
23
+ flask>=3.0.0
24
+ flask-cors>=4.0.0
25
+ fastapi>=0.104.0
26
+ uvicorn>=0.24.0
27
+ python-multipart>=0.0.6
28
+
29
+ # Database
30
+ pymysql>=1.1.0
31
+ sqlalchemy>=2.0.0
32
+
33
+ # NLP for Named Entity Recognition
34
+ spacy>=3.7.0
35
+ # Download model: python -m spacy download en_core_web_sm
36
+
37
+ # Document Classification
38
+ scikit-learn>=1.3.0
39
+
40
+ # Visualization
41
+ matplotlib>=3.7.0
42
+ seaborn>=0.12.0
43
+
44
+ # Configuration
45
+ pyyaml>=6.0
46
+
47
+ # Utilities
48
+ python-dotenv>=1.0.0
49
+ requests>=2.31.0
50
+
51
+ # Document Processing
52
+ python-docx>=1.1.0
53
+
54
+ # Optional: For production deployment
55
+ gunicorn>=21.2.0
56
+ celery>=5.3.0
57
+ redis>=5.0.0
58
+
59
+ # Testing
60
+ pytest>=7.4.0
61
+ pytest-cov>=4.1.0
CRNN+CTC/train.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training Script for CRNN+CTC Civil Registry OCR Includes CTC loss, learning rate scheduling, and model checkpointing
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.optim as optim
6
+ from torch.utils.data import DataLoader
7
+ import os
8
+ from tqdm import tqdm
9
+ import numpy as np
10
+ from pathlib import Path
11
+ import json
12
+
13
+ from crnn_model import get_crnn_model, initialize_weights
14
+ from dataset import CivilRegistryDataset, collate_fn
15
+ from utils import (
16
+ decode_ctc_predictions,
17
+ calculate_cer,
18
+ calculate_wer,
19
+ EarlyStopping
20
+ )
21
+
22
+
23
+ class CRNNTrainer:
24
+ """
25
+ Trainer class for CRNN+CTC model
26
+ """
27
+
28
+ def __init__(self, config):
29
+ self.config = config
30
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31
+
32
+ # Create directories
33
+ self.checkpoint_dir = Path(config['checkpoint_dir'])
34
+ self.log_dir = Path(config['log_dir'])
35
+ self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
36
+ self.log_dir.mkdir(parents=True, exist_ok=True)
37
+
38
+ # Initialize datasets
39
+ print("Loading datasets...")
40
+ self.train_dataset = CivilRegistryDataset(
41
+ data_dir=config['train_data_dir'],
42
+ annotations_file=config['train_annotations'],
43
+ img_height=config['img_height'],
44
+ img_width=config['img_width'],
45
+ augment=True,
46
+ form_type=config.get('form_type', 'all')
47
+ )
48
+
49
+ self.val_dataset = CivilRegistryDataset(
50
+ data_dir=config['val_data_dir'],
51
+ annotations_file=config['val_annotations'],
52
+ img_height=config['img_height'],
53
+ img_width=config['img_width'],
54
+ augment=False,
55
+ form_type=config.get('form_type', 'all')
56
+ )
57
+
58
+ # Create data loaders
59
+ self.train_loader = DataLoader(
60
+ self.train_dataset,
61
+ batch_size=config['batch_size'],
62
+ shuffle=True,
63
+ num_workers=config['num_workers'],
64
+ collate_fn=collate_fn,
65
+ pin_memory=False
66
+ )
67
+
68
+ self.val_loader = DataLoader(
69
+ self.val_dataset,
70
+ batch_size=config['batch_size'],
71
+ shuffle=False,
72
+ num_workers=config['num_workers'],
73
+ collate_fn=collate_fn,
74
+ pin_memory=False
75
+ )
76
+
77
+ # Initialize model
78
+ print(f"Initializing model on {self.device}...")
79
+ self.model = get_crnn_model(
80
+ model_type=config.get('model_type', 'standard'),
81
+ img_height=config['img_height'],
82
+ num_chars=self.train_dataset.num_chars,
83
+ hidden_size=config['hidden_size'],
84
+ num_lstm_layers=config['num_lstm_layers']
85
+ )
86
+
87
+ self.model = self.model.to(self.device)
88
+
89
+ # Loss function - CTC Loss
90
+ self.criterion = nn.CTCLoss(blank=0, zero_infinity=True)
91
+
92
+ # Optimizer β€” lower LR prevents CTC collapse on epoch 1
93
+ self.optimizer = optim.Adam(
94
+ self.model.parameters(),
95
+ lr=config['learning_rate'],
96
+ weight_decay=config.get('weight_decay', 1e-4) # FIXED: fallback was 1e-5
97
+ )
98
+
99
+ # Warmup scheduler: ramp LR from near-zero to target over first N epochs,
100
+ # then hand off to ReduceLROnPlateau.
101
+ # This is the single most effective fix for CTC blank collapse.
102
+ warmup_epochs = config.get('warmup_epochs', 5)
103
+
104
+ def warmup_lambda(epoch):
105
+ if epoch < warmup_epochs:
106
+ return (epoch + 1) / warmup_epochs # gradual: 0.2β†’0.4β†’0.6β†’0.8β†’1.0
107
+ return 1.0
108
+
109
+ self.warmup_scheduler = optim.lr_scheduler.LambdaLR(
110
+ self.optimizer, lr_lambda=warmup_lambda)
111
+
112
+ # ReduceLROnPlateau kicks in after warmup
113
+ self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
114
+ self.optimizer,
115
+ mode='min',
116
+ factor=0.5,
117
+ patience=config.get('lr_patience', 5),
118
+ min_lr=1e-6
119
+ )
120
+ self._warmup_epochs = warmup_epochs
121
+
122
+ # Early stopping
123
+ self.early_stopping = EarlyStopping(
124
+ patience=config.get('early_stopping_patience', 10),
125
+ min_delta=config.get('min_delta', 0.001)
126
+ )
127
+
128
+ # Training history
129
+ self.history = {
130
+ 'train_loss': [],
131
+ 'val_loss': [],
132
+ 'val_cer': [],
133
+ 'val_wer': [],
134
+ 'learning_rates': []
135
+ }
136
+
137
+ # ── Resume from checkpoint if available ──────────────
138
+ self.start_epoch = 1
139
+ self.best_val_loss = float('inf')
140
+ resume_path = self.checkpoint_dir / 'latest_checkpoint.pth'
141
+
142
+ if resume_path.exists():
143
+ print(f"\n Found checkpoint: {resume_path}")
144
+ print(f" Resuming training from last saved epoch...")
145
+ ckpt = torch.load(resume_path, map_location=self.device, weights_only=False)
146
+ self.model.load_state_dict(ckpt['model_state_dict'])
147
+ self.optimizer.load_state_dict(ckpt['optimizer_state_dict'])
148
+ self.scheduler.load_state_dict(ckpt['scheduler_state_dict'])
149
+ if 'warmup_scheduler_state_dict' in ckpt:
150
+ self.warmup_scheduler.load_state_dict(ckpt['warmup_scheduler_state_dict'])
151
+ self.start_epoch = ckpt['epoch'] + 1
152
+ self.best_val_loss = ckpt.get('val_loss', float('inf'))
153
+ self.history = ckpt.get('history', self.history)
154
+ print(f" βœ“ Resumed from Epoch {ckpt['epoch']} "
155
+ f"(Val Loss: {ckpt['val_loss']:.4f}, CER: {ckpt['val_cer']:.2f}%)")
156
+ else:
157
+ print(f" No checkpoint found β€” starting fresh.")
158
+ initialize_weights(self.model)
159
+
160
+ print(f"βœ“ Model ready with {sum(p.numel() for p in self.model.parameters()):,} parameters")
161
+
162
+ def train_epoch(self, epoch):
163
+ """Train for one epoch"""
164
+ self.model.train()
165
+ total_loss = 0
166
+
167
+ pbar = tqdm(self.train_loader, desc=f"Epoch {epoch}/{self.config['epochs']}")
168
+
169
+ nan_count = 0
170
+ for batch_idx, (images, targets, target_lengths, _) in enumerate(pbar):
171
+ images = images.to(self.device)
172
+ targets = targets.to(self.device)
173
+
174
+ # FIXED: zero_grad before forward pass (was incorrectly placed after loss)
175
+ self.optimizer.zero_grad()
176
+
177
+ # Forward pass
178
+ outputs = self.model(images) # [seq_len, batch, num_chars]
179
+
180
+ # Apply log_softmax for CTC
181
+ log_probs = nn.functional.log_softmax(outputs, dim=2)
182
+
183
+ # Calculate sequence lengths
184
+ batch_size = images.size(0)
185
+ input_lengths = torch.full(
186
+ size=(batch_size,),
187
+ fill_value=outputs.size(0),
188
+ dtype=torch.long
189
+ ).to(self.device)
190
+
191
+ # CTC loss
192
+ loss = self.criterion(
193
+ log_probs,
194
+ targets,
195
+ input_lengths,
196
+ target_lengths
197
+ )
198
+
199
+ # FIXED: skip NaN/Inf batches β€” accumulating them corrupts gradients
200
+ if torch.isnan(loss) or torch.isinf(loss):
201
+ nan_count += 1
202
+ continue
203
+
204
+ # Backward pass
205
+ loss.backward()
206
+
207
+ # Gradient clipping to prevent exploding gradients
208
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
209
+
210
+ self.optimizer.step()
211
+
212
+ total_loss += loss.item()
213
+
214
+ # Update progress bar
215
+ pbar.set_postfix({
216
+ 'loss': f'{loss.item():.4f}',
217
+ 'avg_loss': f'{total_loss / (batch_idx + 1):.4f}'
218
+ })
219
+ if nan_count > 0:
220
+ print(f" [WARNING] {nan_count} NaN/Inf batches skipped this epoch.")
221
+
222
+ avg_loss = total_loss / len(self.train_loader)
223
+ return avg_loss
224
+
225
+ def validate(self):
226
+ """Validate the model"""
227
+ self.model.eval()
228
+ total_loss = 0
229
+ all_predictions = []
230
+ all_ground_truths = []
231
+
232
+ with torch.no_grad():
233
+ for images, targets, target_lengths, texts in tqdm(self.val_loader, desc="Validating"):
234
+ images = images.to(self.device)
235
+ targets_gpu = targets.to(self.device)
236
+
237
+ # Forward pass
238
+ outputs = self.model(images)
239
+ log_probs = nn.functional.log_softmax(outputs, dim=2)
240
+
241
+ # CTC loss
242
+ batch_size = images.size(0)
243
+ input_lengths = torch.full(
244
+ size=(batch_size,),
245
+ fill_value=outputs.size(0),
246
+ dtype=torch.long
247
+ ).to(self.device)
248
+
249
+ loss = self.criterion(log_probs, targets_gpu, input_lengths, target_lengths)
250
+ total_loss += loss.item()
251
+
252
+ # Decode predictions
253
+ predictions = decode_ctc_predictions(
254
+ outputs.cpu(),
255
+ self.train_dataset.idx_to_char
256
+ )
257
+
258
+ all_predictions.extend(predictions)
259
+ all_ground_truths.extend(texts)
260
+
261
+ avg_loss = total_loss / len(self.val_loader)
262
+
263
+ # Calculate metrics
264
+ cer = calculate_cer(all_predictions, all_ground_truths)
265
+ wer = calculate_wer(all_predictions, all_ground_truths)
266
+
267
+ return avg_loss, cer, wer, all_predictions, all_ground_truths
268
+
269
+ def train(self):
270
+ """Main training loop"""
271
+ print("\n" + "=" * 70)
272
+ print("Starting Training")
273
+ print("=" * 70)
274
+
275
+ best_val_loss = self.best_val_loss
276
+
277
+ for epoch in range(self.start_epoch, self.config['epochs'] + 1):
278
+ print(f"\nEpoch {epoch}/{self.config['epochs']}")
279
+ print("-" * 70)
280
+
281
+ # Train
282
+ train_loss = self.train_epoch(epoch)
283
+
284
+ # Validate
285
+ val_loss, val_cer, val_wer, predictions, ground_truths = self.validate()
286
+
287
+ # Learning rate scheduling
288
+ # Use warmup for first N epochs, then ReduceLROnPlateau
289
+ if epoch <= self._warmup_epochs:
290
+ self.warmup_scheduler.step()
291
+ else:
292
+ self.scheduler.step(val_loss)
293
+ current_lr = self.optimizer.param_groups[0]['lr']
294
+
295
+ # Update history
296
+ self.history['train_loss'].append(train_loss)
297
+ self.history['val_loss'].append(val_loss)
298
+ self.history['val_cer'].append(val_cer)
299
+ self.history['val_wer'].append(val_wer)
300
+ self.history['learning_rates'].append(current_lr)
301
+
302
+ # Print metrics
303
+ print(f"\nMetrics:")
304
+ print(f" Train Loss: {train_loss:.4f}")
305
+ print(f" Val Loss: {val_loss:.4f}")
306
+ print(f" Val CER: {val_cer:.2f}%")
307
+ print(f" Val WER: {val_wer:.2f}%")
308
+ print(f" LR: {current_lr:.6f}")
309
+
310
+ # Print sample predictions
311
+ print(f"\nSample Predictions:")
312
+ for i in range(min(3, len(predictions))):
313
+ print(f" GT: {ground_truths[i]}")
314
+ print(f" Pred: {predictions[i]}")
315
+ print()
316
+
317
+ # show raw model output
318
+ with torch.no_grad():
319
+ sample_img = self.val_dataset[0][0].unsqueeze(0).to(self.device)
320
+ raw_out = self.model(sample_img)
321
+ probs = torch.softmax(raw_out, dim=2)
322
+ best_idx = probs[:, 0, :].argmax(dim=1)
323
+ best_prob = probs[:, 0, :].max(dim=1).values
324
+ blank_pct = (best_idx == 0).float().mean().item() * 100
325
+ avg_conf = best_prob.mean().item()
326
+ non_blank = [self.train_dataset.idx_to_char.get(i.item(), '?')
327
+ for i in best_idx if i.item() != 0]
328
+ print(f" blank={blank_pct:.0f}% conf={avg_conf:.3f} "
329
+ f"chars={''.join(non_blank[:20])!r}")
330
+
331
+
332
+ # Save checkpoint
333
+ is_best = val_loss < best_val_loss
334
+ if is_best:
335
+ best_val_loss = val_loss
336
+
337
+ self.save_checkpoint(epoch, val_loss, val_cer, is_best)
338
+
339
+ # Early stopping
340
+ if self.early_stopping(val_loss):
341
+ print(f"\nEarly stopping triggered at epoch {epoch}")
342
+ break
343
+
344
+ print("\n" + "=" * 70)
345
+ print("Training Complete!")
346
+ print(f"Best validation loss: {best_val_loss:.4f}")
347
+ print("=" * 70)
348
+
349
+ # Save final training history
350
+ self.save_history()
351
+
352
+ def save_checkpoint(self, epoch, val_loss, val_cer, is_best=False):
353
+ """Save model checkpoint"""
354
+ checkpoint = {
355
+ 'epoch': epoch,
356
+ 'model_state_dict': self.model.state_dict(),
357
+ 'optimizer_state_dict': self.optimizer.state_dict(),
358
+ 'scheduler_state_dict': self.scheduler.state_dict(),
359
+ 'warmup_scheduler_state_dict': self.warmup_scheduler.state_dict(),
360
+ 'val_loss': val_loss,
361
+ 'val_cer': val_cer,
362
+ 'char_to_idx': self.train_dataset.char_to_idx,
363
+ 'idx_to_char': self.train_dataset.idx_to_char,
364
+ 'config': self.config,
365
+ 'history': self.history
366
+ }
367
+
368
+ # Save latest checkpoint
369
+ checkpoint_path = self.checkpoint_dir / 'latest_checkpoint.pth'
370
+ torch.save(checkpoint, checkpoint_path)
371
+
372
+ # Save best checkpoint
373
+ if is_best:
374
+ best_path = self.checkpoint_dir / 'best_model.pth'
375
+ torch.save(checkpoint, best_path)
376
+ print(f" βœ“ Best model saved (Val Loss: {val_loss:.4f}, CER: {val_cer:.2f}%)")
377
+
378
+ # Save epoch checkpoint (history omitted to save disk space β€” it's in latest_checkpoint.pth)
379
+ if epoch % self.config.get('save_freq', 10) == 0:
380
+ epoch_path = self.checkpoint_dir / f'checkpoint_epoch_{epoch}.pth'
381
+ epoch_ckpt = {k: v for k, v in checkpoint.items() if k != 'history'}
382
+ torch.save(epoch_ckpt, epoch_path)
383
+
384
+ def save_history(self):
385
+ """Save training history"""
386
+ history_path = self.log_dir / 'training_history.json'
387
+ with open(history_path, 'w') as f:
388
+ json.dump(self.history, f, indent=2)
389
+ print(f"\nβœ“ Training history saved to {history_path}")
390
+
391
+
392
+ def main():
393
+ """Main training function"""
394
+
395
+ # Configuration
396
+ config = {
397
+ # Data
398
+ 'train_data_dir': 'data/train',
399
+ 'train_annotations': 'data/train_annotations.json',
400
+ 'val_data_dir': 'data/val',
401
+ 'val_annotations': 'data/val_annotations.json',
402
+ 'form_type': 'all', # 'all', 'form1a', 'form2a', 'form3a', 'form90'
403
+
404
+ # Model
405
+ 'model_type': 'standard', # 'standard', 'ensemble', 'lightweight'
406
+ 'img_height': 64,
407
+ 'img_width': 512,
408
+ 'hidden_size': 128,
409
+ 'num_lstm_layers': 1,
410
+
411
+ # Training
412
+ 'batch_size': 32,
413
+ 'epochs': 100,
414
+ 'learning_rate': 0.0001,
415
+ 'weight_decay': 1e-4, # FIXED: was 1e-5 β€” stronger L2 regularisation to reduce overfitting
416
+ 'num_workers': 0,
417
+ 'warmup_epochs': 5, # Ramp LR gradually for first 5 epochs
418
+
419
+ # Scheduling & Early Stopping
420
+ 'lr_patience': 5, # FIXED: was 3 β€” give model more time before halving LR
421
+ 'early_stopping_patience': 20, # FIXED: was 10 β€” more patience during zoom training
422
+ 'min_delta': 0.001,
423
+
424
+ # Saving
425
+ 'checkpoint_dir': 'checkpoints',
426
+ 'log_dir': 'logs',
427
+ 'save_freq': 10,
428
+ }
429
+
430
+ # Initialize trainer
431
+ trainer = CRNNTrainer(config)
432
+
433
+ # Start training
434
+ trainer.train()
435
+
436
+
437
+ if __name__ == "__main__":
438
+ main()
CRNN+CTC/train_emnist.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torchvision
2
+ import torchvision.transforms as transforms
3
+
4
+ print("Loading EMNIST dataset...")
5
+
6
+ train_data = torchvision.datasets.EMNIST(
7
+ root='datasets/emnist',
8
+ split='byclass',
9
+ train=True,
10
+ download=False,
11
+ transform=transforms.ToTensor()
12
+ )
13
+
14
+ print(f"Training samples: {len(train_data)}")
15
+ print("EMNIST loaded successfully!")
CRNN+CTC/train_mnist.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ import numpy as np
3
+ from tensorflow.keras import layers, models
4
+
5
+ # Load MNIST dataset
6
+ (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
7
+
8
+ # Normalize pixel values to 0-1
9
+ x_train = x_train / 255.0
10
+ x_test = x_test / 255.0
11
+
12
+ # Add channel dimension (28, 28) -> (28, 28, 1)
13
+ x_train = x_train[..., tf.newaxis]
14
+ x_test = x_test[..., tf.newaxis]
15
+
16
+ # Build simple CNN model
17
+ model = models.Sequential([
18
+ layers.Conv2D(32, (3,3), activation='relu', input_shape=(28,28,1)),
19
+ layers.MaxPooling2D(2,2),
20
+ layers.Conv2D(64, (3,3), activation='relu'),
21
+ layers.MaxPooling2D(2,2),
22
+ layers.Flatten(),
23
+ layers.Dense(128, activation='relu'),
24
+ layers.Dense(10, activation='softmax') # 10 digits (0-9)
25
+ ])
26
+
27
+ model.compile(optimizer='adam',
28
+ loss='sparse_categorical_crossentropy',
29
+ metrics=['accuracy'])
30
+
31
+ model.summary()
32
+
33
+ # Train
34
+ model.fit(x_train, y_train, epochs=5, validation_split=0.1)
35
+
36
+ # Evaluate
37
+ test_loss, test_acc = model.evaluate(x_test, y_test)
38
+ print(f"\nTest accuracy: {test_acc:.4f}")
39
+
40
+ # Save model
41
+ model.save("mnist_model.h5")
42
+ print("Model saved as mnist_model.h5")
CRNN+CTC/train_with_emnist.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ train_with_emnist.py
3
+ ====================
4
+ Fine-tune the CRNN model with EMNIST character data.
5
+
6
+ FIXES vs old version:
7
+ - Phase 1: CNN FROZEN β€” only RNN+FC trained (prevents catastrophic forgetting)
8
+ - Phase 2: Full model at 10x lower LR for final polish
9
+ - log_softmax applied before CTCLoss (was missing β€” caused garbage loss)
10
+ - Loads from best_model.pth (synthetic, 0.12% CER baseline)
11
+ - Saves best_model_emnist.pth only when val improves
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ import torch
17
+ import torch.nn.functional as F
18
+ import torch.optim as optim
19
+ from torch.utils.data import DataLoader, ConcatDataset
20
+
21
+ sys.path.append('.')
22
+ from crnn_model import get_crnn_model
23
+ from dataset import CivilRegistryDataset, collate_fn
24
+
25
+ print("=" * 55)
26
+ print("Fine-tuning CRNN with EMNIST dataset")
27
+ print("=" * 55)
28
+
29
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
30
+ print(f"Device: {DEVICE}")
31
+
32
+ emnist_dataset = CivilRegistryDataset(
33
+ data_dir='data/train',
34
+ annotations_file='data/emnist_train_annotations.json',
35
+ img_height=64, img_width=512, augment=True
36
+ )
37
+ # FIXED: mix synthetic data in so the model never forgets multi-word sequences
38
+ synth_dataset = CivilRegistryDataset(
39
+ data_dir='data/train',
40
+ annotations_file='data/train_annotations.json',
41
+ img_height=64, img_width=512, augment=True
42
+ )
43
+ train_dataset = emnist_dataset # keep reference for char_to_idx / num_chars
44
+ mixed_train = ConcatDataset([emnist_dataset, synth_dataset])
45
+ val_dataset = CivilRegistryDataset(
46
+ data_dir='data/val',
47
+ annotations_file='data/val_annotations.json', # FIXED: was emnist_val β€” must match real task
48
+ img_height=64, img_width=512, augment=False
49
+ )
50
+ print(f"EMNIST train : {len(emnist_dataset)}")
51
+ print(f"Synthetic train: {len(synth_dataset)}")
52
+ print(f"Mixed train : {len(mixed_train)}")
53
+ print(f"Val : {len(val_dataset)}")
54
+
55
+ train_loader = DataLoader(mixed_train, batch_size=32, shuffle=True,
56
+ num_workers=0, collate_fn=collate_fn)
57
+ val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False,
58
+ num_workers=0, collate_fn=collate_fn)
59
+
60
+ # ── Load best synthetic checkpoint ───────────────────────────
61
+ BASE = 'checkpoints/best_model.pth'
62
+ if not os.path.exists(BASE):
63
+ print(f"ERROR: {BASE} not found. Run: python train.py")
64
+ sys.exit(1)
65
+
66
+ ckpt = torch.load(BASE, map_location=DEVICE, weights_only=False)
67
+ config = ckpt.get('config', {})
68
+
69
+ model = get_crnn_model(
70
+ model_type = config.get('model_type', 'standard'),
71
+ img_height = config.get('img_height', 64),
72
+ num_chars = train_dataset.num_chars,
73
+ hidden_size = config.get('hidden_size', 128),
74
+ num_lstm_layers = config.get('num_lstm_layers', 1),
75
+ ).to(DEVICE)
76
+
77
+ missing, _ = model.load_state_dict(ckpt['model_state_dict'], strict=False)
78
+ if missing:
79
+ print(f" Note: {len(missing)} layers re-initialized (expected for fc layer)")
80
+ print(f" Loaded epoch {ckpt.get('epoch')} "
81
+ f"(val_loss={ckpt.get('val_loss', ckpt.get('val_cer', 0)):.4f})")
82
+
83
+ criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
84
+
85
+
86
+ def run_epoch(loader, training, optimizer=None):
87
+ model.train() if training else model.eval()
88
+ total, n = 0, 0
89
+ ctx = torch.enable_grad() if training else torch.no_grad()
90
+ with ctx:
91
+ for images, targets, target_lengths, _ in loader:
92
+ images = images.to(DEVICE)
93
+ batch_size = images.size(0)
94
+ if training:
95
+ optimizer.zero_grad()
96
+ # CRITICAL: log_softmax before CTCLoss
97
+ outputs = F.log_softmax(model(images), dim=2)
98
+ seq_len = outputs.size(0)
99
+ input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
100
+ loss = criterion(outputs, targets, input_lengths, target_lengths)
101
+ if not torch.isnan(loss) and not torch.isinf(loss):
102
+ if training:
103
+ loss.backward()
104
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
105
+ optimizer.step()
106
+ total += loss.item()
107
+ n += 1
108
+ return total / max(n, 1)
109
+
110
+
111
+ def run_phase(num, epochs, lr, freeze_cnn, patience):
112
+ print(f"\n{'='*55}")
113
+ print(f" PHASE {num} β€” "
114
+ f"{'CNN FROZEN (RNN+FC only)' if freeze_cnn else 'FULL MODEL (all layers)'}"
115
+ f" LR={lr}")
116
+ print(f"{'='*55}")
117
+
118
+ # Freeze or unfreeze CNN
119
+ for name, param in model.named_parameters():
120
+ param.requires_grad = not (freeze_cnn and 'cnn' in name)
121
+
122
+ trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
123
+ print(f" Trainable params : {trainable:,}")
124
+
125
+ opt = optim.Adam(
126
+ filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
127
+ sched = optim.lr_scheduler.ReduceLROnPlateau(opt, patience=3, factor=0.5)
128
+ best = float('inf')
129
+ counter = 0
130
+
131
+ for epoch in range(1, epochs + 1):
132
+ tr = run_epoch(train_loader, True, opt)
133
+ vl = run_epoch(val_loader, False, None)
134
+ sched.step(vl)
135
+
136
+ if vl < best:
137
+ best = vl
138
+ counter = 0
139
+ torch.save({
140
+ 'model_state_dict': model.state_dict(),
141
+ 'config': config,
142
+ 'char_to_idx': train_dataset.char_to_idx,
143
+ 'idx_to_char': train_dataset.idx_to_char,
144
+ 'epoch': epoch,
145
+ 'val_loss': vl, # FIXED: renamed from val_cer β€” this is val loss, not CER%
146
+ }, 'checkpoints/best_model_emnist.pth')
147
+ print(f" Epoch {epoch:02d}/{epochs} Train={tr:.4f} Val={vl:.4f} <- saved")
148
+ else:
149
+ counter += 1
150
+ print(f" Epoch {epoch:02d}/{epochs} Train={tr:.4f} Val={vl:.4f}"
151
+ f" (patience {counter}/{patience})")
152
+ if counter >= patience:
153
+ print(f" Early stopping at epoch {epoch}.")
154
+ break
155
+ return best
156
+
157
+
158
+ # ── Phase 1: Freeze CNN β€” teach RNN+FC to handle EMNIST chars ─
159
+ p1_best = run_phase(1, epochs=30, lr=1e-4, freeze_cnn=True, patience=7)
160
+
161
+ # ── Phase 2: Unfreeze all β€” gentle full-model polish ──────────
162
+ p2_best = run_phase(2, epochs=20, lr=1e-6, freeze_cnn=False, patience=5)
163
+
164
+ print(f"\n{'='*55}")
165
+ print(f"EMNIST fine-tuning complete!")
166
+ print(f" Phase 1 best val loss : {p1_best:.4f}")
167
+ print(f" Phase 2 best val loss : {p2_best:.4f}")
168
+ print(f" Saved : checkpoints/best_model_emnist.pth")
169
+ print(f"\nNext step: python IAM_train.py --prepare --train")
CRNN+CTC/utils.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility Functions for CRNN+CTC Civil Registry OCR
3
+ Includes CTC decoding, metrics calculation, and helper functions
4
+ """
5
+
6
+ import torch
7
+ import numpy as np
8
+ def _editdistance(a, b):
9
+ """Pure-Python Levenshtein distance β€” replaces the editdistance C extension."""
10
+ m, n = len(a), len(b)
11
+ dp = list(range(n + 1))
12
+ for i in range(1, m + 1):
13
+ prev, dp[0] = dp[0], i
14
+ for j in range(1, n + 1):
15
+ prev, dp[j] = dp[j], prev if a[i-1] == b[j-1] else 1 + min(prev, dp[j], dp[j-1])
16
+ return dp[n]
17
+ from typing import List, Dict, Tuple
18
+
19
+
20
+ def decode_ctc_predictions(outputs, idx_to_char, method='greedy'):
21
+ """
22
+ Decode CTC predictions to text
23
+
24
+ Args:
25
+ outputs: Model outputs [seq_len, batch, num_chars]
26
+ idx_to_char: Dictionary mapping indices to characters
27
+ method: 'greedy' or 'beam_search'
28
+
29
+ Returns:
30
+ List of decoded strings
31
+ """
32
+ if method == 'greedy':
33
+ return greedy_decode(outputs, idx_to_char)
34
+ elif method == 'beam_search':
35
+ return beam_search_decode(outputs, idx_to_char)
36
+ else:
37
+ raise ValueError(f"Unknown decoding method: {method}")
38
+
39
+
40
+ def greedy_decode(outputs, idx_to_char):
41
+ """
42
+ Greedy CTC decoding - fast but less accurate
43
+ """
44
+ # Get most probable characters
45
+ pred_indices = torch.argmax(outputs, dim=2) # [seq_len, batch]
46
+ pred_indices = pred_indices.permute(1, 0) # [batch, seq_len]
47
+
48
+ decoded_texts = []
49
+
50
+ for sequence in pred_indices:
51
+ chars = []
52
+ prev_idx = -1
53
+
54
+ for idx in sequence:
55
+ idx = idx.item()
56
+ # Skip blank (0) and consecutive duplicates
57
+ if idx != 0 and idx != prev_idx:
58
+ if idx in idx_to_char:
59
+ chars.append(idx_to_char[idx])
60
+ prev_idx = idx
61
+
62
+ decoded_texts.append(''.join(chars))
63
+
64
+ return decoded_texts
65
+
66
+
67
+ def beam_search_decode(outputs, idx_to_char, beam_width=10):
68
+ """
69
+ Beam search CTC decoding - slower but more accurate.
70
+
71
+ FIXED Bug 6: previous code mixed list-of-chars and string representations.
72
+ After sorting new_beams (a dict keyed by strings), it did `list(seq)` on the
73
+ string key β€” which splits a string like "AB" into ['A','B'] accidentally works
74
+ for ASCII but is fragile and confusing. Rewritten to use strings throughout:
75
+ beams are now List[Tuple[str, float]] with the sequence always kept as a plain
76
+ string, eliminating the list/string ambiguity entirely.
77
+ """
78
+ outputs = torch.nn.functional.softmax(outputs, dim=2)
79
+ outputs = outputs.permute(1, 0, 2).cpu().numpy() # [batch, seq_len, num_chars]
80
+
81
+ decoded_texts = []
82
+
83
+ for output in outputs:
84
+ # Each beam is (sequence_string, cumulative_probability)
85
+ beams: list = [('', 1.0)]
86
+
87
+ for timestep in output:
88
+ new_beams: dict = {}
89
+
90
+ for sequence, prob in beams:
91
+ for idx, char_prob in enumerate(timestep):
92
+ if idx == 0: # blank token β€” sequence unchanged
93
+ new_seq = sequence
94
+ elif idx in idx_to_char:
95
+ char = idx_to_char[idx]
96
+ # CTC rule: merge consecutive duplicate characters
97
+ if sequence and sequence[-1] == char:
98
+ new_seq = sequence # duplicate β€” stay the same
99
+ else:
100
+ new_seq = sequence + char # append directly to string
101
+ else:
102
+ continue
103
+
104
+ new_prob = prob * char_prob
105
+ # Merge beams that produce the same string
106
+ if new_seq in new_beams:
107
+ new_beams[new_seq] = max(new_beams[new_seq], new_prob)
108
+ else:
109
+ new_beams[new_seq] = new_prob
110
+
111
+ # Keep top-k beams; keys are already strings β€” no list() conversion needed
112
+ beams = sorted(new_beams.items(), key=lambda x: x[1], reverse=True)[:beam_width]
113
+
114
+ # Best sequence is the string with highest probability
115
+ best_sequence = max(beams, key=lambda x: x[1])[0]
116
+ decoded_texts.append(best_sequence)
117
+
118
+ return decoded_texts
119
+
120
+
121
+ def calculate_cer(predictions: List[str], ground_truths: List[str]) -> float:
122
+ """
123
+ Calculate Character Error Rate (CER)
124
+
125
+ CER = (Substitutions + Deletions + Insertions) / Total Characters
126
+ """
127
+ if len(predictions) != len(ground_truths):
128
+ raise ValueError("Predictions and ground truths must have same length")
129
+
130
+ total_distance = 0
131
+ total_length = 0
132
+
133
+ for pred, gt in zip(predictions, ground_truths):
134
+ distance = _editdistance(pred, gt)
135
+ total_distance += distance
136
+ total_length += len(gt)
137
+
138
+ cer = (total_distance / total_length * 100) if total_length > 0 else 0
139
+ return cer
140
+
141
+
142
+ def calculate_wer(predictions: List[str], ground_truths: List[str]) -> float:
143
+ """
144
+ Calculate Word Error Rate (WER)
145
+
146
+ WER = (Substitutions + Deletions + Insertions) / Total Words
147
+ """
148
+ if len(predictions) != len(ground_truths):
149
+ raise ValueError("Predictions and ground truths must have same length")
150
+
151
+ total_distance = 0
152
+ total_length = 0
153
+
154
+ for pred, gt in zip(predictions, ground_truths):
155
+ pred_words = pred.split()
156
+ gt_words = gt.split()
157
+
158
+ distance = _editdistance(pred_words, gt_words)
159
+ total_distance += distance
160
+ total_length += len(gt_words)
161
+
162
+ wer = (total_distance / total_length * 100) if total_length > 0 else 0
163
+ return wer
164
+
165
+
166
+ def calculate_accuracy(predictions: List[str], ground_truths: List[str]) -> float:
167
+ """
168
+ Calculate exact match accuracy
169
+ """
170
+ if len(predictions) != len(ground_truths):
171
+ raise ValueError("Predictions and ground truths must have same length")
172
+
173
+ correct = sum(1 for pred, gt in zip(predictions, ground_truths) if pred == gt)
174
+ accuracy = (correct / len(predictions) * 100) if len(predictions) > 0 else 0
175
+
176
+ return accuracy
177
+
178
+
179
+ class EarlyStopping:
180
+ """
181
+ Early stopping to stop training when validation loss stops improving
182
+ """
183
+
184
+ def __init__(self, patience=10, min_delta=0.001):
185
+ self.patience = patience
186
+ self.min_delta = min_delta
187
+ self.counter = 0
188
+ self.best_loss = None
189
+ self.early_stop = False
190
+
191
+ def __call__(self, val_loss):
192
+ if self.best_loss is None:
193
+ self.best_loss = val_loss
194
+ elif val_loss > self.best_loss - self.min_delta:
195
+ self.counter += 1
196
+ if self.counter >= self.patience:
197
+ self.early_stop = True
198
+ else:
199
+ self.best_loss = val_loss
200
+ self.counter = 0
201
+
202
+ return self.early_stop
203
+
204
+
205
+ class AverageMeter:
206
+ """
207
+ Computes and stores the average and current value
208
+ """
209
+
210
+ def __init__(self):
211
+ self.reset()
212
+
213
+ def reset(self):
214
+ self.val = 0
215
+ self.avg = 0
216
+ self.sum = 0
217
+ self.count = 0
218
+
219
+ def update(self, val, n=1):
220
+ self.val = val
221
+ self.sum += val * n
222
+ self.count += n
223
+ self.avg = self.sum / self.count
224
+
225
+
226
+ def calculate_confusion_matrix(predictions: List[str], ground_truths: List[str], char_set: List[str]) -> np.ndarray:
227
+ """
228
+ Calculate character-level confusion matrix
229
+
230
+ Args:
231
+ predictions: List of predicted strings
232
+ ground_truths: List of ground truth strings
233
+ char_set: List of all possible characters
234
+
235
+ Returns:
236
+ Confusion matrix [num_chars, num_chars]
237
+ """
238
+ char_to_idx = {char: idx for idx, char in enumerate(char_set)}
239
+ n_chars = len(char_set)
240
+
241
+ confusion = np.zeros((n_chars, n_chars), dtype=np.int64)
242
+
243
+ for pred, gt in zip(predictions, ground_truths):
244
+ # Align sequences (simple alignment)
245
+ max_len = max(len(pred), len(gt))
246
+ pred_padded = pred + ' ' * (max_len - len(pred))
247
+ gt_padded = gt + ' ' * (max_len - len(gt))
248
+
249
+ for p_char, g_char in zip(pred_padded, gt_padded):
250
+ if p_char in char_to_idx and g_char in char_to_idx:
251
+ confusion[char_to_idx[g_char], char_to_idx[p_char]] += 1
252
+
253
+ return confusion
254
+
255
+
256
+ def extract_form_fields(text: str, form_type: str) -> Dict[str, str]:
257
+ """
258
+ Extract specific fields from recognized text based on form type
259
+
260
+ Args:
261
+ text: Recognized text
262
+ form_type: 'form1a', 'form2a', 'form3a', 'form90'
263
+
264
+ Returns:
265
+ Dictionary of extracted fields
266
+ """
267
+ fields = {}
268
+
269
+ if form_type == 'form1a': # Birth Certificate
270
+ # Extract common fields (simplified)
271
+ # In practice, use NER or regex patterns
272
+ fields['type'] = 'Birth Certificate'
273
+ # Add more field extraction logic
274
+
275
+ elif form_type == 'form2a': # Death Certificate
276
+ fields['type'] = 'Death Certificate'
277
+
278
+ elif form_type == 'form3a': # Marriage Certificate
279
+ fields['type'] = 'Marriage Certificate'
280
+
281
+ elif form_type == 'form90': # Marriage License Application
282
+ fields['type'] = 'Marriage License Application'
283
+
284
+ return fields
285
+
286
+
287
+ def validate_extracted_data(data: Dict[str, str], form_type: str) -> Tuple[bool, List[str]]:
288
+ """
289
+ Validate extracted data for completeness and format
290
+
291
+ Args:
292
+ data: Extracted data dictionary
293
+ form_type: Form type
294
+
295
+ Returns:
296
+ (is_valid, list_of_errors)
297
+ """
298
+ errors = []
299
+
300
+ # Define required fields per form type
301
+ required_fields = {
302
+ 'form1a': ['name', 'date_of_birth', 'place_of_birth'],
303
+ 'form2a': ['name', 'date_of_death', 'place_of_death'],
304
+ 'form3a': ['husband_name', 'wife_name', 'date_of_marriage'],
305
+ 'form90': ['husband_name', 'wife_name', 'date_of_application']
306
+ }
307
+
308
+ # Check required fields
309
+ for field in required_fields.get(form_type, []):
310
+ if field not in data or not data[field]:
311
+ errors.append(f"Missing required field: {field}")
312
+
313
+ # Additional validation can be added here
314
+ # - Date format validation
315
+ # - Name format validation
316
+ # - etc.
317
+
318
+ is_valid = len(errors) == 0
319
+ return is_valid, errors
320
+
321
+
322
+ def load_checkpoint(checkpoint_path, model, optimizer=None, device='cpu'):
323
+ """
324
+ Load model checkpoint
325
+
326
+ Args:
327
+ checkpoint_path: Path to checkpoint file
328
+ model: Model instance
329
+ optimizer: Optimizer instance (optional)
330
+ device: Device to load to
331
+
332
+ Returns:
333
+ (model, optimizer, checkpoint_dict)
334
+ """
335
+ checkpoint = torch.load(checkpoint_path, map_location=device)
336
+
337
+ model.load_state_dict(checkpoint['model_state_dict'])
338
+
339
+ if optimizer is not None and 'optimizer_state_dict' in checkpoint:
340
+ optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
341
+
342
+ print(f"βœ“ Loaded checkpoint from {checkpoint_path}")
343
+ print(f" Epoch: {checkpoint.get('epoch', 'N/A')}")
344
+ if 'val_cer' in checkpoint:
345
+ print(f" Val CER : {checkpoint['val_cer']:.4f}%")
346
+ elif 'val_loss' in checkpoint:
347
+ print(f" Val Loss : {checkpoint['val_loss']:.4f} (run compare_live_cer.py for true CER)")
348
+ else:
349
+ print(f" Val CER : N/A (run compare_live_cer.py for true CER)")
350
+
351
+ return model, optimizer, checkpoint
352
+
353
+
354
+ def save_predictions_to_file(predictions: List[str], ground_truths: List[str], output_file: str):
355
+ """
356
+ Save predictions and ground truths to file for analysis
357
+ """
358
+ with open(output_file, 'w', encoding='utf-8') as f:
359
+ f.write("Ground Truth\tPrediction\tMatch\n")
360
+ f.write("=" * 80 + "\n")
361
+
362
+ for gt, pred in zip(ground_truths, predictions):
363
+ match = "βœ“" if gt == pred else "βœ—"
364
+ f.write(f"{gt}\t{pred}\t{match}\n")
365
+
366
+ print(f"βœ“ Predictions saved to {output_file}")
367
+
368
+
369
+ if __name__ == "__main__":
370
+ # Test utility functions
371
+ print("=" * 60)
372
+ print("Testing Utility Functions")
373
+ print("=" * 60)
374
+
375
+ # Test CER calculation
376
+ predictions = ["Hello World", "Test", "Sample Text"]
377
+ ground_truths = ["Hello World", "Tset", "Sample Txt"]
378
+
379
+ cer = calculate_cer(predictions, ground_truths)
380
+ wer = calculate_wer(predictions, ground_truths)
381
+ accuracy = calculate_accuracy(predictions, ground_truths)
382
+
383
+ print(f"\nMetrics:")
384
+ print(f" CER: {cer:.2f}%")
385
+ print(f" WER: {wer:.2f}%")
386
+ print(f" Accuracy: {accuracy:.2f}%")
387
+
388
+ # Test early stopping
389
+ print("\nTesting Early Stopping:")
390
+ early_stopping = EarlyStopping(patience=3, min_delta=0.001)
391
+
392
+ val_losses = [1.0, 0.9, 0.85, 0.84, 0.84, 0.84, 0.84]
393
+ for epoch, loss in enumerate(val_losses, 1):
394
+ should_stop = early_stopping(loss)
395
+ print(f" Epoch {epoch}: Loss = {loss:.2f}, Stop = {should_stop}")
396
+ if should_stop:
397
+ break
MNB/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # mnb/__init__.py
2
+ from .classifier import MNBClassifier
3
+
4
+ __all__ = ["MNBClassifier"]
MNB/classifier.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mnb/classifier.py
2
+ # ============================================================
3
+ # MNB CLASSIFIER β€” wraps the trained DocumentClassifier
4
+ #
5
+ # TWO SEPARATE CONCERNS:
6
+ #
7
+ # PATH A β€” Certifications Page
8
+ # User uploads a certification scan.
9
+ # MNB identifies which form it is:
10
+ # form102 β†’ Form 102 (Certificate of Live Birth)
11
+ # form103 β†’ Form 103 (Certificate of Death)
12
+ # form97 β†’ Form 97 (Certificate of Marriage)
13
+ #
14
+ # PATH B β€” Application for Marriage License Page (Form 90)
15
+ # User uploads TWO birth certificates:
16
+ # - Groom's Birth Cert (PSA/NSO sealed)
17
+ # - Bride's Birth Cert (PSA/NSO sealed)
18
+ # MNB is NOT used for form type here β€” the upload page
19
+ # already tells us it's a birth cert.
20
+ # classify_sex() reads the SEX field β†’ GROOM (Male) or BRIDE (Female)
21
+ # and routes each cert to the correct Form 90 slot.
22
+ #
23
+ # Files needed:
24
+ # form_classifier.py ← training + DocumentClassifier
25
+ # models/mnb_classifier.pkl
26
+ # models/tfidf_vectorizer.pkl
27
+ # models/mnb_metadata.json
28
+ # ============================================================
29
+
30
+ import sys
31
+ import os
32
+
33
+ _mnb_dir = os.path.dirname(os.path.abspath(__file__))
34
+ if _mnb_dir not in sys.path:
35
+ sys.path.insert(0, _mnb_dir)
36
+
37
+ _root_dir = os.path.dirname(_mnb_dir)
38
+ if _root_dir not in sys.path:
39
+ sys.path.insert(0, _root_dir)
40
+
41
+ try:
42
+ from form_classifier import DocumentClassifier
43
+ _HAVE_DOC_CLASSIFIER = True
44
+ except ImportError:
45
+ _HAVE_DOC_CLASSIFIER = False
46
+
47
+
48
+ # ── Keyword fallback (used if .pkl files not found) ────────
49
+ # Uses exact Philippine civil registry form headers
50
+ _FORM_KEYWORDS = {
51
+ "form102": [
52
+ "Municipal Form No. 102",
53
+ "Municipal Form No.102",
54
+ "Certificate of Live Birth",
55
+ "live birth",
56
+ "name of child",
57
+ "date of birth",
58
+ "place of birth",
59
+ "birth certificate",
60
+ "mother", "father",
61
+ "infant", "newborn",
62
+ "attendant at birth",
63
+ ],
64
+ "form103": [
65
+ "Municipal Form No. 103",
66
+ "Municipal Form No.103",
67
+ "Certificate of Death",
68
+ "death certificate",
69
+ "name of deceased",
70
+ "date of death",
71
+ "place of death",
72
+ "cause of death",
73
+ "burial", "deceased",
74
+ "immediate cause",
75
+ ],
76
+ "form97": [
77
+ "Municipal Form No. 97",
78
+ "Municipal Form No.97",
79
+ "Certificate of Marriage",
80
+ "marriage certificate",
81
+ "name of husband",
82
+ "name of wife",
83
+ "date of marriage",
84
+ "place of marriage",
85
+ "solemnizing officer",
86
+ "contracting parties",
87
+ "witnesses",
88
+ ],
89
+ }
90
+
91
+ # Sex keywords for Form 90 routing (Groom/Bride)
92
+ _SEX_KEYWORDS = {
93
+ "GROOM": [
94
+ "sex: male",
95
+ "sex male",
96
+ "2. sex: male",
97
+ " male",
98
+ "sex m",
99
+ ],
100
+ "BRIDE": [
101
+ "sex: female",
102
+ "sex female",
103
+ "2. sex: female",
104
+ " female",
105
+ "sex f",
106
+ ],
107
+ }
108
+
109
+ def _keyword_classify_form(text: str) -> str:
110
+ """Keyword fallback for Certifications page classification."""
111
+ t = text.lower()
112
+ scores = {k: sum(1 for kw in v if kw.lower() in t) for k, v in _FORM_KEYWORDS.items()}
113
+ return max(scores, key=scores.get)
114
+
115
+ def _keyword_classify_sex(text: str) -> str:
116
+ """Keyword-based sex classifier for Form 90 routing."""
117
+ t = text.lower()
118
+ scores = {k: sum(1 for kw in v if kw.lower() in t) for k, v in _SEX_KEYWORDS.items()}
119
+ return max(scores, key=scores.get)
120
+
121
+
122
+ # ── Form code β†’ NER hint map ──────────────────────────────
123
+ _FORM_CODE_TO_HINT = {
124
+ "form102": "birth",
125
+ "form103": "death",
126
+ "form97": "marriage",
127
+ # Form 90 is handled by classify_sex() β€” not this map
128
+ }
129
+
130
+
131
+ class MNBClassifier:
132
+ """
133
+ MNB Classifier for the Civil Registry Digitization System.
134
+
135
+ PATH A β€” Certifications Page:
136
+ mnb = MNBClassifier()
137
+ form_code = mnb.classify_form_type(ocr_text)
138
+ # β†’ 'form102' | 'form103' | 'form97'
139
+
140
+ hint = mnb.get_ner_hint(ocr_text)
141
+ # β†’ 'birth' | 'death' | 'marriage'
142
+
143
+ result = mnb.classify_full(ocr_text)
144
+ # β†’ {'label': 'Form 102 - Certificate of Live Birth',
145
+ # 'form_code': 'form102', 'confidence': 0.97, 'probabilities': {...}}
146
+
147
+ PATH B β€” Application for Marriage License Page (Form 90):
148
+ sex_role = mnb.classify_sex(ocr_text)
149
+ # β†’ 'GROOM' (Male birth cert) | 'BRIDE' (Female birth cert)
150
+ """
151
+
152
+ def __init__(self, model_dir: str = "models"):
153
+ self._doc_clf = None
154
+ if _HAVE_DOC_CLASSIFIER:
155
+ try:
156
+ self._doc_clf = DocumentClassifier(model_dir=model_dir)
157
+ print(f" [MNB] Loaded DocumentClassifier from {model_dir}/")
158
+ except FileNotFoundError as e:
159
+ print(f" [MNB] {e}")
160
+ print(" [MNB] Using keyword fallback β€” run: python mnb/form_classifier.py")
161
+ else:
162
+ print(" [MNB] form_classifier.py not found β€” using keyword fallback")
163
+
164
+ # ── PATH A: Certifications Page ────────────────────────
165
+
166
+ def classify_form_type(self, ocr_text: str) -> str:
167
+ """
168
+ Certifications page: identify which form was uploaded.
169
+ Returns: 'form102' | 'form103' | 'form97'
170
+ """
171
+ if self._doc_clf is not None:
172
+ return self._doc_clf.predict(ocr_text)["form_code"]
173
+ return _keyword_classify_form(ocr_text)
174
+
175
+ def classify_full(self, ocr_text: str) -> dict:
176
+ """
177
+ Certifications page: full result with confidence scores.
178
+ Returns:
179
+ {
180
+ 'label': 'Form 102 - Certificate of Live Birth',
181
+ 'form_code': 'form102',
182
+ 'confidence': 0.97,
183
+ 'probabilities': { ... }
184
+ }
185
+ """
186
+ if self._doc_clf is not None:
187
+ return self._doc_clf.predict(ocr_text)
188
+ winner = _keyword_classify_form(ocr_text)
189
+ return {
190
+ "label": winner,
191
+ "form_code": winner,
192
+ "confidence": 1.0,
193
+ "probabilities": {k: (1.0 if k == winner else 0.0) for k in _FORM_KEYWORDS},
194
+ }
195
+
196
+ def get_ner_hint(self, ocr_text: str) -> str:
197
+ """
198
+ Returns NER hint string for bridge.py:
199
+ 'birth' | 'death' | 'marriage'
200
+ """
201
+ code = self.classify_form_type(ocr_text)
202
+ return _FORM_CODE_TO_HINT.get(code, "birth")
203
+
204
+ # ── PATH B: Marriage License Page (Form 90) ────────────
205
+
206
+ def classify_sex(self, ocr_text: str) -> str:
207
+ """
208
+ Form 90 upload page only.
209
+ Reads the SEX field on a PSA/NSO birth certificate.
210
+ Returns: 'GROOM' (Male) | 'BRIDE' (Female)
211
+ """
212
+ return _keyword_classify_sex(ocr_text)
213
+
214
+ def classify_sex_proba(self, ocr_text: str) -> dict:
215
+ """
216
+ Returns confidence scores for sex classification.
217
+ Returns: {'GROOM': 0.9, 'BRIDE': 0.1}
218
+ """
219
+ winner = _keyword_classify_sex(ocr_text)
220
+ return {k: (1.0 if k == winner else 0.0) for k in _SEX_KEYWORDS}
221
+
222
+
223
+ # ── Quick test ──────────────────────────────────────────────
224
+ if __name__ == "__main__":
225
+ mnb = MNBClassifier()
226
+
227
+ print("\n ── PATH A: Certifications Page Tests ──")
228
+ cert_tests = [
229
+ (
230
+ "Municipal Form No. 102 Certificate of Live Birth "
231
+ "Name of child Maria Santos Date of birth 01/15/1990 "
232
+ "Place of birth Brgy. San Jose Tarlac City "
233
+ "Name of mother Lani Santos Name of father Jose Santos "
234
+ "Sex Female birth certificate infant",
235
+ "form102"
236
+ ),
237
+ (
238
+ "Municipal Form No.102 Certificate of Live Birth "
239
+ "PSA Child Juan Dela Cruz born 03/22/1985 Capas Tarlac "
240
+ "mother Rosa father Pedro Sex Male",
241
+ "form102"
242
+ ),
243
+ (
244
+ "Municipal Form No. 103 Certificate of Death "
245
+ "Name of deceased Pedro Reyes Date of death 03/22/2020 "
246
+ "Cause of death Cardiac Arrest death certificate burial",
247
+ "form103"
248
+ ),
249
+ (
250
+ "Municipal Form No.103 Certificate of Death "
251
+ "Deceased Ana Torres died 07/04/2000 Pneumonia burial permit",
252
+ "form103"
253
+ ),
254
+ (
255
+ "Municipal Form No. 97 Certificate of Marriage "
256
+ "Name of husband Carlos Bautista Name of wife Ana Torres "
257
+ "Date of marriage 07/04/2005 solemnizing officer witnesses",
258
+ "form97"
259
+ ),
260
+ (
261
+ "Municipal Form No.97 Certificate of Marriage "
262
+ "Husband Jose Santos wife Maria Reyes married 11/30/1995 "
263
+ "contracting parties",
264
+ "form97"
265
+ ),
266
+ ]
267
+
268
+ for text, expected in cert_tests:
269
+ result = mnb.classify_full(text)
270
+ mark = "βœ…" if result["form_code"] == expected else "❌"
271
+ print(f" {mark} Expected={expected:<8} Got={result['form_code']:<8} "
272
+ f"Confidence={result['confidence']:.1%} ({result['label']})")
273
+
274
+ print("\n ── PATH B: Form 90 Marriage License β€” Sex Routing Tests ──")
275
+ sex_tests = [
276
+ (
277
+ "Municipal Form No.102 Certificate of Live Birth PSA "
278
+ "CHILD (First): Juan Dela Cruz SEX: Male "
279
+ "Date of Birth March 15 1990 Mother Maria Dela Cruz",
280
+ "GROOM"
281
+ ),
282
+ (
283
+ "Municipal Form No.102 Certificate of Live Birth NSO "
284
+ "CHILD (First): Ana Santos SEX: Female "
285
+ "Date of Birth August 21 1995 Mother Gloria Santos",
286
+ "BRIDE"
287
+ ),
288
+ ]
289
+ for text, expected in sex_tests:
290
+ pred = mnb.classify_sex(text)
291
+ mark = "βœ…" if pred == expected else "❌"
292
+ print(f" {mark} Expected={expected} Got={pred}")
MNB/form_classifier.py ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ form_classifier.py
3
+ =======================
4
+ Multinomial Naive Bayes (MNB) Document Classifier
5
+ for Local Civil Registry Document Digitization System
6
+
7
+ Classifies extracted OCR text into:
8
+ - Form 102 (Certificate of Live Birth) ← Certifications page
9
+ - Form 103 (Certificate of Death) ← Certifications page
10
+ - Form 97 (Certificate of Marriage) ← Certifications page
11
+
12
+ NOTE: Form 90 (Application for Marriage License) is NOT classified here.
13
+ Form 90 has its OWN upload page where the user uploads:
14
+ - Groom's Birth Certificate (PSA/NSO sealed)
15
+ - Bride's Birth Certificate (PSA/NSO sealed)
16
+ The SEX field on each birth cert determines GROOM (Male) or BRIDE (Female).
17
+ See classify_sex() in classifier.py for that routing.
18
+
19
+ Usage:
20
+ python form_classifier.py # trains and saves model
21
+ python form_classifier.py --test # runs test predictions
22
+ """
23
+
24
+ import os
25
+ import json
26
+ import random
27
+ import argparse
28
+ import pickle
29
+ import numpy as np
30
+ from sklearn.naive_bayes import MultinomialNB
31
+ from sklearn.feature_extraction.text import TfidfVectorizer
32
+ from sklearn.model_selection import train_test_split
33
+ from sklearn.metrics import (
34
+ accuracy_score, classification_report, confusion_matrix
35
+ )
36
+
37
+ # ─────────────────────────────────────────────────────────────
38
+ # 1. LABEL MAP (Certifications page only β€” NO Form 90 here)
39
+ # ─────────────────────────────────────────────────────────────
40
+ LABEL_MAP = {
41
+ 0: 'Form 102 - Certificate of Live Birth',
42
+ 1: 'Form 103 - Certificate of Death',
43
+ 2: 'Form 97 - Certificate of Marriage',
44
+ }
45
+ LABEL_NAMES = list(LABEL_MAP.values())
46
+
47
+ # ─────────────────────────────────────────────────────────────
48
+ # 2. VOCABULARY POOLS (Filipino civil registry)
49
+ # ─────────────────────────────────────────────────────────────
50
+ FIRST_NAMES = [
51
+ 'Juan', 'Maria', 'Jose', 'Ana', 'Pedro', 'Rosa', 'Carlos', 'Lani',
52
+ 'Roberto', 'Nena', 'Ramon', 'Cynthia', 'Eduardo', 'Marites', 'Danilo',
53
+ 'Rowena', 'Renato', 'Melinda', 'Ernesto', 'Josephine', 'Michael',
54
+ 'Jennifer', 'Angelo', 'Christine', 'Mark', 'Patricia', 'John', 'Mary'
55
+ ]
56
+ LAST_NAMES = [
57
+ 'Dela Cruz', 'Santos', 'Reyes', 'Garcia', 'Torres', 'Flores',
58
+ 'Bautista', 'Villanueva', 'Mendoza', 'Castro', 'Ramos', 'Lim',
59
+ 'Aquino', 'Diaz', 'Fernandez', 'Lopez', 'Gonzales', 'Ramirez',
60
+ 'Abad', 'Aguilar', 'Manalo', 'Navarro', 'Ocampo', 'Pascual'
61
+ ]
62
+ MUNICIPALITIES = [
63
+ 'Tarlac City', 'Capas', 'Paniqui', 'Gerona', 'Camiling',
64
+ 'Victoria', 'San Manuel', 'Concepcion', 'La Paz', 'Sta. Ignacia',
65
+ 'Bamban', 'Moncada', 'Pura', 'Ramos', 'Anao'
66
+ ]
67
+ PROVINCES = ['Tarlac', 'Pampanga', 'Nueva Ecija', 'Bulacan', 'Zambales']
68
+ BARANGAYS = [
69
+ 'Brgy. San Jose', 'Brgy. Poblacion', 'Brgy. Sto. Cristo',
70
+ 'Brgy. Tibag', 'Brgy. Maliwalo', 'Brgy. San Nicolas',
71
+ 'Brgy. San Roque', 'Brgy. San Vicente', 'Brgy. Salapungan'
72
+ ]
73
+ DATES = [
74
+ '01/15/1990', '03/22/1985', '07/04/2000', '11/30/1995',
75
+ '05/18/1988', '09/12/1975', '02/28/1993', '06/06/1980',
76
+ '12/25/1998', '04/17/2001', '08/08/1965', '10/31/1970',
77
+ ]
78
+
79
+ def _name():
80
+ return f"{random.choice(FIRST_NAMES)} {random.choice(LAST_NAMES)}"
81
+
82
+ def _date():
83
+ return random.choice(DATES)
84
+
85
+ def _place():
86
+ return f"{random.choice(BARANGAYS)}, {random.choice(MUNICIPALITIES)}, {random.choice(PROVINCES)}"
87
+
88
+
89
+ # ─────────────────────────────────────────────────────────────
90
+ # 3. SAMPLE GENERATORS
91
+ # Each generator uses the EXACT Philippine form header
92
+ # so MNB learns the real keywords from actual documents.
93
+ # ─────────────────────────────────────────────────────────────
94
+
95
+ def generate_form102():
96
+ """
97
+ Form 102 β€” Certificate of Live Birth
98
+ Header keywords: 'Municipal Form No. 102', 'Certificate of Live Birth'
99
+ """
100
+ templates = [
101
+ # Template A: Exact header present
102
+ f"Municipal Form No. 102 Certificate of Live Birth "
103
+ f"Name of child {_name()} Date of birth {_date()} Place of birth {_place()} "
104
+ f"Name of mother {_name()} Name of father {_name()} "
105
+ f"Sex {random.choice(['Male', 'Female'])} "
106
+ f"Legitimacy {random.choice(['Legitimate', 'Illegitimate'])} "
107
+ f"Attendant {random.choice(['Physician', 'Midwife', 'Nurse'])} "
108
+ f"birth certificate registry birth registration infant newborn child",
109
+
110
+ # Template B: No. without space
111
+ f"Municipal Form No.102 Certificate of Live Birth "
112
+ f"Child {_name()} born {_date()} at {_place()} "
113
+ f"mother {_name()} father {_name()} "
114
+ f"birth weight {random.randint(2, 4)}.{random.randint(1, 9)} kg "
115
+ f"birth order {random.choice(['First', 'Second', 'Third'])} "
116
+ f"birth certificate Form 102",
117
+
118
+ # Template C: Registry number format
119
+ f"Municipal Form No. 102 Certificate of Live Birth "
120
+ f"Registry number {random.randint(100, 999)}-{random.randint(1, 99):02d} "
121
+ f"name of child {_name()} date of birth {_date()} "
122
+ f"place of birth {_place()} birth certificate municipal civil registrar",
123
+
124
+ # Template D: PSA/NSO sealed copy (used when filing Form 90)
125
+ f"Municipal Form No. 102 Certificate of Live Birth "
126
+ f"PSA {_name()} born on {_date()} "
127
+ f"place of birth {_place()} "
128
+ f"mother maiden name {_name()} father {_name()} "
129
+ f"type of birth {random.choice(['Single', 'Twin'])} infant newborn",
130
+
131
+ # Template E: NSO variation
132
+ f"Municipal Form No.102 Certificate of Live Birth "
133
+ f"NSO birth registration {_name()} "
134
+ f"birth date {_date()} birthplace {_place()} "
135
+ f"parents mother {_name()} father {_name()} "
136
+ f"attendant at birth {random.choice(['hospital', 'midwife', 'physician'])} "
137
+ f"sex {random.choice(['male', 'female'])}",
138
+ ]
139
+ return random.choice(templates)
140
+
141
+
142
+ def generate_form103():
143
+ """
144
+ Form 103 β€” Certificate of Death
145
+ Header keywords: 'Municipal Form No. 103', 'Certificate of Death'
146
+ """
147
+ causes = [
148
+ 'Cardiac Arrest', 'Pneumonia', 'Hypertension', 'Diabetes Mellitus',
149
+ 'Stroke', 'Respiratory Failure', 'Natural Causes', 'Cancer',
150
+ 'Septicemia', 'Renal Failure'
151
+ ]
152
+ templates = [
153
+ # Template A: Exact header
154
+ f"Municipal Form No. 103 Certificate of Death "
155
+ f"Name of deceased {_name()} Date of death {_date()} Place of death {_place()} "
156
+ f"Cause of death {random.choice(causes)} Age at death {random.randint(1, 95)} "
157
+ f"Sex {random.choice(['Male', 'Female'])} "
158
+ f"Civil status {random.choice(['Single', 'Married', 'Widowed'])} "
159
+ f"death certificate deceased burial interment",
160
+
161
+ # Template B: No space
162
+ f"Municipal Form No.103 Certificate of Death "
163
+ f"Deceased {_name()} died on {_date()} at {_place()} "
164
+ f"cause {random.choice(causes)} corpse informant {_name()} "
165
+ f"death certificate Form 103 municipal civil registrar",
166
+
167
+ # Template C: Registry format
168
+ f"Municipal Form No. 103 Certificate of Death "
169
+ f"Registry number death {random.randint(100, 999)}-{random.randint(1, 99):02d} "
170
+ f"name of deceased {_name()} date of death {_date()} "
171
+ f"place of death {_place()} cause of death {random.choice(causes)} "
172
+ f"death certificate burial permit",
173
+
174
+ # Template D: Clinical format
175
+ f"Municipal Form No.103 Certificate of Death "
176
+ f"{_name()} died {_date()} "
177
+ f"place {_place()} cause of death {random.choice(causes)} "
178
+ f"informant {_name()} relationship {random.choice(['spouse', 'child', 'sibling', 'parent'])} "
179
+ f"death deceased cadaver",
180
+
181
+ # Template E: Full form
182
+ f"Municipal Form No. 103 Certificate of Death "
183
+ f"Form 103 death registration {_name()} "
184
+ f"date of death {_date()} place of death {_place()} "
185
+ f"immediate cause {random.choice(causes)} "
186
+ f"attending physician {_name()} certificate of death",
187
+ ]
188
+ return random.choice(templates)
189
+
190
+
191
+ def generate_form97():
192
+ """
193
+ Form 97 β€” Certificate of Marriage
194
+ Header keywords: 'Municipal Form No. 97', 'Certificate of Marriage'
195
+ """
196
+ officers = ['Rev.', 'Judge', 'Mayor', 'Pastor', 'Fr.']
197
+ licenses = [f"{random.randint(10000, 99999)}", f"ML-{random.randint(1000, 9999)}"]
198
+ templates = [
199
+ # Template A: Exact header
200
+ f"Municipal Form No. 97 Certificate of Marriage "
201
+ f"Name of husband {_name()} Name of wife {_name()} "
202
+ f"Date of marriage {_date()} Place of marriage {_place()} "
203
+ f"Solemnizing officer {random.choice(officers)} {_name()} "
204
+ f"Marriage license number {random.choice(licenses)} witnesses {_name()} {_name()} "
205
+ f"marriage certificate contracting parties wedding",
206
+
207
+ # Template B: No space
208
+ f"Municipal Form No.97 Certificate of Marriage "
209
+ f"Husband {_name()} wife {_name()} "
210
+ f"married on {_date()} at {_place()} "
211
+ f"officiated by {random.choice(officers)} {_name()} "
212
+ f"marriage certificate Form 97 solemnizing officer",
213
+
214
+ # Template C: Registry format
215
+ f"Municipal Form No. 97 Certificate of Marriage "
216
+ f"Registry number marriage {random.randint(100, 999)}-{random.randint(1, 99):02d} "
217
+ f"husband {_name()} wife {_name()} "
218
+ f"date of marriage {_date()} place {_place()} "
219
+ f"marriage license {random.choice(licenses)} issued at {_place()} "
220
+ f"marriage certificate civil registrar",
221
+
222
+ # Template D: Ceremony format
223
+ f"Municipal Form No.97 Certificate of Marriage "
224
+ f"{_name()} and {_name()} "
225
+ f"solemnized {_date()} at {_place()} "
226
+ f"solemnizing officer {random.choice(officers)} {_name()} "
227
+ f"witnesses {_name()} {_name()} "
228
+ f"marriage contracting parties husband wife ceremony",
229
+
230
+ # Template E: Full form
231
+ f"Municipal Form No. 97 Certificate of Marriage "
232
+ f"Form 97 marriage registration husband {_name()} "
233
+ f"wife {_name()} date of marriage {_date()} "
234
+ f"place of marriage {_place()} "
235
+ f"license number {random.choice(licenses)} marriage nuptial wed",
236
+ ]
237
+ return random.choice(templates)
238
+
239
+
240
+ # ─────────────────────────────────────────────────────────────
241
+ # 4. DATASET GENERATOR (3 classes only β€” no Form 90)
242
+ # ─────────────────────────────────────────────────────────────
243
+ def generate_dataset(samples_per_class=150):
244
+ generators = [generate_form102, generate_form103, generate_form97]
245
+ labels_map = [0, 1, 2] # 0=Form102, 1=Form103, 2=Form97
246
+
247
+ texts, labels = [], []
248
+ for gen, label in zip(generators, labels_map):
249
+ for _ in range(samples_per_class):
250
+ texts.append(gen())
251
+ labels.append(label)
252
+
253
+ combined = list(zip(texts, labels))
254
+ random.shuffle(combined)
255
+ texts, labels = zip(*combined)
256
+ return list(texts), list(labels)
257
+
258
+
259
+ # ─────────────────────────────────────────────────────────────
260
+ # 5. TRAIN & SAVE
261
+ # ─────────────────────────────────────────────────────────────
262
+ def train(samples_per_class=150, save_dir='models'):
263
+ os.makedirs(save_dir, exist_ok=True)
264
+
265
+ print("=" * 60)
266
+ print(" MNB Document Classifier | Filipino Civil Registry")
267
+ print(" Certifications Page: Form 102 / 103 / 97 ONLY")
268
+ print(" (Form 90 routing is handled separately via SEX field)")
269
+ print("=" * 60)
270
+
271
+ print(f"\n Generating dataset ({samples_per_class} samples Γ— 3 forms = {samples_per_class * 3} total)...")
272
+ texts, labels = generate_dataset(samples_per_class)
273
+
274
+ X_train, X_test, y_train, y_test = train_test_split(
275
+ texts, labels, test_size=0.2, random_state=42, stratify=labels
276
+ )
277
+ print(f" Train: {len(X_train)} | Test: {len(X_test)}")
278
+
279
+ # TF-IDF vectorizer
280
+ vectorizer = TfidfVectorizer(
281
+ ngram_range=(1, 2),
282
+ max_features=5000,
283
+ sublinear_tf=True,
284
+ min_df=1,
285
+ )
286
+ X_train_vec = vectorizer.fit_transform(X_train)
287
+ X_test_vec = vectorizer.transform(X_test)
288
+
289
+ # Train MNB
290
+ clf = MultinomialNB(alpha=0.1)
291
+ clf.fit(X_train_vec, y_train)
292
+
293
+ # Evaluate
294
+ y_pred = clf.predict(X_test_vec)
295
+ acc = accuracy_score(y_test, y_pred)
296
+
297
+ print(f"\n Accuracy : {acc * 100:.2f}%")
298
+ print("\n Classification Report:")
299
+ print(classification_report(y_test, y_pred, target_names=LABEL_NAMES))
300
+
301
+ print(" Confusion Matrix:")
302
+ cm = confusion_matrix(y_test, y_pred)
303
+ headers = ['Form102', 'Form103', 'Form97']
304
+ print(f" {'':30s} " + " ".join(headers))
305
+ for i, row in enumerate(cm):
306
+ print(f" Actual {headers[i]}: {str(row)}")
307
+
308
+ # Save
309
+ model_path = os.path.join(save_dir, 'mnb_classifier.pkl')
310
+ vec_path = os.path.join(save_dir, 'tfidf_vectorizer.pkl')
311
+ with open(model_path, 'wb') as f:
312
+ pickle.dump(clf, f)
313
+ with open(vec_path, 'wb') as f:
314
+ pickle.dump(vectorizer, f)
315
+
316
+ meta = {
317
+ 'accuracy': round(acc * 100, 2),
318
+ 'samples_per_class': samples_per_class,
319
+ 'total_samples': samples_per_class * 3,
320
+ 'labels': LABEL_MAP,
321
+ 'note': 'Form 90 routing is handled by classify_sex() β€” not this model',
322
+ 'model_path': model_path,
323
+ 'vectorizer_path': vec_path,
324
+ }
325
+ with open(os.path.join(save_dir, 'mnb_metadata.json'), 'w') as f:
326
+ json.dump(meta, f, indent=2)
327
+
328
+ print(f"\n Model saved : {model_path}")
329
+ print(f" Vectorizer saved: {vec_path}")
330
+ print(f"\n Target accuracy : >90%")
331
+ print(f" Achieved : {acc * 100:.2f}% {'βœ“' if acc >= 0.90 else 'βœ— (try increasing samples_per_class)'}")
332
+ print("=" * 60)
333
+
334
+ return clf, vectorizer, acc
335
+
336
+
337
+ # ─────────────────────────────────────────────────────────────
338
+ # 6. DOCUMENT CLASSIFIER CLASS
339
+ # ─────────────────────────────────────────────────────────────
340
+ class DocumentClassifier:
341
+ """Load trained MNB model and classify OCR text from Certifications page."""
342
+
343
+ def __init__(self, model_dir='models'):
344
+ model_path = os.path.join(model_dir, 'mnb_classifier.pkl')
345
+ vec_path = os.path.join(model_dir, 'tfidf_vectorizer.pkl')
346
+
347
+ if not os.path.exists(model_path):
348
+ raise FileNotFoundError(
349
+ f"Model not found at {model_path}. Run: python form_classifier.py"
350
+ )
351
+
352
+ with open(model_path, 'rb') as f:
353
+ self.clf = pickle.load(f)
354
+ with open(vec_path, 'rb') as f:
355
+ self.vectorizer = pickle.load(f)
356
+
357
+ def predict(self, text: str) -> dict:
358
+ """
359
+ Classify OCR text from Certifications page.
360
+
361
+ Returns:
362
+ {
363
+ 'label': 'Form 102 - Certificate of Live Birth',
364
+ 'form_code': 'form102',
365
+ 'confidence': 0.95,
366
+ 'probabilities': { ... }
367
+ }
368
+ """
369
+ vec = self.vectorizer.transform([text])
370
+ probs = self.clf.predict_proba(vec)[0]
371
+ idx = int(np.argmax(probs))
372
+
373
+ form_codes = ['form102', 'form103', 'form97']
374
+ return {
375
+ 'label': LABEL_MAP[idx],
376
+ 'form_code': form_codes[idx],
377
+ 'confidence': round(float(probs[idx]), 4),
378
+ 'probabilities': {
379
+ LABEL_MAP[i]: round(float(p), 4)
380
+ for i, p in enumerate(probs)
381
+ }
382
+ }
383
+
384
+
385
+ # ─────────────────────────────────────────────────────────────
386
+ # 7. TEST DEMO
387
+ # ─────────────────────────────────────────────────────────────
388
+ def run_test():
389
+ print("\n" + "=" * 60)
390
+ print(" Testing DocumentClassifier β€” Certifications Page")
391
+ print("=" * 60)
392
+
393
+ classifier = DocumentClassifier()
394
+
395
+ test_cases = [
396
+ (
397
+ "Municipal Form No. 102 Certificate of Live Birth "
398
+ "Name of child Maria Santos Date of birth 01/15/1990 "
399
+ "Place of birth Brgy. San Jose, Tarlac City, Tarlac "
400
+ "Name of mother Lani Santos Name of father Jose Santos "
401
+ "Sex Female birth certificate infant",
402
+ "Form 102 - Certificate of Live Birth"
403
+ ),
404
+ (
405
+ "Municipal Form No.102 Certificate of Live Birth "
406
+ "PSA Child Juan Dela Cruz born 03/22/1985 "
407
+ "Place of birth Capas Tarlac mother Rosa Dela Cruz "
408
+ "father Pedro Dela Cruz Sex Male",
409
+ "Form 102 - Certificate of Live Birth"
410
+ ),
411
+ (
412
+ "Municipal Form No. 103 Certificate of Death "
413
+ "Name of deceased Pedro Reyes Date of death 03/22/2020 "
414
+ "Place of death Capas, Tarlac Cause of death Cardiac Arrest "
415
+ "Age at death 75 death certificate deceased burial",
416
+ "Form 103 - Certificate of Death"
417
+ ),
418
+ (
419
+ "Municipal Form No.103 Certificate of Death "
420
+ "Deceased Ana Torres died 07/04/2000 "
421
+ "cause Pneumonia burial permit interment",
422
+ "Form 103 - Certificate of Death"
423
+ ),
424
+ (
425
+ "Municipal Form No. 97 Certificate of Marriage "
426
+ "Name of husband Carlos Bautista Name of wife Ana Torres "
427
+ "Date of marriage 07/04/2005 Place of marriage Paniqui, Tarlac "
428
+ "Solemnizing officer Rev. Santos witnesses marriage certificate",
429
+ "Form 97 - Certificate of Marriage"
430
+ ),
431
+ (
432
+ "Municipal Form No.97 Certificate of Marriage "
433
+ "Husband Jose Santos wife Maria Reyes "
434
+ "married 11/30/1995 contracting parties solemnizing officer",
435
+ "Form 97 - Certificate of Marriage"
436
+ ),
437
+ ]
438
+
439
+ correct = 0
440
+ for text, expected in test_cases:
441
+ result = classifier.predict(text)
442
+ status = 'βœ“' if expected in result['label'] else 'βœ—'
443
+ if expected in result['label']:
444
+ correct += 1
445
+ print(f"\n {status} Expected : {expected}")
446
+ print(f" Predicted: {result['label']} ({result['confidence'] * 100:.1f}% confidence)")
447
+
448
+ print(f"\n Test Accuracy: {correct}/{len(test_cases)} ({correct / len(test_cases) * 100:.0f}%)")
449
+ print("=" * 60)
450
+
451
+
452
+ # ─────────────────────────────────────────────────────────────
453
+ # 8. MAIN
454
+ # ─────────────────────────────────────────────────────────────
455
+ if __name__ == '__main__':
456
+ parser = argparse.ArgumentParser()
457
+ parser.add_argument('--test', action='store_true', help='Run test predictions only')
458
+ parser.add_argument('--samples', type=int, default=150, help='Samples per class (default: 150)')
459
+ args = parser.parse_args()
460
+
461
+ if args.test:
462
+ run_test()
463
+ else:
464
+ train(samples_per_class=args.samples)
465
+ print("\nTo test predictions, run:")
466
+ print(" python form_classifier.py --test")
MNB/keywords.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mnb/keywords.py
2
+ # ============================================================
3
+ # Keyword fallback lists used by classifier.py when the
4
+ # trained .pkl models are not available.
5
+ #
6
+ # Uses EXACT Philippine civil registry form headers:
7
+ # Form 102 β†’ "Municipal Form No. 102 / Certificate of Live Birth"
8
+ # Form 103 β†’ "Municipal Form No. 103 / Certificate of Death"
9
+ # Form 97 β†’ "Municipal Form No. 97 / Certificate of Marriage"
10
+ #
11
+ # NOTE: Form 90 is NOT classified here.
12
+ # Form 90 has its own upload page (Application for Marriage License).
13
+ # The SEX field on the uploaded birth cert determines routing:
14
+ # Male β†’ GROOM slot in Form 90
15
+ # Female β†’ BRIDE slot in Form 90
16
+ # ============================================================
17
+
18
+ # ── PATH A: Certifications Page ──────────────────────────────
19
+ FORM_KEYWORDS = {
20
+
21
+ "form102": [
22
+ # Exact header variants
23
+ "Municipal Form No. 102",
24
+ "Municipal Form No.102",
25
+ "Certificate of Live Birth",
26
+ # Field-level keywords
27
+ "name of child",
28
+ "date of birth",
29
+ "place of birth",
30
+ "birth certificate",
31
+ "name of mother",
32
+ "name of father",
33
+ "attendant at birth",
34
+ "type of birth",
35
+ "birth order",
36
+ "legitimacy",
37
+ "infant",
38
+ "newborn",
39
+ # PSA/NSO sealed copy keywords
40
+ "PSA",
41
+ "NSO",
42
+ "bc registry",
43
+ ],
44
+
45
+ "form103": [
46
+ # Exact header variants
47
+ "Municipal Form No. 103",
48
+ "Municipal Form No.103",
49
+ "Certificate of Death",
50
+ # Field-level keywords
51
+ "name of deceased",
52
+ "date of death",
53
+ "place of death",
54
+ "cause of death",
55
+ "death certificate",
56
+ "immediate cause",
57
+ "antecedent cause",
58
+ "underlying cause",
59
+ "burial",
60
+ "deceased",
61
+ "died",
62
+ "burial permit",
63
+ "interment",
64
+ ],
65
+
66
+ "form97": [
67
+ # Exact header variants
68
+ "Municipal Form No. 97",
69
+ "Municipal Form No.97",
70
+ "Certificate of Marriage",
71
+ # Field-level keywords
72
+ "name of husband",
73
+ "name of wife",
74
+ "date of marriage",
75
+ "place of marriage",
76
+ "marriage certificate",
77
+ "solemnizing officer",
78
+ "contracting parties",
79
+ "witnesses",
80
+ "marriage license number",
81
+ "mc registry",
82
+ "nuptial",
83
+ "wed",
84
+ ],
85
+
86
+ "form90": [
87
+ # Exact header variants
88
+ "Municipal Form 90",
89
+ "Municipal Form No. 90",
90
+ "Municipal Form No.90",
91
+ "Application for Marriage License",
92
+ "APPLICATION FOR MARRIAGE LICENSE",
93
+ "Form No. 2",
94
+ # Field-level keywords
95
+ "name of applicant",
96
+ "marriage license no",
97
+ "marriage license number",
98
+ "date of issuance",
99
+ "date of issuance of marriage license",
100
+ "groom",
101
+ "bride",
102
+ "may i apply for a license",
103
+ "accountable form no. 54",
104
+ ],
105
+ }
106
+
107
+ # ── PATH B: Form 90 Marriage License Page ────────────────────
108
+ # Used ONLY on the Marriage License upload page.
109
+ # Reads the SEX field from the uploaded PSA/NSO birth certificate.
110
+ # Male β†’ GROOM (routed to Groom slot in Form 90)
111
+ # Female β†’ BRIDE (routed to Bride slot in Form 90)
112
+ SEX_KEYWORDS = {
113
+ "GROOM": [
114
+ "sex: male",
115
+ "sex male",
116
+ "2. sex: male",
117
+ " male",
118
+ "sex m",
119
+ ],
120
+ "BRIDE": [
121
+ "sex: female",
122
+ "sex female",
123
+ "2. sex: female",
124
+ " female",
125
+ "sex f",
126
+ ],
127
+ }
MNB/mnb_metadata.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 100.0,
3
+ "samples_per_class": 150,
4
+ "total_samples": 450,
5
+ "labels": {
6
+ "0": "Form 102 - Certificate of Live Birth",
7
+ "1": "Form 103 - Certificate of Death",
8
+ "2": "Form 97 - Certificate of Marriage"
9
+ },
10
+ "note": "Form 90 routing is handled separately by classify_sex() using the SEX field on uploaded PSA/NSO birth certificates. Male = GROOM, Female = BRIDE.",
11
+ "pages": {
12
+ "certifications": "Classifies Form 102 / 103 / 97 from uploaded certification scan",
13
+ "marriage_license": "classify_sex() routes birth cert to GROOM or BRIDE slot in Form 90"
14
+ },
15
+ "model_path": "models/mnb_classifier.pkl",
16
+ "vectorizer_path": "models/tfidf_vectorizer.pkl"
17
+ }
MNB/models/mnb_classifier.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d62d9cbdd7d76b60d17787b93bcc22f51c5602934ac60117e15279c3a22c519
3
+ size 200089
MNB/models/mnb_metadata.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 100.0,
3
+ "samples_per_class": 150,
4
+ "total_samples": 600,
5
+ "labels": {
6
+ "0": "Form 1A - Birth Certificate",
7
+ "1": "Form 2A - Death Certificate",
8
+ "2": "Form 3A - Marriage Certificate",
9
+ "3": "Form 90 - Application for Marriage License"
10
+ },
11
+ "model_path": "models\\mnb_classifier.pkl",
12
+ "vectorizer_path": "models\\tfidf_vectorizer.pkl"
13
+ }
MNB/models/tfidf_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:217cd506c7d9d7bfcfef73fc107273c129d4d55ab7dfddc1190e2863ee381ec4
3
+ size 129497
references/12 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d9e1c47ea7a15f7ff1e14a3b34db3f2eb690c15c45c2a5b8174d964633d0f6f
3
+ size 1924369
references/321 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc0159e24fa6735aeed7153ecf0092ba6d7bec510c57c8ec52a28328083d2e61
3
+ size 957650
references/321321 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3e4cf4da9290a262997067fda2640da298979a5f3dc069b88177104d1a629ce
3
+ size 3225794
references/old.jpg ADDED

Git LFS Details

  • SHA256: 451d3062ee9fe0c3941c5cb8997109bc24ca2a363c06f31bf0eea7a26a144cee
  • Pointer size: 131 Bytes
  • Size of remote file: 633 kB
references/reference-102.png ADDED

Git LFS Details

  • SHA256: d0228edcc4baa444f78f2ff908dc9df82dafdebbd933bbd7a49ec52afdbd7352
  • Pointer size: 132 Bytes
  • Size of remote file: 4.07 MB
references/reference-103.png ADDED

Git LFS Details

  • SHA256: d2cb21bd62b12f08cd02593d97db1e49664d4ec3763a255a1f5653b0171ab92c
  • Pointer size: 132 Bytes
  • Size of remote file: 5.27 MB