Hanz Pillerva Claude Sonnet 4.6 commited on
Commit Β·
d748584
1
Parent(s): 56161f2
Replace TrOCR/anchor system with CRNN+CTC absolute-coordinate OCR
Browse files- template_matcher: removed anchor detection, TrOCR, pytesseract; now uses CRNN+CTC model with ECC/ORB alignment + absolute coordinate crops
- app.py: updated preload to use _get_crnn instead of _get_trocr
- calibrate_fields: updated to match latest changes
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- CRNN+CTC/calibrate_fields.py +10 -10
- app.py +5 -5
- template_matcher.py +396 -972
CRNN+CTC/calibrate_fields.py
CHANGED
|
@@ -37,18 +37,17 @@ COLOURS = [
|
|
| 37 |
|
| 38 |
def draw_boxes(img, bounds):
|
| 39 |
left, top, right, bottom = bounds
|
| 40 |
-
|
| 41 |
-
fh = bottom - top
|
| 42 |
|
| 43 |
vis = img.copy()
|
| 44 |
# form boundary
|
| 45 |
cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 2)
|
| 46 |
|
| 47 |
for idx, (name, rx1, ry1, rx2, ry2) in enumerate(boxes):
|
| 48 |
-
x1 = int(
|
| 49 |
-
y1 = int(
|
| 50 |
-
x2 = int(
|
| 51 |
-
y2 = int(
|
| 52 |
c = COLOURS[idx % len(COLOURS)]
|
| 53 |
cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2)
|
| 54 |
cv2.putText(vis, name[:25], (x1 + 2, max(0, y1 - 3)),
|
|
@@ -160,10 +159,11 @@ def main():
|
|
| 160 |
elif event == cv2.EVENT_LBUTTONUP:
|
| 161 |
drawing = False
|
| 162 |
ex, ey = x, y
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
| 167 |
x1r, y1r = max(0.0, x1r), max(0.0, y1r)
|
| 168 |
x2r, y2r = min(1.0, x2r), min(1.0, y2r)
|
| 169 |
if (x2r - x1r) > 0.005 and (y2r - y1r) > 0.003:
|
|
|
|
| 37 |
|
| 38 |
def draw_boxes(img, bounds):
|
| 39 |
left, top, right, bottom = bounds
|
| 40 |
+
h, w = img.shape[:2]
|
|
|
|
| 41 |
|
| 42 |
vis = img.copy()
|
| 43 |
# form boundary
|
| 44 |
cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 2)
|
| 45 |
|
| 46 |
for idx, (name, rx1, ry1, rx2, ry2) in enumerate(boxes):
|
| 47 |
+
x1 = int(rx1 * w)
|
| 48 |
+
y1 = int(ry1 * h)
|
| 49 |
+
x2 = int(rx2 * w)
|
| 50 |
+
y2 = int(ry2 * h)
|
| 51 |
c = COLOURS[idx % len(COLOURS)]
|
| 52 |
cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2)
|
| 53 |
cv2.putText(vis, name[:25], (x1 + 2, max(0, y1 - 3)),
|
|
|
|
| 159 |
elif event == cv2.EVENT_LBUTTONUP:
|
| 160 |
drawing = False
|
| 161 |
ex, ey = x, y
|
| 162 |
+
ih, iw = img_orig.shape[:2]
|
| 163 |
+
x1r = min(ix, ex) / iw
|
| 164 |
+
y1r = min(iy, ey) / ih
|
| 165 |
+
x2r = max(ix, ex) / iw
|
| 166 |
+
y2r = max(iy, ey) / ih
|
| 167 |
x1r, y1r = max(0.0, x1r), max(0.0, y1r)
|
| 168 |
x2r, y2r = min(1.0, x2r), min(1.0, y2r)
|
| 169 |
if (x2r - x1r) > 0.005 and (y2r - y1r) > 0.003:
|
app.py
CHANGED
|
@@ -45,13 +45,13 @@ PIPELINE_REPO_PATH = r"C:\xampp\htdocs\python"
|
|
| 45 |
|
| 46 |
# ββ Load template matcher βββββββββββββββββββββββββββββββββββββ
|
| 47 |
try:
|
| 48 |
-
from template_matcher import extract_fields, pdf_to_image, detect_form_type,
|
| 49 |
_template_matcher_ok = True
|
| 50 |
print("[app.py] Template matcher loaded")
|
| 51 |
-
# Preload
|
| 52 |
-
print("[app.py] Preloading
|
| 53 |
-
|
| 54 |
-
print("[app.py]
|
| 55 |
except Exception as _tm_err:
|
| 56 |
_template_matcher_ok = False
|
| 57 |
print(f"[app.py] Template matcher unavailable: {_tm_err}")
|
|
|
|
| 45 |
|
| 46 |
# ββ Load template matcher βββββββββββββββββββββββββββββββββββββ
|
| 47 |
try:
|
| 48 |
+
from template_matcher import extract_fields, pdf_to_image, detect_form_type, _get_crnn
|
| 49 |
_template_matcher_ok = True
|
| 50 |
print("[app.py] Template matcher loaded")
|
| 51 |
+
# Preload CRNN+CTC at startup so the first request isn't slow
|
| 52 |
+
print("[app.py] Preloading CRNN+CTC model...")
|
| 53 |
+
_get_crnn()
|
| 54 |
+
print("[app.py] CRNN+CTC preloaded.")
|
| 55 |
except Exception as _tm_err:
|
| 56 |
_template_matcher_ok = False
|
| 57 |
print(f"[app.py] Template matcher unavailable: {_tm_err}")
|
template_matcher.py
CHANGED
|
@@ -1,29 +1,22 @@
|
|
| 1 |
"""
|
| 2 |
-
template_matcher.py
|
| 3 |
-
====================
|
| 4 |
-
Extracts field values from civil registry scanned forms
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
Coordinates are stored as relative fractions (0.0-1.0) of the image
|
| 16 |
-
width/height so they work at any scan resolution.
|
| 17 |
-
|
| 18 |
-
CALIBRATION
|
| 19 |
-
-----------
|
| 20 |
-
If OCR picks up the wrong area, adjust the (x1, y1, x2, y2) values
|
| 21 |
-
for that field in the TEMPLATES dict below.
|
| 22 |
-
Run: python template_matcher.py <image_path> <form_type>
|
| 23 |
-
to see a debug image with all boxes drawn.
|
| 24 |
"""
|
| 25 |
|
| 26 |
-
import os
|
|
|
|
|
|
|
|
|
|
| 27 |
import numpy as np
|
| 28 |
from PIL import Image
|
| 29 |
|
|
@@ -33,741 +26,364 @@ try:
|
|
| 33 |
except ImportError:
|
| 34 |
_CV2_OK = False
|
| 35 |
|
| 36 |
-
# ββ Reference images
|
| 37 |
-
# Place one clean blank/lightly-filled scan for each form type in:
|
| 38 |
-
# python/references/reference_102.png
|
| 39 |
-
# python/references/reference_103.png
|
| 40 |
-
# python/references/reference_90.png
|
| 41 |
-
# python/references/reference_97.png
|
| 42 |
_REF_DIR = os.path.join(os.path.dirname(__file__), 'references')
|
| 43 |
REFERENCE_IMAGES = {
|
| 44 |
-
'102': os.path.join(_REF_DIR, 'reference_102.
|
| 45 |
'103': os.path.join(_REF_DIR, 'reference_103.png'),
|
| 46 |
-
'90': os.path.join(_REF_DIR, '
|
| 47 |
-
'97': os.path.join(_REF_DIR, 'reference_97.
|
| 48 |
}
|
| 49 |
|
| 50 |
-
# ββ
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
| 57 |
try:
|
| 58 |
-
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
| 59 |
import torch
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
except Exception as e:
|
| 68 |
-
print(f'[template_matcher]
|
| 69 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
# ββ CRNN+CTC (kept for future use β swap back when model is trained) ββ
|
| 72 |
-
# _crnn_ocr = None
|
| 73 |
-
# _CRNN_CHECKPOINT = os.path.join(
|
| 74 |
-
# os.path.dirname(__file__), 'CRNN+CTC', 'checkpoints', 'best_model_final.pth'
|
| 75 |
-
# )
|
| 76 |
-
# def _get_crnn(): ... (see git history)
|
| 77 |
|
| 78 |
-
# Hint constants kept for template dict compatibility
|
| 79 |
_LINE = 'line'
|
| 80 |
_BLOCK = 'block'
|
| 81 |
_WORD = 'word'
|
| 82 |
|
| 83 |
-
# ββ
|
| 84 |
-
import re as _re
|
| 85 |
-
|
| 86 |
-
# Maps abbreviated sex readings β canonical value
|
| 87 |
_SEX_KEYWORDS = {
|
| 88 |
'female': 'FEMALE', 'fem': 'FEMALE', 'f': 'FEMALE',
|
| 89 |
-
'male': 'MALE', 'm':
|
| 90 |
}
|
| 91 |
-
|
| 92 |
-
# Maps field name β normalization rule
|
| 93 |
_FIELD_TYPE = {
|
| 94 |
-
# Sex
|
| 95 |
'sex': 'sex', 'groom_sex': 'sex', 'bride_sex': 'sex',
|
| 96 |
'husband_sex': 'sex', 'wife_sex': 'sex',
|
| 97 |
-
# Year (4-digit)
|
| 98 |
'dob_year': 'year',
|
| 99 |
-
# Pure digits
|
| 100 |
'age': 'digits', 'groom_age': 'digits', 'bride_age': 'digits',
|
| 101 |
'husband_age': 'digits', 'wife_age': 'digits', 'dob_day': 'digits',
|
| 102 |
-
# Dates β keep digits, spaces, common separators
|
| 103 |
'registration_date': 'date', 'marriage_date': 'date',
|
| 104 |
-
'date_of_marriage':
|
| 105 |
-
'date_of_birth':
|
| 106 |
'groom_dob': 'date', 'bride_dob': 'date',
|
| 107 |
'husband_dob': 'date', 'wife_dob': 'date',
|
| 108 |
-
# Registry / license numbers β alphanumeric + separators
|
| 109 |
'registry_no': 'registry', 'marriage_license_no': 'registry',
|
| 110 |
}
|
| 111 |
|
| 112 |
-
|
| 113 |
def _postprocess(text: str, field_name: str) -> str:
|
| 114 |
-
"""
|
| 115 |
-
Normalize and validate OCR output by field type.
|
| 116 |
-
|
| 117 |
-
sex β 'MALE' or 'FEMALE'
|
| 118 |
-
year β 4-digit year string, e.g. '1990'
|
| 119 |
-
digits β strip all non-digit characters
|
| 120 |
-
date β keep digits, spaces, '-', '/', '.', ','
|
| 121 |
-
registry β keep alphanumeric, spaces, hyphens, slashes
|
| 122 |
-
(default) β strip leading/trailing whitespace
|
| 123 |
-
"""
|
| 124 |
text = text.strip()
|
| 125 |
if not text:
|
| 126 |
return text
|
| 127 |
rule = _FIELD_TYPE.get(field_name)
|
| 128 |
-
|
| 129 |
if rule == 'sex':
|
| 130 |
tl = text.lower()
|
| 131 |
-
# Try longest keyword first to avoid 'f' matching inside 'female' twice
|
| 132 |
for kw in sorted(_SEX_KEYWORDS, key=len, reverse=True):
|
| 133 |
if kw in tl:
|
| 134 |
return _SEX_KEYWORDS[kw]
|
| 135 |
return text
|
| 136 |
-
|
| 137 |
if rule == 'year':
|
| 138 |
m = _re.search(r'(19|20)\d{2}', text)
|
| 139 |
if m:
|
| 140 |
return m.group(0)
|
| 141 |
digits = _re.sub(r'\D', '', text)
|
| 142 |
return digits[:4] if len(digits) >= 4 else text
|
| 143 |
-
|
| 144 |
if rule == 'digits':
|
| 145 |
d = _re.sub(r'\D', '', text)
|
| 146 |
return d if d else text
|
| 147 |
-
|
| 148 |
if rule == 'date':
|
| 149 |
return _re.sub(r'[^\w\s\-/,.]', '', text).strip()
|
| 150 |
-
|
| 151 |
if rule == 'registry':
|
| 152 |
return _re.sub(r'[^\w\s\-/]', '', text).strip()
|
| 153 |
-
|
| 154 |
return text
|
| 155 |
|
| 156 |
-
# ββ Field templates ββββββββββββββββββββββββββββββββββββββββββββ
|
| 157 |
-
# Each entry: 'field_name': (x1, y1, x2, y2, hint)
|
| 158 |
-
# Coordinates are relative fractions of image dimensions (0.0 β 1.0)
|
| 159 |
-
# hint is kept for compatibility but EasyOCR ignores it.
|
| 160 |
-
#
|
| 161 |
-
# TO CALIBRATE: run this file directly with a sample image, it will
|
| 162 |
-
# draw all boxes so you can see which regions need adjusting.
|
| 163 |
|
|
|
|
|
|
|
| 164 |
TEMPLATES = {
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 170 |
-
'102': {
|
| 171 |
-
'province': (0.173, 0.089, 0.655, 0.105, _LINE),
|
| 172 |
-
'registry_no': (0.673, 0.105, 0.957, 0.130, _LINE),
|
| 173 |
-
'city_municipality': (0.226, 0.107, 0.658, 0.125, _LINE),
|
| 174 |
'name_first': (0.169, 0.161, 0.453, 0.181, _LINE),
|
| 175 |
'name_middle': (0.450, 0.161, 0.674, 0.181, _LINE),
|
| 176 |
-
'name_last': (0.
|
| 177 |
-
'sex': (0.
|
| 178 |
-
'dob_day': (0.
|
| 179 |
-
'dob_month': (0.
|
| 180 |
-
'dob_year': (0.
|
| 181 |
-
'place_of_birth': (0.
|
| 182 |
-
'type_of_birth': (0.
|
| 183 |
-
'birth_order': (0.
|
| 184 |
-
'weight_at_birth': (0.
|
| 185 |
-
'mother_name': (0.
|
| 186 |
-
'mother_citizenship': (0.
|
| 187 |
-
'mother_religion': (0.
|
| 188 |
-
'mother_occupation': (0.
|
| 189 |
-
'mother_age_at_birth': (0.
|
| 190 |
-
'mother_residence': (0.
|
| 191 |
-
'father_name': (0.
|
| 192 |
-
'father_citizenship': (0.
|
| 193 |
-
'father_religion': (0.
|
| 194 |
-
'father_occupation': (0.
|
| 195 |
-
'father_age_at_birth': (0.
|
| 196 |
-
'father_residence': (0.
|
| 197 |
-
'marriage_date': (0.
|
| 198 |
-
'marriage_place': (0.
|
| 199 |
-
'registration_date': (0.
|
| 200 |
-
},
|
| 201 |
-
|
| 202 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 203 |
-
# FORM 103 β Certificate of Death (blue border)
|
| 204 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 205 |
-
'103': {
|
| 206 |
-
'province': (0.173, 0.089, 0.655, 0.105, _LINE),
|
| 207 |
-
'registry_no': (0.673, 0.105, 0.957, 0.130, _LINE),
|
| 208 |
-
'city_municipality': (0.226, 0.107, 0.658, 0.125, _LINE),
|
| 209 |
-
'deceased_name': (0.086, 0.147, 0.745, 0.181, _LINE),
|
| 210 |
-
'sex': (0.771, 0.155, 0.963, 0.173, _WORD),
|
| 211 |
-
'date_of_death': (0.100, 0.197, 0.293, 0.224, _LINE),
|
| 212 |
-
'date_of_birth': (0.320, 0.201, 0.568, 0.228, _LINE),
|
| 213 |
-
'age': (0.575, 0.215, 0.719, 0.231, _WORD),
|
| 214 |
-
'place_of_death': (0.089, 0.241, 0.720, 0.265, _LINE),
|
| 215 |
-
'civil_status': (0.723, 0.250, 0.970, 0.265, _WORD),
|
| 216 |
-
'religion': (0.078, 0.281, 0.310, 0.308, _LINE),
|
| 217 |
-
'citizenship': (0.328, 0.281, 0.526, 0.306, _LINE),
|
| 218 |
-
'residence': (0.540, 0.284, 0.957, 0.310, _LINE),
|
| 219 |
-
'occupation': (0.086, 0.318, 0.283, 0.343, _LINE),
|
| 220 |
-
'father_name': (0.641, 0.321, 0.957, 0.342, _LINE),
|
| 221 |
-
'mother_name': (0.296, 0.321, 0.627, 0.341, _LINE),
|
| 222 |
-
'cause_immediate': (0.298, 0.406, 0.947, 0.420, _LINE),
|
| 223 |
-
'cause_antecedent': (0.298, 0.422, 0.951, 0.441, _LINE),
|
| 224 |
-
'cause_underlying': (0.432, 0.438, 0.960, 0.456, _LINE),
|
| 225 |
-
'registration_date': (0.621, 0.685, 0.905, 0.704, _LINE),
|
| 226 |
-
},
|
| 227 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 228 |
-
# FORM 90 β Application for Marriage License (black border)
|
| 229 |
-
# 3-column: GROOM (left 38%) | LABEL (center 24%) | BRIDE (right 38%)
|
| 230 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 231 |
-
'90': {
|
| 232 |
-
# Header
|
| 233 |
-
'province': (0.173, 0.089, 0.655, 0.105, _LINE),
|
| 234 |
-
'registry_no': (0.673, 0.105, 0.957, 0.130, _LINE),
|
| 235 |
-
'city_municipality': (0.226, 0.107, 0.658, 0.125, _LINE),
|
| 236 |
-
'marriage_license_no':(0.696, 0.138, 0.951, 0.156, _LINE),
|
| 237 |
-
'date_issued': (0.825, 0.151, 0.982, 0.169, _LINE),
|
| 238 |
-
# Groom name (first / middle / last β each on its own row)
|
| 239 |
-
'groom_name_first': (0.128, 0.310, 0.441, 0.325, _LINE),
|
| 240 |
-
'groom_name_middle': (0.137, 0.326, 0.446, 0.338, _LINE),
|
| 241 |
-
'groom_name_last': (0.127, 0.340, 0.439, 0.354, _LINE),
|
| 242 |
-
# Bride name (first / middle / last β each on its own row)
|
| 243 |
-
'bride_name_first': (0.629, 0.311, 0.944, 0.325, _LINE),
|
| 244 |
-
'bride_name_middle': (0.631, 0.326, 0.937, 0.339, _LINE),
|
| 245 |
-
'bride_name_last': (0.633, 0.339, 0.941, 0.354, _LINE),
|
| 246 |
-
# Groom DOB / age
|
| 247 |
-
'groom_dob': (0.085, 0.372, 0.369, 0.393, _LINE),
|
| 248 |
-
'groom_age': (0.379, 0.373, 0.456, 0.391, _WORD),
|
| 249 |
-
# Bride DOB / age
|
| 250 |
-
'bride_dob': (0.584, 0.373, 0.879, 0.393, _LINE),
|
| 251 |
-
'bride_age': (0.881, 0.374, 0.965, 0.392, _WORD),
|
| 252 |
-
# Place of birth
|
| 253 |
-
'groom_place_of_birth':(0.080, 0.403, 0.462, 0.426, _LINE),
|
| 254 |
-
'bride_place_of_birth':(0.586, 0.405, 0.960, 0.425, _LINE),
|
| 255 |
-
# Sex / Citizenship
|
| 256 |
-
'groom_sex': (0.085, 0.435, 0.217, 0.452, _WORD),
|
| 257 |
-
'groom_citizenship': (0.219, 0.435, 0.460, 0.454, _LINE),
|
| 258 |
-
'bride_sex': (0.582, 0.436, 0.716, 0.452, _WORD),
|
| 259 |
-
'bride_citizenship': (0.725, 0.436, 0.961, 0.451, _LINE),
|
| 260 |
-
# Residence
|
| 261 |
-
'groom_residence': (0.076, 0.465, 0.460, 0.490, _LINE),
|
| 262 |
-
'bride_residence': (0.586, 0.465, 0.964, 0.489, _LINE),
|
| 263 |
-
# Religion
|
| 264 |
-
'groom_religion': (0.079, 0.493, 0.462, 0.522, _LINE),
|
| 265 |
-
'bride_religion': (0.585, 0.492, 0.962, 0.520, _LINE),
|
| 266 |
-
# Civil Status
|
| 267 |
-
'groom_civil_status': (0.080, 0.520, 0.463, 0.552, _WORD),
|
| 268 |
-
'bride_civil_status': (0.586, 0.522, 0.961, 0.546, _WORD),
|
| 269 |
-
# Father
|
| 270 |
-
'groom_father_name': (0.082, 0.695, 0.459, 0.711, _LINE),
|
| 271 |
-
'groom_father_citizenship':(0.082, 0.713, 0.459, 0.736, _LINE),
|
| 272 |
-
'bride_father_name': (0.581, 0.695, 0.961, 0.715, _LINE),
|
| 273 |
-
'bride_father_citizenship':(0.577, 0.715, 0.963, 0.737, _LINE),
|
| 274 |
-
# Mother
|
| 275 |
-
'groom_mother_name': (0.080, 0.784, 0.456, 0.809, _LINE),
|
| 276 |
-
'groom_mother_citizenship':(0.081, 0.811, 0.459, 0.830, _LINE),
|
| 277 |
-
'bride_mother_name': (0.583, 0.785, 0.962, 0.808, _LINE),
|
| 278 |
-
'bride_mother_citizenship':(0.580, 0.811, 0.963, 0.833, _LINE),
|
| 279 |
-
},
|
| 280 |
-
|
| 281 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 282 |
-
# FORM 97 β Certificate of Marriage (pink/magenta border)
|
| 283 |
-
# Layout: ITEM col (20%) | HUSBAND col (40%) | WIFE col (40%)
|
| 284 |
-
# x-ranges: HUSBAND = 0.22β0.59 | WIFE = 0.62β0.97
|
| 285 |
-
#
|
| 286 |
-
# Form 97 β y-coords calibrated from actual ORB-aligned scan.
|
| 287 |
-
# ORB alignment introduces ~40% vertical stretch by bottom of form;
|
| 288 |
-
# all y values are empirically measured from crop images, NOT from
|
| 289 |
-
# the reference image directly.
|
| 290 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 291 |
-
'97': {
|
| 292 |
-
'province': (0.173, 0.089, 0.655, 0.105, _LINE),
|
| 293 |
-
'registry_no': (0.673, 0.105, 0.957, 0.130, _LINE),
|
| 294 |
-
'city_municipality': (0.226, 0.107, 0.658, 0.125, _LINE),
|
| 295 |
-
'husband_name_first': (0.255, 0.140, 0.570, 0.155, _LINE),
|
| 296 |
-
'husband_name_middle': (0.258, 0.154, 0.569, 0.166, _LINE),
|
| 297 |
-
'husband_name_last': (0.259, 0.167, 0.581, 0.182, _LINE),
|
| 298 |
-
'wife_name_first': (0.650, 0.142, 0.954, 0.155, _LINE),
|
| 299 |
-
'wife_name_middle': (0.639, 0.155, 0.940, 0.170, _LINE),
|
| 300 |
-
'wife_name_last': (0.634, 0.169, 0.951, 0.181, _LINE),
|
| 301 |
-
'husband_dob': (0.219, 0.196, 0.507, 0.213, _LINE),
|
| 302 |
-
'husband_age': (0.523, 0.196, 0.580, 0.212, _WORD),
|
| 303 |
-
'wife_dob': (0.606, 0.198, 0.892, 0.209, _LINE),
|
| 304 |
-
'wife_age': (0.910, 0.199, 0.970, 0.213, _WORD),
|
| 305 |
-
'husband_place_of_birth': (0.203, 0.225, 0.583, 0.241, _LINE),
|
| 306 |
-
'wife_place_of_birth': (0.594, 0.229, 0.962, 0.245, _LINE),
|
| 307 |
-
'husband_sex': (0.219, 0.249, 0.307, 0.269, _WORD),
|
| 308 |
-
'wife_sex': (0.602, 0.249, 0.697, 0.269, _WORD),
|
| 309 |
-
'husband_citizenship': (0.344, 0.257, 0.588, 0.274, _LINE),
|
| 310 |
-
'wife_citizenship': (0.724, 0.255, 0.965, 0.272, _LINE),
|
| 311 |
-
'husband_residence': (0.219, 0.283, 0.579, 0.301, _LINE),
|
| 312 |
-
'wife_residence': (0.596, 0.285, 0.966, 0.307, _LINE),
|
| 313 |
-
'husband_religion': (0.204, 0.310, 0.581, 0.327, _LINE),
|
| 314 |
-
'wife_religion': (0.592, 0.311, 0.964, 0.327, _LINE),
|
| 315 |
-
'husband_civil_status': (0.196, 0.333, 0.579, 0.351, _WORD),
|
| 316 |
-
'wife_civil_status': (0.591, 0.335, 0.959, 0.351, _WORD),
|
| 317 |
-
'husband_father_name': (0.205, 0.367, 0.588, 0.384, _LINE),
|
| 318 |
-
'wife_father_name': (0.588, 0.369, 0.960, 0.386, _LINE),
|
| 319 |
-
'husband_father_citizenship': (0.195, 0.390, 0.580, 0.406, _LINE),
|
| 320 |
-
'wife_father_citizenship': (0.599, 0.388, 0.958, 0.404, _LINE),
|
| 321 |
-
'husband_mother_name': (0.196, 0.421, 0.583, 0.438, _LINE),
|
| 322 |
-
'wife_mother_name': (0.600, 0.419, 0.954, 0.436, _LINE),
|
| 323 |
-
'husband_mother_citizenship': (0.196, 0.443, 0.578, 0.459, _LINE),
|
| 324 |
-
'wife_mother_citizenship': (0.590, 0.447, 0.971, 0.463, _LINE),
|
| 325 |
-
'place_of_marriage': (0.219, 0.551, 0.981, 0.565, _LINE),
|
| 326 |
-
'date_of_marriage': (0.222, 0.582, 0.571, 0.596, _LINE),
|
| 327 |
-
'time_of_marriage': (0.730, 0.581, 0.916, 0.596, _LINE),
|
| 328 |
-
'registration_date': (0.621, 0.685, 0.905, 0.704, _LINE),
|
| 329 |
},
|
| 330 |
-
}
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
# ββ Anchor-based field templates βββββββββββββββββββββββββββββββ
|
| 334 |
-
# These complement TEMPLATES (absolute coords). For each field that has an
|
| 335 |
-
# anchor entry, extract_fields() will:
|
| 336 |
-
# 1. Run EasyOCR once on the full form (detail=1) to get all text + bboxes
|
| 337 |
-
# 2. Search for the printed label text inside 'search' region
|
| 338 |
-
# 3. If found, crop the data region from the anchor's edge
|
| 339 |
-
# 4. Fall back to absolute coords from TEMPLATES when anchor not found.
|
| 340 |
-
#
|
| 341 |
-
# Entry format:
|
| 342 |
-
# 'labels' : list of strings β tried in order, case-insensitive partial match
|
| 343 |
-
# 'search' : (x1,y1,x2,y2) fractions of form to search for the label
|
| 344 |
-
# 'side' : 'right' | 'below' β where the data field is vs. the label
|
| 345 |
-
# 'dx','dy' : offset from anchor edge to data start (fractions of form dims)
|
| 346 |
-
# 'dw','dh' : data region size (fractions of form dims); dh=0 β auto from anchor
|
| 347 |
-
|
| 348 |
-
ANCHOR_TEMPLATES = {
|
| 349 |
-
|
| 350 |
-
# ββ Form 102 βββββββββββββββββββββββββββββββββββββββββββββ
|
| 351 |
-
'102': {
|
| 352 |
-
'province': {
|
| 353 |
-
'labels': ['Province', 'PROVINCE'],
|
| 354 |
-
'search': (0.00, 0.09, 0.17, 0.14),
|
| 355 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.004,
|
| 356 |
-
'dw': 0.48, 'dh': 0.020,
|
| 357 |
-
},
|
| 358 |
-
'registry_no': {
|
| 359 |
-
'labels': ['Registry No', 'REGISTRY NO'],
|
| 360 |
-
'search': (0.56, 0.10, 0.72, 0.15),
|
| 361 |
-
'side': 'right', 'dx': 0.003, 'dy': 0.000,
|
| 362 |
-
'dw': 0.28, 'dh': 0.026,
|
| 363 |
-
},
|
| 364 |
-
'city_municipality': {
|
| 365 |
-
'labels': ['City', 'Municipality', 'City/Municipality'],
|
| 366 |
-
'search': (0.00, 0.12, 0.23, 0.16),
|
| 367 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 368 |
-
'dw': 0.43, 'dh': 0.020,
|
| 369 |
-
},
|
| 370 |
-
'mother_name': {
|
| 371 |
-
'labels': ['Maiden Name', 'MAIDEN NAME', "Mother's Name"],
|
| 372 |
-
'search': (0.05, 0.30, 0.22, 0.35),
|
| 373 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.005,
|
| 374 |
-
'dw': 0.77, 'dh': 0.022,
|
| 375 |
-
},
|
| 376 |
-
'father_name': {
|
| 377 |
-
'labels': ["Father's Name", "FATHER'S NAME", 'Father Name'],
|
| 378 |
-
'search': (0.05, 0.45, 0.22, 0.50),
|
| 379 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.005,
|
| 380 |
-
'dw': 0.77, 'dh': 0.025,
|
| 381 |
-
},
|
| 382 |
-
'marriage_date': {
|
| 383 |
-
'labels': ['Date Married', 'DATE MARRIED', 'Date and Place'],
|
| 384 |
-
'search': (0.00, 0.58, 0.18, 0.63),
|
| 385 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 386 |
-
'dw': 0.32, 'dh': 0.020,
|
| 387 |
-
},
|
| 388 |
-
'marriage_place': {
|
| 389 |
-
'labels': ['Place', 'PLACE'],
|
| 390 |
-
'search': (0.32, 0.58, 0.44, 0.63),
|
| 391 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 392 |
-
'dw': 0.52, 'dh': 0.020,
|
| 393 |
-
},
|
| 394 |
-
'registration_date': {
|
| 395 |
-
'labels': ['Date', 'DATE', 'Registration'],
|
| 396 |
-
'search': (0.45, 0.72, 0.65, 0.77),
|
| 397 |
-
'side': 'right', 'dx': 0.003, 'dy': 0.000,
|
| 398 |
-
'dw': 0.28, 'dh': 0.020,
|
| 399 |
-
},
|
| 400 |
-
},
|
| 401 |
-
|
| 402 |
-
# ββ Form 103 βββββββββββββββββββββββββββββββββββββββββββββ
|
| 403 |
'103': {
|
| 404 |
-
'province':
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
'
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
'
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
'
|
| 423 |
-
|
| 424 |
-
'search': (0.00, 0.13, 0.18, 0.20),
|
| 425 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.005,
|
| 426 |
-
'dw': 0.65, 'dh': 0.038,
|
| 427 |
-
},
|
| 428 |
-
'father_name': {
|
| 429 |
-
'labels': ["Father's Name", "FATHER'S NAME", "Father"],
|
| 430 |
-
'search': (0.20, 0.31, 0.35, 0.36),
|
| 431 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 432 |
-
'dw': 0.33, 'dh': 0.022,
|
| 433 |
-
},
|
| 434 |
-
'mother_name': {
|
| 435 |
-
'labels': ["Mother's Maiden", "MOTHER'S MAIDEN", "Mother"],
|
| 436 |
-
'search': (0.55, 0.31, 0.70, 0.36),
|
| 437 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 438 |
-
'dw': 0.33, 'dh': 0.022,
|
| 439 |
-
},
|
| 440 |
-
'cause_immediate': {
|
| 441 |
-
'labels': ['Immediate Cause', 'IMMEDIATE CAUSE', 'Immediate'],
|
| 442 |
-
'search': (0.05, 0.39, 0.32, 0.43),
|
| 443 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 444 |
-
'dw': 0.65, 'dh': 0.018,
|
| 445 |
-
},
|
| 446 |
-
'cause_antecedent': {
|
| 447 |
-
'labels': ['Antecedent', 'ANTECEDENT'],
|
| 448 |
-
'search': (0.05, 0.41, 0.32, 0.45),
|
| 449 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 450 |
-
'dw': 0.65, 'dh': 0.018,
|
| 451 |
-
},
|
| 452 |
-
'registration_date': {
|
| 453 |
-
'labels': ['Date', 'Registration Date'],
|
| 454 |
-
'search': (0.50, 0.67, 0.68, 0.72),
|
| 455 |
-
'side': 'right', 'dx': 0.003, 'dy': 0.000,
|
| 456 |
-
'dw': 0.28, 'dh': 0.020,
|
| 457 |
-
},
|
| 458 |
},
|
| 459 |
-
|
| 460 |
-
# ββ Form 90 ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 461 |
'90': {
|
| 462 |
-
'province':
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
'
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
'
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
'
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
'
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
'
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
},
|
| 498 |
-
'bride_father_name': {
|
| 499 |
-
'labels': ["Father's Name", 'Father', 'FATHER'],
|
| 500 |
-
'search': (0.46, 0.68, 0.60, 0.73),
|
| 501 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 502 |
-
'dw': 0.37, 'dh': 0.020,
|
| 503 |
-
},
|
| 504 |
-
'groom_mother_name': {
|
| 505 |
-
'labels': ["Mother's Name", "Mother's Maiden", 'Mother', 'MOTHER'],
|
| 506 |
-
'search': (0.00, 0.77, 0.14, 0.82),
|
| 507 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 508 |
-
'dw': 0.37, 'dh': 0.022,
|
| 509 |
-
},
|
| 510 |
-
'bride_mother_name': {
|
| 511 |
-
'labels': ["Mother's Name", "Mother's Maiden", 'Mother', 'MOTHER'],
|
| 512 |
-
'search': (0.46, 0.77, 0.60, 0.82),
|
| 513 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 514 |
-
'dw': 0.37, 'dh': 0.022,
|
| 515 |
-
},
|
| 516 |
},
|
| 517 |
-
|
| 518 |
-
# ββ Form 97 ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 519 |
'97': {
|
| 520 |
-
'province':
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
'
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
'
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
'
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
'
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
'
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
'
|
| 557 |
-
'labels': ["Mother's Name", "Mother's Maiden", 'Mother'],
|
| 558 |
-
'search': (0.50, 0.41, 0.65, 0.46),
|
| 559 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 560 |
-
'dw': 0.37, 'dh': 0.020,
|
| 561 |
-
},
|
| 562 |
-
'place_of_marriage': {
|
| 563 |
-
'labels': ['Place of Marriage', 'PLACE OF MARRIAGE', 'Place'],
|
| 564 |
-
'search': (0.05, 0.54, 0.30, 0.58),
|
| 565 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 566 |
-
'dw': 0.74, 'dh': 0.018,
|
| 567 |
-
},
|
| 568 |
-
'date_of_marriage': {
|
| 569 |
-
'labels': ['Date of Marriage', 'DATE OF MARRIAGE', 'Date'],
|
| 570 |
-
'search': (0.05, 0.57, 0.27, 0.62),
|
| 571 |
-
'side': 'right', 'dx': 0.003, 'dy': -0.003,
|
| 572 |
-
'dw': 0.37, 'dh': 0.018,
|
| 573 |
-
},
|
| 574 |
-
'registration_date': {
|
| 575 |
-
'labels': ['Date', 'Registration'],
|
| 576 |
-
'search': (0.55, 0.81, 0.72, 0.85),
|
| 577 |
-
'side': 'right', 'dx': 0.003, 'dy': 0.000,
|
| 578 |
-
'dw': 0.20, 'dh': 0.020,
|
| 579 |
-
},
|
| 580 |
},
|
| 581 |
}
|
| 582 |
|
| 583 |
|
| 584 |
-
# ββ
|
| 585 |
-
|
| 586 |
-
def _scan_form_text(_img: Image.Image):
|
| 587 |
-
"""
|
| 588 |
-
Anchor detection disabled β TrOCR has no built-in text detector.
|
| 589 |
-
extract_fields falls back to absolute coordinates for all fields.
|
| 590 |
-
"""
|
| 591 |
-
return []
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
def _find_anchor_bbox(detections, labels: list, search_box: tuple,
|
| 595 |
-
form_w: int, form_h: int):
|
| 596 |
-
"""
|
| 597 |
-
Find the first bounding box whose text matches any of `labels` (case-insensitive
|
| 598 |
-
partial match) and whose centre lies within `search_box` (fractions).
|
| 599 |
-
|
| 600 |
-
Returns [[x1,y1],[x2,y1],[x2,y2],[x1,y2]] pixel coords, or None.
|
| 601 |
-
"""
|
| 602 |
-
sx1 = search_box[0] * form_w
|
| 603 |
-
sy1 = search_box[1] * form_h
|
| 604 |
-
sx2 = search_box[2] * form_w
|
| 605 |
-
sy2 = search_box[3] * form_h
|
| 606 |
-
|
| 607 |
-
best_bbox = None
|
| 608 |
-
best_score = 0.0
|
| 609 |
-
|
| 610 |
-
for (bbox, text, conf) in detections:
|
| 611 |
-
if conf < 0.25:
|
| 612 |
-
continue
|
| 613 |
-
pts = np.array(bbox, dtype=np.float32)
|
| 614 |
-
cx = pts[:, 0].mean()
|
| 615 |
-
cy = pts[:, 1].mean()
|
| 616 |
-
if not (sx1 <= cx <= sx2 and sy1 <= cy <= sy2):
|
| 617 |
-
continue
|
| 618 |
-
text_u = text.upper().strip()
|
| 619 |
-
for label in labels:
|
| 620 |
-
label_u = label.upper()
|
| 621 |
-
if label_u in text_u or text_u in label_u:
|
| 622 |
-
score = conf * len(label_u)
|
| 623 |
-
if score > best_score:
|
| 624 |
-
best_score = score
|
| 625 |
-
best_bbox = bbox
|
| 626 |
-
|
| 627 |
-
return best_bbox
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
def _crop_from_anchor(img: Image.Image, anchor_bbox,
|
| 631 |
-
side: str, dx: float, dy: float,
|
| 632 |
-
dw: float, dh: float) -> Image.Image:
|
| 633 |
-
"""
|
| 634 |
-
Compute data region relative to a found anchor bbox and return the crop.
|
| 635 |
-
|
| 636 |
-
anchor_bbox : [[x1,y1],[x2,y1],[x2,y2],[x1,y2]] pixel coords
|
| 637 |
-
side : 'right' β data starts at anchor's right edge
|
| 638 |
-
'below' β data starts below anchor's bottom edge
|
| 639 |
-
dx, dy : offset fractions (of form width/height) from anchor edge
|
| 640 |
-
dw, dh : data region size fractions (of form width/height);
|
| 641 |
-
dh=0 means use anchor's own height
|
| 642 |
-
"""
|
| 643 |
-
form_w, form_h = img.size
|
| 644 |
-
pts = np.array(anchor_bbox, dtype=np.float32)
|
| 645 |
-
ax1 = int(pts[:, 0].min())
|
| 646 |
-
ay1 = int(pts[:, 1].min())
|
| 647 |
-
ax2 = int(pts[:, 0].max())
|
| 648 |
-
ay2 = int(pts[:, 1].max())
|
| 649 |
-
|
| 650 |
-
data_w = int(dw * form_w)
|
| 651 |
-
data_h = int(dh * form_h) if dh > 0 else (ay2 - ay1 + 4)
|
| 652 |
-
|
| 653 |
-
if side == 'right':
|
| 654 |
-
rx1 = ax2 + int(dx * form_w)
|
| 655 |
-
ry1 = ay1 + int(dy * form_h)
|
| 656 |
-
else: # 'below'
|
| 657 |
-
rx1 = ax1 + int(dx * form_w)
|
| 658 |
-
ry1 = ay2 + int(dy * form_h)
|
| 659 |
-
|
| 660 |
-
rx2 = min(form_w, rx1 + data_w)
|
| 661 |
-
ry2 = min(form_h, ry1 + data_h)
|
| 662 |
-
|
| 663 |
-
if rx2 <= rx1 or ry2 <= ry1:
|
| 664 |
-
return None
|
| 665 |
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
|
| 670 |
|
| 671 |
-
def
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
blur = _cv2.GaussianBlur(gray, (5, 5), 0)
|
| 678 |
_, thresh = _cv2.threshold(blur, 0, 255, _cv2.THRESH_BINARY + _cv2.THRESH_OTSU)
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
contours, _ = _cv2.findContours(thresh, _cv2.RETR_EXTERNAL, _cv2.CHAIN_APPROX_SIMPLE)
|
| 682 |
if not contours:
|
| 683 |
-
return
|
| 684 |
-
|
| 685 |
-
# Take the largest contour β should be the document page
|
| 686 |
-
c = max(contours, key=_cv2.contourArea)
|
| 687 |
area = _cv2.contourArea(c)
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
return None
|
| 693 |
-
|
| 694 |
-
# Approximate to a polygon
|
| 695 |
-
peri = _cv2.arcLength(c, True)
|
| 696 |
approx = _cv2.approxPolyDP(c, 0.02 * peri, True)
|
| 697 |
-
|
| 698 |
if len(approx) != 4:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
return None
|
| 700 |
|
| 701 |
-
pts = approx.reshape(4, 2).astype(np.float32)
|
| 702 |
-
|
| 703 |
-
# Order: TL, TR, BR, BL
|
| 704 |
-
s = pts.sum(axis=1)
|
| 705 |
-
d = np.diff(pts, axis=1)
|
| 706 |
-
ordered = np.array([
|
| 707 |
-
pts[np.argmin(s)], # TL β smallest sum
|
| 708 |
-
pts[np.argmin(d)], # TR β smallest diff
|
| 709 |
-
pts[np.argmax(s)], # BR β largest sum
|
| 710 |
-
pts[np.argmax(d)], # BL β largest diff
|
| 711 |
-
], dtype=np.float32)
|
| 712 |
-
return ordered
|
| 713 |
-
|
| 714 |
|
| 715 |
-
def _orb_align(scan_gray: np.ndarray, ref_gray: np.ndarray,
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
Returns (aligned_rgb, inlier_count) or (None, 0) on failure.
|
| 719 |
-
"""
|
| 720 |
-
h, w = scan_gray.shape
|
| 721 |
ref_resized = _cv2.resize(ref_gray, (w, h))
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
kp2, des2 = orb.detectAndCompute(ref_resized, None)
|
| 726 |
-
|
| 727 |
if des1 is None or des2 is None or len(kp1) < 10 or len(kp2) < 10:
|
| 728 |
return None, 0
|
| 729 |
-
|
| 730 |
matcher = _cv2.BFMatcher(_cv2.NORM_HAMMING, crossCheck=True)
|
| 731 |
matches = sorted(matcher.match(des1, des2), key=lambda m: m.distance)
|
| 732 |
-
|
| 733 |
-
good = matches[:max(10, len(matches) // 3)]
|
| 734 |
-
|
| 735 |
if len(good) < 10:
|
| 736 |
return None, 0
|
| 737 |
-
|
| 738 |
src_pts = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
|
| 739 |
dst_pts = np.float32([kp2[m.trainIdx].pt for m in good]).reshape(-1, 1, 2)
|
| 740 |
-
|
| 741 |
M, mask = _cv2.estimateAffinePartial2D(
|
| 742 |
-
src_pts, dst_pts, method=_cv2.RANSAC, ransacReprojThreshold=5.0
|
| 743 |
-
)
|
| 744 |
if M is None:
|
| 745 |
return None, 0
|
| 746 |
-
|
| 747 |
inliers = int(mask.sum()) if mask is not None else 0
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
flags=_cv2.INTER_LINEAR,
|
| 753 |
-
borderMode=_cv2.BORDER_REPLICATE
|
| 754 |
-
)
|
| 755 |
return aligned, inliers
|
| 756 |
|
| 757 |
|
| 758 |
def _orb_inliers(scan_gray: np.ndarray, ref_gray: np.ndarray) -> int:
|
| 759 |
-
|
| 760 |
-
Count ORB RANSAC inliers between two grayscale images without warping.
|
| 761 |
-
Used by detect_form_type() to score form candidates.
|
| 762 |
-
"""
|
| 763 |
-
orb = _cv2.ORB_create(nfeatures=3000)
|
| 764 |
kp1, des1 = orb.detectAndCompute(scan_gray, None)
|
| 765 |
-
kp2, des2 = orb.detectAndCompute(ref_gray,
|
| 766 |
if des1 is None or des2 is None or len(kp1) < 10 or len(kp2) < 10:
|
| 767 |
return 0
|
| 768 |
matcher = _cv2.BFMatcher(_cv2.NORM_HAMMING, crossCheck=True)
|
| 769 |
matches = sorted(matcher.match(des1, des2), key=lambda m: m.distance)
|
| 770 |
-
good
|
| 771 |
if len(good) < 10:
|
| 772 |
return 0
|
| 773 |
src_pts = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
|
|
@@ -776,191 +392,72 @@ def _orb_inliers(scan_gray: np.ndarray, ref_gray: np.ndarray) -> int:
|
|
| 776 |
return int(mask.sum()) if mask is not None else 0
|
| 777 |
|
| 778 |
|
| 779 |
-
def
|
| 780 |
"""
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
h, w = ref_gray.shape
|
| 788 |
-
# Downscale to 500px for speed; scale translation back afterward
|
| 789 |
-
scale = min(1.0, 500.0 / max(h, w))
|
| 790 |
-
sh, sw = max(1, int(h * scale)), max(1, int(w * scale))
|
| 791 |
-
ref_s = _cv2.resize(ref_gray, (sw, sh))
|
| 792 |
-
scn_s = _cv2.resize(_cv2.resize(scan_gray, (w, h)), (sw, sh))
|
| 793 |
-
|
| 794 |
-
warp = np.eye(2, 3, dtype=np.float32)
|
| 795 |
-
criteria = (_cv2.TERM_CRITERIA_EPS | _cv2.TERM_CRITERIA_COUNT, 100, 1e-4)
|
| 796 |
-
cc, warp = _cv2.findTransformECC(ref_s, scn_s, warp, _cv2.MOTION_EUCLIDEAN, criteria)
|
| 797 |
-
|
| 798 |
-
# Clamp rotation to Β±3Β° to prevent over-tilting
|
| 799 |
-
angle_rad = np.arctan2(warp[1, 0], warp[0, 0])
|
| 800 |
-
angle_deg = np.degrees(angle_rad)
|
| 801 |
-
MAX_ANGLE = 1.0
|
| 802 |
-
if abs(angle_deg) > MAX_ANGLE:
|
| 803 |
-
clamped = np.radians(np.clip(angle_deg, -MAX_ANGLE, MAX_ANGLE))
|
| 804 |
-
warp[0, 0] = np.cos(clamped)
|
| 805 |
-
warp[0, 1] = -np.sin(clamped)
|
| 806 |
-
warp[1, 0] = np.sin(clamped)
|
| 807 |
-
warp[1, 1] = np.cos(clamped)
|
| 808 |
-
print(f'[align] ECC rotation clamped {angle_deg:.2f}Β° -> {np.degrees(clamped):.2f}Β°')
|
| 809 |
-
|
| 810 |
-
# Scale translation to full resolution
|
| 811 |
-
warp[0, 2] /= scale
|
| 812 |
-
warp[1, 2] /= scale
|
| 813 |
-
|
| 814 |
-
scan_full = _cv2.resize(scan_rgb, (w, h))
|
| 815 |
-
aligned = _cv2.warpAffine(
|
| 816 |
-
scan_full, warp, (w, h),
|
| 817 |
-
flags=_cv2.INTER_LINEAR,
|
| 818 |
-
borderMode=_cv2.BORDER_REPLICATE
|
| 819 |
-
)
|
| 820 |
-
print(f'[align] ECC applied (cc={cc:.4f} angle={angle_deg:.2f}Β° tx={warp[0,2]:.1f} ty={warp[1,2]:.1f})')
|
| 821 |
-
return aligned
|
| 822 |
-
except Exception as e:
|
| 823 |
-
print(f'[align] ECC failed: {e}')
|
| 824 |
-
return None
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
def align_to_reference(img: Image.Image, form_type: str):
|
| 828 |
-
"""
|
| 829 |
-
Align a scanned form to its clean reference using a three-stage cascade:
|
| 830 |
-
|
| 831 |
-
Stage 1 β ORB feature matching + RANSAC homography (primary).
|
| 832 |
-
Matches structural features (printed lines, boxes, text layout).
|
| 833 |
-
Most accurate when the scan has reasonable contrast/sharpness.
|
| 834 |
-
Returns high confidence (inlier count) used to decide if anchor
|
| 835 |
-
scan is needed in extract_fields().
|
| 836 |
-
|
| 837 |
-
Stage 2 β ECC (Enhanced Correlation Coefficient) EUCLIDEAN.
|
| 838 |
-
Good for blurry / low-texture / handwriting-heavy scans where ORB
|
| 839 |
-
finds too few keypoints. Corrects translation + rotation only.
|
| 840 |
-
|
| 841 |
-
Stage 3 β Corner perspective correction (fallback).
|
| 842 |
-
Finds document corners via contour detection. Only works when the
|
| 843 |
-
page is visible against a background.
|
| 844 |
-
|
| 845 |
-
Stage 4 β Resize only (last resort).
|
| 846 |
-
|
| 847 |
-
Returns (aligned_image, orb_inliers) where orb_inliers=0 means ORB
|
| 848 |
-
did not succeed (ECC/corner/resize was used instead).
|
| 849 |
"""
|
| 850 |
if not _CV2_OK:
|
| 851 |
return img, 0
|
| 852 |
-
|
| 853 |
ref_path = REFERENCE_IMAGES.get(form_type)
|
| 854 |
if not ref_path or not os.path.exists(ref_path):
|
| 855 |
-
print(f'[align] No reference for form {form_type}
|
| 856 |
return img, 0
|
| 857 |
-
|
| 858 |
ref_gray = _cv2.imread(ref_path, _cv2.IMREAD_GRAYSCALE)
|
| 859 |
if ref_gray is None:
|
| 860 |
return img, 0
|
| 861 |
-
|
| 862 |
-
scan_rgb = np.array(img.convert('RGB'))
|
| 863 |
-
scan_gray = _cv2.cvtColor(scan_rgb, _cv2.COLOR_RGB2GRAY)
|
| 864 |
ref_h, ref_w = ref_gray.shape
|
|
|
|
| 865 |
|
| 866 |
-
|
| 867 |
-
scan_rgb_rs = _cv2.resize(scan_rgb,
|
|
|
|
| 868 |
|
| 869 |
-
# ββ Stage 1: ECC (translation + rotation only β no distortion) ββββ
|
| 870 |
print(f'[align] Form {form_type}: trying ECC...')
|
| 871 |
aligned = _ecc_align(scan_gray_rs, ref_gray, scan_rgb_rs)
|
| 872 |
if aligned is not None:
|
| 873 |
-
|
| 874 |
-
return Image.fromarray(aligned), 25 # return 25 so anchor scan is skipped
|
| 875 |
|
| 876 |
-
# ββ Stage 2: ORB (fallback if ECC fails) ββββββββββββββββββ
|
| 877 |
print(f'[align] Form {form_type}: ECC failed, trying ORB...')
|
| 878 |
aligned, inliers = _orb_align(scan_gray_rs, ref_gray, scan_rgb_rs)
|
| 879 |
if aligned is not None:
|
| 880 |
-
print(f'[align] Form {form_type}: ORB applied ({inliers} inliers)')
|
| 881 |
return Image.fromarray(aligned), inliers
|
| 882 |
|
| 883 |
-
|
| 884 |
-
print(f'[align] Form {form_type}: ECC failed, trying corner detection...')
|
| 885 |
-
corners = _find_document_corners(scan_gray)
|
| 886 |
-
if corners is not None:
|
| 887 |
-
dst_corners = np.array([
|
| 888 |
-
[0, 0 ],
|
| 889 |
-
[ref_w, 0 ],
|
| 890 |
-
[ref_w, ref_h],
|
| 891 |
-
[0, ref_h],
|
| 892 |
-
], dtype=np.float32)
|
| 893 |
-
M = _cv2.getPerspectiveTransform(corners, dst_corners)
|
| 894 |
-
warped = _cv2.warpPerspective(
|
| 895 |
-
scan_rgb, M, (ref_w, ref_h),
|
| 896 |
-
flags=_cv2.INTER_LINEAR,
|
| 897 |
-
borderMode=_cv2.BORDER_REPLICATE
|
| 898 |
-
)
|
| 899 |
-
print(f'[align] Form {form_type}: perspective correction applied')
|
| 900 |
-
return Image.fromarray(warped), 0
|
| 901 |
-
|
| 902 |
-
# ββ Stage 4: resize only ββββββββββββββββββββββββββββββββββ
|
| 903 |
-
print(f'[align] Form {form_type}: all alignment methods failed, resizing only')
|
| 904 |
resized = _cv2.resize(scan_rgb, (ref_w, ref_h))
|
| 905 |
return Image.fromarray(resized), 0
|
| 906 |
|
| 907 |
|
|
|
|
|
|
|
| 908 |
def _deskew(gray: np.ndarray) -> np.ndarray:
|
| 909 |
-
"""Correct slight rotation using Hough line detection."""
|
| 910 |
if not _CV2_OK:
|
| 911 |
return gray
|
| 912 |
edges = _cv2.Canny(gray, 50, 150, apertureSize=3)
|
| 913 |
-
lines = _cv2.HoughLinesP(edges, 1, np.pi
|
| 914 |
minLineLength=100, maxLineGap=10)
|
| 915 |
-
if lines is None
|
| 916 |
return gray
|
| 917 |
-
angles = [
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
if -15 < angle < 15:
|
| 921 |
-
angles.append(angle)
|
| 922 |
if not angles:
|
| 923 |
return gray
|
| 924 |
-
|
| 925 |
-
if abs(
|
| 926 |
return gray
|
| 927 |
h, w = gray.shape
|
| 928 |
-
M = _cv2.getRotationMatrix2D((w
|
| 929 |
return _cv2.warpAffine(gray, M, (w, h),
|
| 930 |
flags=_cv2.INTER_CUBIC,
|
| 931 |
borderMode=_cv2.BORDER_REPLICATE)
|
| 932 |
|
| 933 |
|
| 934 |
-
def _enhance_for_ocr(gray: np.ndarray) -> np.ndarray:
|
| 935 |
-
"""CLAHE contrast enhancement + gentle denoising."""
|
| 936 |
-
if not _CV2_OK:
|
| 937 |
-
return gray
|
| 938 |
-
clahe = _cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 939 |
-
enhanced = clahe.apply(gray)
|
| 940 |
-
denoised = _cv2.fastNlMeansDenoising(enhanced, h=10,
|
| 941 |
-
templateWindowSize=7,
|
| 942 |
-
searchWindowSize=21)
|
| 943 |
-
return denoised
|
| 944 |
-
|
| 945 |
-
|
| 946 |
-
def _binarize(gray: np.ndarray) -> np.ndarray:
|
| 947 |
-
"""Adaptive threshold β cleaner black-on-white for OCR."""
|
| 948 |
-
if not _CV2_OK:
|
| 949 |
-
return gray
|
| 950 |
-
return _cv2.adaptiveThreshold(gray, 255,
|
| 951 |
-
_cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 952 |
-
_cv2.THRESH_BINARY, 11, 2)
|
| 953 |
-
|
| 954 |
-
|
| 955 |
def _preprocess(img: Image.Image) -> Image.Image:
|
| 956 |
-
"""
|
| 957 |
-
Prepare the full form image for field cropping:
|
| 958 |
-
- Convert to grayscale
|
| 959 |
-
- Deskew (correct residual rotation after ORB alignment)
|
| 960 |
-
|
| 961 |
-
CLAHE and denoising are applied later per-crop in _ocr(), where
|
| 962 |
-
they are more effective and don't risk blurring the whole form.
|
| 963 |
-
"""
|
| 964 |
if not _CV2_OK:
|
| 965 |
return img.convert('L')
|
| 966 |
gray = np.array(img.convert('L'))
|
|
@@ -969,63 +466,29 @@ def _preprocess(img: Image.Image) -> Image.Image:
|
|
| 969 |
|
| 970 |
|
| 971 |
def _crop_field(img: Image.Image, x1r, y1r, x2r, y2r) -> Image.Image:
|
| 972 |
-
"""Crop a field region using relative coordinates."""
|
| 973 |
w, h = img.size
|
| 974 |
-
|
| 975 |
-
|
| 976 |
-
|
| 977 |
-
pad = 4
|
| 978 |
-
x1 = max(0, x1 - pad); y1 = max(0, y1 - pad)
|
| 979 |
-
x2 = min(w, x2 + pad); y2 = min(h, y2 + pad)
|
| 980 |
return img.crop((x1, y1, x2, y2))
|
| 981 |
|
| 982 |
|
|
|
|
|
|
|
|
|
|
| 983 |
|
| 984 |
-
def _ocr(crop: Image.Image, config: str = '') -> str:
|
| 985 |
-
"""Run TrOCR large-handwritten on a cropped field image."""
|
| 986 |
-
processor, model = _get_trocr()
|
| 987 |
-
if processor is None or model is None:
|
| 988 |
-
return ''
|
| 989 |
-
try:
|
| 990 |
-
import torch
|
| 991 |
-
rgb = crop.convert('RGB')
|
| 992 |
-
pixel_values = processor(rgb, return_tensors='pt').pixel_values
|
| 993 |
-
with torch.no_grad():
|
| 994 |
-
generated_ids = model.generate(pixel_values)
|
| 995 |
-
return processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
| 996 |
-
except Exception as e:
|
| 997 |
-
print(f'[template_matcher] OCR error: {e}')
|
| 998 |
-
return ''
|
| 999 |
|
|
|
|
| 1000 |
|
| 1001 |
def detect_form_type(image_path: str) -> str:
|
| 1002 |
-
"""
|
| 1003 |
-
Auto-detect form type from a scanned image.
|
| 1004 |
-
|
| 1005 |
-
Primary β ORB inlier count:
|
| 1006 |
-
Resize the scan to each reference's dimensions, run ORB feature
|
| 1007 |
-
matching against all 4 reference images, and pick the form type
|
| 1008 |
-
with the most RANSAC inliers. Robust to rotation, brightness
|
| 1009 |
-
differences, and partial fills because it matches structural
|
| 1010 |
-
features (printed lines, boxes, column layout) rather than title
|
| 1011 |
-
text. Works at ~800px for speed.
|
| 1012 |
-
|
| 1013 |
-
Fallback β OCR title:
|
| 1014 |
-
Used when no reference images exist or cv2 is unavailable.
|
| 1015 |
-
Less reliable for rotated / faint / cropped scans.
|
| 1016 |
-
|
| 1017 |
-
Returns '102', '103', '90', or '97'.
|
| 1018 |
-
"""
|
| 1019 |
if _CV2_OK:
|
| 1020 |
try:
|
| 1021 |
-
img
|
| 1022 |
scan_rgb = np.array(img)
|
| 1023 |
scan_gray = _cv2.cvtColor(scan_rgb, _cv2.COLOR_RGB2GRAY)
|
| 1024 |
-
|
| 1025 |
-
|
| 1026 |
-
best_inliers = 0
|
| 1027 |
-
DET_W = 800 # detection width β fast enough, enough detail
|
| 1028 |
-
|
| 1029 |
for ft, ref_path in REFERENCE_IMAGES.items():
|
| 1030 |
if not os.path.exists(ref_path):
|
| 1031 |
continue
|
|
@@ -1033,56 +496,50 @@ def detect_form_type(image_path: str) -> str:
|
|
| 1033 |
if ref_gray is None:
|
| 1034 |
continue
|
| 1035 |
ref_h, ref_w = ref_gray.shape
|
| 1036 |
-
# Resize scan to reference aspect, then both to DET_W
|
| 1037 |
sc = min(1.0, DET_W / ref_w)
|
| 1038 |
-
dw, dh
|
| 1039 |
-
ref_ds
|
| 1040 |
-
scan_ds
|
| 1041 |
-
count
|
| 1042 |
print(f'[detect] Form {ft}: {count} ORB inliers')
|
| 1043 |
if count > best_inliers:
|
| 1044 |
-
best_inliers = count
|
| 1045 |
-
best_type = ft
|
| 1046 |
-
|
| 1047 |
if best_type and best_inliers >= 15:
|
| 1048 |
-
print(f'[detect] Best
|
| 1049 |
return best_type
|
| 1050 |
-
|
| 1051 |
-
print(f'[detect] ORB inconclusive (best={best_inliers}), falling back to OCR title')
|
| 1052 |
-
|
| 1053 |
except Exception as e:
|
| 1054 |
print(f'[template_matcher] detect_form_type ORB error: {e}')
|
| 1055 |
-
|
| 1056 |
-
# ββ OCR title fallback ββββββββββββββββββββββββββββββββββββ
|
| 1057 |
try:
|
| 1058 |
img_l = Image.open(image_path).convert('L')
|
| 1059 |
w, h = img_l.size
|
| 1060 |
-
|
| 1061 |
-
title = _ocr(title_crop).upper()
|
| 1062 |
if title:
|
| 1063 |
if 'LIVE BIRTH' in title or ('BIRTH' in title
|
| 1064 |
and 'DEATH' not in title and 'MARRIAGE' not in title):
|
| 1065 |
return '102'
|
| 1066 |
-
|
| 1067 |
return '103'
|
| 1068 |
-
|
| 1069 |
return '90'
|
| 1070 |
-
|
| 1071 |
return '97'
|
| 1072 |
-
print(
|
| 1073 |
-
f'Title: {title[:80] if title else "(empty)"}')
|
| 1074 |
except Exception as e:
|
| 1075 |
print(f'[template_matcher] detect_form_type OCR error: {e}')
|
| 1076 |
return '102'
|
| 1077 |
|
| 1078 |
|
|
|
|
|
|
|
| 1079 |
def extract_fields(image_path: str, form_type: str) -> dict:
|
| 1080 |
"""
|
| 1081 |
-
|
| 1082 |
|
| 1083 |
Args:
|
| 1084 |
-
image_path:
|
| 1085 |
-
form_type
|
| 1086 |
|
| 1087 |
Returns:
|
| 1088 |
dict of { field_name: extracted_text }
|
|
@@ -1091,168 +548,135 @@ def extract_fields(image_path: str, form_type: str) -> dict:
|
|
| 1091 |
if template is None:
|
| 1092 |
print(f'[template_matcher] No template for form type: {form_type}')
|
| 1093 |
return {}
|
| 1094 |
-
|
| 1095 |
-
|
| 1096 |
-
print('[template_matcher] TrOCR not available β returning empty fields')
|
| 1097 |
return {}
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
-
# Load and preprocess
|
| 1101 |
try:
|
| 1102 |
img = Image.open(image_path).convert('RGB')
|
| 1103 |
except Exception as e:
|
| 1104 |
print(f'[template_matcher] Cannot open image: {e}')
|
| 1105 |
return {}
|
| 1106 |
|
| 1107 |
-
# Align to reference before cropping (fixes scan offset/rotation)
|
| 1108 |
-
# orb_inliers > 0 means ORB succeeded β absolute coords are reliable.
|
| 1109 |
img, orb_inliers = align_to_reference(img, form_type)
|
| 1110 |
-
processed
|
| 1111 |
-
|
| 1112 |
-
# ββ One-time full-form scan for anchor detection ββββββββββ
|
| 1113 |
-
# When ORB aligned with high confidence (inliers >= 25), absolute
|
| 1114 |
-
# coordinates are accurate and the expensive full-page OCR scan can
|
| 1115 |
-
# be skipped. Below that threshold, anchors improve robustness.
|
| 1116 |
-
anchor_defs = ANCHOR_TEMPLATES.get(form_type, {})
|
| 1117 |
-
detections = []
|
| 1118 |
-
if anchor_defs and orb_inliers < 25:
|
| 1119 |
-
print(f'[template_matcher] ORB inliers={orb_inliers} β scanning form for anchors...')
|
| 1120 |
-
detections = _scan_form_text(img) # use colour/original for label scan
|
| 1121 |
-
print(f'[template_matcher] Found {len(detections)} text regions in form')
|
| 1122 |
-
elif anchor_defs:
|
| 1123 |
-
print(f'[template_matcher] ORB inliers={orb_inliers} >= 25 β skipping anchor scan')
|
| 1124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1125 |
form_w, form_h = img.size
|
| 1126 |
-
|
| 1127 |
-
|
| 1128 |
-
# ββ Collect all crops first, then batch-infer in one pass β
|
| 1129 |
-
field_names = []
|
| 1130 |
-
crops = []
|
| 1131 |
|
| 1132 |
for field_name, coords in template.items():
|
| 1133 |
-
|
| 1134 |
-
|
| 1135 |
-
|
| 1136 |
-
adef = anchor_defs.get(field_name)
|
| 1137 |
-
if adef and detections:
|
| 1138 |
-
bbox = _find_anchor_bbox(
|
| 1139 |
-
detections, adef['labels'], adef['search'], form_w, form_h
|
| 1140 |
-
)
|
| 1141 |
-
if bbox is not None:
|
| 1142 |
-
crop = _crop_from_anchor(
|
| 1143 |
-
processed, bbox,
|
| 1144 |
-
adef['side'], adef['dx'], adef['dy'],
|
| 1145 |
-
adef['dw'], adef['dh']
|
| 1146 |
-
)
|
| 1147 |
-
if crop is not None:
|
| 1148 |
-
anchor_hits += 1
|
| 1149 |
-
|
| 1150 |
-
# ββ Fallback: absolute coordinate crop ββββββββββββββββ
|
| 1151 |
-
if crop is None:
|
| 1152 |
-
x1r, y1r, x2r, y2r, cfg = coords
|
| 1153 |
-
crop = _crop_field(processed, x1r, y1r, x2r, y2r)
|
| 1154 |
-
|
| 1155 |
field_names.append(field_name)
|
| 1156 |
crops.append(crop)
|
|
|
|
| 1157 |
|
| 1158 |
-
fields
|
| 1159 |
-
|
|
|
|
| 1160 |
text = _postprocess(_ocr(crop), field_name)
|
| 1161 |
if text:
|
| 1162 |
fields[field_name] = text
|
| 1163 |
|
| 1164 |
-
|
| 1165 |
-
|
| 1166 |
-
print(f'[template_matcher] Extracted {len(fields)}/{len(template)} fields from form {form_type}')
|
| 1167 |
return fields
|
| 1168 |
|
| 1169 |
|
| 1170 |
-
|
| 1171 |
-
"""
|
| 1172 |
-
Convert a PDF page to a PNG image for processing.
|
| 1173 |
-
Returns path to the saved PNG, or None on failure.
|
| 1174 |
-
Requires: pip install pdf2image + poppler installed
|
| 1175 |
-
"""
|
| 1176 |
-
try:
|
| 1177 |
-
from pdf2image import convert_from_path
|
| 1178 |
-
pages = convert_from_path(pdf_path, dpi=150)
|
| 1179 |
-
if not pages:
|
| 1180 |
-
return None
|
| 1181 |
-
out_path = pdf_path.replace('.pdf', f'_page{page}.png')
|
| 1182 |
-
pages[page].save(out_path, 'PNG')
|
| 1183 |
-
return out_path
|
| 1184 |
-
except ImportError:
|
| 1185 |
-
print('[template_matcher] pdf2image not installed. Run: pip install pdf2image')
|
| 1186 |
-
return None
|
| 1187 |
-
except Exception as e:
|
| 1188 |
-
print(f'[template_matcher] PDF conversion failed: {e}')
|
| 1189 |
-
return None
|
| 1190 |
-
|
| 1191 |
|
| 1192 |
-
def debug_draw_boxes(image_path: str, form_type: str, out_path: str = None):
|
| 1193 |
"""
|
| 1194 |
-
Draw all field
|
| 1195 |
-
Uses the same alignment (ORB β corner β resize) as extract_fields(),
|
| 1196 |
-
so the boxes reflect where coordinates actually land during extraction.
|
| 1197 |
|
| 1198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1199 |
"""
|
| 1200 |
from PIL import ImageDraw, ImageFont
|
| 1201 |
|
| 1202 |
template = TEMPLATES.get(form_type)
|
| 1203 |
if not template:
|
| 1204 |
print(f'No template for {form_type}')
|
| 1205 |
-
return
|
| 1206 |
|
| 1207 |
-
img
|
| 1208 |
-
|
| 1209 |
-
draw = ImageDraw.Draw(img)
|
| 1210 |
w, h = img.size
|
| 1211 |
|
| 1212 |
try:
|
| 1213 |
-
font = ImageFont.truetype('
|
| 1214 |
-
except:
|
| 1215 |
-
|
|
|
|
|
|
|
|
|
|
| 1216 |
|
| 1217 |
-
|
| 1218 |
-
for idx, (field_name, coords) in enumerate(template.items()):
|
| 1219 |
x1r, y1r, x2r, y2r, _ = coords
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
draw.
|
| 1224 |
|
| 1225 |
base, ext = os.path.splitext(image_path)
|
| 1226 |
out = out_path or f'{base}_debug_{form_type}{ext}'
|
| 1227 |
img.save(out)
|
| 1228 |
print(f'[template_matcher] Debug image saved: {out}')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1229 |
return out
|
| 1230 |
|
| 1231 |
|
| 1232 |
-
# ββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1233 |
if __name__ == '__main__':
|
| 1234 |
if len(sys.argv) < 3:
|
| 1235 |
-
print('Usage: python template_matcher.py <image_path> <form_type>')
|
| 1236 |
print(' form_type: 102 | 103 | 90 | 97')
|
| 1237 |
-
print('Example: python template_matcher.py form102_sample1.png 102')
|
| 1238 |
sys.exit(1)
|
| 1239 |
|
| 1240 |
img_path = sys.argv[1]
|
| 1241 |
form_type = sys.argv[2]
|
|
|
|
| 1242 |
|
| 1243 |
-
|
| 1244 |
-
|
| 1245 |
-
print(
|
| 1246 |
|
| 1247 |
-
# Extract and print fields
|
| 1248 |
result = extract_fields(img_path, form_type)
|
| 1249 |
-
print(f'
|
| 1250 |
for k, v in result.items():
|
| 1251 |
-
print(f' {k:<
|
| 1252 |
-
|
| 1253 |
template = TEMPLATES.get(form_type, {})
|
| 1254 |
-
missing
|
| 1255 |
if missing:
|
| 1256 |
print(f'\nEmpty fields ({len(missing)}):')
|
| 1257 |
for k in missing:
|
| 1258 |
-
print(f' {k}')
|
|
|
|
| 1 |
"""
|
| 2 |
+
template_matcher.py (v3 β pytesseract removed)
|
| 3 |
+
================================================
|
| 4 |
+
Extracts field values from Philippine civil registry scanned forms.
|
| 5 |
+
|
| 6 |
+
WHAT CHANGED FROM v2
|
| 7 |
+
---------------------
|
| 8 |
+
1. pytesseract removed entirely.
|
| 9 |
+
2. _scan_form_text() now uses CV2 contour/MSER detection to find
|
| 10 |
+
candidate text regions, then reads each region with TrOCR
|
| 11 |
+
(the same model already loaded for field OCR).
|
| 12 |
+
3. Anchor label matching (fuzzy SequenceMatcher) unchanged.
|
| 13 |
+
4. No new dependencies β everything already required by the project.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
+
import os
|
| 17 |
+
import sys
|
| 18 |
+
import re as _re
|
| 19 |
+
|
| 20 |
import numpy as np
|
| 21 |
from PIL import Image
|
| 22 |
|
|
|
|
| 26 |
except ImportError:
|
| 27 |
_CV2_OK = False
|
| 28 |
|
| 29 |
+
# ββ Reference images βββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
_REF_DIR = os.path.join(os.path.dirname(__file__), 'references')
|
| 31 |
REFERENCE_IMAGES = {
|
| 32 |
+
'102': os.path.join(_REF_DIR, 'reference_102.jpg'),
|
| 33 |
'103': os.path.join(_REF_DIR, 'reference_103.png'),
|
| 34 |
+
'90': os.path.join(_REF_DIR, 'reference_90.png'),
|
| 35 |
+
'97': os.path.join(_REF_DIR, 'reference_97.jpg'),
|
| 36 |
}
|
| 37 |
|
| 38 |
+
# ββ CRNN+CTC engine ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
import sys as _sys
|
| 40 |
+
_CRNN_DIR = os.path.join(os.path.dirname(__file__), 'CRNN+CTC')
|
| 41 |
+
if _CRNN_DIR not in _sys.path:
|
| 42 |
+
_sys.path.insert(0, _CRNN_DIR)
|
| 43 |
+
|
| 44 |
+
_CRNN_CHECKPOINT = os.path.join(_CRNN_DIR, 'checkpoints', 'best_model.pth')
|
| 45 |
+
_crnn_ocr = None
|
| 46 |
+
_crnn_decode = None # reference to decode_ctc_predictions
|
| 47 |
|
| 48 |
+
|
| 49 |
+
def _get_crnn():
|
| 50 |
+
global _crnn_ocr, _crnn_decode
|
| 51 |
+
if _crnn_ocr is None:
|
| 52 |
try:
|
|
|
|
| 53 |
import torch
|
| 54 |
+
from inference import CivilRegistryOCR
|
| 55 |
+
from utils import decode_ctc_predictions as _dcp
|
| 56 |
+
print('[template_matcher] Loading CRNN+CTC model...')
|
| 57 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 58 |
+
_crnn_ocr = CivilRegistryOCR(
|
| 59 |
+
checkpoint_path=_CRNN_CHECKPOINT,
|
| 60 |
+
device=device,
|
| 61 |
+
mode='adaptive',
|
| 62 |
+
)
|
| 63 |
+
_crnn_decode = _dcp
|
| 64 |
+
print('[template_matcher] CRNN+CTC ready.')
|
| 65 |
except Exception as e:
|
| 66 |
+
print(f'[template_matcher] CRNN+CTC load error: {e}')
|
| 67 |
+
return _crnn_ocr
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _crnn_read(crop_img: Image.Image) -> str:
|
| 71 |
+
"""Run CRNN+CTC on a PIL Image crop and return the decoded string."""
|
| 72 |
+
ocr = _get_crnn()
|
| 73 |
+
if ocr is None or _crnn_decode is None:
|
| 74 |
+
return ''
|
| 75 |
+
try:
|
| 76 |
+
import torch
|
| 77 |
+
rgb = np.array(crop_img.convert('RGB'))
|
| 78 |
+
bgr = rgb[:, :, ::-1].copy()
|
| 79 |
+
normalized = ocr.normalizer.normalize(bgr)
|
| 80 |
+
tensor = torch.FloatTensor(
|
| 81 |
+
normalized.astype(np.float32) / 255.0
|
| 82 |
+
).unsqueeze(0).unsqueeze(0).to(ocr.device)
|
| 83 |
+
with torch.no_grad():
|
| 84 |
+
outputs = ocr.model(tensor)
|
| 85 |
+
decoded = _crnn_decode(outputs.cpu(), ocr.idx_to_char, method='greedy')
|
| 86 |
+
return decoded[0].strip()
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f'[template_matcher] CRNN+CTC read error: {e}')
|
| 89 |
+
return ''
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
# Hint constants (kept for template dict compatibility)
|
| 93 |
_LINE = 'line'
|
| 94 |
_BLOCK = 'block'
|
| 95 |
_WORD = 'word'
|
| 96 |
|
| 97 |
+
# ββ Post-processing βββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
| 98 |
_SEX_KEYWORDS = {
|
| 99 |
'female': 'FEMALE', 'fem': 'FEMALE', 'f': 'FEMALE',
|
| 100 |
+
'male': 'MALE', 'm': 'MALE',
|
| 101 |
}
|
|
|
|
|
|
|
| 102 |
_FIELD_TYPE = {
|
|
|
|
| 103 |
'sex': 'sex', 'groom_sex': 'sex', 'bride_sex': 'sex',
|
| 104 |
'husband_sex': 'sex', 'wife_sex': 'sex',
|
|
|
|
| 105 |
'dob_year': 'year',
|
|
|
|
| 106 |
'age': 'digits', 'groom_age': 'digits', 'bride_age': 'digits',
|
| 107 |
'husband_age': 'digits', 'wife_age': 'digits', 'dob_day': 'digits',
|
|
|
|
| 108 |
'registration_date': 'date', 'marriage_date': 'date',
|
| 109 |
+
'date_of_marriage': 'date', 'date_of_death': 'date',
|
| 110 |
+
'date_of_birth': 'date', 'date_issued': 'date',
|
| 111 |
'groom_dob': 'date', 'bride_dob': 'date',
|
| 112 |
'husband_dob': 'date', 'wife_dob': 'date',
|
|
|
|
| 113 |
'registry_no': 'registry', 'marriage_license_no': 'registry',
|
| 114 |
}
|
| 115 |
|
|
|
|
| 116 |
def _postprocess(text: str, field_name: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
text = text.strip()
|
| 118 |
if not text:
|
| 119 |
return text
|
| 120 |
rule = _FIELD_TYPE.get(field_name)
|
|
|
|
| 121 |
if rule == 'sex':
|
| 122 |
tl = text.lower()
|
|
|
|
| 123 |
for kw in sorted(_SEX_KEYWORDS, key=len, reverse=True):
|
| 124 |
if kw in tl:
|
| 125 |
return _SEX_KEYWORDS[kw]
|
| 126 |
return text
|
|
|
|
| 127 |
if rule == 'year':
|
| 128 |
m = _re.search(r'(19|20)\d{2}', text)
|
| 129 |
if m:
|
| 130 |
return m.group(0)
|
| 131 |
digits = _re.sub(r'\D', '', text)
|
| 132 |
return digits[:4] if len(digits) >= 4 else text
|
|
|
|
| 133 |
if rule == 'digits':
|
| 134 |
d = _re.sub(r'\D', '', text)
|
| 135 |
return d if d else text
|
|
|
|
| 136 |
if rule == 'date':
|
| 137 |
return _re.sub(r'[^\w\s\-/,.]', '', text).strip()
|
|
|
|
| 138 |
if rule == 'registry':
|
| 139 |
return _re.sub(r'[^\w\s\-/]', '', text).strip()
|
|
|
|
| 140 |
return text
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
+
# ββ Absolute-coordinate templates βββββββββββββββββββββββββββββββββ
|
| 144 |
+
# (x1, y1, x2, y2, hint) β all values are fractions 0.0β1.0
|
| 145 |
TEMPLATES = {
|
| 146 |
+
'102': {
|
| 147 |
+
'province': (0.183, 0.110, 0.582, 0.128, _LINE),
|
| 148 |
+
'registry_no': (0.617, 0.121, 0.900, 0.149, _LINE),
|
| 149 |
+
'city_municipality': (0.224, 0.134, 0.631, 0.150, _LINE),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
'name_first': (0.169, 0.161, 0.453, 0.181, _LINE),
|
| 151 |
'name_middle': (0.450, 0.161, 0.674, 0.181, _LINE),
|
| 152 |
+
'name_last': (0.674, 0.162, 0.935, 0.182, _LINE),
|
| 153 |
+
'sex': (0.126, 0.195, 0.335, 0.210, _WORD),
|
| 154 |
+
'dob_day': (0.445, 0.193, 0.562, 0.210, _WORD),
|
| 155 |
+
'dob_month': (0.560, 0.193, 0.731, 0.211, _LINE),
|
| 156 |
+
'dob_year': (0.735, 0.197, 0.883, 0.213, _WORD),
|
| 157 |
+
'place_of_birth': (0.383, 0.227, 0.890, 0.245, _LINE),
|
| 158 |
+
'type_of_birth': (0.124, 0.263, 0.328, 0.282, _WORD),
|
| 159 |
+
'birth_order': (0.542, 0.272, 0.742, 0.285, _WORD),
|
| 160 |
+
'weight_at_birth': (0.757, 0.258, 0.839, 0.287, _WORD),
|
| 161 |
+
'mother_name': (0.217, 0.299, 0.894, 0.320, _LINE),
|
| 162 |
+
'mother_citizenship': (0.125, 0.329, 0.506, 0.351, _LINE),
|
| 163 |
+
'mother_religion': (0.508, 0.332, 0.901, 0.351, _LINE),
|
| 164 |
+
'mother_occupation': (0.511, 0.363, 0.750, 0.385, _LINE),
|
| 165 |
+
'mother_age_at_birth': (0.758, 0.371, 0.888, 0.390, _WORD),
|
| 166 |
+
'mother_residence': (0.211, 0.405, 0.936, 0.425, _LINE),
|
| 167 |
+
'father_name': (0.200, 0.436, 0.894, 0.456, _LINE),
|
| 168 |
+
'father_citizenship': (0.128, 0.465, 0.318, 0.487, _LINE),
|
| 169 |
+
'father_religion': (0.328, 0.467, 0.550, 0.490, _LINE),
|
| 170 |
+
'father_occupation': (0.543, 0.466, 0.754, 0.496, _LINE),
|
| 171 |
+
'father_age_at_birth': (0.752, 0.476, 0.902, 0.496, _WORD),
|
| 172 |
+
'father_residence': (0.216, 0.508, 0.949, 0.527, _LINE),
|
| 173 |
+
'marriage_date': (0.092, 0.556, 0.413, 0.573, _LINE),
|
| 174 |
+
'marriage_place': (0.400, 0.554, 0.922, 0.571, _LINE),
|
| 175 |
+
'registration_date': (0.635, 0.717, 0.919, 0.736, _LINE),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
'103': {
|
| 178 |
+
'province': (0.182, 0.076, 0.581, 0.094, _LINE),
|
| 179 |
+
'registry_no': (0.649, 0.088, 0.937, 0.123, _LINE),
|
| 180 |
+
'city_municipality': (0.222, 0.097, 0.629, 0.113, _LINE),
|
| 181 |
+
'deceased_name': (0.105, 0.139, 0.739, 0.173, _LINE),
|
| 182 |
+
'sex': (0.735, 0.137, 0.931, 0.170, _WORD),
|
| 183 |
+
'date_of_death': (0.123, 0.189, 0.316, 0.216, _LINE),
|
| 184 |
+
'date_of_birth': (0.319, 0.187, 0.567, 0.214, _LINE),
|
| 185 |
+
'age': (0.573, 0.198, 0.717, 0.214, _WORD),
|
| 186 |
+
'place_of_death': (0.096, 0.227, 0.727, 0.251, _LINE),
|
| 187 |
+
'civil_status': (0.709, 0.233, 0.935, 0.257, _WORD),
|
| 188 |
+
'religion': (0.092, 0.268, 0.324, 0.295, _LINE),
|
| 189 |
+
'citizenship': (0.324, 0.270, 0.522, 0.295, _LINE),
|
| 190 |
+
'residence': (0.519, 0.271, 0.936, 0.297, _LINE),
|
| 191 |
+
'occupation': (0.095, 0.311, 0.292, 0.330, _LINE),
|
| 192 |
+
'father_name': (0.295, 0.306, 0.614, 0.334, _LINE),
|
| 193 |
+
'mother_name': (0.615, 0.312, 0.938, 0.332, _LINE),
|
| 194 |
+
'cause_immediate': (0.312, 0.372, 0.961, 0.384, _LINE),
|
| 195 |
+
'cause_antecedent': (0.320, 0.383, 0.973, 0.402, _LINE),
|
| 196 |
+
'cause_underlying': (0.311, 0.406, 0.839, 0.424, _LINE),
|
| 197 |
+
'registration_date': (0.635, 0.717, 0.919, 0.736, _LINE),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
},
|
|
|
|
|
|
|
| 199 |
'90': {
|
| 200 |
+
'province': (0.208, 0.099, 0.607, 0.117, _LINE),
|
| 201 |
+
'registry_no': (0.641, 0.104, 0.924, 0.132, _LINE),
|
| 202 |
+
'city_municipality': (0.231, 0.113, 0.638, 0.129, _LINE),
|
| 203 |
+
'marriage_license_no': (0.673, 0.132, 0.928, 0.150, _LINE),
|
| 204 |
+
'date_issued': (0.775, 0.150, 0.932, 0.168, _LINE),
|
| 205 |
+
'groom_name_first': (0.170, 0.294, 0.483, 0.309, _LINE),
|
| 206 |
+
'groom_name_middle': (0.176, 0.308, 0.485, 0.320, _LINE),
|
| 207 |
+
'groom_name_last': (0.174, 0.319, 0.486, 0.333, _LINE),
|
| 208 |
+
'bride_name_first': (0.622, 0.292, 0.937, 0.306, _LINE),
|
| 209 |
+
'bride_name_middle': (0.622, 0.306, 0.928, 0.319, _LINE),
|
| 210 |
+
'bride_name_last': (0.621, 0.319, 0.929, 0.334, _LINE),
|
| 211 |
+
'groom_dob': (0.152, 0.348, 0.394, 0.369, _LINE),
|
| 212 |
+
'groom_age': (0.400, 0.345, 0.474, 0.371, _WORD),
|
| 213 |
+
'bride_dob': (0.576, 0.345, 0.853, 0.365, _LINE),
|
| 214 |
+
'bride_age': (0.851, 0.346, 0.932, 0.369, _WORD),
|
| 215 |
+
'groom_place_of_birth': (0.136, 0.371, 0.472, 0.400, _LINE),
|
| 216 |
+
'bride_place_of_birth': (0.585, 0.377, 0.921, 0.400, _LINE),
|
| 217 |
+
'groom_sex': (0.135, 0.408, 0.267, 0.425, _WORD),
|
| 218 |
+
'groom_citizenship': (0.268, 0.407, 0.477, 0.425, _LINE),
|
| 219 |
+
'bride_sex': (0.574, 0.408, 0.708, 0.424, _WORD),
|
| 220 |
+
'bride_citizenship': (0.720, 0.408, 0.917, 0.427, _LINE),
|
| 221 |
+
'groom_residence': (0.140, 0.436, 0.472, 0.463, _LINE),
|
| 222 |
+
'bride_residence': (0.577, 0.434, 0.922, 0.463, _LINE),
|
| 223 |
+
'groom_religion': (0.135, 0.465, 0.472, 0.494, _LINE),
|
| 224 |
+
'bride_religion': (0.584, 0.463, 0.920, 0.486, _LINE),
|
| 225 |
+
'groom_civil_status': (0.135, 0.492, 0.471, 0.517, _WORD),
|
| 226 |
+
'bride_civil_status': (0.585, 0.491, 0.924, 0.513, _WORD),
|
| 227 |
+
'groom_father_name': (0.133, 0.647, 0.477, 0.672, _LINE),
|
| 228 |
+
'groom_father_citizenship':(0.141, 0.669, 0.475, 0.695, _LINE),
|
| 229 |
+
'bride_father_name': (0.580, 0.646, 0.923, 0.666, _LINE),
|
| 230 |
+
'bride_father_citizenship':(0.578, 0.667, 0.916, 0.689, _LINE),
|
| 231 |
+
'groom_mother_name': (0.139, 0.733, 0.474, 0.762, _LINE),
|
| 232 |
+
'groom_mother_citizenship':(0.135, 0.763, 0.480, 0.779, _LINE),
|
| 233 |
+
'bride_mother_name': (0.584, 0.736, 0.914, 0.758, _LINE),
|
| 234 |
+
'bride_mother_citizenship':(0.579, 0.758, 0.924, 0.780, _LINE),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
},
|
|
|
|
|
|
|
| 236 |
'97': {
|
| 237 |
+
'province': (0.196, 0.093, 0.595, 0.111, _LINE),
|
| 238 |
+
'registry_no': (0.771, 0.095, 0.969, 0.130, _LINE),
|
| 239 |
+
'city_municipality': (0.197, 0.119, 0.604, 0.135, _LINE),
|
| 240 |
+
'husband_name_first': (0.257, 0.158, 0.572, 0.173, _LINE),
|
| 241 |
+
'husband_name_middle': (0.251, 0.180, 0.562, 0.192, _LINE),
|
| 242 |
+
'husband_name_last': (0.254, 0.201, 0.576, 0.216, _LINE),
|
| 243 |
+
'wife_name_first': (0.649, 0.158, 0.953, 0.171, _LINE),
|
| 244 |
+
'wife_name_middle': (0.649, 0.180, 0.950, 0.195, _LINE),
|
| 245 |
+
'wife_name_last': (0.651, 0.202, 0.968, 0.214, _LINE),
|
| 246 |
+
'husband_dob': (0.205, 0.231, 0.493, 0.248, _LINE),
|
| 247 |
+
'husband_age': (0.500, 0.233, 0.557, 0.249, _WORD),
|
| 248 |
+
'wife_dob': (0.603, 0.234, 0.889, 0.245, _LINE),
|
| 249 |
+
'wife_age': (0.901, 0.233, 0.961, 0.247, _WORD),
|
| 250 |
+
'husband_place_of_birth': (0.193, 0.262, 0.573, 0.278, _LINE),
|
| 251 |
+
'wife_place_of_birth': (0.595, 0.263, 0.963, 0.279, _LINE),
|
| 252 |
+
'husband_sex': (0.221, 0.288, 0.309, 0.308, _WORD),
|
| 253 |
+
'wife_sex': (0.616, 0.285, 0.711, 0.305, _WORD),
|
| 254 |
+
'husband_citizenship': (0.323, 0.295, 0.567, 0.312, _LINE),
|
| 255 |
+
'wife_citizenship': (0.722, 0.296, 0.963, 0.313, _LINE),
|
| 256 |
+
'husband_residence': (0.190, 0.325, 0.563, 0.362, _LINE),
|
| 257 |
+
'wife_residence': (0.590, 0.326, 0.961, 0.361, _LINE),
|
| 258 |
+
'husband_religion': (0.190, 0.366, 0.567, 0.383, _LINE),
|
| 259 |
+
'wife_religion': (0.582, 0.362, 0.959, 0.383, _LINE),
|
| 260 |
+
'husband_civil_status': (0.189, 0.397, 0.572, 0.415, _WORD),
|
| 261 |
+
'wife_civil_status': (0.588, 0.398, 0.956, 0.414, _WORD),
|
| 262 |
+
'husband_father_name': (0.191, 0.428, 0.574, 0.445, _LINE),
|
| 263 |
+
'wife_father_name': (0.586, 0.429, 0.958, 0.446, _LINE),
|
| 264 |
+
'husband_father_citizenship': (0.184, 0.451, 0.569, 0.467, _LINE),
|
| 265 |
+
'wife_father_citizenship': (0.588, 0.449, 0.947, 0.465, _LINE),
|
| 266 |
+
'husband_mother_name': (0.176, 0.481, 0.563, 0.498, _LINE),
|
| 267 |
+
'wife_mother_name': (0.586, 0.480, 0.940, 0.497, _LINE),
|
| 268 |
+
'husband_mother_citizenship': (0.191, 0.501, 0.573, 0.517, _LINE),
|
| 269 |
+
'wife_mother_citizenship': (0.590, 0.501, 0.971, 0.517, _LINE),
|
| 270 |
+
'place_of_marriage': (0.196, 0.650, 0.958, 0.664, _LINE),
|
| 271 |
+
'date_of_marriage': (0.199, 0.678, 0.548, 0.692, _LINE),
|
| 272 |
+
'time_of_marriage': (0.765, 0.680, 0.917, 0.696, _LINE),
|
| 273 |
+
'registration_date': (0.635, 0.717, 0.919, 0.736, _LINE),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
},
|
| 275 |
}
|
| 276 |
|
| 277 |
|
| 278 |
+
# ββ Alignment helpers βββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
+
def _order_corners(pts: np.ndarray) -> np.ndarray:
|
| 281 |
+
s = pts.sum(axis=1)
|
| 282 |
+
d = np.diff(pts, axis=1).flatten()
|
| 283 |
+
return np.array([
|
| 284 |
+
pts[np.argmin(s)],
|
| 285 |
+
pts[np.argmin(d)],
|
| 286 |
+
pts[np.argmax(s)],
|
| 287 |
+
pts[np.argmax(d)],
|
| 288 |
+
], dtype=np.float32)
|
| 289 |
|
| 290 |
|
| 291 |
+
def _correct_perspective(scan_rgb: np.ndarray, ref_w: int, ref_h: int) -> np.ndarray:
|
| 292 |
+
if not _CV2_OK:
|
| 293 |
+
return scan_rgb
|
| 294 |
+
gray = _cv2.cvtColor(scan_rgb, _cv2.COLOR_RGB2GRAY)
|
| 295 |
+
kernel = _cv2.getStructuringElement(_cv2.MORPH_RECT, (5, 5))
|
| 296 |
+
blur = _cv2.GaussianBlur(gray, (7, 7), 0)
|
|
|
|
| 297 |
_, thresh = _cv2.threshold(blur, 0, 255, _cv2.THRESH_BINARY + _cv2.THRESH_OTSU)
|
| 298 |
+
dilated = _cv2.dilate(thresh, kernel, iterations=2)
|
| 299 |
+
contours, _ = _cv2.findContours(dilated, _cv2.RETR_EXTERNAL, _cv2.CHAIN_APPROX_SIMPLE)
|
|
|
|
| 300 |
if not contours:
|
| 301 |
+
return scan_rgb
|
| 302 |
+
c = max(contours, key=_cv2.contourArea)
|
|
|
|
|
|
|
| 303 |
area = _cv2.contourArea(c)
|
| 304 |
+
if area < 0.30 * gray.shape[0] * gray.shape[1]:
|
| 305 |
+
print('[align] perspective: contour too small, skipping')
|
| 306 |
+
return scan_rgb
|
| 307 |
+
peri = _cv2.arcLength(c, True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
approx = _cv2.approxPolyDP(c, 0.02 * peri, True)
|
|
|
|
| 309 |
if len(approx) != 4:
|
| 310 |
+
print(f'[align] perspective: {len(approx)} corners (need 4), skipping')
|
| 311 |
+
return scan_rgb
|
| 312 |
+
src = _order_corners(approx.reshape(4, 2).astype(np.float32))
|
| 313 |
+
dst = np.array([[0, 0], [ref_w, 0], [ref_w, ref_h], [0, ref_h]], np.float32)
|
| 314 |
+
M = _cv2.getPerspectiveTransform(src, dst)
|
| 315 |
+
warped = _cv2.warpPerspective(
|
| 316 |
+
scan_rgb, M, (ref_w, ref_h),
|
| 317 |
+
flags=_cv2.INTER_LINEAR, borderMode=_cv2.BORDER_REPLICATE)
|
| 318 |
+
print('[align] perspective correction applied')
|
| 319 |
+
return warped
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def _ecc_align(scan_gray: np.ndarray, ref_gray: np.ndarray,
|
| 323 |
+
scan_rgb: np.ndarray) -> np.ndarray | None:
|
| 324 |
+
try:
|
| 325 |
+
h, w = ref_gray.shape
|
| 326 |
+
scale = min(1.0, 500.0 / max(h, w))
|
| 327 |
+
sh, sw = max(1, int(h * scale)), max(1, int(w * scale))
|
| 328 |
+
ref_s = _cv2.resize(ref_gray, (sw, sh))
|
| 329 |
+
scn_s = _cv2.resize(_cv2.resize(scan_gray, (w, h)), (sw, sh))
|
| 330 |
+
warp = np.eye(2, 3, dtype=np.float32)
|
| 331 |
+
criteria = (_cv2.TERM_CRITERIA_EPS | _cv2.TERM_CRITERIA_COUNT, 100, 1e-4)
|
| 332 |
+
cc, warp = _cv2.findTransformECC(ref_s, scn_s, warp, _cv2.MOTION_EUCLIDEAN, criteria)
|
| 333 |
+
angle = np.degrees(np.arctan2(warp[1, 0], warp[0, 0]))
|
| 334 |
+
if abs(angle) > 1.0:
|
| 335 |
+
clamped = np.radians(np.clip(angle, -1.0, 1.0))
|
| 336 |
+
warp[0, 0] = np.cos(clamped); warp[0, 1] = -np.sin(clamped)
|
| 337 |
+
warp[1, 0] = np.sin(clamped); warp[1, 1] = np.cos(clamped)
|
| 338 |
+
warp[0, 2] /= scale; warp[1, 2] /= scale
|
| 339 |
+
scan_full = _cv2.resize(scan_rgb, (w, h))
|
| 340 |
+
aligned = _cv2.warpAffine(scan_full, warp, (w, h),
|
| 341 |
+
flags=_cv2.INTER_LINEAR,
|
| 342 |
+
borderMode=_cv2.BORDER_REPLICATE)
|
| 343 |
+
print(f'[align] ECC applied (cc={cc:.4f} angle={angle:.2f}Β°)')
|
| 344 |
+
return aligned
|
| 345 |
+
except Exception as e:
|
| 346 |
+
print(f'[align] ECC failed: {e}')
|
| 347 |
return None
|
| 348 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
+
def _orb_align(scan_gray: np.ndarray, ref_gray: np.ndarray,
|
| 351 |
+
scan_rgb: np.ndarray) -> tuple[np.ndarray | None, int]:
|
| 352 |
+
h, w = scan_gray.shape
|
|
|
|
|
|
|
|
|
|
| 353 |
ref_resized = _cv2.resize(ref_gray, (w, h))
|
| 354 |
+
orb = _cv2.ORB_create(nfeatures=5000)
|
| 355 |
+
kp1, des1 = orb.detectAndCompute(scan_gray, None)
|
| 356 |
+
kp2, des2 = orb.detectAndCompute(ref_resized, None)
|
|
|
|
|
|
|
| 357 |
if des1 is None or des2 is None or len(kp1) < 10 or len(kp2) < 10:
|
| 358 |
return None, 0
|
|
|
|
| 359 |
matcher = _cv2.BFMatcher(_cv2.NORM_HAMMING, crossCheck=True)
|
| 360 |
matches = sorted(matcher.match(des1, des2), key=lambda m: m.distance)
|
| 361 |
+
good = matches[:max(10, len(matches) // 3)]
|
|
|
|
|
|
|
| 362 |
if len(good) < 10:
|
| 363 |
return None, 0
|
|
|
|
| 364 |
src_pts = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
|
| 365 |
dst_pts = np.float32([kp2[m.trainIdx].pt for m in good]).reshape(-1, 1, 2)
|
|
|
|
| 366 |
M, mask = _cv2.estimateAffinePartial2D(
|
| 367 |
+
src_pts, dst_pts, method=_cv2.RANSAC, ransacReprojThreshold=5.0)
|
|
|
|
| 368 |
if M is None:
|
| 369 |
return None, 0
|
|
|
|
| 370 |
inliers = int(mask.sum()) if mask is not None else 0
|
| 371 |
+
aligned = _cv2.warpAffine(scan_rgb, M, (w, h),
|
| 372 |
+
flags=_cv2.INTER_LINEAR,
|
| 373 |
+
borderMode=_cv2.BORDER_REPLICATE)
|
| 374 |
+
print(f'[align] ORB applied ({inliers} inliers)')
|
|
|
|
|
|
|
|
|
|
| 375 |
return aligned, inliers
|
| 376 |
|
| 377 |
|
| 378 |
def _orb_inliers(scan_gray: np.ndarray, ref_gray: np.ndarray) -> int:
|
| 379 |
+
orb = _cv2.ORB_create(nfeatures=3000)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
kp1, des1 = orb.detectAndCompute(scan_gray, None)
|
| 381 |
+
kp2, des2 = orb.detectAndCompute(ref_gray, None)
|
| 382 |
if des1 is None or des2 is None or len(kp1) < 10 or len(kp2) < 10:
|
| 383 |
return 0
|
| 384 |
matcher = _cv2.BFMatcher(_cv2.NORM_HAMMING, crossCheck=True)
|
| 385 |
matches = sorted(matcher.match(des1, des2), key=lambda m: m.distance)
|
| 386 |
+
good = matches[:max(10, len(matches) // 3)]
|
| 387 |
if len(good) < 10:
|
| 388 |
return 0
|
| 389 |
src_pts = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
|
|
|
|
| 392 |
return int(mask.sum()) if mask is not None else 0
|
| 393 |
|
| 394 |
|
| 395 |
+
def align_to_reference(img: Image.Image, form_type: str) -> tuple[Image.Image, int]:
|
| 396 |
"""
|
| 397 |
+
Four-stage alignment cascade:
|
| 398 |
+
Stage 0 β Perspective correction
|
| 399 |
+
Stage 1 β ECC EUCLIDEAN
|
| 400 |
+
Stage 2 β ORB RANSAC affine
|
| 401 |
+
Stage 3 β Resize only
|
| 402 |
+
Returns (aligned_image, orb_inlier_count).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
"""
|
| 404 |
if not _CV2_OK:
|
| 405 |
return img, 0
|
|
|
|
| 406 |
ref_path = REFERENCE_IMAGES.get(form_type)
|
| 407 |
if not ref_path or not os.path.exists(ref_path):
|
| 408 |
+
print(f'[align] No reference for form {form_type}')
|
| 409 |
return img, 0
|
|
|
|
| 410 |
ref_gray = _cv2.imread(ref_path, _cv2.IMREAD_GRAYSCALE)
|
| 411 |
if ref_gray is None:
|
| 412 |
return img, 0
|
|
|
|
|
|
|
|
|
|
| 413 |
ref_h, ref_w = ref_gray.shape
|
| 414 |
+
scan_rgb = np.array(img.convert('RGB'))
|
| 415 |
|
| 416 |
+
scan_rgb = _correct_perspective(scan_rgb, ref_w, ref_h)
|
| 417 |
+
scan_rgb_rs = _cv2.resize(scan_rgb, (ref_w, ref_h))
|
| 418 |
+
scan_gray_rs = _cv2.cvtColor(scan_rgb_rs, _cv2.COLOR_RGB2GRAY)
|
| 419 |
|
|
|
|
| 420 |
print(f'[align] Form {form_type}: trying ECC...')
|
| 421 |
aligned = _ecc_align(scan_gray_rs, ref_gray, scan_rgb_rs)
|
| 422 |
if aligned is not None:
|
| 423 |
+
return Image.fromarray(aligned), 25
|
|
|
|
| 424 |
|
|
|
|
| 425 |
print(f'[align] Form {form_type}: ECC failed, trying ORB...')
|
| 426 |
aligned, inliers = _orb_align(scan_gray_rs, ref_gray, scan_rgb_rs)
|
| 427 |
if aligned is not None:
|
|
|
|
| 428 |
return Image.fromarray(aligned), inliers
|
| 429 |
|
| 430 |
+
print(f'[align] Form {form_type}: all alignment failed, resizing only')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
resized = _cv2.resize(scan_rgb, (ref_w, ref_h))
|
| 432 |
return Image.fromarray(resized), 0
|
| 433 |
|
| 434 |
|
| 435 |
+
# ββ Image preprocessing βββββββββββββββββββββββββββββββββββββββββββ
|
| 436 |
+
|
| 437 |
def _deskew(gray: np.ndarray) -> np.ndarray:
|
|
|
|
| 438 |
if not _CV2_OK:
|
| 439 |
return gray
|
| 440 |
edges = _cv2.Canny(gray, 50, 150, apertureSize=3)
|
| 441 |
+
lines = _cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100,
|
| 442 |
minLineLength=100, maxLineGap=10)
|
| 443 |
+
if lines is None:
|
| 444 |
return gray
|
| 445 |
+
angles = [np.degrees(np.arctan2(y2-y1, x2-x1))
|
| 446 |
+
for x1, y1, x2, y2 in lines[:, 0]
|
| 447 |
+
if -15 < np.degrees(np.arctan2(y2-y1, x2-x1)) < 15]
|
|
|
|
|
|
|
| 448 |
if not angles:
|
| 449 |
return gray
|
| 450 |
+
angle = float(np.median(angles))
|
| 451 |
+
if abs(angle) < 0.3:
|
| 452 |
return gray
|
| 453 |
h, w = gray.shape
|
| 454 |
+
M = _cv2.getRotationMatrix2D((w/2, h/2), angle, 1.0)
|
| 455 |
return _cv2.warpAffine(gray, M, (w, h),
|
| 456 |
flags=_cv2.INTER_CUBIC,
|
| 457 |
borderMode=_cv2.BORDER_REPLICATE)
|
| 458 |
|
| 459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
def _preprocess(img: Image.Image) -> Image.Image:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
if not _CV2_OK:
|
| 462 |
return img.convert('L')
|
| 463 |
gray = np.array(img.convert('L'))
|
|
|
|
| 466 |
|
| 467 |
|
| 468 |
def _crop_field(img: Image.Image, x1r, y1r, x2r, y2r) -> Image.Image:
|
|
|
|
| 469 |
w, h = img.size
|
| 470 |
+
pad = 4
|
| 471 |
+
x1 = max(0, int(x1r * w) - pad); y1 = max(0, int(y1r * h) - pad)
|
| 472 |
+
x2 = min(w, int(x2r * w) + pad); y2 = min(h, int(y2r * h) + pad)
|
|
|
|
|
|
|
|
|
|
| 473 |
return img.crop((x1, y1, x2, y2))
|
| 474 |
|
| 475 |
|
| 476 |
+
def _ocr(crop: Image.Image) -> str:
|
| 477 |
+
"""Run CRNN+CTC on a cropped field image."""
|
| 478 |
+
return _crnn_read(crop)
|
| 479 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
|
| 481 |
+
# ββ Form type detection βββββββββββββββββββββββββββββββββββββββββββ
|
| 482 |
|
| 483 |
def detect_form_type(image_path: str) -> str:
|
| 484 |
+
"""Auto-detect form type using ORB inlier scoring, falling back to OCR title."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
if _CV2_OK:
|
| 486 |
try:
|
| 487 |
+
img = Image.open(image_path).convert('RGB')
|
| 488 |
scan_rgb = np.array(img)
|
| 489 |
scan_gray = _cv2.cvtColor(scan_rgb, _cv2.COLOR_RGB2GRAY)
|
| 490 |
+
best_type, best_inliers = None, 0
|
| 491 |
+
DET_W = 800
|
|
|
|
|
|
|
|
|
|
| 492 |
for ft, ref_path in REFERENCE_IMAGES.items():
|
| 493 |
if not os.path.exists(ref_path):
|
| 494 |
continue
|
|
|
|
| 496 |
if ref_gray is None:
|
| 497 |
continue
|
| 498 |
ref_h, ref_w = ref_gray.shape
|
|
|
|
| 499 |
sc = min(1.0, DET_W / ref_w)
|
| 500 |
+
dw, dh = max(1, int(ref_w*sc)), max(1, int(ref_h*sc))
|
| 501 |
+
ref_ds = _cv2.resize(ref_gray, (dw, dh))
|
| 502 |
+
scan_ds = _cv2.resize(_cv2.resize(scan_gray, (ref_w, ref_h)), (dw, dh))
|
| 503 |
+
count = _orb_inliers(scan_ds, ref_ds)
|
| 504 |
print(f'[detect] Form {ft}: {count} ORB inliers')
|
| 505 |
if count > best_inliers:
|
| 506 |
+
best_inliers, best_type = count, ft
|
|
|
|
|
|
|
| 507 |
if best_type and best_inliers >= 15:
|
| 508 |
+
print(f'[detect] Best: Form {best_type} ({best_inliers} inliers)')
|
| 509 |
return best_type
|
| 510 |
+
print(f'[detect] ORB inconclusive ({best_inliers}), trying OCR title')
|
|
|
|
|
|
|
| 511 |
except Exception as e:
|
| 512 |
print(f'[template_matcher] detect_form_type ORB error: {e}')
|
| 513 |
+
# CRNN+CTC title fallback
|
|
|
|
| 514 |
try:
|
| 515 |
img_l = Image.open(image_path).convert('L')
|
| 516 |
w, h = img_l.size
|
| 517 |
+
title = _crnn_read(img_l.crop((0, int(h*0.04), w, int(h*0.15)))).upper()
|
|
|
|
| 518 |
if title:
|
| 519 |
if 'LIVE BIRTH' in title or ('BIRTH' in title
|
| 520 |
and 'DEATH' not in title and 'MARRIAGE' not in title):
|
| 521 |
return '102'
|
| 522 |
+
if 'DEATH' in title:
|
| 523 |
return '103'
|
| 524 |
+
if 'MARRIAGE' in title and 'LICENSE' in title:
|
| 525 |
return '90'
|
| 526 |
+
if 'MARRIAGE' in title:
|
| 527 |
return '97'
|
| 528 |
+
print('[detect] Could not detect form type; defaulting to 102.')
|
|
|
|
| 529 |
except Exception as e:
|
| 530 |
print(f'[template_matcher] detect_form_type OCR error: {e}')
|
| 531 |
return '102'
|
| 532 |
|
| 533 |
|
| 534 |
+
# ββ Main extraction βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 535 |
+
|
| 536 |
def extract_fields(image_path: str, form_type: str) -> dict:
|
| 537 |
"""
|
| 538 |
+
Extract handwritten field values from a civil registry form scan.
|
| 539 |
|
| 540 |
Args:
|
| 541 |
+
image_path : path to uploaded form image (PNG / JPG / PDF page)
|
| 542 |
+
form_type : '102' | '103' | '90' | '97'
|
| 543 |
|
| 544 |
Returns:
|
| 545 |
dict of { field_name: extracted_text }
|
|
|
|
| 548 |
if template is None:
|
| 549 |
print(f'[template_matcher] No template for form type: {form_type}')
|
| 550 |
return {}
|
| 551 |
+
if _get_crnn() is None:
|
| 552 |
+
print('[template_matcher] CRNN+CTC not available')
|
|
|
|
| 553 |
return {}
|
|
|
|
|
|
|
|
|
|
| 554 |
try:
|
| 555 |
img = Image.open(image_path).convert('RGB')
|
| 556 |
except Exception as e:
|
| 557 |
print(f'[template_matcher] Cannot open image: {e}')
|
| 558 |
return {}
|
| 559 |
|
|
|
|
|
|
|
| 560 |
img, orb_inliers = align_to_reference(img, form_type)
|
| 561 |
+
processed = _preprocess(img)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
|
| 563 |
+
# Anchor detection disabled: CRNN+CTC is trained on handwritten text and
|
| 564 |
+
# reads printed labels inconsistently, causing fields to jump between
|
| 565 |
+
# anchor-relative and absolute positions across runs.
|
| 566 |
+
# After ECC/ORB alignment the absolute coordinates are stable and sufficient.
|
| 567 |
form_w, form_h = img.size
|
| 568 |
+
field_names, crops, methods = [], [], []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
|
| 570 |
for field_name, coords in template.items():
|
| 571 |
+
x1r, y1r, x2r, y2r, _ = coords
|
| 572 |
+
crop = _crop_field(processed, x1r, y1r, x2r, y2r)
|
| 573 |
+
method = 'absolute'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
field_names.append(field_name)
|
| 575 |
crops.append(crop)
|
| 576 |
+
methods.append(method)
|
| 577 |
|
| 578 |
+
fields = {}
|
| 579 |
+
anchor_hits = 0
|
| 580 |
+
for field_name, crop, method in zip(field_names, crops, methods):
|
| 581 |
text = _postprocess(_ocr(crop), field_name)
|
| 582 |
if text:
|
| 583 |
fields[field_name] = text
|
| 584 |
|
| 585 |
+
print(f'[template_matcher] Anchor hits: {anchor_hits}/{len(anchor_defs)} | '
|
| 586 |
+
f'Extracted: {len(fields)}/{len(template)} fields')
|
|
|
|
| 587 |
return fields
|
| 588 |
|
| 589 |
|
| 590 |
+
# ββ Debug visualisation βββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 591 |
|
| 592 |
+
def debug_draw_boxes(image_path: str, form_type: str, out_path: str = None) -> str:
|
| 593 |
"""
|
| 594 |
+
Draw all field boxes on the aligned image and save it.
|
|
|
|
|
|
|
| 595 |
|
| 596 |
+
Colour coding:
|
| 597 |
+
GREEN β anchor found and crop succeeded
|
| 598 |
+
RED β anchor search region (label NOT found)
|
| 599 |
+
BLUE β absolute coordinate crop (no anchor defined for field)
|
| 600 |
+
ORANGE β anchor found but crop produced empty region
|
| 601 |
"""
|
| 602 |
from PIL import ImageDraw, ImageFont
|
| 603 |
|
| 604 |
template = TEMPLATES.get(form_type)
|
| 605 |
if not template:
|
| 606 |
print(f'No template for {form_type}')
|
| 607 |
+
return None
|
| 608 |
|
| 609 |
+
img, _ = align_to_reference(Image.open(image_path).convert('RGB'), form_type)
|
| 610 |
+
draw = ImageDraw.Draw(img)
|
|
|
|
| 611 |
w, h = img.size
|
| 612 |
|
| 613 |
try:
|
| 614 |
+
font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 11)
|
| 615 |
+
except Exception:
|
| 616 |
+
try:
|
| 617 |
+
font = ImageFont.truetype('C:/Windows/Fonts/arial.ttf', 11)
|
| 618 |
+
except Exception:
|
| 619 |
+
font = ImageFont.load_default()
|
| 620 |
|
| 621 |
+
for field_name, coords in template.items():
|
|
|
|
| 622 |
x1r, y1r, x2r, y2r, _ = coords
|
| 623 |
+
bx1, by1 = int(x1r*w), int(y1r*h)
|
| 624 |
+
bx2, by2 = int(x2r*w), int(y2r*h)
|
| 625 |
+
draw.rectangle([bx1, by1, bx2, by2], outline='#1a6fd4', width=1)
|
| 626 |
+
draw.text((bx1+2, by1+2), field_name, fill='#1a6fd4', font=font)
|
| 627 |
|
| 628 |
base, ext = os.path.splitext(image_path)
|
| 629 |
out = out_path or f'{base}_debug_{form_type}{ext}'
|
| 630 |
img.save(out)
|
| 631 |
print(f'[template_matcher] Debug image saved: {out}')
|
| 632 |
+
print(' GREEN = anchor found + crop region')
|
| 633 |
+
print(' RED = anchor label NOT found (search region shown)')
|
| 634 |
+
print(' BLUE = no anchor defined (absolute coords used)')
|
| 635 |
+
print(' ORANGE = anchor found but crop was empty')
|
| 636 |
return out
|
| 637 |
|
| 638 |
|
| 639 |
+
# ββ PDF helper ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 640 |
+
|
| 641 |
+
def pdf_to_image(pdf_path: str, page: int = 0) -> str:
|
| 642 |
+
try:
|
| 643 |
+
from pdf2image import convert_from_path
|
| 644 |
+
pages = convert_from_path(pdf_path, dpi=150)
|
| 645 |
+
out_path = pdf_path.replace('.pdf', f'_page{page}.png')
|
| 646 |
+
pages[page].save(out_path, 'PNG')
|
| 647 |
+
return out_path
|
| 648 |
+
except ImportError:
|
| 649 |
+
print('[template_matcher] pdf2image not installed.')
|
| 650 |
+
return None
|
| 651 |
+
except Exception as e:
|
| 652 |
+
print(f'[template_matcher] PDF conversion failed: {e}')
|
| 653 |
+
return None
|
| 654 |
+
|
| 655 |
+
|
| 656 |
+
# ββ CLI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 657 |
+
|
| 658 |
if __name__ == '__main__':
|
| 659 |
if len(sys.argv) < 3:
|
| 660 |
+
print('Usage: python template_matcher.py <image_path> <form_type> [out_path]')
|
| 661 |
print(' form_type: 102 | 103 | 90 | 97')
|
|
|
|
| 662 |
sys.exit(1)
|
| 663 |
|
| 664 |
img_path = sys.argv[1]
|
| 665 |
form_type = sys.argv[2]
|
| 666 |
+
out_path = sys.argv[3] if len(sys.argv) > 3 else None
|
| 667 |
|
| 668 |
+
out = debug_draw_boxes(img_path, form_type, out_path)
|
| 669 |
+
print(f'\nDebug image: {out}')
|
| 670 |
+
print(' GREEN = anchor hit | RED = anchor miss | BLUE = absolute fallback\n')
|
| 671 |
|
|
|
|
| 672 |
result = extract_fields(img_path, form_type)
|
| 673 |
+
print(f'Extracted fields ({len(result)}):')
|
| 674 |
for k, v in result.items():
|
| 675 |
+
print(f' {k:<40} = {v}')
|
| 676 |
+
|
| 677 |
template = TEMPLATES.get(form_type, {})
|
| 678 |
+
missing = [k for k in template if k not in result]
|
| 679 |
if missing:
|
| 680 |
print(f'\nEmpty fields ({len(missing)}):')
|
| 681 |
for k in missing:
|
| 682 |
+
print(f' {k}')
|