Hanz Pillerva Claude Sonnet 4.6 commited on
Commit
d748584
Β·
1 Parent(s): 56161f2

Replace TrOCR/anchor system with CRNN+CTC absolute-coordinate OCR

Browse files

- template_matcher: removed anchor detection, TrOCR, pytesseract; now uses CRNN+CTC model with ECC/ORB alignment + absolute coordinate crops
- app.py: updated preload to use _get_crnn instead of _get_trocr
- calibrate_fields: updated to match latest changes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show
  1. CRNN+CTC/calibrate_fields.py +10 -10
  2. app.py +5 -5
  3. template_matcher.py +396 -972
CRNN+CTC/calibrate_fields.py CHANGED
@@ -37,18 +37,17 @@ COLOURS = [
37
 
38
  def draw_boxes(img, bounds):
39
  left, top, right, bottom = bounds
40
- fw = right - left
41
- fh = bottom - top
42
 
43
  vis = img.copy()
44
  # form boundary
45
  cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 2)
46
 
47
  for idx, (name, rx1, ry1, rx2, ry2) in enumerate(boxes):
48
- x1 = int(left + rx1 * fw)
49
- y1 = int(top + ry1 * fh)
50
- x2 = int(left + rx2 * fw)
51
- y2 = int(top + ry2 * fh)
52
  c = COLOURS[idx % len(COLOURS)]
53
  cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2)
54
  cv2.putText(vis, name[:25], (x1 + 2, max(0, y1 - 3)),
@@ -160,10 +159,11 @@ def main():
160
  elif event == cv2.EVENT_LBUTTONUP:
161
  drawing = False
162
  ex, ey = x, y
163
- x1r = (min(ix, ex) - left) / fw
164
- y1r = (min(iy, ey) - top) / fh
165
- x2r = (max(ix, ex) - left) / fw
166
- y2r = (max(iy, ey) - top) / fh
 
167
  x1r, y1r = max(0.0, x1r), max(0.0, y1r)
168
  x2r, y2r = min(1.0, x2r), min(1.0, y2r)
169
  if (x2r - x1r) > 0.005 and (y2r - y1r) > 0.003:
 
37
 
38
  def draw_boxes(img, bounds):
39
  left, top, right, bottom = bounds
40
+ h, w = img.shape[:2]
 
41
 
42
  vis = img.copy()
43
  # form boundary
44
  cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 2)
45
 
46
  for idx, (name, rx1, ry1, rx2, ry2) in enumerate(boxes):
47
+ x1 = int(rx1 * w)
48
+ y1 = int(ry1 * h)
49
+ x2 = int(rx2 * w)
50
+ y2 = int(ry2 * h)
51
  c = COLOURS[idx % len(COLOURS)]
52
  cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2)
53
  cv2.putText(vis, name[:25], (x1 + 2, max(0, y1 - 3)),
 
159
  elif event == cv2.EVENT_LBUTTONUP:
160
  drawing = False
161
  ex, ey = x, y
162
+ ih, iw = img_orig.shape[:2]
163
+ x1r = min(ix, ex) / iw
164
+ y1r = min(iy, ey) / ih
165
+ x2r = max(ix, ex) / iw
166
+ y2r = max(iy, ey) / ih
167
  x1r, y1r = max(0.0, x1r), max(0.0, y1r)
168
  x2r, y2r = min(1.0, x2r), min(1.0, y2r)
169
  if (x2r - x1r) > 0.005 and (y2r - y1r) > 0.003:
app.py CHANGED
@@ -45,13 +45,13 @@ PIPELINE_REPO_PATH = r"C:\xampp\htdocs\python"
45
 
46
  # ── Load template matcher ─────────────────────────────────────
47
  try:
48
- from template_matcher import extract_fields, pdf_to_image, detect_form_type, _get_trocr
49
  _template_matcher_ok = True
50
  print("[app.py] Template matcher loaded")
51
- # Preload TrOCR at startup so the first request isn't slow
52
- print("[app.py] Preloading TrOCR model...")
53
- _get_trocr()
54
- print("[app.py] TrOCR preloaded.")
55
  except Exception as _tm_err:
56
  _template_matcher_ok = False
57
  print(f"[app.py] Template matcher unavailable: {_tm_err}")
 
45
 
46
  # ── Load template matcher ─────────────────────────────────────
47
  try:
48
+ from template_matcher import extract_fields, pdf_to_image, detect_form_type, _get_crnn
49
  _template_matcher_ok = True
50
  print("[app.py] Template matcher loaded")
51
+ # Preload CRNN+CTC at startup so the first request isn't slow
52
+ print("[app.py] Preloading CRNN+CTC model...")
53
+ _get_crnn()
54
+ print("[app.py] CRNN+CTC preloaded.")
55
  except Exception as _tm_err:
56
  _template_matcher_ok = False
57
  print(f"[app.py] Template matcher unavailable: {_tm_err}")
template_matcher.py CHANGED
@@ -1,29 +1,22 @@
1
  """
2
- template_matcher.py
3
- ====================
4
- Extracts field values from civil registry scanned forms using fixed
5
- coordinate templates. No ML training required.
6
-
7
- How it works:
8
- 1. Load the uploaded image
9
- 2. Detect which form it is (passed in as form_type: '102','103','90','97')
10
- 3. Normalize image to a standard size
11
- 4. Crop each predefined field region
12
- 5. Run Tesseract OCR on the crop
13
- 6. Return { field_name: value, ... }
14
-
15
- Coordinates are stored as relative fractions (0.0-1.0) of the image
16
- width/height so they work at any scan resolution.
17
-
18
- CALIBRATION
19
- -----------
20
- If OCR picks up the wrong area, adjust the (x1, y1, x2, y2) values
21
- for that field in the TEMPLATES dict below.
22
- Run: python template_matcher.py <image_path> <form_type>
23
- to see a debug image with all boxes drawn.
24
  """
25
 
26
- import os, sys
 
 
 
27
  import numpy as np
28
  from PIL import Image
29
 
@@ -33,741 +26,364 @@ try:
33
  except ImportError:
34
  _CV2_OK = False
35
 
36
- # ── Reference images for form alignment ────────────────────────
37
- # Place one clean blank/lightly-filled scan for each form type in:
38
- # python/references/reference_102.png
39
- # python/references/reference_103.png
40
- # python/references/reference_90.png
41
- # python/references/reference_97.png
42
  _REF_DIR = os.path.join(os.path.dirname(__file__), 'references')
43
  REFERENCE_IMAGES = {
44
- '102': os.path.join(_REF_DIR, 'reference_102.png'),
45
  '103': os.path.join(_REF_DIR, 'reference_103.png'),
46
- '90': os.path.join(_REF_DIR, 'reference-90.png'),
47
- '97': os.path.join(_REF_DIR, 'reference_97.png'),
48
  }
49
 
50
- # ── OCR engine: TrOCR large ────────────────────────────────────
51
- _trocr_processor = None
52
- _trocr_model = None
 
 
 
 
 
 
53
 
54
- def _get_trocr():
55
- global _trocr_processor, _trocr_model
56
- if _trocr_processor is None:
 
57
  try:
58
- from transformers import TrOCRProcessor, VisionEncoderDecoderModel
59
  import torch
60
- print('[template_matcher] Loading TrOCR large-handwritten...')
61
- _trocr_processor = TrOCRProcessor.from_pretrained(
62
- 'microsoft/trocr-large-handwritten')
63
- _trocr_model = VisionEncoderDecoderModel.from_pretrained(
64
- 'microsoft/trocr-large-handwritten')
65
- _trocr_model.eval()
66
- print('[template_matcher] TrOCR ready.')
 
 
 
 
67
  except Exception as e:
68
- print(f'[template_matcher] TrOCR load error: {e}')
69
- return _trocr_processor, _trocr_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- # ── CRNN+CTC (kept for future use β€” swap back when model is trained) ──
72
- # _crnn_ocr = None
73
- # _CRNN_CHECKPOINT = os.path.join(
74
- # os.path.dirname(__file__), 'CRNN+CTC', 'checkpoints', 'best_model_final.pth'
75
- # )
76
- # def _get_crnn(): ... (see git history)
77
 
78
- # Hint constants kept for template dict compatibility (values unused by CRNN)
79
  _LINE = 'line'
80
  _BLOCK = 'block'
81
  _WORD = 'word'
82
 
83
- # ── Per-field post-processing ──────────────────────────────────
84
- import re as _re
85
-
86
- # Maps abbreviated sex readings β†’ canonical value
87
  _SEX_KEYWORDS = {
88
  'female': 'FEMALE', 'fem': 'FEMALE', 'f': 'FEMALE',
89
- 'male': 'MALE', 'm': 'MALE',
90
  }
91
-
92
- # Maps field name β†’ normalization rule
93
  _FIELD_TYPE = {
94
- # Sex
95
  'sex': 'sex', 'groom_sex': 'sex', 'bride_sex': 'sex',
96
  'husband_sex': 'sex', 'wife_sex': 'sex',
97
- # Year (4-digit)
98
  'dob_year': 'year',
99
- # Pure digits
100
  'age': 'digits', 'groom_age': 'digits', 'bride_age': 'digits',
101
  'husband_age': 'digits', 'wife_age': 'digits', 'dob_day': 'digits',
102
- # Dates β€” keep digits, spaces, common separators
103
  'registration_date': 'date', 'marriage_date': 'date',
104
- 'date_of_marriage': 'date', 'date_of_death': 'date',
105
- 'date_of_birth': 'date', 'date_issued': 'date',
106
  'groom_dob': 'date', 'bride_dob': 'date',
107
  'husband_dob': 'date', 'wife_dob': 'date',
108
- # Registry / license numbers β€” alphanumeric + separators
109
  'registry_no': 'registry', 'marriage_license_no': 'registry',
110
  }
111
 
112
-
113
  def _postprocess(text: str, field_name: str) -> str:
114
- """
115
- Normalize and validate OCR output by field type.
116
-
117
- sex β†’ 'MALE' or 'FEMALE'
118
- year β†’ 4-digit year string, e.g. '1990'
119
- digits β†’ strip all non-digit characters
120
- date β†’ keep digits, spaces, '-', '/', '.', ','
121
- registry β†’ keep alphanumeric, spaces, hyphens, slashes
122
- (default) β†’ strip leading/trailing whitespace
123
- """
124
  text = text.strip()
125
  if not text:
126
  return text
127
  rule = _FIELD_TYPE.get(field_name)
128
-
129
  if rule == 'sex':
130
  tl = text.lower()
131
- # Try longest keyword first to avoid 'f' matching inside 'female' twice
132
  for kw in sorted(_SEX_KEYWORDS, key=len, reverse=True):
133
  if kw in tl:
134
  return _SEX_KEYWORDS[kw]
135
  return text
136
-
137
  if rule == 'year':
138
  m = _re.search(r'(19|20)\d{2}', text)
139
  if m:
140
  return m.group(0)
141
  digits = _re.sub(r'\D', '', text)
142
  return digits[:4] if len(digits) >= 4 else text
143
-
144
  if rule == 'digits':
145
  d = _re.sub(r'\D', '', text)
146
  return d if d else text
147
-
148
  if rule == 'date':
149
  return _re.sub(r'[^\w\s\-/,.]', '', text).strip()
150
-
151
  if rule == 'registry':
152
  return _re.sub(r'[^\w\s\-/]', '', text).strip()
153
-
154
  return text
155
 
156
- # ── Field templates ────────────────────────────────────────────
157
- # Each entry: 'field_name': (x1, y1, x2, y2, hint)
158
- # Coordinates are relative fractions of image dimensions (0.0 – 1.0)
159
- # hint is kept for compatibility but EasyOCR ignores it.
160
- #
161
- # TO CALIBRATE: run this file directly with a sample image, it will
162
- # draw all boxes so you can see which regions need adjusting.
163
 
 
 
164
  TEMPLATES = {
165
-
166
- # ══════════════════════════════════════════════════════════
167
- # FORM 102 β€” Certificate of Live Birth (green border)
168
- # Calibrated from the REAL Municipal Form No. 102
169
- # ══════════════════════════════════════════════════════════
170
- '102': {
171
- 'province': (0.173, 0.089, 0.655, 0.105, _LINE),
172
- 'registry_no': (0.673, 0.105, 0.957, 0.130, _LINE),
173
- 'city_municipality': (0.226, 0.107, 0.658, 0.125, _LINE),
174
  'name_first': (0.169, 0.161, 0.453, 0.181, _LINE),
175
  'name_middle': (0.450, 0.161, 0.674, 0.181, _LINE),
176
- 'name_last': (0.682, 0.161, 0.943, 0.181, _LINE),
177
- 'sex': (0.771, 0.155, 0.963, 0.173, _WORD),
178
- 'dob_day': (0.479, 0.196, 0.596, 0.213, _WORD),
179
- 'dob_month': (0.610, 0.196, 0.781, 0.214, _LINE),
180
- 'dob_year': (0.799, 0.199, 0.947, 0.215, _WORD),
181
- 'place_of_birth': (0.446, 0.227, 0.953, 0.245, _LINE),
182
- 'type_of_birth': (0.112, 0.280, 0.316, 0.299, _WORD),
183
- 'birth_order': (0.596, 0.280, 0.824, 0.294, _WORD),
184
- 'weight_at_birth': (0.837, 0.262, 0.919, 0.291, _WORD),
185
- 'mother_name': (0.296, 0.321, 0.627, 0.341, _LINE),
186
- 'mother_citizenship': (0.206, 0.337, 0.540, 0.363, _LINE),
187
- 'mother_religion': (0.566, 0.347, 0.959, 0.366, _LINE),
188
- 'mother_occupation': (0.553, 0.380, 0.831, 0.405, _LINE),
189
- 'mother_age_at_birth': (0.829, 0.389, 0.967, 0.409, _WORD),
190
- 'mother_residence': (0.219, 0.422, 0.944, 0.442, _LINE),
191
- 'father_name': (0.641, 0.321, 0.957, 0.342, _LINE),
192
- 'father_citizenship': (0.115, 0.491, 0.325, 0.515, _LINE),
193
- 'father_religion': (0.333, 0.493, 0.588, 0.519, _LINE),
194
- 'father_occupation': (0.600, 0.492, 0.811, 0.522, _LINE),
195
- 'father_age_at_birth': (0.819, 0.503, 0.969, 0.523, _WORD),
196
- 'father_residence': (0.230, 0.536, 0.963, 0.555, _LINE),
197
- 'marriage_date': (0.091, 0.593, 0.412, 0.610, _LINE),
198
- 'marriage_place': (0.427, 0.591, 0.949, 0.608, _LINE),
199
- 'registration_date': (0.621, 0.685, 0.905, 0.704, _LINE),
200
- },
201
-
202
- # ══════════════════════════════════════════════════════════
203
- # FORM 103 β€” Certificate of Death (blue border)
204
- # ══════════════════════════════════════════════════════════
205
- '103': {
206
- 'province': (0.173, 0.089, 0.655, 0.105, _LINE),
207
- 'registry_no': (0.673, 0.105, 0.957, 0.130, _LINE),
208
- 'city_municipality': (0.226, 0.107, 0.658, 0.125, _LINE),
209
- 'deceased_name': (0.086, 0.147, 0.745, 0.181, _LINE),
210
- 'sex': (0.771, 0.155, 0.963, 0.173, _WORD),
211
- 'date_of_death': (0.100, 0.197, 0.293, 0.224, _LINE),
212
- 'date_of_birth': (0.320, 0.201, 0.568, 0.228, _LINE),
213
- 'age': (0.575, 0.215, 0.719, 0.231, _WORD),
214
- 'place_of_death': (0.089, 0.241, 0.720, 0.265, _LINE),
215
- 'civil_status': (0.723, 0.250, 0.970, 0.265, _WORD),
216
- 'religion': (0.078, 0.281, 0.310, 0.308, _LINE),
217
- 'citizenship': (0.328, 0.281, 0.526, 0.306, _LINE),
218
- 'residence': (0.540, 0.284, 0.957, 0.310, _LINE),
219
- 'occupation': (0.086, 0.318, 0.283, 0.343, _LINE),
220
- 'father_name': (0.641, 0.321, 0.957, 0.342, _LINE),
221
- 'mother_name': (0.296, 0.321, 0.627, 0.341, _LINE),
222
- 'cause_immediate': (0.298, 0.406, 0.947, 0.420, _LINE),
223
- 'cause_antecedent': (0.298, 0.422, 0.951, 0.441, _LINE),
224
- 'cause_underlying': (0.432, 0.438, 0.960, 0.456, _LINE),
225
- 'registration_date': (0.621, 0.685, 0.905, 0.704, _LINE),
226
- },
227
- # ══════════════════════════════════════════════════════════
228
- # FORM 90 β€” Application for Marriage License (black border)
229
- # 3-column: GROOM (left 38%) | LABEL (center 24%) | BRIDE (right 38%)
230
- # ══════════════════════════════════════════════════════════
231
- '90': {
232
- # Header
233
- 'province': (0.173, 0.089, 0.655, 0.105, _LINE),
234
- 'registry_no': (0.673, 0.105, 0.957, 0.130, _LINE),
235
- 'city_municipality': (0.226, 0.107, 0.658, 0.125, _LINE),
236
- 'marriage_license_no':(0.696, 0.138, 0.951, 0.156, _LINE),
237
- 'date_issued': (0.825, 0.151, 0.982, 0.169, _LINE),
238
- # Groom name (first / middle / last β€” each on its own row)
239
- 'groom_name_first': (0.128, 0.310, 0.441, 0.325, _LINE),
240
- 'groom_name_middle': (0.137, 0.326, 0.446, 0.338, _LINE),
241
- 'groom_name_last': (0.127, 0.340, 0.439, 0.354, _LINE),
242
- # Bride name (first / middle / last β€” each on its own row)
243
- 'bride_name_first': (0.629, 0.311, 0.944, 0.325, _LINE),
244
- 'bride_name_middle': (0.631, 0.326, 0.937, 0.339, _LINE),
245
- 'bride_name_last': (0.633, 0.339, 0.941, 0.354, _LINE),
246
- # Groom DOB / age
247
- 'groom_dob': (0.085, 0.372, 0.369, 0.393, _LINE),
248
- 'groom_age': (0.379, 0.373, 0.456, 0.391, _WORD),
249
- # Bride DOB / age
250
- 'bride_dob': (0.584, 0.373, 0.879, 0.393, _LINE),
251
- 'bride_age': (0.881, 0.374, 0.965, 0.392, _WORD),
252
- # Place of birth
253
- 'groom_place_of_birth':(0.080, 0.403, 0.462, 0.426, _LINE),
254
- 'bride_place_of_birth':(0.586, 0.405, 0.960, 0.425, _LINE),
255
- # Sex / Citizenship
256
- 'groom_sex': (0.085, 0.435, 0.217, 0.452, _WORD),
257
- 'groom_citizenship': (0.219, 0.435, 0.460, 0.454, _LINE),
258
- 'bride_sex': (0.582, 0.436, 0.716, 0.452, _WORD),
259
- 'bride_citizenship': (0.725, 0.436, 0.961, 0.451, _LINE),
260
- # Residence
261
- 'groom_residence': (0.076, 0.465, 0.460, 0.490, _LINE),
262
- 'bride_residence': (0.586, 0.465, 0.964, 0.489, _LINE),
263
- # Religion
264
- 'groom_religion': (0.079, 0.493, 0.462, 0.522, _LINE),
265
- 'bride_religion': (0.585, 0.492, 0.962, 0.520, _LINE),
266
- # Civil Status
267
- 'groom_civil_status': (0.080, 0.520, 0.463, 0.552, _WORD),
268
- 'bride_civil_status': (0.586, 0.522, 0.961, 0.546, _WORD),
269
- # Father
270
- 'groom_father_name': (0.082, 0.695, 0.459, 0.711, _LINE),
271
- 'groom_father_citizenship':(0.082, 0.713, 0.459, 0.736, _LINE),
272
- 'bride_father_name': (0.581, 0.695, 0.961, 0.715, _LINE),
273
- 'bride_father_citizenship':(0.577, 0.715, 0.963, 0.737, _LINE),
274
- # Mother
275
- 'groom_mother_name': (0.080, 0.784, 0.456, 0.809, _LINE),
276
- 'groom_mother_citizenship':(0.081, 0.811, 0.459, 0.830, _LINE),
277
- 'bride_mother_name': (0.583, 0.785, 0.962, 0.808, _LINE),
278
- 'bride_mother_citizenship':(0.580, 0.811, 0.963, 0.833, _LINE),
279
- },
280
-
281
- # ══════════════════════════════════════════════════════════
282
- # FORM 97 β€” Certificate of Marriage (pink/magenta border)
283
- # Layout: ITEM col (20%) | HUSBAND col (40%) | WIFE col (40%)
284
- # x-ranges: HUSBAND = 0.22–0.59 | WIFE = 0.62–0.97
285
- #
286
- # Form 97 β€” y-coords calibrated from actual ORB-aligned scan.
287
- # ORB alignment introduces ~40% vertical stretch by bottom of form;
288
- # all y values are empirically measured from crop images, NOT from
289
- # the reference image directly.
290
- # ══════════════════════════════════════════════════════════
291
- '97': {
292
- 'province': (0.173, 0.089, 0.655, 0.105, _LINE),
293
- 'registry_no': (0.673, 0.105, 0.957, 0.130, _LINE),
294
- 'city_municipality': (0.226, 0.107, 0.658, 0.125, _LINE),
295
- 'husband_name_first': (0.255, 0.140, 0.570, 0.155, _LINE),
296
- 'husband_name_middle': (0.258, 0.154, 0.569, 0.166, _LINE),
297
- 'husband_name_last': (0.259, 0.167, 0.581, 0.182, _LINE),
298
- 'wife_name_first': (0.650, 0.142, 0.954, 0.155, _LINE),
299
- 'wife_name_middle': (0.639, 0.155, 0.940, 0.170, _LINE),
300
- 'wife_name_last': (0.634, 0.169, 0.951, 0.181, _LINE),
301
- 'husband_dob': (0.219, 0.196, 0.507, 0.213, _LINE),
302
- 'husband_age': (0.523, 0.196, 0.580, 0.212, _WORD),
303
- 'wife_dob': (0.606, 0.198, 0.892, 0.209, _LINE),
304
- 'wife_age': (0.910, 0.199, 0.970, 0.213, _WORD),
305
- 'husband_place_of_birth': (0.203, 0.225, 0.583, 0.241, _LINE),
306
- 'wife_place_of_birth': (0.594, 0.229, 0.962, 0.245, _LINE),
307
- 'husband_sex': (0.219, 0.249, 0.307, 0.269, _WORD),
308
- 'wife_sex': (0.602, 0.249, 0.697, 0.269, _WORD),
309
- 'husband_citizenship': (0.344, 0.257, 0.588, 0.274, _LINE),
310
- 'wife_citizenship': (0.724, 0.255, 0.965, 0.272, _LINE),
311
- 'husband_residence': (0.219, 0.283, 0.579, 0.301, _LINE),
312
- 'wife_residence': (0.596, 0.285, 0.966, 0.307, _LINE),
313
- 'husband_religion': (0.204, 0.310, 0.581, 0.327, _LINE),
314
- 'wife_religion': (0.592, 0.311, 0.964, 0.327, _LINE),
315
- 'husband_civil_status': (0.196, 0.333, 0.579, 0.351, _WORD),
316
- 'wife_civil_status': (0.591, 0.335, 0.959, 0.351, _WORD),
317
- 'husband_father_name': (0.205, 0.367, 0.588, 0.384, _LINE),
318
- 'wife_father_name': (0.588, 0.369, 0.960, 0.386, _LINE),
319
- 'husband_father_citizenship': (0.195, 0.390, 0.580, 0.406, _LINE),
320
- 'wife_father_citizenship': (0.599, 0.388, 0.958, 0.404, _LINE),
321
- 'husband_mother_name': (0.196, 0.421, 0.583, 0.438, _LINE),
322
- 'wife_mother_name': (0.600, 0.419, 0.954, 0.436, _LINE),
323
- 'husband_mother_citizenship': (0.196, 0.443, 0.578, 0.459, _LINE),
324
- 'wife_mother_citizenship': (0.590, 0.447, 0.971, 0.463, _LINE),
325
- 'place_of_marriage': (0.219, 0.551, 0.981, 0.565, _LINE),
326
- 'date_of_marriage': (0.222, 0.582, 0.571, 0.596, _LINE),
327
- 'time_of_marriage': (0.730, 0.581, 0.916, 0.596, _LINE),
328
- 'registration_date': (0.621, 0.685, 0.905, 0.704, _LINE),
329
  },
330
- }
331
-
332
-
333
- # ── Anchor-based field templates ───────────────────────────────
334
- # These complement TEMPLATES (absolute coords). For each field that has an
335
- # anchor entry, extract_fields() will:
336
- # 1. Run EasyOCR once on the full form (detail=1) to get all text + bboxes
337
- # 2. Search for the printed label text inside 'search' region
338
- # 3. If found, crop the data region from the anchor's edge
339
- # 4. Fall back to absolute coords from TEMPLATES when anchor not found.
340
- #
341
- # Entry format:
342
- # 'labels' : list of strings β€” tried in order, case-insensitive partial match
343
- # 'search' : (x1,y1,x2,y2) fractions of form to search for the label
344
- # 'side' : 'right' | 'below' β€” where the data field is vs. the label
345
- # 'dx','dy' : offset from anchor edge to data start (fractions of form dims)
346
- # 'dw','dh' : data region size (fractions of form dims); dh=0 β†’ auto from anchor
347
-
348
- ANCHOR_TEMPLATES = {
349
-
350
- # ── Form 102 ─────────────────────────────────────────────
351
- '102': {
352
- 'province': {
353
- 'labels': ['Province', 'PROVINCE'],
354
- 'search': (0.00, 0.09, 0.17, 0.14),
355
- 'side': 'right', 'dx': 0.003, 'dy': -0.004,
356
- 'dw': 0.48, 'dh': 0.020,
357
- },
358
- 'registry_no': {
359
- 'labels': ['Registry No', 'REGISTRY NO'],
360
- 'search': (0.56, 0.10, 0.72, 0.15),
361
- 'side': 'right', 'dx': 0.003, 'dy': 0.000,
362
- 'dw': 0.28, 'dh': 0.026,
363
- },
364
- 'city_municipality': {
365
- 'labels': ['City', 'Municipality', 'City/Municipality'],
366
- 'search': (0.00, 0.12, 0.23, 0.16),
367
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
368
- 'dw': 0.43, 'dh': 0.020,
369
- },
370
- 'mother_name': {
371
- 'labels': ['Maiden Name', 'MAIDEN NAME', "Mother's Name"],
372
- 'search': (0.05, 0.30, 0.22, 0.35),
373
- 'side': 'right', 'dx': 0.003, 'dy': -0.005,
374
- 'dw': 0.77, 'dh': 0.022,
375
- },
376
- 'father_name': {
377
- 'labels': ["Father's Name", "FATHER'S NAME", 'Father Name'],
378
- 'search': (0.05, 0.45, 0.22, 0.50),
379
- 'side': 'right', 'dx': 0.003, 'dy': -0.005,
380
- 'dw': 0.77, 'dh': 0.025,
381
- },
382
- 'marriage_date': {
383
- 'labels': ['Date Married', 'DATE MARRIED', 'Date and Place'],
384
- 'search': (0.00, 0.58, 0.18, 0.63),
385
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
386
- 'dw': 0.32, 'dh': 0.020,
387
- },
388
- 'marriage_place': {
389
- 'labels': ['Place', 'PLACE'],
390
- 'search': (0.32, 0.58, 0.44, 0.63),
391
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
392
- 'dw': 0.52, 'dh': 0.020,
393
- },
394
- 'registration_date': {
395
- 'labels': ['Date', 'DATE', 'Registration'],
396
- 'search': (0.45, 0.72, 0.65, 0.77),
397
- 'side': 'right', 'dx': 0.003, 'dy': 0.000,
398
- 'dw': 0.28, 'dh': 0.020,
399
- },
400
- },
401
-
402
- # ── Form 103 ─────────────────────────────────────────────
403
  '103': {
404
- 'province': {
405
- 'labels': ['Province', 'PROVINCE'],
406
- 'search': (0.00, 0.07, 0.18, 0.12),
407
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
408
- 'dw': 0.47, 'dh': 0.020,
409
- },
410
- 'registry_no': {
411
- 'labels': ['Registry No', 'REGISTRY NO'],
412
- 'search': (0.55, 0.09, 0.70, 0.14),
413
- 'side': 'right', 'dx': 0.003, 'dy': 0.000,
414
- 'dw': 0.28, 'dh': 0.026,
415
- },
416
- 'city_municipality': {
417
- 'labels': ['City', 'Municipality'],
418
- 'search': (0.00, 0.10, 0.23, 0.14),
419
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
420
- 'dw': 0.42, 'dh': 0.020,
421
- },
422
- 'deceased_name': {
423
- 'labels': ['First Name', 'FIRST NAME', 'Name of Deceased', 'NAME'],
424
- 'search': (0.00, 0.13, 0.18, 0.20),
425
- 'side': 'right', 'dx': 0.003, 'dy': -0.005,
426
- 'dw': 0.65, 'dh': 0.038,
427
- },
428
- 'father_name': {
429
- 'labels': ["Father's Name", "FATHER'S NAME", "Father"],
430
- 'search': (0.20, 0.31, 0.35, 0.36),
431
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
432
- 'dw': 0.33, 'dh': 0.022,
433
- },
434
- 'mother_name': {
435
- 'labels': ["Mother's Maiden", "MOTHER'S MAIDEN", "Mother"],
436
- 'search': (0.55, 0.31, 0.70, 0.36),
437
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
438
- 'dw': 0.33, 'dh': 0.022,
439
- },
440
- 'cause_immediate': {
441
- 'labels': ['Immediate Cause', 'IMMEDIATE CAUSE', 'Immediate'],
442
- 'search': (0.05, 0.39, 0.32, 0.43),
443
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
444
- 'dw': 0.65, 'dh': 0.018,
445
- },
446
- 'cause_antecedent': {
447
- 'labels': ['Antecedent', 'ANTECEDENT'],
448
- 'search': (0.05, 0.41, 0.32, 0.45),
449
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
450
- 'dw': 0.65, 'dh': 0.018,
451
- },
452
- 'registration_date': {
453
- 'labels': ['Date', 'Registration Date'],
454
- 'search': (0.50, 0.67, 0.68, 0.72),
455
- 'side': 'right', 'dx': 0.003, 'dy': 0.000,
456
- 'dw': 0.28, 'dh': 0.020,
457
- },
458
  },
459
-
460
- # ── Form 90 ──────────────────────────────────────────────
461
  '90': {
462
- 'province': {
463
- 'labels': ['Province', 'PROVINCE'],
464
- 'search': (0.00, 0.08, 0.17, 0.12),
465
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
466
- 'dw': 0.46, 'dh': 0.020,
467
- },
468
- 'registry_no': {
469
- 'labels': ['Registry No', 'REGISTRY NO'],
470
- 'search': (0.55, 0.10, 0.70, 0.15),
471
- 'side': 'right', 'dx': 0.003, 'dy': 0.000,
472
- 'dw': 0.28, 'dh': 0.030,
473
- },
474
- 'city_municipality': {
475
- 'labels': ['City', 'Municipality'],
476
- 'search': (0.00, 0.11, 0.23, 0.15),
477
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
478
- 'dw': 0.40, 'dh': 0.020,
479
- },
480
- 'groom_name_first': {
481
- 'labels': ['First', 'FIRST', 'Given Name'],
482
- 'search': (0.00, 0.30, 0.14, 0.34),
483
- 'side': 'right', 'dx': 0.002, 'dy': -0.003,
484
- 'dw': 0.31, 'dh': 0.018,
485
- },
486
- 'bride_name_first': {
487
- 'labels': ['First', 'FIRST', 'Given Name'],
488
- 'search': (0.48, 0.30, 0.63, 0.34),
489
- 'side': 'right', 'dx': 0.002, 'dy': -0.003,
490
- 'dw': 0.31, 'dh': 0.018,
491
- },
492
- 'groom_father_name': {
493
- 'labels': ["Father's Name", 'Father', 'FATHER'],
494
- 'search': (0.00, 0.68, 0.14, 0.73),
495
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
496
- 'dw': 0.37, 'dh': 0.020,
497
- },
498
- 'bride_father_name': {
499
- 'labels': ["Father's Name", 'Father', 'FATHER'],
500
- 'search': (0.46, 0.68, 0.60, 0.73),
501
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
502
- 'dw': 0.37, 'dh': 0.020,
503
- },
504
- 'groom_mother_name': {
505
- 'labels': ["Mother's Name", "Mother's Maiden", 'Mother', 'MOTHER'],
506
- 'search': (0.00, 0.77, 0.14, 0.82),
507
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
508
- 'dw': 0.37, 'dh': 0.022,
509
- },
510
- 'bride_mother_name': {
511
- 'labels': ["Mother's Name", "Mother's Maiden", 'Mother', 'MOTHER'],
512
- 'search': (0.46, 0.77, 0.60, 0.82),
513
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
514
- 'dw': 0.37, 'dh': 0.022,
515
- },
516
  },
517
-
518
- # ── Form 97 ──────────────────────────────────────────────
519
  '97': {
520
- 'province': {
521
- 'labels': ['Province', 'PROVINCE'],
522
- 'search': (0.00, 0.07, 0.17, 0.11),
523
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
524
- 'dw': 0.48, 'dh': 0.018,
525
- },
526
- 'registry_no': {
527
- 'labels': ['Registry No', 'REGISTRY NO'],
528
- 'search': (0.55, 0.08, 0.70, 0.14),
529
- 'side': 'right', 'dx': 0.003, 'dy': 0.000,
530
- 'dw': 0.30, 'dh': 0.030,
531
- },
532
- 'city_municipality': {
533
- 'labels': ['City', 'Municipality'],
534
- 'search': (0.00, 0.09, 0.23, 0.13),
535
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
536
- 'dw': 0.42, 'dh': 0.018,
537
- },
538
- 'husband_father_name': {
539
- 'labels': ["Father's Name", 'Father', 'FATHER'],
540
- 'search': (0.05, 0.36, 0.22, 0.40),
541
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
542
- 'dw': 0.37, 'dh': 0.018,
543
- },
544
- 'wife_father_name': {
545
- 'labels': ["Father's Name", 'Father', 'FATHER'],
546
- 'search': (0.50, 0.36, 0.65, 0.40),
547
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
548
- 'dw': 0.37, 'dh': 0.018,
549
- },
550
- 'husband_mother_name': {
551
- 'labels': ["Mother's Name", "Mother's Maiden", 'Mother'],
552
- 'search': (0.05, 0.41, 0.22, 0.46),
553
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
554
- 'dw': 0.38, 'dh': 0.020,
555
- },
556
- 'wife_mother_name': {
557
- 'labels': ["Mother's Name", "Mother's Maiden", 'Mother'],
558
- 'search': (0.50, 0.41, 0.65, 0.46),
559
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
560
- 'dw': 0.37, 'dh': 0.020,
561
- },
562
- 'place_of_marriage': {
563
- 'labels': ['Place of Marriage', 'PLACE OF MARRIAGE', 'Place'],
564
- 'search': (0.05, 0.54, 0.30, 0.58),
565
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
566
- 'dw': 0.74, 'dh': 0.018,
567
- },
568
- 'date_of_marriage': {
569
- 'labels': ['Date of Marriage', 'DATE OF MARRIAGE', 'Date'],
570
- 'search': (0.05, 0.57, 0.27, 0.62),
571
- 'side': 'right', 'dx': 0.003, 'dy': -0.003,
572
- 'dw': 0.37, 'dh': 0.018,
573
- },
574
- 'registration_date': {
575
- 'labels': ['Date', 'Registration'],
576
- 'search': (0.55, 0.81, 0.72, 0.85),
577
- 'side': 'right', 'dx': 0.003, 'dy': 0.000,
578
- 'dw': 0.20, 'dh': 0.020,
579
- },
580
  },
581
  }
582
 
583
 
584
- # ── Anchor scanning helpers ─────────────────────────────────────
585
-
586
- def _scan_form_text(_img: Image.Image):
587
- """
588
- Anchor detection disabled β€” TrOCR has no built-in text detector.
589
- extract_fields falls back to absolute coordinates for all fields.
590
- """
591
- return []
592
-
593
-
594
- def _find_anchor_bbox(detections, labels: list, search_box: tuple,
595
- form_w: int, form_h: int):
596
- """
597
- Find the first bounding box whose text matches any of `labels` (case-insensitive
598
- partial match) and whose centre lies within `search_box` (fractions).
599
-
600
- Returns [[x1,y1],[x2,y1],[x2,y2],[x1,y2]] pixel coords, or None.
601
- """
602
- sx1 = search_box[0] * form_w
603
- sy1 = search_box[1] * form_h
604
- sx2 = search_box[2] * form_w
605
- sy2 = search_box[3] * form_h
606
-
607
- best_bbox = None
608
- best_score = 0.0
609
-
610
- for (bbox, text, conf) in detections:
611
- if conf < 0.25:
612
- continue
613
- pts = np.array(bbox, dtype=np.float32)
614
- cx = pts[:, 0].mean()
615
- cy = pts[:, 1].mean()
616
- if not (sx1 <= cx <= sx2 and sy1 <= cy <= sy2):
617
- continue
618
- text_u = text.upper().strip()
619
- for label in labels:
620
- label_u = label.upper()
621
- if label_u in text_u or text_u in label_u:
622
- score = conf * len(label_u)
623
- if score > best_score:
624
- best_score = score
625
- best_bbox = bbox
626
-
627
- return best_bbox
628
-
629
-
630
- def _crop_from_anchor(img: Image.Image, anchor_bbox,
631
- side: str, dx: float, dy: float,
632
- dw: float, dh: float) -> Image.Image:
633
- """
634
- Compute data region relative to a found anchor bbox and return the crop.
635
-
636
- anchor_bbox : [[x1,y1],[x2,y1],[x2,y2],[x1,y2]] pixel coords
637
- side : 'right' β†’ data starts at anchor's right edge
638
- 'below' β†’ data starts below anchor's bottom edge
639
- dx, dy : offset fractions (of form width/height) from anchor edge
640
- dw, dh : data region size fractions (of form width/height);
641
- dh=0 means use anchor's own height
642
- """
643
- form_w, form_h = img.size
644
- pts = np.array(anchor_bbox, dtype=np.float32)
645
- ax1 = int(pts[:, 0].min())
646
- ay1 = int(pts[:, 1].min())
647
- ax2 = int(pts[:, 0].max())
648
- ay2 = int(pts[:, 1].max())
649
-
650
- data_w = int(dw * form_w)
651
- data_h = int(dh * form_h) if dh > 0 else (ay2 - ay1 + 4)
652
-
653
- if side == 'right':
654
- rx1 = ax2 + int(dx * form_w)
655
- ry1 = ay1 + int(dy * form_h)
656
- else: # 'below'
657
- rx1 = ax1 + int(dx * form_w)
658
- ry1 = ay2 + int(dy * form_h)
659
-
660
- rx2 = min(form_w, rx1 + data_w)
661
- ry2 = min(form_h, ry1 + data_h)
662
-
663
- if rx2 <= rx1 or ry2 <= ry1:
664
- return None
665
 
666
- pad = 3
667
- return img.crop((max(0, rx1 - pad), max(0, ry1 - pad),
668
- min(form_w, rx2 + pad), min(form_h, ry2 + pad)))
 
 
 
 
 
 
669
 
670
 
671
- def _find_document_corners(gray: np.ndarray):
672
- """
673
- Try to find the 4 corners of the document in a grayscale scan.
674
- Returns a (4,2) float32 array ordered [TL, TR, BR, BL], or None.
675
- """
676
- # Blur + threshold to isolate the white page against background
677
- blur = _cv2.GaussianBlur(gray, (5, 5), 0)
678
  _, thresh = _cv2.threshold(blur, 0, 255, _cv2.THRESH_BINARY + _cv2.THRESH_OTSU)
679
-
680
- # Find contours
681
- contours, _ = _cv2.findContours(thresh, _cv2.RETR_EXTERNAL, _cv2.CHAIN_APPROX_SIMPLE)
682
  if not contours:
683
- return None
684
-
685
- # Take the largest contour β€” should be the document page
686
- c = max(contours, key=_cv2.contourArea)
687
  area = _cv2.contourArea(c)
688
- img_area = gray.shape[0] * gray.shape[1]
689
-
690
- # Must cover at least 30% of the image
691
- if area < 0.30 * img_area:
692
- return None
693
-
694
- # Approximate to a polygon
695
- peri = _cv2.arcLength(c, True)
696
  approx = _cv2.approxPolyDP(c, 0.02 * peri, True)
697
-
698
  if len(approx) != 4:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699
  return None
700
 
701
- pts = approx.reshape(4, 2).astype(np.float32)
702
-
703
- # Order: TL, TR, BR, BL
704
- s = pts.sum(axis=1)
705
- d = np.diff(pts, axis=1)
706
- ordered = np.array([
707
- pts[np.argmin(s)], # TL β€” smallest sum
708
- pts[np.argmin(d)], # TR β€” smallest diff
709
- pts[np.argmax(s)], # BR β€” largest sum
710
- pts[np.argmax(d)], # BL β€” largest diff
711
- ], dtype=np.float32)
712
- return ordered
713
-
714
 
715
- def _orb_align(scan_gray: np.ndarray, ref_gray: np.ndarray, scan_rgb: np.ndarray):
716
- """
717
- Align scan_rgb to ref_gray using ORB feature matching + RANSAC homography.
718
- Returns (aligned_rgb, inlier_count) or (None, 0) on failure.
719
- """
720
- h, w = scan_gray.shape
721
  ref_resized = _cv2.resize(ref_gray, (w, h))
722
-
723
- orb = _cv2.ORB_create(nfeatures=5000)
724
- kp1, des1 = orb.detectAndCompute(scan_gray, None)
725
- kp2, des2 = orb.detectAndCompute(ref_resized, None)
726
-
727
  if des1 is None or des2 is None or len(kp1) < 10 or len(kp2) < 10:
728
  return None, 0
729
-
730
  matcher = _cv2.BFMatcher(_cv2.NORM_HAMMING, crossCheck=True)
731
  matches = sorted(matcher.match(des1, des2), key=lambda m: m.distance)
732
- # Keep only top 30% as good matches
733
- good = matches[:max(10, len(matches) // 3)]
734
-
735
  if len(good) < 10:
736
  return None, 0
737
-
738
  src_pts = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
739
  dst_pts = np.float32([kp2[m.trainIdx].pt for m in good]).reshape(-1, 1, 2)
740
-
741
  M, mask = _cv2.estimateAffinePartial2D(
742
- src_pts, dst_pts, method=_cv2.RANSAC, ransacReprojThreshold=5.0
743
- )
744
  if M is None:
745
  return None, 0
746
-
747
  inliers = int(mask.sum()) if mask is not None else 0
748
- print(f'[align] ORB homography: {inliers}/{len(good)} inliers')
749
-
750
- aligned = _cv2.warpAffine(
751
- scan_rgb, M, (w, h),
752
- flags=_cv2.INTER_LINEAR,
753
- borderMode=_cv2.BORDER_REPLICATE
754
- )
755
  return aligned, inliers
756
 
757
 
758
  def _orb_inliers(scan_gray: np.ndarray, ref_gray: np.ndarray) -> int:
759
- """
760
- Count ORB RANSAC inliers between two grayscale images without warping.
761
- Used by detect_form_type() to score form candidates.
762
- """
763
- orb = _cv2.ORB_create(nfeatures=3000)
764
  kp1, des1 = orb.detectAndCompute(scan_gray, None)
765
- kp2, des2 = orb.detectAndCompute(ref_gray, None)
766
  if des1 is None or des2 is None or len(kp1) < 10 or len(kp2) < 10:
767
  return 0
768
  matcher = _cv2.BFMatcher(_cv2.NORM_HAMMING, crossCheck=True)
769
  matches = sorted(matcher.match(des1, des2), key=lambda m: m.distance)
770
- good = matches[:max(10, len(matches) // 3)]
771
  if len(good) < 10:
772
  return 0
773
  src_pts = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
@@ -776,191 +392,72 @@ def _orb_inliers(scan_gray: np.ndarray, ref_gray: np.ndarray) -> int:
776
  return int(mask.sum()) if mask is not None else 0
777
 
778
 
779
- def _ecc_align(scan_gray: np.ndarray, ref_gray: np.ndarray, scan_rgb: np.ndarray):
780
  """
781
- Align scan_rgb to ref_gray using ECC (Enhanced Correlation Coefficient).
782
- Uses MOTION_EUCLIDEAN (translation + rotation) β€” more robust than ORB for
783
- blurry, low-texture, or handwriting-heavy scans where keypoint matching fails.
784
- Returns aligned RGB array or None on failure.
785
- """
786
- try:
787
- h, w = ref_gray.shape
788
- # Downscale to 500px for speed; scale translation back afterward
789
- scale = min(1.0, 500.0 / max(h, w))
790
- sh, sw = max(1, int(h * scale)), max(1, int(w * scale))
791
- ref_s = _cv2.resize(ref_gray, (sw, sh))
792
- scn_s = _cv2.resize(_cv2.resize(scan_gray, (w, h)), (sw, sh))
793
-
794
- warp = np.eye(2, 3, dtype=np.float32)
795
- criteria = (_cv2.TERM_CRITERIA_EPS | _cv2.TERM_CRITERIA_COUNT, 100, 1e-4)
796
- cc, warp = _cv2.findTransformECC(ref_s, scn_s, warp, _cv2.MOTION_EUCLIDEAN, criteria)
797
-
798
- # Clamp rotation to Β±3Β° to prevent over-tilting
799
- angle_rad = np.arctan2(warp[1, 0], warp[0, 0])
800
- angle_deg = np.degrees(angle_rad)
801
- MAX_ANGLE = 1.0
802
- if abs(angle_deg) > MAX_ANGLE:
803
- clamped = np.radians(np.clip(angle_deg, -MAX_ANGLE, MAX_ANGLE))
804
- warp[0, 0] = np.cos(clamped)
805
- warp[0, 1] = -np.sin(clamped)
806
- warp[1, 0] = np.sin(clamped)
807
- warp[1, 1] = np.cos(clamped)
808
- print(f'[align] ECC rotation clamped {angle_deg:.2f}Β° -> {np.degrees(clamped):.2f}Β°')
809
-
810
- # Scale translation to full resolution
811
- warp[0, 2] /= scale
812
- warp[1, 2] /= scale
813
-
814
- scan_full = _cv2.resize(scan_rgb, (w, h))
815
- aligned = _cv2.warpAffine(
816
- scan_full, warp, (w, h),
817
- flags=_cv2.INTER_LINEAR,
818
- borderMode=_cv2.BORDER_REPLICATE
819
- )
820
- print(f'[align] ECC applied (cc={cc:.4f} angle={angle_deg:.2f}Β° tx={warp[0,2]:.1f} ty={warp[1,2]:.1f})')
821
- return aligned
822
- except Exception as e:
823
- print(f'[align] ECC failed: {e}')
824
- return None
825
-
826
-
827
- def align_to_reference(img: Image.Image, form_type: str):
828
- """
829
- Align a scanned form to its clean reference using a three-stage cascade:
830
-
831
- Stage 1 β€” ORB feature matching + RANSAC homography (primary).
832
- Matches structural features (printed lines, boxes, text layout).
833
- Most accurate when the scan has reasonable contrast/sharpness.
834
- Returns high confidence (inlier count) used to decide if anchor
835
- scan is needed in extract_fields().
836
-
837
- Stage 2 β€” ECC (Enhanced Correlation Coefficient) EUCLIDEAN.
838
- Good for blurry / low-texture / handwriting-heavy scans where ORB
839
- finds too few keypoints. Corrects translation + rotation only.
840
-
841
- Stage 3 β€” Corner perspective correction (fallback).
842
- Finds document corners via contour detection. Only works when the
843
- page is visible against a background.
844
-
845
- Stage 4 β€” Resize only (last resort).
846
-
847
- Returns (aligned_image, orb_inliers) where orb_inliers=0 means ORB
848
- did not succeed (ECC/corner/resize was used instead).
849
  """
850
  if not _CV2_OK:
851
  return img, 0
852
-
853
  ref_path = REFERENCE_IMAGES.get(form_type)
854
  if not ref_path or not os.path.exists(ref_path):
855
- print(f'[align] No reference for form {form_type} at {ref_path}')
856
  return img, 0
857
-
858
  ref_gray = _cv2.imread(ref_path, _cv2.IMREAD_GRAYSCALE)
859
  if ref_gray is None:
860
  return img, 0
861
-
862
- scan_rgb = np.array(img.convert('RGB'))
863
- scan_gray = _cv2.cvtColor(scan_rgb, _cv2.COLOR_RGB2GRAY)
864
  ref_h, ref_w = ref_gray.shape
 
865
 
866
- scan_gray_rs = _cv2.resize(scan_gray, (ref_w, ref_h))
867
- scan_rgb_rs = _cv2.resize(scan_rgb, (ref_w, ref_h))
 
868
 
869
- # ── Stage 1: ECC (translation + rotation only β€” no distortion) ────
870
  print(f'[align] Form {form_type}: trying ECC...')
871
  aligned = _ecc_align(scan_gray_rs, ref_gray, scan_rgb_rs)
872
  if aligned is not None:
873
- print(f'[align] Form {form_type}: ECC alignment applied')
874
- return Image.fromarray(aligned), 25 # return 25 so anchor scan is skipped
875
 
876
- # ── Stage 2: ORB (fallback if ECC fails) ──────────────────
877
  print(f'[align] Form {form_type}: ECC failed, trying ORB...')
878
  aligned, inliers = _orb_align(scan_gray_rs, ref_gray, scan_rgb_rs)
879
  if aligned is not None:
880
- print(f'[align] Form {form_type}: ORB applied ({inliers} inliers)')
881
  return Image.fromarray(aligned), inliers
882
 
883
- # ── Stage 3: corner perspective correction ────────────────
884
- print(f'[align] Form {form_type}: ECC failed, trying corner detection...')
885
- corners = _find_document_corners(scan_gray)
886
- if corners is not None:
887
- dst_corners = np.array([
888
- [0, 0 ],
889
- [ref_w, 0 ],
890
- [ref_w, ref_h],
891
- [0, ref_h],
892
- ], dtype=np.float32)
893
- M = _cv2.getPerspectiveTransform(corners, dst_corners)
894
- warped = _cv2.warpPerspective(
895
- scan_rgb, M, (ref_w, ref_h),
896
- flags=_cv2.INTER_LINEAR,
897
- borderMode=_cv2.BORDER_REPLICATE
898
- )
899
- print(f'[align] Form {form_type}: perspective correction applied')
900
- return Image.fromarray(warped), 0
901
-
902
- # ── Stage 4: resize only ──────────────────────────────────
903
- print(f'[align] Form {form_type}: all alignment methods failed, resizing only')
904
  resized = _cv2.resize(scan_rgb, (ref_w, ref_h))
905
  return Image.fromarray(resized), 0
906
 
907
 
 
 
908
  def _deskew(gray: np.ndarray) -> np.ndarray:
909
- """Correct slight rotation using Hough line detection."""
910
  if not _CV2_OK:
911
  return gray
912
  edges = _cv2.Canny(gray, 50, 150, apertureSize=3)
913
- lines = _cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=100,
914
  minLineLength=100, maxLineGap=10)
915
- if lines is None or len(lines) == 0:
916
  return gray
917
- angles = []
918
- for x1, y1, x2, y2 in lines[:, 0]:
919
- angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
920
- if -15 < angle < 15:
921
- angles.append(angle)
922
  if not angles:
923
  return gray
924
- median_angle = float(np.median(angles))
925
- if abs(median_angle) < 0.3:
926
  return gray
927
  h, w = gray.shape
928
- M = _cv2.getRotationMatrix2D((w / 2, h / 2), median_angle, 1.0)
929
  return _cv2.warpAffine(gray, M, (w, h),
930
  flags=_cv2.INTER_CUBIC,
931
  borderMode=_cv2.BORDER_REPLICATE)
932
 
933
 
934
- def _enhance_for_ocr(gray: np.ndarray) -> np.ndarray:
935
- """CLAHE contrast enhancement + gentle denoising."""
936
- if not _CV2_OK:
937
- return gray
938
- clahe = _cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
939
- enhanced = clahe.apply(gray)
940
- denoised = _cv2.fastNlMeansDenoising(enhanced, h=10,
941
- templateWindowSize=7,
942
- searchWindowSize=21)
943
- return denoised
944
-
945
-
946
- def _binarize(gray: np.ndarray) -> np.ndarray:
947
- """Adaptive threshold β€” cleaner black-on-white for OCR."""
948
- if not _CV2_OK:
949
- return gray
950
- return _cv2.adaptiveThreshold(gray, 255,
951
- _cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
952
- _cv2.THRESH_BINARY, 11, 2)
953
-
954
-
955
  def _preprocess(img: Image.Image) -> Image.Image:
956
- """
957
- Prepare the full form image for field cropping:
958
- - Convert to grayscale
959
- - Deskew (correct residual rotation after ORB alignment)
960
-
961
- CLAHE and denoising are applied later per-crop in _ocr(), where
962
- they are more effective and don't risk blurring the whole form.
963
- """
964
  if not _CV2_OK:
965
  return img.convert('L')
966
  gray = np.array(img.convert('L'))
@@ -969,63 +466,29 @@ def _preprocess(img: Image.Image) -> Image.Image:
969
 
970
 
971
  def _crop_field(img: Image.Image, x1r, y1r, x2r, y2r) -> Image.Image:
972
- """Crop a field region using relative coordinates."""
973
  w, h = img.size
974
- x1 = int(x1r * w); y1 = int(y1r * h)
975
- x2 = int(x2r * w); y2 = int(y2r * h)
976
- # Add small padding for OCR accuracy
977
- pad = 4
978
- x1 = max(0, x1 - pad); y1 = max(0, y1 - pad)
979
- x2 = min(w, x2 + pad); y2 = min(h, y2 + pad)
980
  return img.crop((x1, y1, x2, y2))
981
 
982
 
 
 
 
983
 
984
- def _ocr(crop: Image.Image, config: str = '') -> str:
985
- """Run TrOCR large-handwritten on a cropped field image."""
986
- processor, model = _get_trocr()
987
- if processor is None or model is None:
988
- return ''
989
- try:
990
- import torch
991
- rgb = crop.convert('RGB')
992
- pixel_values = processor(rgb, return_tensors='pt').pixel_values
993
- with torch.no_grad():
994
- generated_ids = model.generate(pixel_values)
995
- return processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
996
- except Exception as e:
997
- print(f'[template_matcher] OCR error: {e}')
998
- return ''
999
 
 
1000
 
1001
  def detect_form_type(image_path: str) -> str:
1002
- """
1003
- Auto-detect form type from a scanned image.
1004
-
1005
- Primary β€” ORB inlier count:
1006
- Resize the scan to each reference's dimensions, run ORB feature
1007
- matching against all 4 reference images, and pick the form type
1008
- with the most RANSAC inliers. Robust to rotation, brightness
1009
- differences, and partial fills because it matches structural
1010
- features (printed lines, boxes, column layout) rather than title
1011
- text. Works at ~800px for speed.
1012
-
1013
- Fallback β€” OCR title:
1014
- Used when no reference images exist or cv2 is unavailable.
1015
- Less reliable for rotated / faint / cropped scans.
1016
-
1017
- Returns '102', '103', '90', or '97'.
1018
- """
1019
  if _CV2_OK:
1020
  try:
1021
- img = Image.open(image_path).convert('RGB')
1022
  scan_rgb = np.array(img)
1023
  scan_gray = _cv2.cvtColor(scan_rgb, _cv2.COLOR_RGB2GRAY)
1024
-
1025
- best_type = None
1026
- best_inliers = 0
1027
- DET_W = 800 # detection width β€” fast enough, enough detail
1028
-
1029
  for ft, ref_path in REFERENCE_IMAGES.items():
1030
  if not os.path.exists(ref_path):
1031
  continue
@@ -1033,56 +496,50 @@ def detect_form_type(image_path: str) -> str:
1033
  if ref_gray is None:
1034
  continue
1035
  ref_h, ref_w = ref_gray.shape
1036
- # Resize scan to reference aspect, then both to DET_W
1037
  sc = min(1.0, DET_W / ref_w)
1038
- dw, dh = max(1, int(ref_w * sc)), max(1, int(ref_h * sc))
1039
- ref_ds = _cv2.resize(ref_gray, (dw, dh))
1040
- scan_ds = _cv2.resize(_cv2.resize(scan_gray, (ref_w, ref_h)), (dw, dh))
1041
- count = _orb_inliers(scan_ds, ref_ds)
1042
  print(f'[detect] Form {ft}: {count} ORB inliers')
1043
  if count > best_inliers:
1044
- best_inliers = count
1045
- best_type = ft
1046
-
1047
  if best_type and best_inliers >= 15:
1048
- print(f'[detect] Best match: Form {best_type} ({best_inliers} inliers)')
1049
  return best_type
1050
-
1051
- print(f'[detect] ORB inconclusive (best={best_inliers}), falling back to OCR title')
1052
-
1053
  except Exception as e:
1054
  print(f'[template_matcher] detect_form_type ORB error: {e}')
1055
-
1056
- # ── OCR title fallback ────────────────────────────────────
1057
  try:
1058
  img_l = Image.open(image_path).convert('L')
1059
  w, h = img_l.size
1060
- title_crop = img_l.crop((0, int(h * 0.04), w, int(h * 0.15)))
1061
- title = _ocr(title_crop).upper()
1062
  if title:
1063
  if 'LIVE BIRTH' in title or ('BIRTH' in title
1064
  and 'DEATH' not in title and 'MARRIAGE' not in title):
1065
  return '102'
1066
- elif 'DEATH' in title:
1067
  return '103'
1068
- elif 'MARRIAGE' in title and 'LICENSE' in title:
1069
  return '90'
1070
- elif 'MARRIAGE' in title:
1071
  return '97'
1072
- print(f'[template_matcher] Could not detect form type; defaulting to 102. '
1073
- f'Title: {title[:80] if title else "(empty)"}')
1074
  except Exception as e:
1075
  print(f'[template_matcher] detect_form_type OCR error: {e}')
1076
  return '102'
1077
 
1078
 
 
 
1079
  def extract_fields(image_path: str, form_type: str) -> dict:
1080
  """
1081
- Main entry point.
1082
 
1083
  Args:
1084
- image_path: Path to the uploaded form image (PNG/JPG/PDF page)
1085
- form_type: '102', '103', '90', or '97'
1086
 
1087
  Returns:
1088
  dict of { field_name: extracted_text }
@@ -1091,168 +548,135 @@ def extract_fields(image_path: str, form_type: str) -> dict:
1091
  if template is None:
1092
  print(f'[template_matcher] No template for form type: {form_type}')
1093
  return {}
1094
-
1095
- if _get_trocr()[0] is None:
1096
- print('[template_matcher] TrOCR not available β€” returning empty fields')
1097
  return {}
1098
-
1099
-
1100
- # Load and preprocess
1101
  try:
1102
  img = Image.open(image_path).convert('RGB')
1103
  except Exception as e:
1104
  print(f'[template_matcher] Cannot open image: {e}')
1105
  return {}
1106
 
1107
- # Align to reference before cropping (fixes scan offset/rotation)
1108
- # orb_inliers > 0 means ORB succeeded β€” absolute coords are reliable.
1109
  img, orb_inliers = align_to_reference(img, form_type)
1110
- processed = _preprocess(img)
1111
-
1112
- # ── One-time full-form scan for anchor detection ──────────
1113
- # When ORB aligned with high confidence (inliers >= 25), absolute
1114
- # coordinates are accurate and the expensive full-page OCR scan can
1115
- # be skipped. Below that threshold, anchors improve robustness.
1116
- anchor_defs = ANCHOR_TEMPLATES.get(form_type, {})
1117
- detections = []
1118
- if anchor_defs and orb_inliers < 25:
1119
- print(f'[template_matcher] ORB inliers={orb_inliers} β€” scanning form for anchors...')
1120
- detections = _scan_form_text(img) # use colour/original for label scan
1121
- print(f'[template_matcher] Found {len(detections)} text regions in form')
1122
- elif anchor_defs:
1123
- print(f'[template_matcher] ORB inliers={orb_inliers} >= 25 β€” skipping anchor scan')
1124
 
 
 
 
 
1125
  form_w, form_h = img.size
1126
- anchor_hits = 0
1127
-
1128
- # ── Collect all crops first, then batch-infer in one pass ─
1129
- field_names = []
1130
- crops = []
1131
 
1132
  for field_name, coords in template.items():
1133
- crop = None
1134
-
1135
- # ── Try anchor-based crop first ───────────────────────
1136
- adef = anchor_defs.get(field_name)
1137
- if adef and detections:
1138
- bbox = _find_anchor_bbox(
1139
- detections, adef['labels'], adef['search'], form_w, form_h
1140
- )
1141
- if bbox is not None:
1142
- crop = _crop_from_anchor(
1143
- processed, bbox,
1144
- adef['side'], adef['dx'], adef['dy'],
1145
- adef['dw'], adef['dh']
1146
- )
1147
- if crop is not None:
1148
- anchor_hits += 1
1149
-
1150
- # ── Fallback: absolute coordinate crop ────────────────
1151
- if crop is None:
1152
- x1r, y1r, x2r, y2r, cfg = coords
1153
- crop = _crop_field(processed, x1r, y1r, x2r, y2r)
1154
-
1155
  field_names.append(field_name)
1156
  crops.append(crop)
 
1157
 
1158
- fields = {}
1159
- for field_name, crop in zip(field_names, crops):
 
1160
  text = _postprocess(_ocr(crop), field_name)
1161
  if text:
1162
  fields[field_name] = text
1163
 
1164
- if anchor_defs:
1165
- print(f'[template_matcher] Anchor hits: {anchor_hits}/{len(anchor_defs)} defined')
1166
- print(f'[template_matcher] Extracted {len(fields)}/{len(template)} fields from form {form_type}')
1167
  return fields
1168
 
1169
 
1170
- def pdf_to_image(pdf_path: str, page: int = 0) -> str:
1171
- """
1172
- Convert a PDF page to a PNG image for processing.
1173
- Returns path to the saved PNG, or None on failure.
1174
- Requires: pip install pdf2image + poppler installed
1175
- """
1176
- try:
1177
- from pdf2image import convert_from_path
1178
- pages = convert_from_path(pdf_path, dpi=150)
1179
- if not pages:
1180
- return None
1181
- out_path = pdf_path.replace('.pdf', f'_page{page}.png')
1182
- pages[page].save(out_path, 'PNG')
1183
- return out_path
1184
- except ImportError:
1185
- print('[template_matcher] pdf2image not installed. Run: pip install pdf2image')
1186
- return None
1187
- except Exception as e:
1188
- print(f'[template_matcher] PDF conversion failed: {e}')
1189
- return None
1190
-
1191
 
1192
- def debug_draw_boxes(image_path: str, form_type: str, out_path: str = None):
1193
  """
1194
- Draw all field bounding boxes on the ALIGNED image and save it.
1195
- Uses the same alignment (ORB β†’ corner β†’ resize) as extract_fields(),
1196
- so the boxes reflect where coordinates actually land during extraction.
1197
 
1198
- Usage: python template_matcher.py myform.png 102
 
 
 
 
1199
  """
1200
  from PIL import ImageDraw, ImageFont
1201
 
1202
  template = TEMPLATES.get(form_type)
1203
  if not template:
1204
  print(f'No template for {form_type}')
1205
- return
1206
 
1207
- img = Image.open(image_path).convert('RGB')
1208
- img, _ = align_to_reference(img, form_type) # ← align first
1209
- draw = ImageDraw.Draw(img)
1210
  w, h = img.size
1211
 
1212
  try:
1213
- font = ImageFont.truetype('C:/Windows/Fonts/arial.ttf', 12)
1214
- except:
1215
- font = ImageFont.load_default()
 
 
 
1216
 
1217
- colors = ['#e53935','#1e88e5','#43a047','#fb8c00','#8e24aa','#00acc1']
1218
- for idx, (field_name, coords) in enumerate(template.items()):
1219
  x1r, y1r, x2r, y2r, _ = coords
1220
- x1 = int(x1r * w); y1 = int(y1r * h)
1221
- x2 = int(x2r * w); y2 = int(y2r * h)
1222
- color = colors[idx % len(colors)]
1223
- draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
1224
 
1225
  base, ext = os.path.splitext(image_path)
1226
  out = out_path or f'{base}_debug_{form_type}{ext}'
1227
  img.save(out)
1228
  print(f'[template_matcher] Debug image saved: {out}')
 
 
 
 
1229
  return out
1230
 
1231
 
1232
- # ── CLI ────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1233
  if __name__ == '__main__':
1234
  if len(sys.argv) < 3:
1235
- print('Usage: python template_matcher.py <image_path> <form_type>')
1236
  print(' form_type: 102 | 103 | 90 | 97')
1237
- print('Example: python template_matcher.py form102_sample1.png 102')
1238
  sys.exit(1)
1239
 
1240
  img_path = sys.argv[1]
1241
  form_type = sys.argv[2]
 
1242
 
1243
- # Draw boxes on the aligned image (same as what extraction sees)
1244
- out = debug_draw_boxes(img_path, form_type)
1245
- print(f'Open {out} to verify box positions on the aligned image.\n')
1246
 
1247
- # Extract and print fields
1248
  result = extract_fields(img_path, form_type)
1249
- print(f'\nExtracted fields ({len(result)}):')
1250
  for k, v in result.items():
1251
- print(f' {k:<35} = {v}')
1252
- # Show which fields got nothing
1253
  template = TEMPLATES.get(form_type, {})
1254
- missing = [k for k in template if k not in result]
1255
  if missing:
1256
  print(f'\nEmpty fields ({len(missing)}):')
1257
  for k in missing:
1258
- print(f' {k}')
 
1
  """
2
+ template_matcher.py (v3 β€” pytesseract removed)
3
+ ================================================
4
+ Extracts field values from Philippine civil registry scanned forms.
5
+
6
+ WHAT CHANGED FROM v2
7
+ ---------------------
8
+ 1. pytesseract removed entirely.
9
+ 2. _scan_form_text() now uses CV2 contour/MSER detection to find
10
+ candidate text regions, then reads each region with TrOCR
11
+ (the same model already loaded for field OCR).
12
+ 3. Anchor label matching (fuzzy SequenceMatcher) unchanged.
13
+ 4. No new dependencies β€” everything already required by the project.
 
 
 
 
 
 
 
 
 
 
14
  """
15
 
16
+ import os
17
+ import sys
18
+ import re as _re
19
+
20
  import numpy as np
21
  from PIL import Image
22
 
 
26
  except ImportError:
27
  _CV2_OK = False
28
 
29
+ # ── Reference images ─────────────────────────────────────────────
 
 
 
 
 
30
  _REF_DIR = os.path.join(os.path.dirname(__file__), 'references')
31
  REFERENCE_IMAGES = {
32
+ '102': os.path.join(_REF_DIR, 'reference_102.jpg'),
33
  '103': os.path.join(_REF_DIR, 'reference_103.png'),
34
+ '90': os.path.join(_REF_DIR, 'reference_90.png'),
35
+ '97': os.path.join(_REF_DIR, 'reference_97.jpg'),
36
  }
37
 
38
+ # ── CRNN+CTC engine ──────────────────────────────────────────────
39
+ import sys as _sys
40
+ _CRNN_DIR = os.path.join(os.path.dirname(__file__), 'CRNN+CTC')
41
+ if _CRNN_DIR not in _sys.path:
42
+ _sys.path.insert(0, _CRNN_DIR)
43
+
44
+ _CRNN_CHECKPOINT = os.path.join(_CRNN_DIR, 'checkpoints', 'best_model.pth')
45
+ _crnn_ocr = None
46
+ _crnn_decode = None # reference to decode_ctc_predictions
47
 
48
+
49
+ def _get_crnn():
50
+ global _crnn_ocr, _crnn_decode
51
+ if _crnn_ocr is None:
52
  try:
 
53
  import torch
54
+ from inference import CivilRegistryOCR
55
+ from utils import decode_ctc_predictions as _dcp
56
+ print('[template_matcher] Loading CRNN+CTC model...')
57
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
58
+ _crnn_ocr = CivilRegistryOCR(
59
+ checkpoint_path=_CRNN_CHECKPOINT,
60
+ device=device,
61
+ mode='adaptive',
62
+ )
63
+ _crnn_decode = _dcp
64
+ print('[template_matcher] CRNN+CTC ready.')
65
  except Exception as e:
66
+ print(f'[template_matcher] CRNN+CTC load error: {e}')
67
+ return _crnn_ocr
68
+
69
+
70
+ def _crnn_read(crop_img: Image.Image) -> str:
71
+ """Run CRNN+CTC on a PIL Image crop and return the decoded string."""
72
+ ocr = _get_crnn()
73
+ if ocr is None or _crnn_decode is None:
74
+ return ''
75
+ try:
76
+ import torch
77
+ rgb = np.array(crop_img.convert('RGB'))
78
+ bgr = rgb[:, :, ::-1].copy()
79
+ normalized = ocr.normalizer.normalize(bgr)
80
+ tensor = torch.FloatTensor(
81
+ normalized.astype(np.float32) / 255.0
82
+ ).unsqueeze(0).unsqueeze(0).to(ocr.device)
83
+ with torch.no_grad():
84
+ outputs = ocr.model(tensor)
85
+ decoded = _crnn_decode(outputs.cpu(), ocr.idx_to_char, method='greedy')
86
+ return decoded[0].strip()
87
+ except Exception as e:
88
+ print(f'[template_matcher] CRNN+CTC read error: {e}')
89
+ return ''
90
 
 
 
 
 
 
 
91
 
92
+ # Hint constants (kept for template dict compatibility)
93
  _LINE = 'line'
94
  _BLOCK = 'block'
95
  _WORD = 'word'
96
 
97
+ # ── Post-processing ───────────────────────────────────────────────
 
 
 
98
  _SEX_KEYWORDS = {
99
  'female': 'FEMALE', 'fem': 'FEMALE', 'f': 'FEMALE',
100
+ 'male': 'MALE', 'm': 'MALE',
101
  }
 
 
102
  _FIELD_TYPE = {
 
103
  'sex': 'sex', 'groom_sex': 'sex', 'bride_sex': 'sex',
104
  'husband_sex': 'sex', 'wife_sex': 'sex',
 
105
  'dob_year': 'year',
 
106
  'age': 'digits', 'groom_age': 'digits', 'bride_age': 'digits',
107
  'husband_age': 'digits', 'wife_age': 'digits', 'dob_day': 'digits',
 
108
  'registration_date': 'date', 'marriage_date': 'date',
109
+ 'date_of_marriage': 'date', 'date_of_death': 'date',
110
+ 'date_of_birth': 'date', 'date_issued': 'date',
111
  'groom_dob': 'date', 'bride_dob': 'date',
112
  'husband_dob': 'date', 'wife_dob': 'date',
 
113
  'registry_no': 'registry', 'marriage_license_no': 'registry',
114
  }
115
 
 
116
  def _postprocess(text: str, field_name: str) -> str:
 
 
 
 
 
 
 
 
 
 
117
  text = text.strip()
118
  if not text:
119
  return text
120
  rule = _FIELD_TYPE.get(field_name)
 
121
  if rule == 'sex':
122
  tl = text.lower()
 
123
  for kw in sorted(_SEX_KEYWORDS, key=len, reverse=True):
124
  if kw in tl:
125
  return _SEX_KEYWORDS[kw]
126
  return text
 
127
  if rule == 'year':
128
  m = _re.search(r'(19|20)\d{2}', text)
129
  if m:
130
  return m.group(0)
131
  digits = _re.sub(r'\D', '', text)
132
  return digits[:4] if len(digits) >= 4 else text
 
133
  if rule == 'digits':
134
  d = _re.sub(r'\D', '', text)
135
  return d if d else text
 
136
  if rule == 'date':
137
  return _re.sub(r'[^\w\s\-/,.]', '', text).strip()
 
138
  if rule == 'registry':
139
  return _re.sub(r'[^\w\s\-/]', '', text).strip()
 
140
  return text
141
 
 
 
 
 
 
 
 
142
 
143
+ # ── Absolute-coordinate templates ─────────────────────────────────
144
+ # (x1, y1, x2, y2, hint) β€” all values are fractions 0.0–1.0
145
  TEMPLATES = {
146
+ '102': {
147
+ 'province': (0.183, 0.110, 0.582, 0.128, _LINE),
148
+ 'registry_no': (0.617, 0.121, 0.900, 0.149, _LINE),
149
+ 'city_municipality': (0.224, 0.134, 0.631, 0.150, _LINE),
 
 
 
 
 
150
  'name_first': (0.169, 0.161, 0.453, 0.181, _LINE),
151
  'name_middle': (0.450, 0.161, 0.674, 0.181, _LINE),
152
+ 'name_last': (0.674, 0.162, 0.935, 0.182, _LINE),
153
+ 'sex': (0.126, 0.195, 0.335, 0.210, _WORD),
154
+ 'dob_day': (0.445, 0.193, 0.562, 0.210, _WORD),
155
+ 'dob_month': (0.560, 0.193, 0.731, 0.211, _LINE),
156
+ 'dob_year': (0.735, 0.197, 0.883, 0.213, _WORD),
157
+ 'place_of_birth': (0.383, 0.227, 0.890, 0.245, _LINE),
158
+ 'type_of_birth': (0.124, 0.263, 0.328, 0.282, _WORD),
159
+ 'birth_order': (0.542, 0.272, 0.742, 0.285, _WORD),
160
+ 'weight_at_birth': (0.757, 0.258, 0.839, 0.287, _WORD),
161
+ 'mother_name': (0.217, 0.299, 0.894, 0.320, _LINE),
162
+ 'mother_citizenship': (0.125, 0.329, 0.506, 0.351, _LINE),
163
+ 'mother_religion': (0.508, 0.332, 0.901, 0.351, _LINE),
164
+ 'mother_occupation': (0.511, 0.363, 0.750, 0.385, _LINE),
165
+ 'mother_age_at_birth': (0.758, 0.371, 0.888, 0.390, _WORD),
166
+ 'mother_residence': (0.211, 0.405, 0.936, 0.425, _LINE),
167
+ 'father_name': (0.200, 0.436, 0.894, 0.456, _LINE),
168
+ 'father_citizenship': (0.128, 0.465, 0.318, 0.487, _LINE),
169
+ 'father_religion': (0.328, 0.467, 0.550, 0.490, _LINE),
170
+ 'father_occupation': (0.543, 0.466, 0.754, 0.496, _LINE),
171
+ 'father_age_at_birth': (0.752, 0.476, 0.902, 0.496, _WORD),
172
+ 'father_residence': (0.216, 0.508, 0.949, 0.527, _LINE),
173
+ 'marriage_date': (0.092, 0.556, 0.413, 0.573, _LINE),
174
+ 'marriage_place': (0.400, 0.554, 0.922, 0.571, _LINE),
175
+ 'registration_date': (0.635, 0.717, 0.919, 0.736, _LINE),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  '103': {
178
+ 'province': (0.182, 0.076, 0.581, 0.094, _LINE),
179
+ 'registry_no': (0.649, 0.088, 0.937, 0.123, _LINE),
180
+ 'city_municipality': (0.222, 0.097, 0.629, 0.113, _LINE),
181
+ 'deceased_name': (0.105, 0.139, 0.739, 0.173, _LINE),
182
+ 'sex': (0.735, 0.137, 0.931, 0.170, _WORD),
183
+ 'date_of_death': (0.123, 0.189, 0.316, 0.216, _LINE),
184
+ 'date_of_birth': (0.319, 0.187, 0.567, 0.214, _LINE),
185
+ 'age': (0.573, 0.198, 0.717, 0.214, _WORD),
186
+ 'place_of_death': (0.096, 0.227, 0.727, 0.251, _LINE),
187
+ 'civil_status': (0.709, 0.233, 0.935, 0.257, _WORD),
188
+ 'religion': (0.092, 0.268, 0.324, 0.295, _LINE),
189
+ 'citizenship': (0.324, 0.270, 0.522, 0.295, _LINE),
190
+ 'residence': (0.519, 0.271, 0.936, 0.297, _LINE),
191
+ 'occupation': (0.095, 0.311, 0.292, 0.330, _LINE),
192
+ 'father_name': (0.295, 0.306, 0.614, 0.334, _LINE),
193
+ 'mother_name': (0.615, 0.312, 0.938, 0.332, _LINE),
194
+ 'cause_immediate': (0.312, 0.372, 0.961, 0.384, _LINE),
195
+ 'cause_antecedent': (0.320, 0.383, 0.973, 0.402, _LINE),
196
+ 'cause_underlying': (0.311, 0.406, 0.839, 0.424, _LINE),
197
+ 'registration_date': (0.635, 0.717, 0.919, 0.736, _LINE),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  },
 
 
199
  '90': {
200
+ 'province': (0.208, 0.099, 0.607, 0.117, _LINE),
201
+ 'registry_no': (0.641, 0.104, 0.924, 0.132, _LINE),
202
+ 'city_municipality': (0.231, 0.113, 0.638, 0.129, _LINE),
203
+ 'marriage_license_no': (0.673, 0.132, 0.928, 0.150, _LINE),
204
+ 'date_issued': (0.775, 0.150, 0.932, 0.168, _LINE),
205
+ 'groom_name_first': (0.170, 0.294, 0.483, 0.309, _LINE),
206
+ 'groom_name_middle': (0.176, 0.308, 0.485, 0.320, _LINE),
207
+ 'groom_name_last': (0.174, 0.319, 0.486, 0.333, _LINE),
208
+ 'bride_name_first': (0.622, 0.292, 0.937, 0.306, _LINE),
209
+ 'bride_name_middle': (0.622, 0.306, 0.928, 0.319, _LINE),
210
+ 'bride_name_last': (0.621, 0.319, 0.929, 0.334, _LINE),
211
+ 'groom_dob': (0.152, 0.348, 0.394, 0.369, _LINE),
212
+ 'groom_age': (0.400, 0.345, 0.474, 0.371, _WORD),
213
+ 'bride_dob': (0.576, 0.345, 0.853, 0.365, _LINE),
214
+ 'bride_age': (0.851, 0.346, 0.932, 0.369, _WORD),
215
+ 'groom_place_of_birth': (0.136, 0.371, 0.472, 0.400, _LINE),
216
+ 'bride_place_of_birth': (0.585, 0.377, 0.921, 0.400, _LINE),
217
+ 'groom_sex': (0.135, 0.408, 0.267, 0.425, _WORD),
218
+ 'groom_citizenship': (0.268, 0.407, 0.477, 0.425, _LINE),
219
+ 'bride_sex': (0.574, 0.408, 0.708, 0.424, _WORD),
220
+ 'bride_citizenship': (0.720, 0.408, 0.917, 0.427, _LINE),
221
+ 'groom_residence': (0.140, 0.436, 0.472, 0.463, _LINE),
222
+ 'bride_residence': (0.577, 0.434, 0.922, 0.463, _LINE),
223
+ 'groom_religion': (0.135, 0.465, 0.472, 0.494, _LINE),
224
+ 'bride_religion': (0.584, 0.463, 0.920, 0.486, _LINE),
225
+ 'groom_civil_status': (0.135, 0.492, 0.471, 0.517, _WORD),
226
+ 'bride_civil_status': (0.585, 0.491, 0.924, 0.513, _WORD),
227
+ 'groom_father_name': (0.133, 0.647, 0.477, 0.672, _LINE),
228
+ 'groom_father_citizenship':(0.141, 0.669, 0.475, 0.695, _LINE),
229
+ 'bride_father_name': (0.580, 0.646, 0.923, 0.666, _LINE),
230
+ 'bride_father_citizenship':(0.578, 0.667, 0.916, 0.689, _LINE),
231
+ 'groom_mother_name': (0.139, 0.733, 0.474, 0.762, _LINE),
232
+ 'groom_mother_citizenship':(0.135, 0.763, 0.480, 0.779, _LINE),
233
+ 'bride_mother_name': (0.584, 0.736, 0.914, 0.758, _LINE),
234
+ 'bride_mother_citizenship':(0.579, 0.758, 0.924, 0.780, _LINE),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  },
 
 
236
  '97': {
237
+ 'province': (0.196, 0.093, 0.595, 0.111, _LINE),
238
+ 'registry_no': (0.771, 0.095, 0.969, 0.130, _LINE),
239
+ 'city_municipality': (0.197, 0.119, 0.604, 0.135, _LINE),
240
+ 'husband_name_first': (0.257, 0.158, 0.572, 0.173, _LINE),
241
+ 'husband_name_middle': (0.251, 0.180, 0.562, 0.192, _LINE),
242
+ 'husband_name_last': (0.254, 0.201, 0.576, 0.216, _LINE),
243
+ 'wife_name_first': (0.649, 0.158, 0.953, 0.171, _LINE),
244
+ 'wife_name_middle': (0.649, 0.180, 0.950, 0.195, _LINE),
245
+ 'wife_name_last': (0.651, 0.202, 0.968, 0.214, _LINE),
246
+ 'husband_dob': (0.205, 0.231, 0.493, 0.248, _LINE),
247
+ 'husband_age': (0.500, 0.233, 0.557, 0.249, _WORD),
248
+ 'wife_dob': (0.603, 0.234, 0.889, 0.245, _LINE),
249
+ 'wife_age': (0.901, 0.233, 0.961, 0.247, _WORD),
250
+ 'husband_place_of_birth': (0.193, 0.262, 0.573, 0.278, _LINE),
251
+ 'wife_place_of_birth': (0.595, 0.263, 0.963, 0.279, _LINE),
252
+ 'husband_sex': (0.221, 0.288, 0.309, 0.308, _WORD),
253
+ 'wife_sex': (0.616, 0.285, 0.711, 0.305, _WORD),
254
+ 'husband_citizenship': (0.323, 0.295, 0.567, 0.312, _LINE),
255
+ 'wife_citizenship': (0.722, 0.296, 0.963, 0.313, _LINE),
256
+ 'husband_residence': (0.190, 0.325, 0.563, 0.362, _LINE),
257
+ 'wife_residence': (0.590, 0.326, 0.961, 0.361, _LINE),
258
+ 'husband_religion': (0.190, 0.366, 0.567, 0.383, _LINE),
259
+ 'wife_religion': (0.582, 0.362, 0.959, 0.383, _LINE),
260
+ 'husband_civil_status': (0.189, 0.397, 0.572, 0.415, _WORD),
261
+ 'wife_civil_status': (0.588, 0.398, 0.956, 0.414, _WORD),
262
+ 'husband_father_name': (0.191, 0.428, 0.574, 0.445, _LINE),
263
+ 'wife_father_name': (0.586, 0.429, 0.958, 0.446, _LINE),
264
+ 'husband_father_citizenship': (0.184, 0.451, 0.569, 0.467, _LINE),
265
+ 'wife_father_citizenship': (0.588, 0.449, 0.947, 0.465, _LINE),
266
+ 'husband_mother_name': (0.176, 0.481, 0.563, 0.498, _LINE),
267
+ 'wife_mother_name': (0.586, 0.480, 0.940, 0.497, _LINE),
268
+ 'husband_mother_citizenship': (0.191, 0.501, 0.573, 0.517, _LINE),
269
+ 'wife_mother_citizenship': (0.590, 0.501, 0.971, 0.517, _LINE),
270
+ 'place_of_marriage': (0.196, 0.650, 0.958, 0.664, _LINE),
271
+ 'date_of_marriage': (0.199, 0.678, 0.548, 0.692, _LINE),
272
+ 'time_of_marriage': (0.765, 0.680, 0.917, 0.696, _LINE),
273
+ 'registration_date': (0.635, 0.717, 0.919, 0.736, _LINE),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  },
275
  }
276
 
277
 
278
+ # ── Alignment helpers ─────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
+ def _order_corners(pts: np.ndarray) -> np.ndarray:
281
+ s = pts.sum(axis=1)
282
+ d = np.diff(pts, axis=1).flatten()
283
+ return np.array([
284
+ pts[np.argmin(s)],
285
+ pts[np.argmin(d)],
286
+ pts[np.argmax(s)],
287
+ pts[np.argmax(d)],
288
+ ], dtype=np.float32)
289
 
290
 
291
+ def _correct_perspective(scan_rgb: np.ndarray, ref_w: int, ref_h: int) -> np.ndarray:
292
+ if not _CV2_OK:
293
+ return scan_rgb
294
+ gray = _cv2.cvtColor(scan_rgb, _cv2.COLOR_RGB2GRAY)
295
+ kernel = _cv2.getStructuringElement(_cv2.MORPH_RECT, (5, 5))
296
+ blur = _cv2.GaussianBlur(gray, (7, 7), 0)
 
297
  _, thresh = _cv2.threshold(blur, 0, 255, _cv2.THRESH_BINARY + _cv2.THRESH_OTSU)
298
+ dilated = _cv2.dilate(thresh, kernel, iterations=2)
299
+ contours, _ = _cv2.findContours(dilated, _cv2.RETR_EXTERNAL, _cv2.CHAIN_APPROX_SIMPLE)
 
300
  if not contours:
301
+ return scan_rgb
302
+ c = max(contours, key=_cv2.contourArea)
 
 
303
  area = _cv2.contourArea(c)
304
+ if area < 0.30 * gray.shape[0] * gray.shape[1]:
305
+ print('[align] perspective: contour too small, skipping')
306
+ return scan_rgb
307
+ peri = _cv2.arcLength(c, True)
 
 
 
 
308
  approx = _cv2.approxPolyDP(c, 0.02 * peri, True)
 
309
  if len(approx) != 4:
310
+ print(f'[align] perspective: {len(approx)} corners (need 4), skipping')
311
+ return scan_rgb
312
+ src = _order_corners(approx.reshape(4, 2).astype(np.float32))
313
+ dst = np.array([[0, 0], [ref_w, 0], [ref_w, ref_h], [0, ref_h]], np.float32)
314
+ M = _cv2.getPerspectiveTransform(src, dst)
315
+ warped = _cv2.warpPerspective(
316
+ scan_rgb, M, (ref_w, ref_h),
317
+ flags=_cv2.INTER_LINEAR, borderMode=_cv2.BORDER_REPLICATE)
318
+ print('[align] perspective correction applied')
319
+ return warped
320
+
321
+
322
+ def _ecc_align(scan_gray: np.ndarray, ref_gray: np.ndarray,
323
+ scan_rgb: np.ndarray) -> np.ndarray | None:
324
+ try:
325
+ h, w = ref_gray.shape
326
+ scale = min(1.0, 500.0 / max(h, w))
327
+ sh, sw = max(1, int(h * scale)), max(1, int(w * scale))
328
+ ref_s = _cv2.resize(ref_gray, (sw, sh))
329
+ scn_s = _cv2.resize(_cv2.resize(scan_gray, (w, h)), (sw, sh))
330
+ warp = np.eye(2, 3, dtype=np.float32)
331
+ criteria = (_cv2.TERM_CRITERIA_EPS | _cv2.TERM_CRITERIA_COUNT, 100, 1e-4)
332
+ cc, warp = _cv2.findTransformECC(ref_s, scn_s, warp, _cv2.MOTION_EUCLIDEAN, criteria)
333
+ angle = np.degrees(np.arctan2(warp[1, 0], warp[0, 0]))
334
+ if abs(angle) > 1.0:
335
+ clamped = np.radians(np.clip(angle, -1.0, 1.0))
336
+ warp[0, 0] = np.cos(clamped); warp[0, 1] = -np.sin(clamped)
337
+ warp[1, 0] = np.sin(clamped); warp[1, 1] = np.cos(clamped)
338
+ warp[0, 2] /= scale; warp[1, 2] /= scale
339
+ scan_full = _cv2.resize(scan_rgb, (w, h))
340
+ aligned = _cv2.warpAffine(scan_full, warp, (w, h),
341
+ flags=_cv2.INTER_LINEAR,
342
+ borderMode=_cv2.BORDER_REPLICATE)
343
+ print(f'[align] ECC applied (cc={cc:.4f} angle={angle:.2f}Β°)')
344
+ return aligned
345
+ except Exception as e:
346
+ print(f'[align] ECC failed: {e}')
347
  return None
348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
+ def _orb_align(scan_gray: np.ndarray, ref_gray: np.ndarray,
351
+ scan_rgb: np.ndarray) -> tuple[np.ndarray | None, int]:
352
+ h, w = scan_gray.shape
 
 
 
353
  ref_resized = _cv2.resize(ref_gray, (w, h))
354
+ orb = _cv2.ORB_create(nfeatures=5000)
355
+ kp1, des1 = orb.detectAndCompute(scan_gray, None)
356
+ kp2, des2 = orb.detectAndCompute(ref_resized, None)
 
 
357
  if des1 is None or des2 is None or len(kp1) < 10 or len(kp2) < 10:
358
  return None, 0
 
359
  matcher = _cv2.BFMatcher(_cv2.NORM_HAMMING, crossCheck=True)
360
  matches = sorted(matcher.match(des1, des2), key=lambda m: m.distance)
361
+ good = matches[:max(10, len(matches) // 3)]
 
 
362
  if len(good) < 10:
363
  return None, 0
 
364
  src_pts = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
365
  dst_pts = np.float32([kp2[m.trainIdx].pt for m in good]).reshape(-1, 1, 2)
 
366
  M, mask = _cv2.estimateAffinePartial2D(
367
+ src_pts, dst_pts, method=_cv2.RANSAC, ransacReprojThreshold=5.0)
 
368
  if M is None:
369
  return None, 0
 
370
  inliers = int(mask.sum()) if mask is not None else 0
371
+ aligned = _cv2.warpAffine(scan_rgb, M, (w, h),
372
+ flags=_cv2.INTER_LINEAR,
373
+ borderMode=_cv2.BORDER_REPLICATE)
374
+ print(f'[align] ORB applied ({inliers} inliers)')
 
 
 
375
  return aligned, inliers
376
 
377
 
378
  def _orb_inliers(scan_gray: np.ndarray, ref_gray: np.ndarray) -> int:
379
+ orb = _cv2.ORB_create(nfeatures=3000)
 
 
 
 
380
  kp1, des1 = orb.detectAndCompute(scan_gray, None)
381
+ kp2, des2 = orb.detectAndCompute(ref_gray, None)
382
  if des1 is None or des2 is None or len(kp1) < 10 or len(kp2) < 10:
383
  return 0
384
  matcher = _cv2.BFMatcher(_cv2.NORM_HAMMING, crossCheck=True)
385
  matches = sorted(matcher.match(des1, des2), key=lambda m: m.distance)
386
+ good = matches[:max(10, len(matches) // 3)]
387
  if len(good) < 10:
388
  return 0
389
  src_pts = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
 
392
  return int(mask.sum()) if mask is not None else 0
393
 
394
 
395
+ def align_to_reference(img: Image.Image, form_type: str) -> tuple[Image.Image, int]:
396
  """
397
+ Four-stage alignment cascade:
398
+ Stage 0 β€” Perspective correction
399
+ Stage 1 β€” ECC EUCLIDEAN
400
+ Stage 2 β€” ORB RANSAC affine
401
+ Stage 3 β€” Resize only
402
+ Returns (aligned_image, orb_inlier_count).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  """
404
  if not _CV2_OK:
405
  return img, 0
 
406
  ref_path = REFERENCE_IMAGES.get(form_type)
407
  if not ref_path or not os.path.exists(ref_path):
408
+ print(f'[align] No reference for form {form_type}')
409
  return img, 0
 
410
  ref_gray = _cv2.imread(ref_path, _cv2.IMREAD_GRAYSCALE)
411
  if ref_gray is None:
412
  return img, 0
 
 
 
413
  ref_h, ref_w = ref_gray.shape
414
+ scan_rgb = np.array(img.convert('RGB'))
415
 
416
+ scan_rgb = _correct_perspective(scan_rgb, ref_w, ref_h)
417
+ scan_rgb_rs = _cv2.resize(scan_rgb, (ref_w, ref_h))
418
+ scan_gray_rs = _cv2.cvtColor(scan_rgb_rs, _cv2.COLOR_RGB2GRAY)
419
 
 
420
  print(f'[align] Form {form_type}: trying ECC...')
421
  aligned = _ecc_align(scan_gray_rs, ref_gray, scan_rgb_rs)
422
  if aligned is not None:
423
+ return Image.fromarray(aligned), 25
 
424
 
 
425
  print(f'[align] Form {form_type}: ECC failed, trying ORB...')
426
  aligned, inliers = _orb_align(scan_gray_rs, ref_gray, scan_rgb_rs)
427
  if aligned is not None:
 
428
  return Image.fromarray(aligned), inliers
429
 
430
+ print(f'[align] Form {form_type}: all alignment failed, resizing only')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  resized = _cv2.resize(scan_rgb, (ref_w, ref_h))
432
  return Image.fromarray(resized), 0
433
 
434
 
435
+ # ── Image preprocessing ───────────────────────────────────────────
436
+
437
  def _deskew(gray: np.ndarray) -> np.ndarray:
 
438
  if not _CV2_OK:
439
  return gray
440
  edges = _cv2.Canny(gray, 50, 150, apertureSize=3)
441
+ lines = _cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100,
442
  minLineLength=100, maxLineGap=10)
443
+ if lines is None:
444
  return gray
445
+ angles = [np.degrees(np.arctan2(y2-y1, x2-x1))
446
+ for x1, y1, x2, y2 in lines[:, 0]
447
+ if -15 < np.degrees(np.arctan2(y2-y1, x2-x1)) < 15]
 
 
448
  if not angles:
449
  return gray
450
+ angle = float(np.median(angles))
451
+ if abs(angle) < 0.3:
452
  return gray
453
  h, w = gray.shape
454
+ M = _cv2.getRotationMatrix2D((w/2, h/2), angle, 1.0)
455
  return _cv2.warpAffine(gray, M, (w, h),
456
  flags=_cv2.INTER_CUBIC,
457
  borderMode=_cv2.BORDER_REPLICATE)
458
 
459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  def _preprocess(img: Image.Image) -> Image.Image:
 
 
 
 
 
 
 
 
461
  if not _CV2_OK:
462
  return img.convert('L')
463
  gray = np.array(img.convert('L'))
 
466
 
467
 
468
  def _crop_field(img: Image.Image, x1r, y1r, x2r, y2r) -> Image.Image:
 
469
  w, h = img.size
470
+ pad = 4
471
+ x1 = max(0, int(x1r * w) - pad); y1 = max(0, int(y1r * h) - pad)
472
+ x2 = min(w, int(x2r * w) + pad); y2 = min(h, int(y2r * h) + pad)
 
 
 
473
  return img.crop((x1, y1, x2, y2))
474
 
475
 
476
+ def _ocr(crop: Image.Image) -> str:
477
+ """Run CRNN+CTC on a cropped field image."""
478
+ return _crnn_read(crop)
479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
+ # ── Form type detection ───────────────────────────────────────────
482
 
483
  def detect_form_type(image_path: str) -> str:
484
+ """Auto-detect form type using ORB inlier scoring, falling back to OCR title."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  if _CV2_OK:
486
  try:
487
+ img = Image.open(image_path).convert('RGB')
488
  scan_rgb = np.array(img)
489
  scan_gray = _cv2.cvtColor(scan_rgb, _cv2.COLOR_RGB2GRAY)
490
+ best_type, best_inliers = None, 0
491
+ DET_W = 800
 
 
 
492
  for ft, ref_path in REFERENCE_IMAGES.items():
493
  if not os.path.exists(ref_path):
494
  continue
 
496
  if ref_gray is None:
497
  continue
498
  ref_h, ref_w = ref_gray.shape
 
499
  sc = min(1.0, DET_W / ref_w)
500
+ dw, dh = max(1, int(ref_w*sc)), max(1, int(ref_h*sc))
501
+ ref_ds = _cv2.resize(ref_gray, (dw, dh))
502
+ scan_ds = _cv2.resize(_cv2.resize(scan_gray, (ref_w, ref_h)), (dw, dh))
503
+ count = _orb_inliers(scan_ds, ref_ds)
504
  print(f'[detect] Form {ft}: {count} ORB inliers')
505
  if count > best_inliers:
506
+ best_inliers, best_type = count, ft
 
 
507
  if best_type and best_inliers >= 15:
508
+ print(f'[detect] Best: Form {best_type} ({best_inliers} inliers)')
509
  return best_type
510
+ print(f'[detect] ORB inconclusive ({best_inliers}), trying OCR title')
 
 
511
  except Exception as e:
512
  print(f'[template_matcher] detect_form_type ORB error: {e}')
513
+ # CRNN+CTC title fallback
 
514
  try:
515
  img_l = Image.open(image_path).convert('L')
516
  w, h = img_l.size
517
+ title = _crnn_read(img_l.crop((0, int(h*0.04), w, int(h*0.15)))).upper()
 
518
  if title:
519
  if 'LIVE BIRTH' in title or ('BIRTH' in title
520
  and 'DEATH' not in title and 'MARRIAGE' not in title):
521
  return '102'
522
+ if 'DEATH' in title:
523
  return '103'
524
+ if 'MARRIAGE' in title and 'LICENSE' in title:
525
  return '90'
526
+ if 'MARRIAGE' in title:
527
  return '97'
528
+ print('[detect] Could not detect form type; defaulting to 102.')
 
529
  except Exception as e:
530
  print(f'[template_matcher] detect_form_type OCR error: {e}')
531
  return '102'
532
 
533
 
534
+ # ── Main extraction ───────────────────────────────────────────────
535
+
536
  def extract_fields(image_path: str, form_type: str) -> dict:
537
  """
538
+ Extract handwritten field values from a civil registry form scan.
539
 
540
  Args:
541
+ image_path : path to uploaded form image (PNG / JPG / PDF page)
542
+ form_type : '102' | '103' | '90' | '97'
543
 
544
  Returns:
545
  dict of { field_name: extracted_text }
 
548
  if template is None:
549
  print(f'[template_matcher] No template for form type: {form_type}')
550
  return {}
551
+ if _get_crnn() is None:
552
+ print('[template_matcher] CRNN+CTC not available')
 
553
  return {}
 
 
 
554
  try:
555
  img = Image.open(image_path).convert('RGB')
556
  except Exception as e:
557
  print(f'[template_matcher] Cannot open image: {e}')
558
  return {}
559
 
 
 
560
  img, orb_inliers = align_to_reference(img, form_type)
561
+ processed = _preprocess(img)
 
 
 
 
 
 
 
 
 
 
 
 
 
562
 
563
+ # Anchor detection disabled: CRNN+CTC is trained on handwritten text and
564
+ # reads printed labels inconsistently, causing fields to jump between
565
+ # anchor-relative and absolute positions across runs.
566
+ # After ECC/ORB alignment the absolute coordinates are stable and sufficient.
567
  form_w, form_h = img.size
568
+ field_names, crops, methods = [], [], []
 
 
 
 
569
 
570
  for field_name, coords in template.items():
571
+ x1r, y1r, x2r, y2r, _ = coords
572
+ crop = _crop_field(processed, x1r, y1r, x2r, y2r)
573
+ method = 'absolute'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  field_names.append(field_name)
575
  crops.append(crop)
576
+ methods.append(method)
577
 
578
+ fields = {}
579
+ anchor_hits = 0
580
+ for field_name, crop, method in zip(field_names, crops, methods):
581
  text = _postprocess(_ocr(crop), field_name)
582
  if text:
583
  fields[field_name] = text
584
 
585
+ print(f'[template_matcher] Anchor hits: {anchor_hits}/{len(anchor_defs)} | '
586
+ f'Extracted: {len(fields)}/{len(template)} fields')
 
587
  return fields
588
 
589
 
590
+ # ── Debug visualisation ───────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
591
 
592
+ def debug_draw_boxes(image_path: str, form_type: str, out_path: str = None) -> str:
593
  """
594
+ Draw all field boxes on the aligned image and save it.
 
 
595
 
596
+ Colour coding:
597
+ GREEN β€” anchor found and crop succeeded
598
+ RED β€” anchor search region (label NOT found)
599
+ BLUE β€” absolute coordinate crop (no anchor defined for field)
600
+ ORANGE β€” anchor found but crop produced empty region
601
  """
602
  from PIL import ImageDraw, ImageFont
603
 
604
  template = TEMPLATES.get(form_type)
605
  if not template:
606
  print(f'No template for {form_type}')
607
+ return None
608
 
609
+ img, _ = align_to_reference(Image.open(image_path).convert('RGB'), form_type)
610
+ draw = ImageDraw.Draw(img)
 
611
  w, h = img.size
612
 
613
  try:
614
+ font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 11)
615
+ except Exception:
616
+ try:
617
+ font = ImageFont.truetype('C:/Windows/Fonts/arial.ttf', 11)
618
+ except Exception:
619
+ font = ImageFont.load_default()
620
 
621
+ for field_name, coords in template.items():
 
622
  x1r, y1r, x2r, y2r, _ = coords
623
+ bx1, by1 = int(x1r*w), int(y1r*h)
624
+ bx2, by2 = int(x2r*w), int(y2r*h)
625
+ draw.rectangle([bx1, by1, bx2, by2], outline='#1a6fd4', width=1)
626
+ draw.text((bx1+2, by1+2), field_name, fill='#1a6fd4', font=font)
627
 
628
  base, ext = os.path.splitext(image_path)
629
  out = out_path or f'{base}_debug_{form_type}{ext}'
630
  img.save(out)
631
  print(f'[template_matcher] Debug image saved: {out}')
632
+ print(' GREEN = anchor found + crop region')
633
+ print(' RED = anchor label NOT found (search region shown)')
634
+ print(' BLUE = no anchor defined (absolute coords used)')
635
+ print(' ORANGE = anchor found but crop was empty')
636
  return out
637
 
638
 
639
+ # ── PDF helper ────────────────────────────────────────────────────
640
+
641
+ def pdf_to_image(pdf_path: str, page: int = 0) -> str:
642
+ try:
643
+ from pdf2image import convert_from_path
644
+ pages = convert_from_path(pdf_path, dpi=150)
645
+ out_path = pdf_path.replace('.pdf', f'_page{page}.png')
646
+ pages[page].save(out_path, 'PNG')
647
+ return out_path
648
+ except ImportError:
649
+ print('[template_matcher] pdf2image not installed.')
650
+ return None
651
+ except Exception as e:
652
+ print(f'[template_matcher] PDF conversion failed: {e}')
653
+ return None
654
+
655
+
656
+ # ── CLI ───────────────────────────────────────────────────────────
657
+
658
  if __name__ == '__main__':
659
  if len(sys.argv) < 3:
660
+ print('Usage: python template_matcher.py <image_path> <form_type> [out_path]')
661
  print(' form_type: 102 | 103 | 90 | 97')
 
662
  sys.exit(1)
663
 
664
  img_path = sys.argv[1]
665
  form_type = sys.argv[2]
666
+ out_path = sys.argv[3] if len(sys.argv) > 3 else None
667
 
668
+ out = debug_draw_boxes(img_path, form_type, out_path)
669
+ print(f'\nDebug image: {out}')
670
+ print(' GREEN = anchor hit | RED = anchor miss | BLUE = absolute fallback\n')
671
 
 
672
  result = extract_fields(img_path, form_type)
673
+ print(f'Extracted fields ({len(result)}):')
674
  for k, v in result.items():
675
+ print(f' {k:<40} = {v}')
676
+
677
  template = TEMPLATES.get(form_type, {})
678
+ missing = [k for k in template if k not in result]
679
  if missing:
680
  print(f'\nEmpty fields ({len(missing)}):')
681
  for k in missing:
682
+ print(f' {k}')