Spaces:

hanz245
/

ocr

Running

App Files Files Community

ocr / field_extractor.py

hanz245

set up

7111e1a 9 days ago

raw

history blame contribute delete

35.6 kB

	"""
	Philippine Civil Registry — Field Extractor (Dynamic)
	======================================================
	Automatically detects form borders on ANY scan/photo and aligns field
	extraction to the detected boundary — no hardcoded pixel positions.

	Field coordinates calibrated directly from official PDF renders at 200 DPI:
	Form 102 (Birth): 1700 x 2800 px
	Form 103 (Death): 1700 x 2878 px
	Form 97 (Marriage): 1700 x 2600 px
	Form 90 (License): 1700 x 2600 px

	Usage:
	python field_extractor.py --pdf FORM_102.pdf --form birth
	python field_extractor.py --pdf FORM_97.pdf --form marriage --visualize
	python field_extractor.py --pdf FORM_103.pdf --form death --output results.json
	python field_extractor.py --image form102.png --form birth --visualize
	python field_extractor.py --pdf FORM_102.pdf --form birth --checkpoint checkpoints/best_model_emnist.pth

	.env file (project root) — each team member sets their own:
	POPPLER_PATH=C:\\your\\path\\to\\poppler\\Library\\bin
	"""

	import argparse
	import os
	import sys
	import json
	import cv2
	import numpy as np
	from pathlib import Path

	import torch
	from dotenv import load_dotenv

	# Load .env from same folder as this script (works regardless of cwd)
	_script_dir = Path(__file__).parent.resolve()
	load_dotenv(dotenv_path=_script_dir / ".env")

	# Poppler path — from .env or None (Linux/Mac auto-detects)
	POPPLER_PATH = os.environ.get("POPPLER_PATH", None)
	DEFAULT_CHECKPOINT = "checkpoints/best_model.pth"


	# ══════════════════════════════════════════════════════════════════════════════
	# FIELD RATIO MAPS
	# Format: field_name: (x1, y1, x2, y2) — ratios 0.0–1.0
	# Coordinates are relative to the DETECTED FORM BOUNDARY (not full image).
	# x = left→right, y = top→bottom
	# ══════════════════════════════════════════════════════════════════════════════

	# Form 102 → Certificate of Live Birth (Form 1A)
	BIRTH_FIELDS = {
	# Header
	"province": (0.02, 0.068, 0.30, 0.088),
	"registry_number": (0.66, 0.068, 0.99, 0.108),
	"city_municipality": (0.02, 0.090, 0.65, 0.108),

	# Item 1 — Child Name
	"child_first_name": (0.03, 0.109, 0.40, 0.141),
	"child_middle_name": (0.40, 0.109, 0.64, 0.141),
	"child_last_name": (0.64, 0.109, 0.99, 0.141),

	# Items 2-3 — Sex / Date of Birth
	"sex": (0.03, 0.142, 0.30, 0.167),
	"dob_day": (0.40, 0.142, 0.80, 0.167),
	"dob_month": (0.80, 0.142, 0.60, 0.167),
	"dob_year": (0.80, 0.142, 0.99, 0.167),

	# Item 4 — Place of Birth
	"place_birth_hospital": (0.03, 0.169, 0.46, 0.197),
	"place_birth_city": (0.47, 0.169, 0.70, 0.199),
	"place_birth_province": (0.71, 0.169, 0.99, 0.199),



	# Mother section
	"mother_first_name": (0.03, 0.248, 0.40, 0.276),
	"mother_middle_name": (0.40, 0.248, 0.64, 0.276),
	"mother_last_name": (0.64, 0.248, 0.99, 0.276),
	"mother_citizenship": (0.03, 0.277, 0.50, 0.305),


	# Father section
	"father_first_name": (0.03, 0.380, 0.40, 0.410),
	"father_middle_name": (0.40, 0.380, 0.64, 0.410),
	"father_last_name": (0.64, 0.380, 0.99, 0.410),
	"father_citizenship": (0.03, 0.411, 0.28, 0.445),


	# Item 20 — Marriage of Parents
	"parents_marriage_month": (0.03, 0.496, 0.19, 0.526),
	"parents_marriage_day": (0.19, 0.496, 0.27, 0.526),
	"parents_marriage_year": (0.27, 0.496, 0.38, 0.526),

	"parents_marriage_city": (0.41, 0.496, 0.68, 0.526),
	"parents_marriage_province": (0.68, 0.496, 0.84, 0.526),


	}

	# Form 103 → Certificate of Death (Form 2A)
	DEATH_FIELDS = {
	# Header
	"province": (0.04, 0.128, 0.40, 0.144),
	"registry_number": (0.52, 0.128, 0.75, 0.144),
	"city_municipality": (0.04, 0.145, 0.45, 0.160),

	# Item 1 — Name
	"deceased_first_name": (0.10, 0.162, 0.34, 0.178),
	"deceased_middle_name": (0.34, 0.162, 0.56, 0.178),
	"deceased_last_name": (0.56, 0.162, 0.75, 0.178),

	# Items 2-4 — Sex / Religion / Age
	"sex": (0.04, 0.182, 0.13, 0.220),
	"age_years": (0.28, 0.182, 0.38, 0.202),

	# Item 5 — Place of Death
	"place_death_hospital": (0.13, 0.224, 0.42, 0.242),
	"place_death_city": (0.42, 0.224, 0.58, 0.242),
	"place_death_province": (0.58, 0.224, 0.75, 0.242),

	# Items 6-7 — Date of Death / Citizenship
	"dod_day": (0.10, 0.252, 0.22, 0.268),
	"dod_month": (0.22, 0.252, 0.38, 0.268),
	"dod_year": (0.38, 0.252, 0.52, 0.268),
	"citizenship": (0.52, 0.252, 0.75, 0.268),

	# Item 8 — Residence
	"residence_house": (0.13, 0.278, 0.40, 0.294),
	"residence_city": (0.40, 0.278, 0.56, 0.294),
	"residence_province": (0.56, 0.278, 0.75, 0.294),

	# Items 9-10 — Civil Status / Occupation
	"civil_status": (0.04, 0.302, 0.38, 0.360),
	"occupation": (0.44, 0.302, 0.75, 0.360),

	# Item 17 — Causes of Death
	"cause_immediate": (0.18, 0.402, 0.58, 0.418),
	"cause_antecedent": (0.18, 0.424, 0.58, 0.440),
	"cause_underlying": (0.18, 0.446, 0.58, 0.462),
	"cause_other": (0.18, 0.468, 0.58, 0.484),

	# Item 25 — Informant
	"informant_name": (0.04, 0.808, 0.35, 0.822),
	"informant_address": (0.04, 0.822, 0.35, 0.836),
	"informant_date": (0.35, 0.836, 0.58, 0.850),
	}

	# Form 97 → Certificate of Marriage (Form 3A)
	# Only the fields that flow through bridge.py → spaCy NER → SpouseOutput/Form3A.
	# Removed: province, city_municipality, dob_day/month/year (×2),
	# place_birth_city/prov/country (×2), sex (×2), residence (×2),
	# religion (×2), civil_status (×2).
	MARRIAGE_FIELDS = {
	# ── Header ───────────────────────────────────────────────────────────────
	"registry_number": (0.62, 0.088, 0.97, 0.104), # → Form3A.registry_number

	#"registry_number": (0.62, 0.088, 0.97, 0.104), # → Form3A.registry_number

	# ── Item 1 — Name (HUSBAND left / WIFE right) ────────────────────────────
	"husband_first_name": (0.23, 0.121, 0.56, 0.139),
	"husband_middle_name": (0.23, 0.141, 0.56, 0.159),
	"husband_last_name": (0.23, 0.160, 0.56, 0.178),
	"wife_first_name": (0.65, 0.121, 0.98, 0.139),
	"wife_middle_name": (0.65, 0.141, 0.98, 0.159),
	"wife_last_name": (0.65, 0.160, 0.98, 0.178),

	# "husband_first_name": (0.14, 0.138, 0.47, 0.156),
	# "husband_middle_name": (0.14, 0.156, 0.47, 0.174),
	# "husband_last_name": (0.14, 0.174, 0.47, 0.192),
	# "wife_first_name": (0.53, 0.138, 0.86, 0.156),
	# "wife_middle_name": (0.53, 0.156, 0.86, 0.174),
	# "wife_last_name": (0.53, 0.174, 0.86, 0.192),

	# ── Item 2b — Age ────────────────────────────────────────────────────────
	"husband_age": (0.40, 0.198, 0.47, 0.216), # → husband.age
	"wife_age": (0.78, 0.198, 0.86, 0.216), # → wife.age

	# ── Item 4b — Citizenship ────────────────────────────────────────────────
	"husband_citizenship": (0.22, 0.252, 0.47, 0.270), # → husband.nationality
	"wife_citizenship": (0.62, 0.252, 0.86, 0.270), # → wife.nationality

	# ── Item 8 — Name of Father ──────────────────────────────────────────────
	"husband_father_first": (0.14, 0.396, 0.24, 0.414),
	"husband_father_middle": (0.24, 0.396, 0.34, 0.414),
	"husband_father_last": (0.34, 0.396, 0.47, 0.414),
	"wife_father_first": (0.53, 0.396, 0.63, 0.414),
	"wife_father_middle": (0.63, 0.396, 0.73, 0.414),
	"wife_father_last": (0.73, 0.396, 0.86, 0.414),

	# ── Item 9 — Citizenship of Father ──────────────────────────────────────
	"husband_father_citizenship": (0.14, 0.420, 0.47, 0.436), # → husband.nationality_of_father
	"wife_father_citizenship": (0.53, 0.420, 0.86, 0.436), # → wife.nationality_of_father

	# ── Item 10 — Name of Mother ─────────────────────────────────────────────
	"husband_mother_first": (0.14, 0.444, 0.24, 0.462),
	"husband_mother_middle": (0.24, 0.444, 0.34, 0.462),
	"husband_mother_last": (0.34, 0.444, 0.47, 0.462),
	"wife_mother_first": (0.53, 0.444, 0.63, 0.462),
	"wife_mother_middle": (0.63, 0.444, 0.73, 0.462),
	"wife_mother_last": (0.73, 0.444, 0.86, 0.462),

	# ── Item 11 — Citizenship of Mother ─────────────────────────────────────
	"husband_mother_citizenship": (0.14, 0.468, 0.47, 0.484), # → husband.nationality_of_mother
	"wife_mother_citizenship": (0.53, 0.468, 0.86, 0.484), # → wife.nationality_of_mother

	# ── Items 15–16 — Place / Date of Marriage ───────────────────────────────
	"place_marriage_office": (0.14, 0.596, 0.44, 0.614),
	"place_marriage_city": (0.44, 0.596, 0.68, 0.614),
	"place_marriage_province": (0.68, 0.596, 0.88, 0.614),
	"date_marriage_day": (0.14, 0.630, 0.24, 0.648),
	"date_marriage_month": (0.24, 0.630, 0.38, 0.648),
	"date_marriage_year": (0.38, 0.630, 0.48, 0.648),
	}

	# Form 90 → Application for Marriage License
	MARRIAGE_LICENSE_FIELDS = {
	# Header
	"province": (0.12, 0.092, 0.48, 0.108),
	"registry_number": (0.56, 0.092, 0.97, 0.108),
	"city_municipality": (0.12, 0.108, 0.48, 0.124),
	"received_by": (0.12, 0.124, 0.48, 0.140),
	"date_of_receipt": (0.12, 0.140, 0.48, 0.156),
	"marriage_license_number": (0.56, 0.124, 0.97, 0.140),
	"date_of_issuance": (0.56, 0.140, 0.97, 0.156),

	# Item 1 — Name of Applicant (GROOM left / BRIDE right)
	"groom_first_name": (0.02, 0.278, 0.46, 0.294),
	"bride_first_name": (0.54, 0.278, 0.97, 0.294),
	"groom_middle_name": (0.02, 0.296, 0.46, 0.312),
	"bride_middle_name": (0.54, 0.296, 0.97, 0.312),
	"groom_last_name": (0.02, 0.314, 0.46, 0.330),
	"bride_last_name": (0.54, 0.314, 0.97, 0.330),

	# Item 2 — Date of Birth / Age
	"groom_dob_day": (0.02, 0.334, 0.12, 0.350),
	"groom_dob_month": (0.12, 0.334, 0.24, 0.350),
	"groom_dob_year": (0.24, 0.334, 0.34, 0.350),
	"groom_age": (0.34, 0.334, 0.46, 0.350),
	"bride_dob_day": (0.54, 0.334, 0.62, 0.350),
	"bride_dob_month": (0.62, 0.334, 0.74, 0.350),
	"bride_dob_year": (0.74, 0.334, 0.84, 0.350),
	"bride_age": (0.84, 0.334, 0.97, 0.350),

	# Item 3 — Place of Birth
	"groom_place_birth_city": (0.02, 0.354, 0.18, 0.370),
	"groom_place_birth_province": (0.18, 0.354, 0.32, 0.370),
	"groom_place_birth_country": (0.32, 0.354, 0.46, 0.370),
	"bride_place_birth_city": (0.54, 0.354, 0.70, 0.370),
	"bride_place_birth_province": (0.70, 0.354, 0.84, 0.370),
	"bride_place_birth_country": (0.84, 0.354, 0.97, 0.370),

	# Item 4 — Sex / Citizenship
	"groom_sex": (0.02, 0.374, 0.16, 0.390),
	"groom_citizenship": (0.16, 0.374, 0.46, 0.390),
	"bride_sex": (0.54, 0.374, 0.68, 0.390),
	"bride_citizenship": (0.68, 0.374, 0.97, 0.390),

	# Item 5 — Residence
	"groom_residence": (0.02, 0.394, 0.46, 0.412),
	"bride_residence": (0.54, 0.394, 0.97, 0.412),

	# Item 6 — Religion
	"groom_religion": (0.02, 0.424, 0.46, 0.440),
	"bride_religion": (0.54, 0.424, 0.97, 0.440),

	# Item 7 — Civil Status
	"groom_civil_status": (0.02, 0.452, 0.46, 0.468),
	"bride_civil_status": (0.54, 0.452, 0.97, 0.468),

	# Item 9 — Place where dissolved
	"groom_dissolution_city": (0.02, 0.496, 0.16, 0.512),
	"groom_dissolution_province": (0.16, 0.496, 0.30, 0.512),
	"groom_dissolution_country": (0.30, 0.496, 0.46, 0.512),
	"bride_dissolution_city": (0.54, 0.496, 0.68, 0.512),
	"bride_dissolution_province": (0.68, 0.496, 0.82, 0.512),
	"bride_dissolution_country": (0.82, 0.496, 0.97, 0.512),

	# Item 10 — Date when dissolved
	"groom_dissolution_day": (0.02, 0.520, 0.12, 0.536),
	"groom_dissolution_month": (0.12, 0.520, 0.24, 0.536),
	"groom_dissolution_year": (0.24, 0.520, 0.34, 0.536),
	"bride_dissolution_day": (0.54, 0.520, 0.62, 0.536),
	"bride_dissolution_month": (0.62, 0.520, 0.74, 0.536),
	"bride_dissolution_year": (0.74, 0.520, 0.84, 0.536),

	# Item 12 — Father Name
	"groom_father_first": (0.02, 0.594, 0.16, 0.610),
	"groom_father_middle": (0.16, 0.594, 0.28, 0.610),
	"groom_father_last": (0.28, 0.594, 0.46, 0.610),
	"bride_father_first": (0.54, 0.594, 0.66, 0.610),
	"bride_father_middle": (0.66, 0.594, 0.78, 0.610),
	"bride_father_last": (0.78, 0.594, 0.97, 0.610),

	# Item 13 — Father Citizenship
	"groom_father_citizenship": (0.02, 0.620, 0.46, 0.636),
	"bride_father_citizenship": (0.54, 0.620, 0.97, 0.636),

	# Item 14 — Father Residence
	"groom_father_residence": (0.02, 0.644, 0.46, 0.660),
	"bride_father_residence": (0.54, 0.644, 0.97, 0.660),

	# Item 15 — Mother Name
	"groom_mother_first": (0.02, 0.674, 0.16, 0.690),
	"groom_mother_middle": (0.16, 0.674, 0.28, 0.690),
	"groom_mother_last": (0.28, 0.674, 0.46, 0.690),
	"bride_mother_first": (0.54, 0.674, 0.66, 0.690),
	"bride_mother_middle": (0.66, 0.674, 0.78, 0.690),
	"bride_mother_last": (0.78, 0.674, 0.97, 0.690),

	# Item 16 — Mother Citizenship
	"groom_mother_citizenship": (0.02, 0.696, 0.46, 0.712),
	"bride_mother_citizenship": (0.54, 0.696, 0.97, 0.712),

	# Item 17 — Mother Residence
	"groom_mother_residence": (0.02, 0.720, 0.46, 0.736),
	"bride_mother_residence": (0.54, 0.720, 0.97, 0.736),
	}

	FORM_FIELDS = {
	"birth": BIRTH_FIELDS,
	"death": DEATH_FIELDS,
	"marriage": MARRIAGE_FIELDS,
	"marriage_license": MARRIAGE_LICENSE_FIELDS,
	}

	COLOURS = [
	(0,200,0),(0,150,255),(200,0,200),(0,200,200),(200,200,0),(220,20,60),
	(255,140,0),(150,50,200),(0,160,80),(30,144,255),(255,20,147),(100,200,100),
	]


	# ══════════════════════════════════════════════════════════════════════════════
	# FORM BOUNDS DETECTOR
	# Finds the outer border of a civil registry form using line detection.
	# Falls back to full image if detection fails.
	# ══════════════════════════════════════════════════════════════════════════════

	class FormBoundsDetector:
	def __init__(self, verbose=False):
	self.verbose = verbose

	def detect(self, image_bgr):
	h, w = image_bgr.shape[:2]
	gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
	bounds = self._detect_by_lines(gray, w, h)
	if bounds is None:
	if self.verbose:
	print(" [Bounds] Line detection failed — using full image")
	return (0, 0, w, h)
	if self.verbose:
	print(f" [Bounds] Detected: {bounds}")
	return bounds

	def _detect_by_lines(self, gray, w, h):
	try:
	thresh = cv2.adaptiveThreshold(
	gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY_INV, 11, 2)
	hk = cv2.getStructuringElement(cv2.MORPH_RECT, (max(w // 5, 10), 1))
	h_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, hk)
	h_rows = np.where(np.sum(h_lines, axis=1) > w * 0.15)[0]
	vk = cv2.getStructuringElement(cv2.MORPH_RECT, (1, max(h // 5, 10)))
	v_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vk)
	v_cols = np.where(np.sum(v_lines, axis=0) > h * 0.08)[0]
	if len(h_rows) == 0 or len(v_cols) == 0:
	return None
	top, bottom = int(h_rows.min()), int(h_rows.max())
	left, right = int(v_cols.min()), int(v_cols.max())
	if (right - left) < w * 0.4 or (bottom - top) < h * 0.4:
	return None
	return (left, top, right, bottom)
	except Exception as e:
	if self.verbose:
	print(f" [Bounds error] {e}")
	return None


	# ══════════════════════════════════════════════════════════════════════════════
	# DYNAMIC FIELD EXTRACTOR
	# Crops each field region relative to the detected form boundary.
	# Works on any image size, DPI, scan margin, or slight rotation.
	# ══════════════════════════════════════════════════════════════════════════════

	class DynamicFieldExtractor:
	def __init__(self, form_type="birth", verbose=False):
	self.form_type = form_type.lower()
	self.field_map = FORM_FIELDS.get(self.form_type, BIRTH_FIELDS)
	self.detector = FormBoundsDetector(verbose=verbose)
	self.verbose = verbose
	self._last_bounds = None

	def _to_bgr(self, image):
	try:
	from PIL import Image as PILImage
	if isinstance(image, PILImage.Image):
	arr = np.array(image.convert("RGB"))
	return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
	except ImportError:
	pass
	if isinstance(image, np.ndarray):
	if len(image.shape) == 2:
	return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
	if image.shape[2] == 4:
	return cv2.cvtColor(image, cv2.COLOR_BGRA2BGR)
	return image
	raise TypeError(f"Unsupported image type: {type(image)}")

	def extract(self, image):
	"""Returns {field_name: BGR numpy array}."""
	image = self._to_bgr(image)
	h, w = image.shape[:2]
	left, top, right, bottom = self.detector.detect(image)
	self._last_bounds = (left, top, right, bottom)
	form_w = right - left
	form_h = bottom - top
	if self.verbose:
	print(f" [Extract] Image={w}x{h} "
	f" Form={form_w}x{form_h} @ ({left},{top})-({right},{bottom})")
	crops = {}
	for name, (rx1, ry1, rx2, ry2) in self.field_map.items():
	x1 = max(0, min(int(left + rx1 * form_w), w - 1))
	y1 = max(0, min(int(top + ry1 * form_h), h - 1))
	x2 = max(0, min(int(left + rx2 * form_w), w - 1))
	y2 = max(0, min(int(top + ry2 * form_h), h - 1))
	if x2 > x1 and y2 > y1:
	crops[name] = image[y1:y2, x1:x2]
	return crops

	def visualize(self, image, output_path=None):
	"""Draw detected boundary + field boxes. Returns annotated BGR image."""
	image = self._to_bgr(image)
	vis = image.copy()
	h, w = vis.shape[:2]
	self.extract(image)
	left, top, right, bottom = self._last_bounds
	form_w = right - left
	form_h = bottom - top
	cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 3)
	cv2.putText(vis, "DETECTED FORM BOUNDARY",
	(left, max(0, top - 8)),
	cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 140, 255), 1)
	for idx, (name, (rx1, ry1, rx2, ry2)) in enumerate(self.field_map.items()):
	x1 = max(0, min(int(left + rx1 * form_w), w - 1))
	y1 = max(0, min(int(top + ry1 * form_h), h - 1))
	x2 = max(0, min(int(left + rx2 * form_w), w - 1))
	y2 = max(0, min(int(top + ry2 * form_h), h - 1))
	c = COLOURS[idx % len(COLOURS)]
	cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2)
	cv2.putText(vis, name[:22], (x1 + 2, max(0, y1 - 2)),
	cv2.FONT_HERSHEY_SIMPLEX, 0.28, c, 1)
	if output_path:
	cv2.imwrite(str(output_path), vis)
	print(f" Field map saved -> {output_path}")
	return vis


	# ══════════════════════════════════════════════════════════════════════════════
	# FIELD NORMALIZER — prepares a BGR crop for CRNN inference
	# ══════════════════════════════════════════════════════════════════════════════

	class FieldNormalizer:
	def __init__(self, target_height=64, target_width=512):
	self.H = target_height
	self.W = target_width

	def _crop_to_text(self, gray):
	inv = cv2.bitwise_not(gray)
	_, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
	coords = np.column_stack(np.where(thresh > 0))
	if len(coords) == 0:
	return gray
	y_min, x_min = coords.min(axis=0)
	y_max, x_max = coords.max(axis=0)
	pad = max(4, int((y_max - y_min) * 0.15))
	y_min = max(0, y_min - pad)
	x_min = max(0, x_min - pad)
	y_max = min(gray.shape[0] - 1, y_max + pad)
	x_max = min(gray.shape[1] - 1, x_max + pad)
	return gray[y_min:y_max + 1, x_min:x_max + 1]

	def _smart_resize(self, gray):
	h, w = gray.shape
	if h == 0 or w == 0:
	return np.ones((self.H, self.W), dtype=np.uint8) * 255
	scale = self.H / h
	new_w = int(w * scale)
	new_h = self.H
	if new_w > self.W:
	scale = self.W / w
	new_h = int(h * scale)
	new_w = self.W
	resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
	canvas = np.ones((self.H, self.W), dtype=np.uint8) * 255
	y_off = (self.H - new_h) // 2
	x_off = (self.W - new_w) // 2
	canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized
	return canvas

	def _binarize(self, img):
	_, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	white_ratio = np.mean(otsu == 255)
	if white_ratio < 0.30 or white_ratio > 0.97:
	return cv2.adaptiveThreshold(
	img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY, 11, 2)
	return otsu

	def normalize(self, crop) -> np.ndarray:
	"""Accept BGR numpy array or PIL image, return normalized binary array."""
	try:
	from PIL import Image as PILImage
	if isinstance(crop, PILImage.Image):
	crop = cv2.cvtColor(np.array(crop.convert("RGB")), cv2.COLOR_RGB2BGR)
	except ImportError:
	pass
	gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) if len(crop.shape) == 3 else crop.copy()
	gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
	gray = self._crop_to_text(gray)
	gray = self._smart_resize(gray)
	return self._binarize(gray)

	def to_tensor(self, img: np.ndarray) -> torch.Tensor:
	return torch.FloatTensor(
	img.astype(np.float32) / 255.0
	).unsqueeze(0).unsqueeze(0)


	# ══════════════════════════════════════════════════════════════════════════════
	# CRNN MODEL LOADER
	# ══════════════════════════════════════════════════════════════════════════════

	def load_crnn_model(checkpoint_path: str, device: torch.device):
	sys.path.insert(0, str(Path(__file__).parent))
	from crnn_model import get_crnn_model

	print(f" Loading CRNN model from: {checkpoint_path}")
	c = torch.load(checkpoint_path, map_location=device, weights_only=False)
	config = c.get("config", {})
	idx_to_char = c["idx_to_char"]
	num_chars = c["model_state_dict"]["fc.weight"].shape[0]

	model = get_crnn_model(
	model_type=config.get("model_type", "standard"),
	img_height=config.get("img_height", 64),
	num_chars=num_chars,
	hidden_size=config.get("hidden_size", 128),
	num_lstm_layers=config.get("num_lstm_layers", 1),
	).to(device)
	model.load_state_dict(c["model_state_dict"])
	model.eval()

	val_cer = c.get("val_cer", None)
	val_loss = c.get("val_loss", None)
	metric = f"val_cer={val_cer:.2f}%" if val_cer else \
	f"val_loss={val_loss:.4f}" if val_loss else "no metric"
	print(f" Model loaded \| {metric} \| chars={num_chars}")
	return model, idx_to_char, config.get("img_height", 64), config.get("img_width", 512)


	# ══════════════════════════════════════════════════════════════════════════════
	# GREEDY CTC DECODE
	# ══════════════════════════════════════════════════════════════════════════════

	def greedy_decode(outputs: torch.Tensor, idx_to_char: dict) -> str:
	pred_indices = torch.argmax(outputs, dim=2).permute(1, 0)
	chars, prev = [], -1
	for idx in pred_indices[0]:
	idx = idx.item()
	if idx != 0 and idx != prev and idx in idx_to_char:
	chars.append(idx_to_char[idx])
	prev = idx
	return "".join(chars)


	# ══════════════════════════════════════════════════════════════════════════════
	# PDF → PIL IMAGE
	# ══════════════════════════════════════════════════════════════════════════════

	def pdf_to_image(pdf_path: str, dpi: int = 200):
	from pdf2image import convert_from_path
	# Resolve to absolute path — fixes "Unable to get page count" on Windows
	pdf_path = str(Path(pdf_path).resolve())
	kwargs = {"dpi": dpi, "first_page": 1, "last_page": 1}
	if POPPLER_PATH:
	kwargs["poppler_path"] = str(Path(POPPLER_PATH).resolve())
	return convert_from_path(pdf_path, **kwargs)[0]


	# ══════════════════════════════════════════════════════════════════════════════
	# CRNN OCR — runs on extracted field crops
	# ══════════════════════════════════════════════════════════════════════════════

	def run_crnn_ocr(crops: dict, model, idx_to_char: dict,
	img_h: int, img_w: int, device: torch.device) -> dict:
	normalizer = FieldNormalizer(target_height=img_h, target_width=img_w)
	results = {}
	with torch.no_grad():
	for name, crop in crops.items():
	try:
	norm = normalizer.normalize(crop)
	tensor = normalizer.to_tensor(norm).to(device)
	text = greedy_decode(model(tensor).cpu(), idx_to_char)
	results[name] = text
	except Exception as e:
	results[name] = f"[ERROR: {e}]"
	return results


	# ══════════════════════════════════════════════════════════════════════════════
	# CONVENIENCE WRAPPER — for other scripts that import this module
	# ══════════════════════════════════════════════════════════════════════════════

	def extract_field_images(image, form_type="birth", verbose=False):
	"""Extract field crops using dynamic boundary detection.

	Parameters
	----------
	image : PIL Image or BGR numpy array
	form_type : str 'birth' \| 'death' \| 'marriage' \| 'marriage_license'
	verbose : bool

	Returns
	-------
	dict {field_name: BGR numpy array}
	"""
	return DynamicFieldExtractor(form_type=form_type, verbose=verbose).extract(image)


	# Keep old name as alias so any existing code doesn't break
	extract_field_images_dynamic = extract_field_images


	# ══════════════════════════════════════════════════════════════════════════════
	# MAIN
	# ══════════════════════════════════════════════════════════════════════════════

	def main():
	parser = argparse.ArgumentParser(
	description="PH Civil Registry Field Extractor — Dynamic CRNN OCR")
	group = parser.add_mutually_exclusive_group(required=True)
	group.add_argument("--pdf", help="Path to scanned PDF")
	group.add_argument("--image", help="Path to scanned image (JPG/PNG)")
	parser.add_argument("--form", required=True,
	choices=["birth", "death", "marriage", "marriage_license"])
	parser.add_argument("--checkpoint", default=DEFAULT_CHECKPOINT)
	parser.add_argument("--visualize", action="store_true",
	help="Save annotated field-map image")
	parser.add_argument("--output", default=None,
	help="Save extracted fields to JSON")
	parser.add_argument("--poppler", default=None,
	help="Override Poppler bin path (overrides .env)")
	parser.add_argument("--dpi", type=int, default=200)
	parser.add_argument("--verbose", action="store_true")
	args = parser.parse_args()

	global POPPLER_PATH
	if args.poppler:
	POPPLER_PATH = args.poppler

	form_labels = {
	"birth": "Form 102 — Certificate of Live Birth",
	"death": "Form 103 — Certificate of Death",
	"marriage": "Form 97 — Certificate of Marriage",
	"marriage_license": "Form 90 — Application for Marriage License",
	}
	input_file = args.pdf or args.image

	print("\nPhilippine Civil Registry OCR — Dynamic Field Extractor")
	print("=" * 65)
	print(f" Form : {form_labels[args.form]}")
	print(f" File : {input_file}")
	print(f" Checkpoint : {args.checkpoint}")

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f" Device : {device}\n")

	if not os.path.exists(args.checkpoint):
	print(f"ERROR: Checkpoint not found: {args.checkpoint}")
	sys.exit(1)

	model, idx_to_char, img_h, img_w = load_crnn_model(args.checkpoint, device)

	# Load image
	if args.pdf:
	print(f" Converting PDF to image at {args.dpi} DPI...")
	try:
	pil_img = pdf_to_image(args.pdf, dpi=args.dpi)
	page_image = cv2.cvtColor(np.array(pil_img.convert("RGB")), cv2.COLOR_RGB2BGR)
	except Exception as e:
	print(f"\nERROR converting PDF: {e}")
	print("Fix: add POPPLER_PATH=C:\\...\\poppler\\Library\\bin to your .env file")
	sys.exit(1)
	else:
	page_image = cv2.imread(args.image)
	if page_image is None:
	print(f"ERROR: Could not load image: {args.image}")
	sys.exit(1)

	h, w = page_image.shape[:2]
	print(f" Page size : {w} x {h} px")

	extractor = DynamicFieldExtractor(form_type=args.form, verbose=args.verbose)

	if args.visualize:
	stem = Path(input_file).stem
	out_path = stem + "_field_map.jpg"
	extractor.visualize(page_image, output_path=out_path)
	print(f" Field map saved -> {out_path}")

	print(f"\n Detecting form boundary and extracting fields...")
	crops = extractor.extract(page_image)
	print(f" {len(crops)} field crops extracted")

	print(f"\n Running CRNN OCR on {len(crops)} fields...")
	results = run_crnn_ocr(crops, model, idx_to_char, img_h, img_w, device)

	print(f"\n{'─'*65}")
	print(f" {'FIELD':<42} TEXT")
	print(f"{'─'*65}")
	for name, text in results.items():
	print(f" {name:<42} {text if text.strip() else '(empty)'}")
	print(f"{'─'*65}")
	print(f"\n Fields recognized : {sum(1 for t in results.values() if t.strip())} / {len(results)}")

	if args.output:
	with open(args.output, "w", encoding="utf-8") as f:
	json.dump({"form": form_labels[args.form], "file": input_file,
	"fields": results}, f, ensure_ascii=False, indent=2)
	print(f"\n Results saved -> {args.output}")
	print()


	if __name__ == "__main__":
	main()