| """ |
| Philippine Civil Registry β Field Extractor (Dynamic) |
| ====================================================== |
| Automatically detects form borders on ANY scan/photo and aligns field |
| extraction to the detected boundary β no hardcoded pixel positions. |
| |
| Field coordinates calibrated directly from official PDF renders at 200 DPI: |
| Form 102 (Birth): 1700 x 2800 px |
| Form 103 (Death): 1700 x 2878 px |
| Form 97 (Marriage): 1700 x 2600 px |
| Form 90 (License): 1700 x 2600 px |
| |
| Usage: |
| python field_extractor.py --pdf FORM_102.pdf --form birth |
| python field_extractor.py --pdf FORM_97.pdf --form marriage --visualize |
| python field_extractor.py --pdf FORM_103.pdf --form death --output results.json |
| python field_extractor.py --image form102.png --form birth --visualize |
| python field_extractor.py --pdf FORM_102.pdf --form birth --checkpoint checkpoints/best_model_emnist.pth |
| |
| .env file (project root) β each team member sets their own: |
| POPPLER_PATH=C:\\your\\path\\to\\poppler\\Library\\bin |
| """ |
|
|
| import argparse |
| import os |
| import sys |
| import json |
| import cv2 |
| import numpy as np |
| from pathlib import Path |
|
|
| import torch |
| from dotenv import load_dotenv |
|
|
| |
| _script_dir = Path(__file__).parent.resolve() |
| load_dotenv(dotenv_path=_script_dir / ".env") |
|
|
| |
| POPPLER_PATH = os.environ.get("POPPLER_PATH", None) |
| DEFAULT_CHECKPOINT = "checkpoints/best_model.pth" |
|
|
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| BIRTH_FIELDS = { |
| |
| "province": (0.02, 0.068, 0.30, 0.088), |
| "registry_number": (0.66, 0.068, 0.99, 0.108), |
| "city_municipality": (0.02, 0.090, 0.65, 0.108), |
|
|
| |
| "child_first_name": (0.03, 0.109, 0.40, 0.141), |
| "child_middle_name": (0.40, 0.109, 0.64, 0.141), |
| "child_last_name": (0.64, 0.109, 0.99, 0.141), |
|
|
| |
| "sex": (0.03, 0.142, 0.30, 0.167), |
| "dob_day": (0.40, 0.142, 0.80, 0.167), |
| "dob_month": (0.80, 0.142, 0.60, 0.167), |
| "dob_year": (0.80, 0.142, 0.99, 0.167), |
|
|
| |
| "place_birth_hospital": (0.03, 0.169, 0.46, 0.197), |
| "place_birth_city": (0.47, 0.169, 0.70, 0.199), |
| "place_birth_province": (0.71, 0.169, 0.99, 0.199), |
|
|
| |
|
|
| |
| "mother_first_name": (0.03, 0.248, 0.40, 0.276), |
| "mother_middle_name": (0.40, 0.248, 0.64, 0.276), |
| "mother_last_name": (0.64, 0.248, 0.99, 0.276), |
| "mother_citizenship": (0.03, 0.277, 0.50, 0.305), |
| |
|
|
| |
| "father_first_name": (0.03, 0.380, 0.40, 0.410), |
| "father_middle_name": (0.40, 0.380, 0.64, 0.410), |
| "father_last_name": (0.64, 0.380, 0.99, 0.410), |
| "father_citizenship": (0.03, 0.411, 0.28, 0.445), |
| |
|
|
| |
| "parents_marriage_month": (0.03, 0.496, 0.19, 0.526), |
| "parents_marriage_day": (0.19, 0.496, 0.27, 0.526), |
| "parents_marriage_year": (0.27, 0.496, 0.38, 0.526), |
| |
| "parents_marriage_city": (0.41, 0.496, 0.68, 0.526), |
| "parents_marriage_province": (0.68, 0.496, 0.84, 0.526), |
|
|
| |
| } |
|
|
| |
| DEATH_FIELDS = { |
| |
| "province": (0.04, 0.128, 0.40, 0.144), |
| "registry_number": (0.52, 0.128, 0.75, 0.144), |
| "city_municipality": (0.04, 0.145, 0.45, 0.160), |
|
|
| |
| "deceased_first_name": (0.10, 0.162, 0.34, 0.178), |
| "deceased_middle_name": (0.34, 0.162, 0.56, 0.178), |
| "deceased_last_name": (0.56, 0.162, 0.75, 0.178), |
|
|
| |
| "sex": (0.04, 0.182, 0.13, 0.220), |
| "age_years": (0.28, 0.182, 0.38, 0.202), |
|
|
| |
| "place_death_hospital": (0.13, 0.224, 0.42, 0.242), |
| "place_death_city": (0.42, 0.224, 0.58, 0.242), |
| "place_death_province": (0.58, 0.224, 0.75, 0.242), |
|
|
| |
| "dod_day": (0.10, 0.252, 0.22, 0.268), |
| "dod_month": (0.22, 0.252, 0.38, 0.268), |
| "dod_year": (0.38, 0.252, 0.52, 0.268), |
| "citizenship": (0.52, 0.252, 0.75, 0.268), |
|
|
| |
| "residence_house": (0.13, 0.278, 0.40, 0.294), |
| "residence_city": (0.40, 0.278, 0.56, 0.294), |
| "residence_province": (0.56, 0.278, 0.75, 0.294), |
|
|
| |
| "civil_status": (0.04, 0.302, 0.38, 0.360), |
| "occupation": (0.44, 0.302, 0.75, 0.360), |
|
|
| |
| "cause_immediate": (0.18, 0.402, 0.58, 0.418), |
| "cause_antecedent": (0.18, 0.424, 0.58, 0.440), |
| "cause_underlying": (0.18, 0.446, 0.58, 0.462), |
| "cause_other": (0.18, 0.468, 0.58, 0.484), |
|
|
| |
| "informant_name": (0.04, 0.808, 0.35, 0.822), |
| "informant_address": (0.04, 0.822, 0.35, 0.836), |
| "informant_date": (0.35, 0.836, 0.58, 0.850), |
| } |
|
|
| |
| |
| |
| |
| |
| MARRIAGE_FIELDS = { |
| |
| "registry_number": (0.62, 0.088, 0.97, 0.104), |
| |
| |
|
|
| |
| "husband_first_name": (0.23, 0.121, 0.56, 0.139), |
| "husband_middle_name": (0.23, 0.141, 0.56, 0.159), |
| "husband_last_name": (0.23, 0.160, 0.56, 0.178), |
| "wife_first_name": (0.65, 0.121, 0.98, 0.139), |
| "wife_middle_name": (0.65, 0.141, 0.98, 0.159), |
| "wife_last_name": (0.65, 0.160, 0.98, 0.178), |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| "husband_age": (0.40, 0.198, 0.47, 0.216), |
| "wife_age": (0.78, 0.198, 0.86, 0.216), |
|
|
| |
| "husband_citizenship": (0.22, 0.252, 0.47, 0.270), |
| "wife_citizenship": (0.62, 0.252, 0.86, 0.270), |
|
|
| |
| "husband_father_first": (0.14, 0.396, 0.24, 0.414), |
| "husband_father_middle": (0.24, 0.396, 0.34, 0.414), |
| "husband_father_last": (0.34, 0.396, 0.47, 0.414), |
| "wife_father_first": (0.53, 0.396, 0.63, 0.414), |
| "wife_father_middle": (0.63, 0.396, 0.73, 0.414), |
| "wife_father_last": (0.73, 0.396, 0.86, 0.414), |
|
|
| |
| "husband_father_citizenship": (0.14, 0.420, 0.47, 0.436), |
| "wife_father_citizenship": (0.53, 0.420, 0.86, 0.436), |
|
|
| |
| "husband_mother_first": (0.14, 0.444, 0.24, 0.462), |
| "husband_mother_middle": (0.24, 0.444, 0.34, 0.462), |
| "husband_mother_last": (0.34, 0.444, 0.47, 0.462), |
| "wife_mother_first": (0.53, 0.444, 0.63, 0.462), |
| "wife_mother_middle": (0.63, 0.444, 0.73, 0.462), |
| "wife_mother_last": (0.73, 0.444, 0.86, 0.462), |
|
|
| |
| "husband_mother_citizenship": (0.14, 0.468, 0.47, 0.484), |
| "wife_mother_citizenship": (0.53, 0.468, 0.86, 0.484), |
|
|
| |
| "place_marriage_office": (0.14, 0.596, 0.44, 0.614), |
| "place_marriage_city": (0.44, 0.596, 0.68, 0.614), |
| "place_marriage_province": (0.68, 0.596, 0.88, 0.614), |
| "date_marriage_day": (0.14, 0.630, 0.24, 0.648), |
| "date_marriage_month": (0.24, 0.630, 0.38, 0.648), |
| "date_marriage_year": (0.38, 0.630, 0.48, 0.648), |
| } |
|
|
| |
| MARRIAGE_LICENSE_FIELDS = { |
| |
| "province": (0.12, 0.092, 0.48, 0.108), |
| "registry_number": (0.56, 0.092, 0.97, 0.108), |
| "city_municipality": (0.12, 0.108, 0.48, 0.124), |
| "received_by": (0.12, 0.124, 0.48, 0.140), |
| "date_of_receipt": (0.12, 0.140, 0.48, 0.156), |
| "marriage_license_number": (0.56, 0.124, 0.97, 0.140), |
| "date_of_issuance": (0.56, 0.140, 0.97, 0.156), |
|
|
| |
| "groom_first_name": (0.02, 0.278, 0.46, 0.294), |
| "bride_first_name": (0.54, 0.278, 0.97, 0.294), |
| "groom_middle_name": (0.02, 0.296, 0.46, 0.312), |
| "bride_middle_name": (0.54, 0.296, 0.97, 0.312), |
| "groom_last_name": (0.02, 0.314, 0.46, 0.330), |
| "bride_last_name": (0.54, 0.314, 0.97, 0.330), |
|
|
| |
| "groom_dob_day": (0.02, 0.334, 0.12, 0.350), |
| "groom_dob_month": (0.12, 0.334, 0.24, 0.350), |
| "groom_dob_year": (0.24, 0.334, 0.34, 0.350), |
| "groom_age": (0.34, 0.334, 0.46, 0.350), |
| "bride_dob_day": (0.54, 0.334, 0.62, 0.350), |
| "bride_dob_month": (0.62, 0.334, 0.74, 0.350), |
| "bride_dob_year": (0.74, 0.334, 0.84, 0.350), |
| "bride_age": (0.84, 0.334, 0.97, 0.350), |
|
|
| |
| "groom_place_birth_city": (0.02, 0.354, 0.18, 0.370), |
| "groom_place_birth_province": (0.18, 0.354, 0.32, 0.370), |
| "groom_place_birth_country": (0.32, 0.354, 0.46, 0.370), |
| "bride_place_birth_city": (0.54, 0.354, 0.70, 0.370), |
| "bride_place_birth_province": (0.70, 0.354, 0.84, 0.370), |
| "bride_place_birth_country": (0.84, 0.354, 0.97, 0.370), |
|
|
| |
| "groom_sex": (0.02, 0.374, 0.16, 0.390), |
| "groom_citizenship": (0.16, 0.374, 0.46, 0.390), |
| "bride_sex": (0.54, 0.374, 0.68, 0.390), |
| "bride_citizenship": (0.68, 0.374, 0.97, 0.390), |
|
|
| |
| "groom_residence": (0.02, 0.394, 0.46, 0.412), |
| "bride_residence": (0.54, 0.394, 0.97, 0.412), |
|
|
| |
| "groom_religion": (0.02, 0.424, 0.46, 0.440), |
| "bride_religion": (0.54, 0.424, 0.97, 0.440), |
|
|
| |
| "groom_civil_status": (0.02, 0.452, 0.46, 0.468), |
| "bride_civil_status": (0.54, 0.452, 0.97, 0.468), |
|
|
| |
| "groom_dissolution_city": (0.02, 0.496, 0.16, 0.512), |
| "groom_dissolution_province": (0.16, 0.496, 0.30, 0.512), |
| "groom_dissolution_country": (0.30, 0.496, 0.46, 0.512), |
| "bride_dissolution_city": (0.54, 0.496, 0.68, 0.512), |
| "bride_dissolution_province": (0.68, 0.496, 0.82, 0.512), |
| "bride_dissolution_country": (0.82, 0.496, 0.97, 0.512), |
|
|
| |
| "groom_dissolution_day": (0.02, 0.520, 0.12, 0.536), |
| "groom_dissolution_month": (0.12, 0.520, 0.24, 0.536), |
| "groom_dissolution_year": (0.24, 0.520, 0.34, 0.536), |
| "bride_dissolution_day": (0.54, 0.520, 0.62, 0.536), |
| "bride_dissolution_month": (0.62, 0.520, 0.74, 0.536), |
| "bride_dissolution_year": (0.74, 0.520, 0.84, 0.536), |
|
|
| |
| "groom_father_first": (0.02, 0.594, 0.16, 0.610), |
| "groom_father_middle": (0.16, 0.594, 0.28, 0.610), |
| "groom_father_last": (0.28, 0.594, 0.46, 0.610), |
| "bride_father_first": (0.54, 0.594, 0.66, 0.610), |
| "bride_father_middle": (0.66, 0.594, 0.78, 0.610), |
| "bride_father_last": (0.78, 0.594, 0.97, 0.610), |
|
|
| |
| "groom_father_citizenship": (0.02, 0.620, 0.46, 0.636), |
| "bride_father_citizenship": (0.54, 0.620, 0.97, 0.636), |
|
|
| |
| "groom_father_residence": (0.02, 0.644, 0.46, 0.660), |
| "bride_father_residence": (0.54, 0.644, 0.97, 0.660), |
|
|
| |
| "groom_mother_first": (0.02, 0.674, 0.16, 0.690), |
| "groom_mother_middle": (0.16, 0.674, 0.28, 0.690), |
| "groom_mother_last": (0.28, 0.674, 0.46, 0.690), |
| "bride_mother_first": (0.54, 0.674, 0.66, 0.690), |
| "bride_mother_middle": (0.66, 0.674, 0.78, 0.690), |
| "bride_mother_last": (0.78, 0.674, 0.97, 0.690), |
|
|
| |
| "groom_mother_citizenship": (0.02, 0.696, 0.46, 0.712), |
| "bride_mother_citizenship": (0.54, 0.696, 0.97, 0.712), |
|
|
| |
| "groom_mother_residence": (0.02, 0.720, 0.46, 0.736), |
| "bride_mother_residence": (0.54, 0.720, 0.97, 0.736), |
| } |
|
|
| FORM_FIELDS = { |
| "birth": BIRTH_FIELDS, |
| "death": DEATH_FIELDS, |
| "marriage": MARRIAGE_FIELDS, |
| "marriage_license": MARRIAGE_LICENSE_FIELDS, |
| } |
|
|
| COLOURS = [ |
| (0,200,0),(0,150,255),(200,0,200),(0,200,200),(200,200,0),(220,20,60), |
| (255,140,0),(150,50,200),(0,160,80),(30,144,255),(255,20,147),(100,200,100), |
| ] |
|
|
|
|
| |
| |
| |
| |
| |
|
|
| class FormBoundsDetector: |
| def __init__(self, verbose=False): |
| self.verbose = verbose |
|
|
| def detect(self, image_bgr): |
| h, w = image_bgr.shape[:2] |
| gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) |
| bounds = self._detect_by_lines(gray, w, h) |
| if bounds is None: |
| if self.verbose: |
| print(" [Bounds] Line detection failed β using full image") |
| return (0, 0, w, h) |
| if self.verbose: |
| print(f" [Bounds] Detected: {bounds}") |
| return bounds |
|
|
| def _detect_by_lines(self, gray, w, h): |
| try: |
| thresh = cv2.adaptiveThreshold( |
| gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, |
| cv2.THRESH_BINARY_INV, 11, 2) |
| hk = cv2.getStructuringElement(cv2.MORPH_RECT, (max(w // 5, 10), 1)) |
| h_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, hk) |
| h_rows = np.where(np.sum(h_lines, axis=1) > w * 0.15)[0] |
| vk = cv2.getStructuringElement(cv2.MORPH_RECT, (1, max(h // 5, 10))) |
| v_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vk) |
| v_cols = np.where(np.sum(v_lines, axis=0) > h * 0.08)[0] |
| if len(h_rows) == 0 or len(v_cols) == 0: |
| return None |
| top, bottom = int(h_rows.min()), int(h_rows.max()) |
| left, right = int(v_cols.min()), int(v_cols.max()) |
| if (right - left) < w * 0.4 or (bottom - top) < h * 0.4: |
| return None |
| return (left, top, right, bottom) |
| except Exception as e: |
| if self.verbose: |
| print(f" [Bounds error] {e}") |
| return None |
|
|
|
|
| |
| |
| |
| |
| |
|
|
| class DynamicFieldExtractor: |
| def __init__(self, form_type="birth", verbose=False): |
| self.form_type = form_type.lower() |
| self.field_map = FORM_FIELDS.get(self.form_type, BIRTH_FIELDS) |
| self.detector = FormBoundsDetector(verbose=verbose) |
| self.verbose = verbose |
| self._last_bounds = None |
|
|
| def _to_bgr(self, image): |
| try: |
| from PIL import Image as PILImage |
| if isinstance(image, PILImage.Image): |
| arr = np.array(image.convert("RGB")) |
| return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR) |
| except ImportError: |
| pass |
| if isinstance(image, np.ndarray): |
| if len(image.shape) == 2: |
| return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) |
| if image.shape[2] == 4: |
| return cv2.cvtColor(image, cv2.COLOR_BGRA2BGR) |
| return image |
| raise TypeError(f"Unsupported image type: {type(image)}") |
|
|
| def extract(self, image): |
| """Returns {field_name: BGR numpy array}.""" |
| image = self._to_bgr(image) |
| h, w = image.shape[:2] |
| left, top, right, bottom = self.detector.detect(image) |
| self._last_bounds = (left, top, right, bottom) |
| form_w = right - left |
| form_h = bottom - top |
| if self.verbose: |
| print(f" [Extract] Image={w}x{h} " |
| f" Form={form_w}x{form_h} @ ({left},{top})-({right},{bottom})") |
| crops = {} |
| for name, (rx1, ry1, rx2, ry2) in self.field_map.items(): |
| x1 = max(0, min(int(left + rx1 * form_w), w - 1)) |
| y1 = max(0, min(int(top + ry1 * form_h), h - 1)) |
| x2 = max(0, min(int(left + rx2 * form_w), w - 1)) |
| y2 = max(0, min(int(top + ry2 * form_h), h - 1)) |
| if x2 > x1 and y2 > y1: |
| crops[name] = image[y1:y2, x1:x2] |
| return crops |
|
|
| def visualize(self, image, output_path=None): |
| """Draw detected boundary + field boxes. Returns annotated BGR image.""" |
| image = self._to_bgr(image) |
| vis = image.copy() |
| h, w = vis.shape[:2] |
| self.extract(image) |
| left, top, right, bottom = self._last_bounds |
| form_w = right - left |
| form_h = bottom - top |
| cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 3) |
| cv2.putText(vis, "DETECTED FORM BOUNDARY", |
| (left, max(0, top - 8)), |
| cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 140, 255), 1) |
| for idx, (name, (rx1, ry1, rx2, ry2)) in enumerate(self.field_map.items()): |
| x1 = max(0, min(int(left + rx1 * form_w), w - 1)) |
| y1 = max(0, min(int(top + ry1 * form_h), h - 1)) |
| x2 = max(0, min(int(left + rx2 * form_w), w - 1)) |
| y2 = max(0, min(int(top + ry2 * form_h), h - 1)) |
| c = COLOURS[idx % len(COLOURS)] |
| cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2) |
| cv2.putText(vis, name[:22], (x1 + 2, max(0, y1 - 2)), |
| cv2.FONT_HERSHEY_SIMPLEX, 0.28, c, 1) |
| if output_path: |
| cv2.imwrite(str(output_path), vis) |
| print(f" Field map saved -> {output_path}") |
| return vis |
|
|
|
|
| |
| |
| |
|
|
| class FieldNormalizer: |
| def __init__(self, target_height=64, target_width=512): |
| self.H = target_height |
| self.W = target_width |
|
|
| def _crop_to_text(self, gray): |
| inv = cv2.bitwise_not(gray) |
| _, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY) |
| coords = np.column_stack(np.where(thresh > 0)) |
| if len(coords) == 0: |
| return gray |
| y_min, x_min = coords.min(axis=0) |
| y_max, x_max = coords.max(axis=0) |
| pad = max(4, int((y_max - y_min) * 0.15)) |
| y_min = max(0, y_min - pad) |
| x_min = max(0, x_min - pad) |
| y_max = min(gray.shape[0] - 1, y_max + pad) |
| x_max = min(gray.shape[1] - 1, x_max + pad) |
| return gray[y_min:y_max + 1, x_min:x_max + 1] |
|
|
| def _smart_resize(self, gray): |
| h, w = gray.shape |
| if h == 0 or w == 0: |
| return np.ones((self.H, self.W), dtype=np.uint8) * 255 |
| scale = self.H / h |
| new_w = int(w * scale) |
| new_h = self.H |
| if new_w > self.W: |
| scale = self.W / w |
| new_h = int(h * scale) |
| new_w = self.W |
| resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4) |
| canvas = np.ones((self.H, self.W), dtype=np.uint8) * 255 |
| y_off = (self.H - new_h) // 2 |
| x_off = (self.W - new_w) // 2 |
| canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized |
| return canvas |
|
|
| def _binarize(self, img): |
| _, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) |
| white_ratio = np.mean(otsu == 255) |
| if white_ratio < 0.30 or white_ratio > 0.97: |
| return cv2.adaptiveThreshold( |
| img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, |
| cv2.THRESH_BINARY, 11, 2) |
| return otsu |
|
|
| def normalize(self, crop) -> np.ndarray: |
| """Accept BGR numpy array or PIL image, return normalized binary array.""" |
| try: |
| from PIL import Image as PILImage |
| if isinstance(crop, PILImage.Image): |
| crop = cv2.cvtColor(np.array(crop.convert("RGB")), cv2.COLOR_RGB2BGR) |
| except ImportError: |
| pass |
| gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) if len(crop.shape) == 3 else crop.copy() |
| gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21) |
| gray = self._crop_to_text(gray) |
| gray = self._smart_resize(gray) |
| return self._binarize(gray) |
|
|
| def to_tensor(self, img: np.ndarray) -> torch.Tensor: |
| return torch.FloatTensor( |
| img.astype(np.float32) / 255.0 |
| ).unsqueeze(0).unsqueeze(0) |
|
|
|
|
| |
| |
| |
|
|
| def load_crnn_model(checkpoint_path: str, device: torch.device): |
| sys.path.insert(0, str(Path(__file__).parent)) |
| from crnn_model import get_crnn_model |
|
|
| print(f" Loading CRNN model from: {checkpoint_path}") |
| c = torch.load(checkpoint_path, map_location=device, weights_only=False) |
| config = c.get("config", {}) |
| idx_to_char = c["idx_to_char"] |
| num_chars = c["model_state_dict"]["fc.weight"].shape[0] |
|
|
| model = get_crnn_model( |
| model_type=config.get("model_type", "standard"), |
| img_height=config.get("img_height", 64), |
| num_chars=num_chars, |
| hidden_size=config.get("hidden_size", 128), |
| num_lstm_layers=config.get("num_lstm_layers", 1), |
| ).to(device) |
| model.load_state_dict(c["model_state_dict"]) |
| model.eval() |
|
|
| val_cer = c.get("val_cer", None) |
| val_loss = c.get("val_loss", None) |
| metric = f"val_cer={val_cer:.2f}%" if val_cer else \ |
| f"val_loss={val_loss:.4f}" if val_loss else "no metric" |
| print(f" Model loaded | {metric} | chars={num_chars}") |
| return model, idx_to_char, config.get("img_height", 64), config.get("img_width", 512) |
|
|
|
|
| |
| |
| |
|
|
| def greedy_decode(outputs: torch.Tensor, idx_to_char: dict) -> str: |
| pred_indices = torch.argmax(outputs, dim=2).permute(1, 0) |
| chars, prev = [], -1 |
| for idx in pred_indices[0]: |
| idx = idx.item() |
| if idx != 0 and idx != prev and idx in idx_to_char: |
| chars.append(idx_to_char[idx]) |
| prev = idx |
| return "".join(chars) |
|
|
|
|
| |
| |
| |
|
|
| def pdf_to_image(pdf_path: str, dpi: int = 200): |
| from pdf2image import convert_from_path |
| |
| pdf_path = str(Path(pdf_path).resolve()) |
| kwargs = {"dpi": dpi, "first_page": 1, "last_page": 1} |
| if POPPLER_PATH: |
| kwargs["poppler_path"] = str(Path(POPPLER_PATH).resolve()) |
| return convert_from_path(pdf_path, **kwargs)[0] |
|
|
|
|
| |
| |
| |
|
|
| def run_crnn_ocr(crops: dict, model, idx_to_char: dict, |
| img_h: int, img_w: int, device: torch.device) -> dict: |
| normalizer = FieldNormalizer(target_height=img_h, target_width=img_w) |
| results = {} |
| with torch.no_grad(): |
| for name, crop in crops.items(): |
| try: |
| norm = normalizer.normalize(crop) |
| tensor = normalizer.to_tensor(norm).to(device) |
| text = greedy_decode(model(tensor).cpu(), idx_to_char) |
| results[name] = text |
| except Exception as e: |
| results[name] = f"[ERROR: {e}]" |
| return results |
|
|
|
|
| |
| |
| |
|
|
| def extract_field_images(image, form_type="birth", verbose=False): |
| """Extract field crops using dynamic boundary detection. |
| |
| Parameters |
| ---------- |
| image : PIL Image or BGR numpy array |
| form_type : str 'birth' | 'death' | 'marriage' | 'marriage_license' |
| verbose : bool |
| |
| Returns |
| ------- |
| dict {field_name: BGR numpy array} |
| """ |
| return DynamicFieldExtractor(form_type=form_type, verbose=verbose).extract(image) |
|
|
|
|
| |
| extract_field_images_dynamic = extract_field_images |
|
|
|
|
| |
| |
| |
|
|
| def main(): |
| parser = argparse.ArgumentParser( |
| description="PH Civil Registry Field Extractor β Dynamic CRNN OCR") |
| group = parser.add_mutually_exclusive_group(required=True) |
| group.add_argument("--pdf", help="Path to scanned PDF") |
| group.add_argument("--image", help="Path to scanned image (JPG/PNG)") |
| parser.add_argument("--form", required=True, |
| choices=["birth", "death", "marriage", "marriage_license"]) |
| parser.add_argument("--checkpoint", default=DEFAULT_CHECKPOINT) |
| parser.add_argument("--visualize", action="store_true", |
| help="Save annotated field-map image") |
| parser.add_argument("--output", default=None, |
| help="Save extracted fields to JSON") |
| parser.add_argument("--poppler", default=None, |
| help="Override Poppler bin path (overrides .env)") |
| parser.add_argument("--dpi", type=int, default=200) |
| parser.add_argument("--verbose", action="store_true") |
| args = parser.parse_args() |
|
|
| global POPPLER_PATH |
| if args.poppler: |
| POPPLER_PATH = args.poppler |
|
|
| form_labels = { |
| "birth": "Form 102 β Certificate of Live Birth", |
| "death": "Form 103 β Certificate of Death", |
| "marriage": "Form 97 β Certificate of Marriage", |
| "marriage_license": "Form 90 β Application for Marriage License", |
| } |
| input_file = args.pdf or args.image |
|
|
| print("\nPhilippine Civil Registry OCR β Dynamic Field Extractor") |
| print("=" * 65) |
| print(f" Form : {form_labels[args.form]}") |
| print(f" File : {input_file}") |
| print(f" Checkpoint : {args.checkpoint}") |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print(f" Device : {device}\n") |
|
|
| if not os.path.exists(args.checkpoint): |
| print(f"ERROR: Checkpoint not found: {args.checkpoint}") |
| sys.exit(1) |
|
|
| model, idx_to_char, img_h, img_w = load_crnn_model(args.checkpoint, device) |
|
|
| |
| if args.pdf: |
| print(f" Converting PDF to image at {args.dpi} DPI...") |
| try: |
| pil_img = pdf_to_image(args.pdf, dpi=args.dpi) |
| page_image = cv2.cvtColor(np.array(pil_img.convert("RGB")), cv2.COLOR_RGB2BGR) |
| except Exception as e: |
| print(f"\nERROR converting PDF: {e}") |
| print("Fix: add POPPLER_PATH=C:\\...\\poppler\\Library\\bin to your .env file") |
| sys.exit(1) |
| else: |
| page_image = cv2.imread(args.image) |
| if page_image is None: |
| print(f"ERROR: Could not load image: {args.image}") |
| sys.exit(1) |
|
|
| h, w = page_image.shape[:2] |
| print(f" Page size : {w} x {h} px") |
|
|
| extractor = DynamicFieldExtractor(form_type=args.form, verbose=args.verbose) |
|
|
| if args.visualize: |
| stem = Path(input_file).stem |
| out_path = stem + "_field_map.jpg" |
| extractor.visualize(page_image, output_path=out_path) |
| print(f" Field map saved -> {out_path}") |
|
|
| print(f"\n Detecting form boundary and extracting fields...") |
| crops = extractor.extract(page_image) |
| print(f" {len(crops)} field crops extracted") |
|
|
| print(f"\n Running CRNN OCR on {len(crops)} fields...") |
| results = run_crnn_ocr(crops, model, idx_to_char, img_h, img_w, device) |
|
|
| print(f"\n{'β'*65}") |
| print(f" {'FIELD':<42} TEXT") |
| print(f"{'β'*65}") |
| for name, text in results.items(): |
| print(f" {name:<42} {text if text.strip() else '(empty)'}") |
| print(f"{'β'*65}") |
| print(f"\n Fields recognized : {sum(1 for t in results.values() if t.strip())} / {len(results)}") |
|
|
| if args.output: |
| with open(args.output, "w", encoding="utf-8") as f: |
| json.dump({"form": form_labels[args.form], "file": input_file, |
| "fields": results}, f, ensure_ascii=False, indent=2) |
| print(f"\n Results saved -> {args.output}") |
| print() |
|
|
|
|
| if __name__ == "__main__": |
| main() |