"""
Philippine Civil Registry — Field Extractor (Dynamic)
======================================================
Automatically detects form borders on ANY scan/photo and aligns field
extraction to the detected boundary — no hardcoded pixel positions.

Field coordinates calibrated directly from official PDF renders at 200 DPI:
  Form 102 (Birth):    1700 x 2800 px
  Form 103 (Death):    1700 x 2878 px
  Form 97  (Marriage): 1700 x 2600 px
  Form 90  (License):  1700 x 2600 px

Usage:
    python field_extractor.py --pdf  FORM_102.pdf --form birth
    python field_extractor.py --pdf  FORM_97.pdf  --form marriage --visualize
    python field_extractor.py --pdf  FORM_103.pdf --form death    --output results.json
    python field_extractor.py --image form102.png --form birth    --visualize
    python field_extractor.py --pdf  FORM_102.pdf --form birth    --checkpoint checkpoints/best_model_emnist.pth

.env file (project root) — each team member sets their own:
    POPPLER_PATH=C:\\your\\path\\to\\poppler\\Library\\bin
"""

import argparse
import os
import sys
import json
import cv2
import numpy as np
from pathlib import Path

import torch
from dotenv import load_dotenv

# Load .env from same folder as this script (works regardless of cwd)
_script_dir = Path(__file__).parent.resolve()
load_dotenv(dotenv_path=_script_dir / ".env")

# Poppler path — from .env or None (Linux/Mac auto-detects)
POPPLER_PATH = os.environ.get("POPPLER_PATH", None)
DEFAULT_CHECKPOINT = "checkpoints/best_model.pth"


# ══════════════════════════════════════════════════════════════════════════════
#  FIELD RATIO MAPS
#  Format: field_name: (x1, y1, x2, y2) — ratios 0.0–1.0
#  Coordinates are relative to the DETECTED FORM BOUNDARY (not full image).
#  x = left→right,  y = top→bottom
# ══════════════════════════════════════════════════════════════════════════════

# Form 102 → Certificate of Live Birth (Form 1A)
BIRTH_FIELDS = {
    # Header
    "province":                   (0.02, 0.068, 0.30, 0.088),
    "registry_number":            (0.66, 0.068, 0.99, 0.108),
    "city_municipality":          (0.02, 0.090, 0.65, 0.108),

    # Item 1 — Child Name
    "child_first_name":           (0.03, 0.109, 0.40, 0.141),
    "child_middle_name":          (0.40, 0.109, 0.64, 0.141),
    "child_last_name":            (0.64, 0.109, 0.99, 0.141),

    # Items 2-3 — Sex / Date of Birth
    "sex":                        (0.03, 0.142, 0.30, 0.167),
    "dob_day":                    (0.40, 0.142, 0.80, 0.167),
    "dob_month":                  (0.80, 0.142, 0.60, 0.167),
    "dob_year":                   (0.80, 0.142, 0.99, 0.167),

    # Item 4 — Place of Birth
    "place_birth_hospital":       (0.03, 0.169, 0.46, 0.197),
    "place_birth_city":           (0.47, 0.169, 0.70, 0.199),
    "place_birth_province":       (0.71, 0.169, 0.99, 0.199),

  
    # Mother section
    "mother_first_name":          (0.03, 0.248, 0.40, 0.276),
    "mother_middle_name":         (0.40, 0.248, 0.64, 0.276),
    "mother_last_name":           (0.64, 0.248, 0.99, 0.276),
    "mother_citizenship":         (0.03, 0.277, 0.50, 0.305),
    

    # Father section
    "father_first_name":          (0.03, 0.380, 0.40, 0.410),
    "father_middle_name":         (0.40, 0.380, 0.64, 0.410),
    "father_last_name":           (0.64, 0.380, 0.99, 0.410),
    "father_citizenship":         (0.03, 0.411, 0.28, 0.445),
   

    # Item 20 — Marriage of Parents
    "parents_marriage_month":     (0.03, 0.496, 0.19, 0.526),
    "parents_marriage_day":       (0.19, 0.496, 0.27, 0.526),
    "parents_marriage_year":      (0.27, 0.496, 0.38, 0.526),
    
    "parents_marriage_city":      (0.41, 0.496, 0.68, 0.526),
    "parents_marriage_province":  (0.68, 0.496, 0.84, 0.526),

   
}

# Form 103 → Certificate of Death (Form 2A)
DEATH_FIELDS = {
    # Header
    "province":                   (0.04, 0.128, 0.40, 0.144),
    "registry_number":            (0.52, 0.128, 0.75, 0.144),
    "city_municipality":          (0.04, 0.145, 0.45, 0.160),

    # Item 1 — Name
    "deceased_first_name":        (0.10, 0.162, 0.34, 0.178),
    "deceased_middle_name":       (0.34, 0.162, 0.56, 0.178),
    "deceased_last_name":         (0.56, 0.162, 0.75, 0.178),

    # Items 2-4 — Sex / Religion / Age
    "sex":                        (0.04, 0.182, 0.13, 0.220),
    "age_years":                  (0.28, 0.182, 0.38, 0.202),

    # Item 5 — Place of Death
    "place_death_hospital":       (0.13, 0.224, 0.42, 0.242),
    "place_death_city":           (0.42, 0.224, 0.58, 0.242),
    "place_death_province":       (0.58, 0.224, 0.75, 0.242),

    # Items 6-7 — Date of Death / Citizenship
    "dod_day":                    (0.10, 0.252, 0.22, 0.268),
    "dod_month":                  (0.22, 0.252, 0.38, 0.268),
    "dod_year":                   (0.38, 0.252, 0.52, 0.268),
    "citizenship":                (0.52, 0.252, 0.75, 0.268),

    # Item 8 — Residence
    "residence_house":            (0.13, 0.278, 0.40, 0.294),
    "residence_city":             (0.40, 0.278, 0.56, 0.294),
    "residence_province":         (0.56, 0.278, 0.75, 0.294),

    # Items 9-10 — Civil Status / Occupation
    "civil_status":               (0.04, 0.302, 0.38, 0.360),
    "occupation":                 (0.44, 0.302, 0.75, 0.360),

    # Item 17 — Causes of Death
    "cause_immediate":            (0.18, 0.402, 0.58, 0.418),
    "cause_antecedent":           (0.18, 0.424, 0.58, 0.440),
    "cause_underlying":           (0.18, 0.446, 0.58, 0.462),
    "cause_other":                (0.18, 0.468, 0.58, 0.484),

    # Item 25 — Informant
    "informant_name":             (0.04, 0.808, 0.35, 0.822),
    "informant_address":          (0.04, 0.822, 0.35, 0.836),
    "informant_date":             (0.35, 0.836, 0.58, 0.850),
}

# Form 97 → Certificate of Marriage (Form 3A)
# Only the fields that flow through bridge.py → spaCy NER → SpouseOutput/Form3A.
# Removed: province, city_municipality, dob_day/month/year (×2),
#   place_birth_city/prov/country (×2), sex (×2), residence (×2),
#   religion (×2), civil_status (×2).
MARRIAGE_FIELDS = {
    # ── Header ───────────────────────────────────────────────────────────────
    "registry_number":            (0.62, 0.088, 0.97, 0.104),  # → Form3A.registry_number
    
    #"registry_number":            (0.62, 0.088, 0.97, 0.104),  # → Form3A.registry_number

    # ── Item 1 — Name (HUSBAND left / WIFE right) ────────────────────────────
    "husband_first_name":         (0.23, 0.121, 0.56, 0.139),
    "husband_middle_name":        (0.23, 0.141, 0.56, 0.159),
    "husband_last_name":          (0.23, 0.160, 0.56, 0.178),
    "wife_first_name":            (0.65, 0.121, 0.98, 0.139),
    "wife_middle_name":           (0.65, 0.141, 0.98, 0.159),
    "wife_last_name":             (0.65, 0.160, 0.98, 0.178),

   # "husband_first_name":         (0.14, 0.138, 0.47, 0.156),
   # "husband_middle_name":        (0.14, 0.156, 0.47, 0.174),
   # "husband_last_name":          (0.14, 0.174, 0.47, 0.192),
   # "wife_first_name":            (0.53, 0.138, 0.86, 0.156),
   # "wife_middle_name":           (0.53, 0.156, 0.86, 0.174),
   # "wife_last_name":             (0.53, 0.174, 0.86, 0.192),

    # ── Item 2b — Age ────────────────────────────────────────────────────────
    "husband_age":                (0.40, 0.198, 0.47, 0.216),  # → husband.age
    "wife_age":                   (0.78, 0.198, 0.86, 0.216),  # → wife.age

    # ── Item 4b — Citizenship ────────────────────────────────────────────────
    "husband_citizenship":        (0.22, 0.252, 0.47, 0.270),  # → husband.nationality
    "wife_citizenship":           (0.62, 0.252, 0.86, 0.270),  # → wife.nationality

    # ── Item 8 — Name of Father ──────────────────────────────────────────────
    "husband_father_first":       (0.14, 0.396, 0.24, 0.414),
    "husband_father_middle":      (0.24, 0.396, 0.34, 0.414),
    "husband_father_last":        (0.34, 0.396, 0.47, 0.414),
    "wife_father_first":          (0.53, 0.396, 0.63, 0.414),
    "wife_father_middle":         (0.63, 0.396, 0.73, 0.414),
    "wife_father_last":           (0.73, 0.396, 0.86, 0.414),

    # ── Item 9 — Citizenship of Father ──────────────────────────────────────
    "husband_father_citizenship": (0.14, 0.420, 0.47, 0.436),  # → husband.nationality_of_father
    "wife_father_citizenship":    (0.53, 0.420, 0.86, 0.436),  # → wife.nationality_of_father

    # ── Item 10 — Name of Mother ─────────────────────────────────────────────
    "husband_mother_first":       (0.14, 0.444, 0.24, 0.462),
    "husband_mother_middle":      (0.24, 0.444, 0.34, 0.462),
    "husband_mother_last":        (0.34, 0.444, 0.47, 0.462),
    "wife_mother_first":          (0.53, 0.444, 0.63, 0.462),
    "wife_mother_middle":         (0.63, 0.444, 0.73, 0.462),
    "wife_mother_last":           (0.73, 0.444, 0.86, 0.462),

    # ── Item 11 — Citizenship of Mother ─────────────────────────────────────
    "husband_mother_citizenship": (0.14, 0.468, 0.47, 0.484),  # → husband.nationality_of_mother
    "wife_mother_citizenship":    (0.53, 0.468, 0.86, 0.484),  # → wife.nationality_of_mother

    # ── Items 15–16 — Place / Date of Marriage ───────────────────────────────
    "place_marriage_office":      (0.14, 0.596, 0.44, 0.614),
    "place_marriage_city":        (0.44, 0.596, 0.68, 0.614),
    "place_marriage_province":    (0.68, 0.596, 0.88, 0.614),
    "date_marriage_day":          (0.14, 0.630, 0.24, 0.648),
    "date_marriage_month":        (0.24, 0.630, 0.38, 0.648),
    "date_marriage_year":         (0.38, 0.630, 0.48, 0.648),
}

# Form 90 → Application for Marriage License
MARRIAGE_LICENSE_FIELDS = {
    # Header
    "province":                   (0.12, 0.092, 0.48, 0.108),
    "registry_number":            (0.56, 0.092, 0.97, 0.108),
    "city_municipality":          (0.12, 0.108, 0.48, 0.124),
    "received_by":                (0.12, 0.124, 0.48, 0.140),
    "date_of_receipt":            (0.12, 0.140, 0.48, 0.156),
    "marriage_license_number":    (0.56, 0.124, 0.97, 0.140),
    "date_of_issuance":           (0.56, 0.140, 0.97, 0.156),

    # Item 1 — Name of Applicant (GROOM left / BRIDE right)
    "groom_first_name":           (0.02, 0.278, 0.46, 0.294),
    "bride_first_name":           (0.54, 0.278, 0.97, 0.294),
    "groom_middle_name":          (0.02, 0.296, 0.46, 0.312),
    "bride_middle_name":          (0.54, 0.296, 0.97, 0.312),
    "groom_last_name":            (0.02, 0.314, 0.46, 0.330),
    "bride_last_name":            (0.54, 0.314, 0.97, 0.330),

    # Item 2 — Date of Birth / Age
    "groom_dob_day":              (0.02, 0.334, 0.12, 0.350),
    "groom_dob_month":            (0.12, 0.334, 0.24, 0.350),
    "groom_dob_year":             (0.24, 0.334, 0.34, 0.350),
    "groom_age":                  (0.34, 0.334, 0.46, 0.350),
    "bride_dob_day":              (0.54, 0.334, 0.62, 0.350),
    "bride_dob_month":            (0.62, 0.334, 0.74, 0.350),
    "bride_dob_year":             (0.74, 0.334, 0.84, 0.350),
    "bride_age":                  (0.84, 0.334, 0.97, 0.350),

    # Item 3 — Place of Birth
    "groom_place_birth_city":     (0.02, 0.354, 0.18, 0.370),
    "groom_place_birth_province": (0.18, 0.354, 0.32, 0.370),
    "groom_place_birth_country":  (0.32, 0.354, 0.46, 0.370),
    "bride_place_birth_city":     (0.54, 0.354, 0.70, 0.370),
    "bride_place_birth_province": (0.70, 0.354, 0.84, 0.370),
    "bride_place_birth_country":  (0.84, 0.354, 0.97, 0.370),

    # Item 4 — Sex / Citizenship
    "groom_sex":                  (0.02, 0.374, 0.16, 0.390),
    "groom_citizenship":          (0.16, 0.374, 0.46, 0.390),
    "bride_sex":                  (0.54, 0.374, 0.68, 0.390),
    "bride_citizenship":          (0.68, 0.374, 0.97, 0.390),

    # Item 5 — Residence
    "groom_residence":            (0.02, 0.394, 0.46, 0.412),
    "bride_residence":            (0.54, 0.394, 0.97, 0.412),

    # Item 6 — Religion
    "groom_religion":             (0.02, 0.424, 0.46, 0.440),
    "bride_religion":             (0.54, 0.424, 0.97, 0.440),

    # Item 7 — Civil Status
    "groom_civil_status":         (0.02, 0.452, 0.46, 0.468),
    "bride_civil_status":         (0.54, 0.452, 0.97, 0.468),

    # Item 9 — Place where dissolved
    "groom_dissolution_city":     (0.02, 0.496, 0.16, 0.512),
    "groom_dissolution_province": (0.16, 0.496, 0.30, 0.512),
    "groom_dissolution_country":  (0.30, 0.496, 0.46, 0.512),
    "bride_dissolution_city":     (0.54, 0.496, 0.68, 0.512),
    "bride_dissolution_province": (0.68, 0.496, 0.82, 0.512),
    "bride_dissolution_country":  (0.82, 0.496, 0.97, 0.512),

    # Item 10 — Date when dissolved
    "groom_dissolution_day":      (0.02, 0.520, 0.12, 0.536),
    "groom_dissolution_month":    (0.12, 0.520, 0.24, 0.536),
    "groom_dissolution_year":     (0.24, 0.520, 0.34, 0.536),
    "bride_dissolution_day":      (0.54, 0.520, 0.62, 0.536),
    "bride_dissolution_month":    (0.62, 0.520, 0.74, 0.536),
    "bride_dissolution_year":     (0.74, 0.520, 0.84, 0.536),

    # Item 12 — Father Name
    "groom_father_first":         (0.02, 0.594, 0.16, 0.610),
    "groom_father_middle":        (0.16, 0.594, 0.28, 0.610),
    "groom_father_last":          (0.28, 0.594, 0.46, 0.610),
    "bride_father_first":         (0.54, 0.594, 0.66, 0.610),
    "bride_father_middle":        (0.66, 0.594, 0.78, 0.610),
    "bride_father_last":          (0.78, 0.594, 0.97, 0.610),

    # Item 13 — Father Citizenship
    "groom_father_citizenship":   (0.02, 0.620, 0.46, 0.636),
    "bride_father_citizenship":   (0.54, 0.620, 0.97, 0.636),

    # Item 14 — Father Residence
    "groom_father_residence":     (0.02, 0.644, 0.46, 0.660),
    "bride_father_residence":     (0.54, 0.644, 0.97, 0.660),

    # Item 15 — Mother Name
    "groom_mother_first":         (0.02, 0.674, 0.16, 0.690),
    "groom_mother_middle":        (0.16, 0.674, 0.28, 0.690),
    "groom_mother_last":          (0.28, 0.674, 0.46, 0.690),
    "bride_mother_first":         (0.54, 0.674, 0.66, 0.690),
    "bride_mother_middle":        (0.66, 0.674, 0.78, 0.690),
    "bride_mother_last":          (0.78, 0.674, 0.97, 0.690),

    # Item 16 — Mother Citizenship
    "groom_mother_citizenship":   (0.02, 0.696, 0.46, 0.712),
    "bride_mother_citizenship":   (0.54, 0.696, 0.97, 0.712),

    # Item 17 — Mother Residence
    "groom_mother_residence":     (0.02, 0.720, 0.46, 0.736),
    "bride_mother_residence":     (0.54, 0.720, 0.97, 0.736),
}

FORM_FIELDS = {
    "birth":            BIRTH_FIELDS,
    "death":            DEATH_FIELDS,
    "marriage":         MARRIAGE_FIELDS,
    "marriage_license": MARRIAGE_LICENSE_FIELDS,
}

COLOURS = [
    (0,200,0),(0,150,255),(200,0,200),(0,200,200),(200,200,0),(220,20,60),
    (255,140,0),(150,50,200),(0,160,80),(30,144,255),(255,20,147),(100,200,100),
]


# ══════════════════════════════════════════════════════════════════════════════
#  FORM BOUNDS DETECTOR
#  Finds the outer border of a civil registry form using line detection.
#  Falls back to full image if detection fails.
# ══════════════════════════════════════════════════════════════════════════════

class FormBoundsDetector:
    def __init__(self, verbose=False):
        self.verbose = verbose

    def detect(self, image_bgr):
        h, w   = image_bgr.shape[:2]
        gray   = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
        bounds = self._detect_by_lines(gray, w, h)
        if bounds is None:
            if self.verbose:
                print("  [Bounds] Line detection failed — using full image")
            return (0, 0, w, h)
        if self.verbose:
            print(f"  [Bounds] Detected: {bounds}")
        return bounds

    def _detect_by_lines(self, gray, w, h):
        try:
            thresh  = cv2.adaptiveThreshold(
                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY_INV, 11, 2)
            hk      = cv2.getStructuringElement(cv2.MORPH_RECT, (max(w // 5, 10), 1))
            h_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, hk)
            h_rows  = np.where(np.sum(h_lines, axis=1) > w * 0.15)[0]
            vk      = cv2.getStructuringElement(cv2.MORPH_RECT, (1, max(h // 5, 10)))
            v_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vk)
            v_cols  = np.where(np.sum(v_lines, axis=0) > h * 0.08)[0]
            if len(h_rows) == 0 or len(v_cols) == 0:
                return None
            top, bottom = int(h_rows.min()), int(h_rows.max())
            left, right = int(v_cols.min()), int(v_cols.max())
            if (right - left) < w * 0.4 or (bottom - top) < h * 0.4:
                return None
            return (left, top, right, bottom)
        except Exception as e:
            if self.verbose:
                print(f"  [Bounds error] {e}")
            return None


# ══════════════════════════════════════════════════════════════════════════════
#  DYNAMIC FIELD EXTRACTOR
#  Crops each field region relative to the detected form boundary.
#  Works on any image size, DPI, scan margin, or slight rotation.
# ══════════════════════════════════════════════════════════════════════════════

class DynamicFieldExtractor:
    def __init__(self, form_type="birth", verbose=False):
        self.form_type    = form_type.lower()
        self.field_map    = FORM_FIELDS.get(self.form_type, BIRTH_FIELDS)
        self.detector     = FormBoundsDetector(verbose=verbose)
        self.verbose      = verbose
        self._last_bounds = None

    def _to_bgr(self, image):
        try:
            from PIL import Image as PILImage
            if isinstance(image, PILImage.Image):
                arr = np.array(image.convert("RGB"))
                return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
        except ImportError:
            pass
        if isinstance(image, np.ndarray):
            if len(image.shape) == 2:
                return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
            if image.shape[2] == 4:
                return cv2.cvtColor(image, cv2.COLOR_BGRA2BGR)
            return image
        raise TypeError(f"Unsupported image type: {type(image)}")

    def extract(self, image):
        """Returns {field_name: BGR numpy array}."""
        image = self._to_bgr(image)
        h, w  = image.shape[:2]
        left, top, right, bottom = self.detector.detect(image)
        self._last_bounds = (left, top, right, bottom)
        form_w = right - left
        form_h = bottom - top
        if self.verbose:
            print(f"  [Extract] Image={w}x{h} "
                  f" Form={form_w}x{form_h} @ ({left},{top})-({right},{bottom})")
        crops = {}
        for name, (rx1, ry1, rx2, ry2) in self.field_map.items():
            x1 = max(0, min(int(left + rx1 * form_w), w - 1))
            y1 = max(0, min(int(top  + ry1 * form_h), h - 1))
            x2 = max(0, min(int(left + rx2 * form_w), w - 1))
            y2 = max(0, min(int(top  + ry2 * form_h), h - 1))
            if x2 > x1 and y2 > y1:
                crops[name] = image[y1:y2, x1:x2]
        return crops

    def visualize(self, image, output_path=None):
        """Draw detected boundary + field boxes. Returns annotated BGR image."""
        image = self._to_bgr(image)
        vis   = image.copy()
        h, w  = vis.shape[:2]
        self.extract(image)
        left, top, right, bottom = self._last_bounds
        form_w = right - left
        form_h = bottom - top
        cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 3)
        cv2.putText(vis, "DETECTED FORM BOUNDARY",
                    (left, max(0, top - 8)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 140, 255), 1)
        for idx, (name, (rx1, ry1, rx2, ry2)) in enumerate(self.field_map.items()):
            x1 = max(0, min(int(left + rx1 * form_w), w - 1))
            y1 = max(0, min(int(top  + ry1 * form_h), h - 1))
            x2 = max(0, min(int(left + rx2 * form_w), w - 1))
            y2 = max(0, min(int(top  + ry2 * form_h), h - 1))
            c  = COLOURS[idx % len(COLOURS)]
            cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2)
            cv2.putText(vis, name[:22], (x1 + 2, max(0, y1 - 2)),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.28, c, 1)
        if output_path:
            cv2.imwrite(str(output_path), vis)
            print(f"  Field map saved -> {output_path}")
        return vis


# ══════════════════════════════════════════════════════════════════════════════
#  FIELD NORMALIZER — prepares a BGR crop for CRNN inference
# ══════════════════════════════════════════════════════════════════════════════

class FieldNormalizer:
    def __init__(self, target_height=64, target_width=512):
        self.H = target_height
        self.W = target_width

    def _crop_to_text(self, gray):
        inv = cv2.bitwise_not(gray)
        _, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
        coords = np.column_stack(np.where(thresh > 0))
        if len(coords) == 0:
            return gray
        y_min, x_min = coords.min(axis=0)
        y_max, x_max = coords.max(axis=0)
        pad   = max(4, int((y_max - y_min) * 0.15))
        y_min = max(0, y_min - pad)
        x_min = max(0, x_min - pad)
        y_max = min(gray.shape[0] - 1, y_max + pad)
        x_max = min(gray.shape[1] - 1, x_max + pad)
        return gray[y_min:y_max + 1, x_min:x_max + 1]

    def _smart_resize(self, gray):
        h, w = gray.shape
        if h == 0 or w == 0:
            return np.ones((self.H, self.W), dtype=np.uint8) * 255
        scale = self.H / h
        new_w = int(w * scale)
        new_h = self.H
        if new_w > self.W:
            scale = self.W / w
            new_h = int(h * scale)
            new_w = self.W
        resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
        canvas  = np.ones((self.H, self.W), dtype=np.uint8) * 255
        y_off   = (self.H - new_h) // 2
        x_off   = (self.W - new_w) // 2
        canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized
        return canvas

    def _binarize(self, img):
        _, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        white_ratio = np.mean(otsu == 255)
        if white_ratio < 0.30 or white_ratio > 0.97:
            return cv2.adaptiveThreshold(
                img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY, 11, 2)
        return otsu

    def normalize(self, crop) -> np.ndarray:
        """Accept BGR numpy array or PIL image, return normalized binary array."""
        try:
            from PIL import Image as PILImage
            if isinstance(crop, PILImage.Image):
                crop = cv2.cvtColor(np.array(crop.convert("RGB")), cv2.COLOR_RGB2BGR)
        except ImportError:
            pass
        gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) if len(crop.shape) == 3 else crop.copy()
        gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
        gray = self._crop_to_text(gray)
        gray = self._smart_resize(gray)
        return self._binarize(gray)

    def to_tensor(self, img: np.ndarray) -> torch.Tensor:
        return torch.FloatTensor(
            img.astype(np.float32) / 255.0
        ).unsqueeze(0).unsqueeze(0)


# ══════════════════════════════════════════════════════════════════════════════
#  CRNN MODEL LOADER
# ══════════════════════════════════════════════════════════════════════════════

def load_crnn_model(checkpoint_path: str, device: torch.device):
    sys.path.insert(0, str(Path(__file__).parent))
    from crnn_model import get_crnn_model

    print(f"  Loading CRNN model from: {checkpoint_path}")
    c           = torch.load(checkpoint_path, map_location=device, weights_only=False)
    config      = c.get("config", {})
    idx_to_char = c["idx_to_char"]
    num_chars   = c["model_state_dict"]["fc.weight"].shape[0]

    model = get_crnn_model(
        model_type=config.get("model_type", "standard"),
        img_height=config.get("img_height", 64),
        num_chars=num_chars,
        hidden_size=config.get("hidden_size", 128),
        num_lstm_layers=config.get("num_lstm_layers", 1),
    ).to(device)
    model.load_state_dict(c["model_state_dict"])
    model.eval()

    val_cer  = c.get("val_cer",  None)
    val_loss = c.get("val_loss", None)
    metric   = f"val_cer={val_cer:.2f}%" if val_cer else \
               f"val_loss={val_loss:.4f}" if val_loss else "no metric"
    print(f"  Model loaded  |  {metric}  |  chars={num_chars}")
    return model, idx_to_char, config.get("img_height", 64), config.get("img_width", 512)


# ══════════════════════════════════════════════════════════════════════════════
#  GREEDY CTC DECODE
# ══════════════════════════════════════════════════════════════════════════════

def greedy_decode(outputs: torch.Tensor, idx_to_char: dict) -> str:
    pred_indices = torch.argmax(outputs, dim=2).permute(1, 0)
    chars, prev  = [], -1
    for idx in pred_indices[0]:
        idx = idx.item()
        if idx != 0 and idx != prev and idx in idx_to_char:
            chars.append(idx_to_char[idx])
        prev = idx
    return "".join(chars)


# ══════════════════════════════════════════════════════════════════════════════
#  PDF → PIL IMAGE
# ══════════════════════════════════════════════════════════════════════════════

def pdf_to_image(pdf_path: str, dpi: int = 200):
    from pdf2image import convert_from_path
    # Resolve to absolute path — fixes "Unable to get page count" on Windows
    pdf_path = str(Path(pdf_path).resolve())
    kwargs = {"dpi": dpi, "first_page": 1, "last_page": 1}
    if POPPLER_PATH:
        kwargs["poppler_path"] = str(Path(POPPLER_PATH).resolve())
    return convert_from_path(pdf_path, **kwargs)[0]


# ══════════════════════════════════════════════════════════════════════════════
#  CRNN OCR — runs on extracted field crops
# ══════════════════════════════════════════════════════════════════════════════

def run_crnn_ocr(crops: dict, model, idx_to_char: dict,
                 img_h: int, img_w: int, device: torch.device) -> dict:
    normalizer = FieldNormalizer(target_height=img_h, target_width=img_w)
    results    = {}
    with torch.no_grad():
        for name, crop in crops.items():
            try:
                norm   = normalizer.normalize(crop)
                tensor = normalizer.to_tensor(norm).to(device)
                text   = greedy_decode(model(tensor).cpu(), idx_to_char)
                results[name] = text
            except Exception as e:
                results[name] = f"[ERROR: {e}]"
    return results


# ══════════════════════════════════════════════════════════════════════════════
#  CONVENIENCE WRAPPER — for other scripts that import this module
# ══════════════════════════════════════════════════════════════════════════════

def extract_field_images(image, form_type="birth", verbose=False):
    """Extract field crops using dynamic boundary detection.

    Parameters
    ----------
    image     : PIL Image or BGR numpy array
    form_type : str  'birth' | 'death' | 'marriage' | 'marriage_license'
    verbose   : bool

    Returns
    -------
    dict  {field_name: BGR numpy array}
    """
    return DynamicFieldExtractor(form_type=form_type, verbose=verbose).extract(image)


# Keep old name as alias so any existing code doesn't break
extract_field_images_dynamic = extract_field_images


# ══════════════════════════════════════════════════════════════════════════════
#  MAIN
# ══════════════════════════════════════════════════════════════════════════════

def main():
    parser = argparse.ArgumentParser(
        description="PH Civil Registry Field Extractor — Dynamic CRNN OCR")
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--pdf",   help="Path to scanned PDF")
    group.add_argument("--image", help="Path to scanned image (JPG/PNG)")
    parser.add_argument("--form",       required=True,
                        choices=["birth", "death", "marriage", "marriage_license"])
    parser.add_argument("--checkpoint", default=DEFAULT_CHECKPOINT)
    parser.add_argument("--visualize",  action="store_true",
                        help="Save annotated field-map image")
    parser.add_argument("--output",     default=None,
                        help="Save extracted fields to JSON")
    parser.add_argument("--poppler",    default=None,
                        help="Override Poppler bin path (overrides .env)")
    parser.add_argument("--dpi",        type=int, default=200)
    parser.add_argument("--verbose",    action="store_true")
    args = parser.parse_args()

    global POPPLER_PATH
    if args.poppler:
        POPPLER_PATH = args.poppler

    form_labels = {
        "birth":            "Form 102 — Certificate of Live Birth",
        "death":            "Form 103 — Certificate of Death",
        "marriage":         "Form 97  — Certificate of Marriage",
        "marriage_license": "Form 90  — Application for Marriage License",
    }
    input_file = args.pdf or args.image

    print("\nPhilippine Civil Registry OCR — Dynamic Field Extractor")
    print("=" * 65)
    print(f"  Form       : {form_labels[args.form]}")
    print(f"  File       : {input_file}")
    print(f"  Checkpoint : {args.checkpoint}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"  Device     : {device}\n")

    if not os.path.exists(args.checkpoint):
        print(f"ERROR: Checkpoint not found: {args.checkpoint}")
        sys.exit(1)

    model, idx_to_char, img_h, img_w = load_crnn_model(args.checkpoint, device)

    # Load image
    if args.pdf:
        print(f"  Converting PDF to image at {args.dpi} DPI...")
        try:
            pil_img    = pdf_to_image(args.pdf, dpi=args.dpi)
            page_image = cv2.cvtColor(np.array(pil_img.convert("RGB")), cv2.COLOR_RGB2BGR)
        except Exception as e:
            print(f"\nERROR converting PDF: {e}")
            print("Fix: add POPPLER_PATH=C:\\...\\poppler\\Library\\bin to your .env file")
            sys.exit(1)
    else:
        page_image = cv2.imread(args.image)
        if page_image is None:
            print(f"ERROR: Could not load image: {args.image}")
            sys.exit(1)

    h, w = page_image.shape[:2]
    print(f"  Page size  : {w} x {h} px")

    extractor = DynamicFieldExtractor(form_type=args.form, verbose=args.verbose)

    if args.visualize:
        stem     = Path(input_file).stem
        out_path = stem + "_field_map.jpg"
        extractor.visualize(page_image, output_path=out_path)
        print(f"  Field map saved -> {out_path}")

    print(f"\n  Detecting form boundary and extracting fields...")
    crops = extractor.extract(page_image)
    print(f"  {len(crops)} field crops extracted")

    print(f"\n  Running CRNN OCR on {len(crops)} fields...")
    results = run_crnn_ocr(crops, model, idx_to_char, img_h, img_w, device)

    print(f"\n{'─'*65}")
    print(f"  {'FIELD':<42} TEXT")
    print(f"{'─'*65}")
    for name, text in results.items():
        print(f"  {name:<42} {text if text.strip() else '(empty)'}")
    print(f"{'─'*65}")
    print(f"\n  Fields recognized : {sum(1 for t in results.values() if t.strip())} / {len(results)}")

    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
            json.dump({"form": form_labels[args.form], "file": input_file,
                       "fields": results}, f, ensure_ascii=False, indent=2)
        print(f"\n  Results saved -> {args.output}")
    print()


if __name__ == "__main__":
    main()