Spaces:

hanz245
/

ocr

Running

App Files Files Community

Hanz Pillerva commited on 21 days ago

Commit

25a1178

1 Parent(s): 94ec9c8

Updated files

Browse files

Files changed (12) hide show

CRNN+CTC/finetune.py +42 -31
CRNN+CTC/fix_annotations.py +0 -40
CRNN+CTC/fix_data.py +0 -770
CRNN+CTC/generate_form_samples.py +0 -389
CRNN+CTC/inference.py +1 -1
debug_and_retrain.py +20 -0
finetune.py +50 -29
inference.py +3 -3
spacyNER/debug_and_retrain.py +0 -316
spacyNER/models/phase1_funsd/model-last/vocab/strings.json +0 -0
spacyNER/models/phase1_funsd/model-last/vocab/vectors.cfg +3 -3
template_matcher.py +1 -1

CRNN+CTC/finetune.py CHANGED Viewed

@@ -3,14 +3,14 @@ finetune.py
 ===========
 Fine-tune CRNN+CTC on generated civil registry form crops.
-Loads best_model_iam.pth (already knows real handwriting from IAM),
-then trains on real_annotations.json (Filipino names on real form backgrounds).
 Usage:
     python finetune.py
 Output:
-    checkpoints/best_model_final.pth
 """
 import os
@@ -25,12 +25,12 @@ from crnn_model import get_crnn_model
 from dataset import CivilRegistryDataset, collate_fn
 # ── Config ────────────────────────────────────────────────────
-CHECKPOINT_IN  = "checkpoints/best_model_iam.pth"
-CHECKPOINT_OUT = "checkpoints/best_model_final.pth"
-REAL_ANN   = "data/real_annotations.json"   # generated by generate_form_samples.py
-SYNTH_ANN  = "data/train_annotations.json"  # original synthetic data
-VAL_ANN    = "data/val_annotations.json"    # validation set
 IMG_HEIGHT = 64
 IMG_WIDTH  = 512
@@ -53,35 +53,45 @@ def main():
     print(f"  Device      : {DEVICE}")
     print(f"  Checkpoint  : {CHECKPOINT_IN}")
-    # ── Check files ───────────────────────────────────────────
-    for f in [CHECKPOINT_IN, REAL_ANN, VAL_ANN]:
         if not os.path.exists(f):
             print(f"ERROR: {f} not found.")
             sys.exit(1)
     # ── Datasets ──────────────────────────────────────────────
-    real_dataset = CivilRegistryDataset(
-        data_dir="data/train", annotations_file=REAL_ANN,
-        img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
-    )
-    val_dataset = CivilRegistryDataset(
-        data_dir="data/val", annotations_file=VAL_ANN,
-        img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
-    )
-    # Mix in original synthetic data so model doesn't forget
-    train_dataset = real_dataset
     if os.path.exists(SYNTH_ANN):
         synth_dataset = CivilRegistryDataset(
             data_dir="data/train", annotations_file=SYNTH_ANN,
             img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
         )
-        train_dataset = ConcatDataset([real_dataset, synth_dataset])
-        print(f"  Real crops  : {len(real_dataset)}")
-        print(f"  Synth crops : {len(synth_dataset)}")
-    else:
-        print(f"  Real crops  : {len(real_dataset)}")
     print(f"  Total train : {len(train_dataset)}")
     print(f"  Val         : {len(val_dataset)}")
@@ -95,10 +105,11 @@ def main():
     ckpt   = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
     config = ckpt.get('config', {})
     model = get_crnn_model(
         model_type      = config.get('model_type', 'standard'),
         img_height      = config.get('img_height', 64),
-        num_chars       = real_dataset.num_chars,
         hidden_size     = config.get('hidden_size', 128),
         num_lstm_layers = config.get('num_lstm_layers', 1),
     ).to(DEVICE)
@@ -123,8 +134,8 @@ def main():
                 batch_size = images.size(0)
                 if training:
                     optimizer.zero_grad()
-                outputs      = F.log_softmax(model(images), dim=2)
-                seq_len      = outputs.size(0)
                 input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
                 loss = criterion(outputs, targets, input_lengths, target_lengths)
                 if not torch.isnan(loss) and not torch.isinf(loss):
@@ -167,8 +178,8 @@ def main():
                     torch.save({
                         'model_state_dict': model.state_dict(),
                         'config':           config,
-                        'char_to_idx':      real_dataset.char_to_idx,
-                        'idx_to_char':      real_dataset.idx_to_char,
                         'epoch':            epoch,
                         'val_loss':         vl,
                     }, CHECKPOINT_OUT)
@@ -188,4 +199,4 @@ def main():
 if __name__ == '__main__':
-    main()

 ===========
 Fine-tune CRNN+CTC on generated civil registry form crops.
+Loads best_model_final.pth (pretrained), continues training on
+actual_annotations.json + train_annotations.json.
 Usage:
     python finetune.py
 Output:
+    checkpoints/best_model_v2.pth
 """
 import os
 from dataset import CivilRegistryDataset, collate_fn
 # ── Config ────────────────────────────────────────────────────
+CHECKPOINT_IN  = "checkpoints/best_model_final.pth"
+CHECKPOINT_OUT = "checkpoints/best_model_v2.pth"
+ACTUAL_ANN = "data/actual_annotations.json"  # real scanned forms
+SYNTH_ANN  = "data/train_annotations.json"   # synthetic / train split
+VAL_ANN    = "data/val_annotations.json"     # validation set
 IMG_HEIGHT = 64
 IMG_WIDTH  = 512
     print(f"  Device      : {DEVICE}")
     print(f"  Checkpoint  : {CHECKPOINT_IN}")
+    # ── Check required files ──────────────────────────────────
+    for f in [CHECKPOINT_IN, VAL_ANN]:
         if not os.path.exists(f):
             print(f"ERROR: {f} not found.")
             sys.exit(1)
     # ── Datasets ──────────────────────────────────────────────
+    datasets_to_merge = []
+    # 1. Actual scanned forms (highest priority — real data)
+    if os.path.exists(ACTUAL_ANN):
+        actual_dataset = CivilRegistryDataset(
+            data_dir=".", annotations_file=ACTUAL_ANN,
+            img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
+        )
+        datasets_to_merge.append(actual_dataset)
+        print(f"  Actual crops: {len(actual_dataset)}  (real scanned forms)")
+    else:
+        print(f"  [!] {ACTUAL_ANN} not found — run extract_actual_data.py first")
+    # 2. Fully synthetic — keep so model doesn't forget basic characters
     if os.path.exists(SYNTH_ANN):
         synth_dataset = CivilRegistryDataset(
             data_dir="data/train", annotations_file=SYNTH_ANN,
             img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
         )
+        datasets_to_merge.append(synth_dataset)
+        print(f"  Synth crops : {len(synth_dataset)}  (fully synthetic)")
+    if not datasets_to_merge:
+        print("ERROR: No training data found. Run extract_actual_data.py first.")
+        sys.exit(1)
+    val_dataset = CivilRegistryDataset(
+        data_dir="data/val", annotations_file=VAL_ANN,
+        img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
+    )
+    train_dataset = ConcatDataset(datasets_to_merge) if len(datasets_to_merge) > 1 else datasets_to_merge[0]
     print(f"  Total train : {len(train_dataset)}")
     print(f"  Val         : {len(val_dataset)}")
     ckpt   = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
     config = ckpt.get('config', {})
+    ref_dataset = datasets_to_merge[0]
     model = get_crnn_model(
         model_type      = config.get('model_type', 'standard'),
         img_height      = config.get('img_height', 64),
+        num_chars       = ref_dataset.num_chars,
         hidden_size     = config.get('hidden_size', 128),
         num_lstm_layers = config.get('num_lstm_layers', 1),
     ).to(DEVICE)
                 batch_size = images.size(0)
                 if training:
                     optimizer.zero_grad()
+                outputs       = F.log_softmax(model(images), dim=2)
+                seq_len       = outputs.size(0)
                 input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
                 loss = criterion(outputs, targets, input_lengths, target_lengths)
                 if not torch.isnan(loss) and not torch.isinf(loss):
                     torch.save({
                         'model_state_dict': model.state_dict(),
                         'config':           config,
+                        'char_to_idx':      ref_dataset.char_to_idx,
+                        'idx_to_char':      ref_dataset.idx_to_char,
                         'epoch':            epoch,
                         'val_loss':         vl,
                     }, CHECKPOINT_OUT)
 if __name__ == '__main__':
+    main()

CRNN+CTC/fix_annotations.py DELETED Viewed

@@ -1,40 +0,0 @@
-import json, os
-# Maps any image path to its correct form subfolder.
-# FIXED: was only handling form1a/form2a — missed form3a and form90.
-def detect_folder(image_path):
-    for form in ['form1a', 'form2a', 'form3a', 'form90']:
-        if form in image_path:
-            return form
-    return 'form1a'   # safe fallback
-for split in ['train', 'val']:
-    ann_file = f'data/{split}_annotations.json'
-    if not os.path.exists(ann_file):
-        print(f'SKIP: {ann_file} not found')
-        continue
-    with open(ann_file) as f:
-        data = json.load(f)
-    fixed = []
-    skipped = 0
-    for d in data:
-        # Support both old key names ('image'/'label') and new ('image_path'/'text')
-        image_val = d.get('image') or d.get('image_path', '')
-        text_val  = d.get('label') or d.get('text', '')
-        if not image_val or not text_val:
-            skipped += 1
-            continue
-        filename = os.path.basename(image_val)
-        folder   = detect_folder(image_val)
-        fixed.append({'image_path': f'{folder}/{filename}', 'text': text_val})
-    with open(ann_file, 'w') as f:
-        json.dump(fixed, f, indent=2)
-    print(f'{split}: {len(fixed)} fixed, {skipped} skipped')
-print('Done!')

CRNN+CTC/fix_data.py DELETED Viewed

@@ -1,770 +0,0 @@
-"""
-fix_data.py
-===========
-Generates synthetic training images for the Civil Registry OCR system.
-Run this ONCE before training to create your dataset.
-STEP ORDER:
-  1. python generate_ph_names.py   <- generates data/ph_names.json
-  2. python fix_data.py            <- generates all training images  (THIS FILE)
-  3. python train.py               <- trains the CRNN model
-WHAT IT GENERATES:
-  - Printed text images of names, dates, places, and other form fields
-  - Covers all 4 form types: birth, death, marriage, marriage license
-  - Splits into train (90%) and val (10%)
-  - Writes data/train_annotations.json and data/val_annotations.json
-OUTPUT STRUCTURE:
-  data/
-    train/
-      form1a/   <- birth certificate fields
-      form2a/   <- death certificate fields
-      form3a/   <- marriage certificate fields
-      form90/   <- marriage license fields
-    val/
-      form1a/
-      form2a/
-      form3a/
-      form90/
-    train_annotations.json
-    val_annotations.json
-"""
-import os
-import json
-import random
-import numpy as np
-from pathlib import Path
-from PIL import Image, ImageDraw, ImageFont, ImageFilter
-# ─────────────────────────────────────────────────────────────────────────────
-#  CONFIG
-# ─────────────────────────────────────────────────────────────────────────────
-IMG_WIDTH   = 512
-IMG_HEIGHT  = 64
-FONT_SIZE   = 22
-VAL_SPLIT   = 0.10
-RANDOM_SEED = 42
-SAMPLES_PER_FORM = {
-    'form1a': 6000,
-    'form2a': 4000,
-    'form3a': 4000,
-    'form90': 2000,
-}
-PH_NAMES_FILE = 'data/ph_names.json'
-random.seed(RANDOM_SEED)
-# ─────────────────────────────────────────────────────────────────────────────
-#  FONT LOADER
-# ─────────────────────────────────────────────────────────────────────────────
-def load_font(size=FONT_SIZE):
-    """Load a single font — used as fallback. Prefer load_font_pool()."""
-    for fp in [
-        'arial.ttf', 'Arial.ttf',
-        '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
-        '/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf',
-        '/System/Library/Fonts/Helvetica.ttc',
-        'C:/Windows/Fonts/arial.ttf',
-        'C:/Windows/Fonts/calibri.ttf',
-    ]:
-        try:
-            return ImageFont.truetype(fp, size)
-        except Exception:
-            continue
-    print("WARNING: Could not load a TrueType font. Using default bitmap font.")
-    print("         Prediction accuracy may be lower.")
-    return ImageFont.load_default()
-def load_font_pool(size=FONT_SIZE):
-    """
-    Load a pool of diverse fonts so the model trains on varied typefaces.
-    Using only one font causes the model to overfit to that font's style and
-    fail on real civil registry documents which use mixed fonts.
-    Returns a list of at least 1 font; caller picks randomly per image.
-    """
-    candidates = [
-        # Sans-serif (most common in PH civil registry printed forms)
-        'arial.ttf', 'Arial.ttf',
-        '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
-        '/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf',
-        '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
-        'C:/Windows/Fonts/arial.ttf',
-        'C:/Windows/Fonts/arialbd.ttf',
-        'C:/Windows/Fonts/calibri.ttf',
-        'C:/Windows/Fonts/calibrib.ttf',
-        # Serif (used in older typewriter-style registry entries)
-        '/usr/share/fonts/truetype/dejavu/DejaVuSerif.ttf',
-        '/usr/share/fonts/truetype/liberation/LiberationSerif-Regular.ttf',
-        'C:/Windows/Fonts/times.ttf',
-        'C:/Windows/Fonts/Georgia.ttf',
-        '/System/Library/Fonts/Times.ttc',
-        # Mono (typewriter — common in pre-2000 civil registry forms)
-        '/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf',
-        '/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf',
-        'C:/Windows/Fonts/cour.ttf',
-        # Condensed / narrow (space-saving fonts used in registry tables)
-        'C:/Windows/Fonts/arialn.ttf',
-        '/usr/share/fonts/truetype/ubuntu/UbuntuCondensed-Regular.ttf',
-    ]
-    pool = []
-    for fp in candidates:
-        try:
-            pool.append(ImageFont.truetype(fp, size))
-        except Exception:
-            continue
-    if not pool:
-        print("WARNING: No TrueType fonts found. Using default bitmap font.")
-        pool.append(ImageFont.load_default())
-    else:
-        print(f"  ✓ Font pool loaded: {len(pool)} font(s) available")
-    return pool
-# ─────────────────────────────────────────────────────────────────────────────
-#  IMAGE RENDERER
-# ─────────────────────────────────────────────────────────────────────────────
-def render_text_image(text: str, font, width=IMG_WIDTH, height=IMG_HEIGHT,
-                      handwriting=False) -> Image.Image:
-    """
-    Render text on a white background, centered.
-    handwriting=True applies handwriting-style augmentations.
-    """
-    img  = Image.new('RGB', (width, height), color=(255, 255, 255))
-    draw = ImageDraw.Draw(img)
-    bbox = draw.textbbox((0, 0), text, font=font)
-    tw   = bbox[2] - bbox[0]
-    th   = bbox[3] - bbox[1]
-    x    = max(4, (width  - tw) // 2)
-    y    = max(4, (height - th) // 2)
-    if not handwriting:
-        # ── PRINTED mode ──────────────────────────────────────
-        shade = random.randint(0, 40)
-        draw.text((x, y), text, fill=(shade, shade, shade), font=font)
-    else:
-        # ── HANDWRITING simulation mode ───────────────────────
-        # 1. Pen color — blue-black ballpen
-        r = random.randint(0, 60)
-        g = random.randint(0, 60)
-        b = random.randint(0, 120)
-        ink_color = (r, g, b)
-        # 2. Per-character y-wobble (unsteady hand)
-        if random.choice([True, False]) and len(text) > 1:
-            char_x = x
-            for ch in text:
-                y_offset = random.randint(-2, 2)
-                draw.text((char_x, y + y_offset), ch, fill=ink_color, font=font)
-                ch_bbox  = draw.textbbox((0, 0), ch, font=font)
-                char_x  += (ch_bbox[2] - ch_bbox[0]) + random.randint(-1, 1)
-        else:
-            draw.text((x, y), text, fill=ink_color, font=font)
-        # 3. Pixel-level augmentation
-        arr = np.array(img).astype(np.float32)
-        # 4. Ink bleed
-        if random.random() < 0.5:
-            img_pil = Image.fromarray(arr.astype(np.uint8))
-            img_pil = img_pil.filter(
-                ImageFilter.GaussianBlur(radius=random.uniform(0.3, 0.7)))
-            arr = np.array(img_pil).astype(np.float32)
-        # 5. Paper texture noise
-        noise_map = np.random.normal(0, random.uniform(3, 10), arr.shape)
-        arr = np.clip(arr + noise_map, 0, 255)
-        # 6. Scan shadow patch
-        if random.random() < 0.3:
-            patch_x = random.randint(0, width - 20)
-            patch_w = random.randint(10, 60)
-            arr[:, patch_x:patch_x + patch_w] *= random.uniform(0.88, 0.97)
-            arr = np.clip(arr, 0, 255)
-        img = Image.fromarray(arr.astype(np.uint8))
-        # 7. Pen tilt rotation (+-3 degrees)
-        if random.random() < 0.6:
-            angle = random.uniform(-3, 3)
-            img   = img.rotate(angle, fillcolor=(255, 255, 255), expand=False)
-    return img
-# ─────────────────────────────────────────────────────────────────────────────
-#  NAME / DATA POOLS
-# ─────────────────────────────────────────────────────────────────────────────
-# Populated at runtime from ph_names.json via load_ph_names()
-MIDDLE_NAMES = []
-SUFFIXES = ['Jr.', 'Sr.', 'II', 'III', '']
-MONTHS = [
-    'January', 'February', 'March', 'April', 'May', 'June',
-    'July', 'August', 'September', 'October', 'November', 'December',
-]
-CITIES = [
-    # NCR
-    'Manila', 'Quezon City', 'Caloocan', 'Pasig', 'Makati',
-    'Taguig', 'Paranaque', 'Pasay', 'Las Pinas', 'Muntinlupa',
-    'Marikina', 'Valenzuela', 'Malabon', 'Navotas', 'Mandaluyong',
-    'San Juan', 'Pateros',
-    # Luzon
-    'Tarlac City', 'Angeles City', 'San Fernando', 'Olongapo',
-    'Cabanatuan', 'San Jose del Monte', 'Bacoor', 'Imus', 'Dasmarinas',
-    'Antipolo', 'Binangonan', 'Taytay', 'Santa Rosa', 'Calamba',
-    'San Pablo', 'Lucena', 'Batangas City', 'Lipa', 'Naga City',
-    'Legazpi', 'Sorsogon City', 'Tuguegarao', 'Ilagan', 'Santiago City',
-    'Cauayan', 'San Fernando (La Union)', 'Vigan', 'Laoag',
-    'Dagupan', 'San Carlos', 'Urdaneta', 'Baguio City',
-    # Visayas
-    'Cebu City', 'Mandaue', 'Lapu-Lapu', 'Talisay', 'Danao',
-    'Toledo', 'Carcar', 'Bacolod', 'Bago', 'Sagay', 'Victorias',
-    'Iloilo City', 'Passi', 'Roxas City', 'Kalibo',
-    'Tacloban', 'Ormoc', 'Palo', 'Catbalogan', 'Calbayog',
-    'Tagbilaran', 'Dumaguete', 'Tanjay', 'Bayawan', 'Kabankalan',
-    # Mindanao
-    'Davao City', 'Tagum', 'Panabo', 'Digos', 'Mati',
-    'General Santos', 'Koronadal', 'Kidapawan', 'Cotabato City',
-    'Cagayan de Oro', 'Iligan', 'Ozamiz', 'Oroquieta', 'Tangub',
-    'Butuan', 'Cabadbaran', 'Surigao City', 'Bislig', 'Bayugan',
-    'Zamboanga City', 'Pagadian', 'Dipolog', 'Dapitan',
-    'Marawi', 'Malaybalay', 'Valencia',
-]
-PROVINCES = [
-    # Luzon
-    'Tarlac', 'Pampanga', 'Bulacan', 'Nueva Ecija', 'Bataan',
-    'Zambales', 'Aurora', 'Rizal', 'Cavite', 'Laguna',
-    'Batangas', 'Quezon', 'Marinduque', 'Occidental Mindoro',
-    'Oriental Mindoro', 'Palawan', 'Romblon',
-    'Camarines Norte', 'Camarines Sur', 'Albay', 'Sorsogon',
-    'Catanduanes', 'Masbate',
-    'Pangasinan', 'La Union', 'Benguet', 'Ifugao', 'Mountain Province',
-    'Kalinga', 'Apayao', 'Abra', 'Ilocos Norte', 'Ilocos Sur',
-    'Cagayan', 'Isabela', 'Nueva Vizcaya', 'Quirino',
-    'Metro Manila',
-    # Visayas
-    'Cebu', 'Bohol', 'Negros Oriental', 'Siquijor',
-    'Negros Occidental', 'Iloilo', 'Capiz', 'Aklan', 'Antique',
-    'Guimaras', 'Leyte', 'Southern Leyte', 'Samar', 'Eastern Samar',
-    'Northern Samar', 'Biliran',
-    # Mindanao
-    'Davao del Sur', 'Davao del Norte', 'Davao Oriental',
-    'Davao Occidental', 'Davao de Oro',
-    'South Cotabato', 'Sarangani', 'Sultan Kudarat', 'North Cotabato',
-    'Misamis Oriental', 'Misamis Occidental', 'Camiguin',
-    'Bukidnon', 'Lanao del Norte', 'Lanao del Sur',
-    'Maguindanao', 'Basilan', 'Sulu', 'Tawi-Tawi',
-    'Zamboanga del Sur', 'Zamboanga del Norte', 'Zamboanga Sibugay',
-    'Agusan del Norte', 'Agusan del Sur', 'Surigao del Norte',
-    'Surigao del Sur', 'Dinagat Islands',
-]
-BARANGAYS = [
-    'Brgy. San Jose', 'Brgy. Sta. Maria', 'Brgy. San Antonio',
-    'Brgy. Santo Nino', 'Brgy. Poblacion', 'Brgy. San Isidro',
-    'Brgy. San Pedro', 'Brgy. San Miguel', 'Brgy. Mabini',
-    'Brgy. Rizal', 'Brgy. Magsaysay', 'Brgy. Quezon',
-    'Brgy. Bagong Silang', 'Brgy. Bagumbayan', 'Brgy. Batasan Hills',
-    'Brgy. Commonwealth', 'Brgy. Culiat', 'Brgy. Fairview',
-    'Brgy. Holy Spirit', 'Brgy. Kamuning', 'Brgy. Laging Handa',
-    'Brgy. Malaya', 'Brgy. Masagana', 'Brgy. Pinyahan',
-    'Brgy. Roxas', 'Brgy. Sacred Heart', 'Brgy. San Roque',
-    'Brgy. Santa Cruz', 'Brgy. Santa Teresita', 'Brgy. Santo Domingo',
-    'Brgy. Silangan', 'Brgy. South Triangle', 'Brgy. Tagumpay',
-    'Brgy. Tandang Sora', 'Brgy. Vasra', 'Brgy. White Plains',
-]
-STREETS = [
-    'Mabini St.', 'Rizal Ave.', 'MacArthur Hwy.', 'Quezon Blvd.',
-    'Gen. Luna St.', 'Bonifacio St.', 'Aguinaldo St.', 'Burgos St.',
-    'Del Pilar St.', 'Gomez St.', 'Jacinto St.', 'Lapu-Lapu St.',
-    'Lopez Jaena St.', 'Luna St.', 'Osmena Blvd.', 'Padre Faura St.',
-    'Palma St.', 'Plaridel St.', 'Recto Ave.', 'Roxas Blvd.',
-    'San Andres St.', 'Shaw Blvd.', 'Taft Ave.', 'Tandang Sora Ave.',
-    'Timog Ave.', 'Tuazon Blvd.', 'Visayas Ave.', 'Aurora Blvd.',
-    'EDSA', 'Espana Blvd.', 'Katipunan Ave.', 'Marcos Hwy.',
-    'Ortigas Ave.', 'Quirino Ave.',
-]
-RELIGIONS = [
-    'Roman Catholic', 'Catholic', 'Islam', 'Muslim',
-    'Iglesia ni Cristo', 'INC', 'Baptist', 'Methodist',
-    'Seventh Day Adventist', 'Born Again Christian', 'Aglipayan',
-]
-OCCUPATIONS = [
-    'Farmer', 'Teacher', 'Engineer', 'Nurse', 'Doctor',
-    'Laborer', 'Housewife', 'Driver', 'Carpenter', 'Vendor',
-    'Student', 'OFW', 'Fisherman', 'Mechanic', 'Electrician',
-    'Police Officer', 'Military', 'Government Employee',
-    'Business Owner', 'Retired',
-]
-CIVIL_STATUSES = ['Single', 'Married', 'Widowed', 'Legally Separated']
-CITIZENSHIPS = ['Filipino', 'Filipino', 'Filipino', 'American',
-                'Chinese', 'Japanese', 'Korean']
-DEATH_CAUSES = [
-    'Cardio-Respiratory Arrest', 'Hypertensive Cardiovascular Disease',
-    'Acute Myocardial Infarction', 'Cerebrovascular Accident',
-    'Pneumonia', 'Septicemia', 'Renal Failure', 'Diabetes Mellitus',
-    'Pulmonary Tuberculosis', 'Cancer of the Lung',
-    'Chronic Obstructive Pulmonary Disease', 'Liver Cirrhosis',
-    'Dengue Hemorrhagic Fever', 'Acute Gastroenteritis',
-    'Congestive Heart Failure',
-]
-ATTENDANT_TYPES = [
-    'Private Physician', 'Public Health Officer',
-    'Hospital Authority', 'Hilot', 'None',
-]
-# ─────────────────────────────────────────────────────────────────────────────
-#  NAME LOADER
-# ─────────────────────────────────────────────────────────────────────────────
-def load_ph_names():
-    """
-    Load Filipino names from ph_names.json.
-    Returns (first_names, last_names, middle_names).
-    Falls back to built-in lists if JSON not found.
-    """
-    if os.path.exists(PH_NAMES_FILE):
-        with open(PH_NAMES_FILE, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-        first_names  = data['first_names']['all']
-        last_names   = data['last_names']
-        # Load middle_names from JSON (added by updated generate_ph_names.py)
-        # Falls back to last_names if key missing (older ph_names.json)
-        middle_names = data.get('middle_names', last_names)
-        print(f"  Loaded ph_names.json: "
-              f"{len(first_names)} first, "
-              f"{len(last_names)} last, "
-              f"{len(middle_names)} middle names")
-    else:
-        print(f"  WARNING: {PH_NAMES_FILE} not found.")
-        print(f"  Using built-in fallback names.")
-        print(f"  For better results run: python generate_ph_names.py first.")
-        first_names = [
-            'Juan', 'Maria', 'Jose', 'Ana', 'Pedro', 'Rosa', 'Carlos',
-            'Elena', 'Ramon', 'Lucia', 'Eduardo', 'Carmen', 'Antonio',
-            'Isabel', 'Francisco', 'Gloria', 'Roberto', 'Corazon',
-            'Ricardo', 'Remedios', 'Manuel', 'Teresita', 'Andres',
-            'Lourdes', 'Fernando', 'Maricel', 'Rolando', 'Rowena',
-            'Danilo', 'Cristina', 'Ernesto', 'Marilou', 'Renato',
-            'Felicidad', 'Alfredo', 'Natividad', 'Domingo', 'Milagros',
-        ]
-        last_names = [
-            'Santos', 'Reyes', 'Cruz', 'Bautista', 'Ocampo', 'Garcia',
-            'Mendoza', 'Torres', 'Flores', 'Aquino', 'Dela Cruz',
-            'Del Rosario', 'San Jose', 'De Guzman', 'Villanueva',
-            'Gonzales', 'Ramos', 'Diaz', 'Castro', 'Morales',
-            'Lim', 'Tan', 'Go', 'Chua', 'Sy', 'Ong',
-            'Macaraeg', 'Pascual', 'Buenaventura', 'Concepcion',
-            'Manalo', 'Soriano', 'Evangelista', 'Salazar', 'Tolentino',
-        ]
-        middle_names = last_names
-    return first_names, last_names, middle_names
-# ─────────────────────────────────────────────────────────────────────────────
-#  TEXT GENERATORS
-# ─────────────────────────────────────────────────────────────────────────────
-def gen_full_name(first_names, last_names, with_suffix=True):
-    first  = random.choice(first_names)
-    middle = random.choice(MIDDLE_NAMES) if MIDDLE_NAMES else random.choice(last_names)
-    last   = random.choice(last_names)
-    suffix = random.choice(SUFFIXES) if with_suffix else ''
-    name   = f"{first} {middle} {last}"
-    if suffix:
-        name += f" {suffix}"
-    return name
-def gen_first_name(first_names):
-    return random.choice(first_names)
-def gen_last_name(last_names):
-    return random.choice(last_names)
-def gen_middle_name(last_names):
-    # Always draw from MIDDLE_NAMES (700+ entries from ph_names.json)
-    pool = MIDDLE_NAMES if MIDDLE_NAMES else last_names
-    return random.choice(pool)
-def gen_date_slash():
-    month = random.randint(1, 12)
-    day   = random.randint(1, 28)
-    year  = random.randint(1930, 2024)
-    return f"{month:02d}/{day:02d}/{year}"
-def gen_date_long():
-    month = random.choice(MONTHS)
-    day   = random.randint(1, 28)
-    year  = random.randint(1930, 2024)
-    return f"{month} {day}, {year}"
-def gen_date_day():
-    return str(random.randint(1, 28))
-def gen_date_month():
-    return random.choice(MONTHS)
-def gen_date_year():
-    return str(random.randint(1930, 2024))
-def gen_age():
-    return str(random.randint(1, 95))
-def gen_place_full():
-    return (f"{random.choice(BARANGAYS)}, "
-            f"{random.choice(CITIES)}, "
-            f"{random.choice(PROVINCES)}")
-def gen_place_city():
-    return random.choice(CITIES)
-def gen_place_province():
-    return random.choice(PROVINCES)
-def gen_address():
-    num = random.randint(1, 999)
-    st  = random.choice(STREETS)
-    return f"{num} {st}, {random.choice(CITIES)}"
-def gen_registry_no():
-    year = random.randint(2000, 2024)
-    seq  = random.randint(1, 9999)
-    return f"{year}-{seq:04d}"
-def gen_sex():
-    return random.choice(['Male', 'Female'])
-def gen_religion():
-    return random.choice(RELIGIONS)
-def gen_occupation():
-    return random.choice(OCCUPATIONS)
-def gen_civil_status():
-    return random.choice(CIVIL_STATUSES)
-def gen_citizenship():
-    return random.choice(CITIZENSHIPS)
-def gen_weight():
-    return f"{random.randint(1500, 4500)} grams"
-def gen_death_cause():
-    return random.choice(DEATH_CAUSES)
-def gen_attendant():
-    return random.choice(ATTENDANT_TYPES)
-# ─────────────────────────────────────────────────────────────────────────────
-#  FORM FIELD DEFINITIONS
-# ─────────────���───────────────────────────────────────────────────────────────
-def get_form_fields(form_type, first_names, last_names):
-    fn = first_names
-    ln = last_names
-    if form_type == 'form1a':   # Birth Certificate
-        return [
-            ('province',                  lambda: gen_place_province()),
-            ('registry_no',               lambda: gen_registry_no()),
-            ('city_municipality',         lambda: gen_place_city()),
-            ('child_first_name',          lambda: gen_first_name(fn)),
-            ('child_middle_name',         lambda: gen_middle_name(ln)),
-            ('child_last_name',           lambda: gen_last_name(ln)),
-            ('sex',                       lambda: gen_sex()),
-            ('dob_day',                   lambda: gen_date_day()),
-            ('dob_month',                 lambda: gen_date_month()),
-            ('dob_year',                  lambda: gen_date_year()),
-            ('place_birth_hospital',      lambda: f"Ospital ng {gen_place_city()}"),
-            ('place_birth_city',          lambda: gen_place_city()),
-            ('place_birth_province',      lambda: gen_place_province()),
-            ('weight_at_birth',           lambda: gen_weight()),
-            ('type_of_birth',             lambda: random.choice(['Single', 'Twin', 'Triplet'])),
-            ('mother_first_name',         lambda: gen_first_name(fn)),
-            ('mother_middle_name',        lambda: gen_middle_name(ln)),
-            ('mother_last_name',          lambda: gen_last_name(ln)),
-            ('mother_citizenship',        lambda: gen_citizenship()),
-            ('mother_religion',           lambda: gen_religion()),
-            ('mother_occupation',         lambda: gen_occupation()),
-            ('mother_age_at_birth',       lambda: str(random.randint(16, 45))),
-            ('mother_residence_house',    lambda: gen_address()),
-            ('mother_residence_city',     lambda: gen_place_city()),
-            ('mother_residence_province', lambda: gen_place_province()),
-            ('father_first_name',         lambda: gen_first_name(fn)),
-            ('father_middle_name',        lambda: gen_middle_name(ln)),
-            ('father_last_name',          lambda: gen_last_name(ln)),
-            ('father_citizenship',        lambda: gen_citizenship()),
-            ('father_religion',           lambda: gen_religion()),
-            ('father_occupation',         lambda: gen_occupation()),
-            ('father_age_at_birth',       lambda: str(random.randint(18, 55))),
-            ('parents_marriage_month',    lambda: gen_date_month()),
-            ('parents_marriage_day',      lambda: gen_date_day()),
-            ('parents_marriage_year',     lambda: gen_date_year()),
-            ('parents_marriage_city',     lambda: gen_place_city()),
-            ('informant_name',            lambda: gen_full_name(fn, ln, False)),
-            ('informant_address',         lambda: gen_address()),
-            ('informant_date',            lambda: gen_date_slash()),
-        ]
-    elif form_type == 'form2a':   # Death Certificate
-        return [
-            ('province',               lambda: gen_place_province()),
-            ('registry_no',            lambda: gen_registry_no()),
-            ('city_municipality',      lambda: gen_place_city()),
-            ('deceased_first_name',    lambda: gen_first_name(fn)),
-            ('deceased_middle_name',   lambda: gen_middle_name(ln)),
-            ('deceased_last_name',     lambda: gen_last_name(ln)),
-            ('sex',                    lambda: gen_sex()),
-            ('religion',               lambda: gen_religion()),
-            ('age_years',              lambda: gen_age()),
-            ('place_death_full',       lambda: f"{gen_place_city()}, {gen_place_province()}"),
-            ('dod_day',                lambda: gen_date_day()),
-            ('dod_month',              lambda: gen_date_month()),
-            ('dod_year',               lambda: gen_date_year()),
-            ('citizenship',            lambda: gen_citizenship()),
-            ('residence_full',         lambda: gen_address()),
-            ('civil_status',           lambda: gen_civil_status()),
-            ('occupation',             lambda: gen_occupation()),
-            ('cause_immediate',        lambda: gen_death_cause()),
-            ('cause_antecedent',       lambda: gen_death_cause()),
-            ('cause_underlying',       lambda: gen_death_cause()),
-            ('cause_other',            lambda: gen_death_cause()),
-            ('informant_name',         lambda: gen_full_name(fn, ln, False)),
-            ('informant_address',      lambda: gen_address()),
-            ('informant_date',         lambda: gen_date_slash()),
-        ]
-    elif form_type == 'form3a':   # Marriage Certificate
-        return [
-            ('province',                      lambda: gen_place_province()),
-            ('city_municipality',             lambda: gen_place_city()),
-            ('registry_no',                   lambda: gen_registry_no()),
-            ('husband_first_name',            lambda: gen_first_name(fn)),
-            ('husband_middle_name',           lambda: gen_middle_name(ln)),
-            ('husband_last_name',             lambda: gen_last_name(ln)),
-            ('wife_first_name',               lambda: gen_first_name(fn)),
-            ('wife_middle_name',              lambda: gen_middle_name(ln)),
-            ('wife_last_name',                lambda: gen_last_name(ln)),
-            ('husband_dob_day',               lambda: gen_date_day()),
-            ('husband_dob_month',             lambda: gen_date_month()),
-            ('husband_dob_year',              lambda: gen_date_year()),
-            ('husband_age',                   lambda: gen_age()),
-            ('wife_dob_day',                  lambda: gen_date_day()),
-            ('wife_dob_month',                lambda: gen_date_month()),
-            ('wife_dob_year',                 lambda: gen_date_year()),
-            ('wife_age',                      lambda: gen_age()),
-            ('husband_place_birth_city',      lambda: gen_place_city()),
-            ('husband_place_birth_province',  lambda: gen_place_province()),
-            ('wife_place_birth_city',         lambda: gen_place_city()),
-            ('wife_place_birth_province',     lambda: gen_place_province()),
-            ('husband_citizenship',           lambda: gen_citizenship()),
-            ('wife_citizenship',              lambda: gen_citizenship()),
-            ('husband_religion',              lambda: gen_religion()),
-            ('wife_religion',                 lambda: gen_religion()),
-            ('husband_civil_status',          lambda: gen_civil_status()),
-            ('wife_civil_status',             lambda: gen_civil_status()),
-            ('husband_father_first',          lambda: gen_first_name(fn)),
-            ('husband_father_last',           lambda: gen_last_name(ln)),
-            ('wife_father_first',             lambda: gen_first_name(fn)),
-            ('wife_father_last',              lambda: gen_last_name(ln)),
-            ('husband_mother_first',          lambda: gen_first_name(fn)),
-            ('husband_mother_last',           lambda: gen_last_name(ln)),
-            ('wife_mother_first',             lambda: gen_first_name(fn)),
-            ('wife_mother_last',              lambda: gen_last_name(ln)),
-            ('place_marriage_city',           lambda: gen_place_city()),
-            ('place_marriage_province',       lambda: gen_place_province()),
-            ('date_marriage_day',             lambda: gen_date_day()),
-            ('date_marriage_month',           lambda: gen_date_month()),
-            ('date_marriage_year',            lambda: gen_date_year()),
-        ]
-    elif form_type == 'form90':   # Marriage License Application
-        return [
-            ('province',               lambda: gen_place_province()),
-            ('city_municipality',      lambda: gen_place_city()),
-            ('registry_no',            lambda: gen_registry_no()),
-            ('husband_first_name',     lambda: gen_first_name(fn)),
-            ('husband_middle_name',    lambda: gen_middle_name(ln)),
-            ('husband_last_name',      lambda: gen_last_name(ln)),
-            ('wife_first_name',        lambda: gen_first_name(fn)),
-            ('wife_middle_name',       lambda: gen_middle_name(ln)),
-            ('wife_last_name',         lambda: gen_last_name(ln)),
-            ('husband_age',            lambda: gen_age()),
-            ('wife_age',               lambda: gen_age()),
-            ('husband_citizenship',    lambda: gen_citizenship()),
-            ('wife_citizenship',       lambda: gen_citizenship()),
-            ('husband_residence',      lambda: gen_address()),
-            ('wife_residence',         lambda: gen_address()),
-            ('application_date',       lambda: gen_date_slash()),
-        ]
-    return []
-# ─────────────────────────────────────────────────────────────────────────────
-#  MAIN GENERATOR
-# ─────────────────────────────────────────────────────────────────────────────
-def generate_dataset():
-    print("=" * 65)
-    print("  fix_data.py — Synthetic Training Data Generator")
-    print("=" * 65)
-    # Load Filipino names
-    print("\n[1/4] Loading Filipino names...")
-    first_names, last_names, middle_names = load_ph_names()
-    # Populate global MIDDLE_NAMES so all generators use the full 700+ pool
-    global MIDDLE_NAMES
-    MIDDLE_NAMES.clear()
-    MIDDLE_NAMES.extend(middle_names)
-    print(f"  Middle names pool active: {len(MIDDLE_NAMES)} entries")
-    # Create output directories
-    print("\n[2/4] Creating output directories...")
-    for split in ['train', 'val']:
-        for form in ['form1a', 'form2a', 'form3a', 'form90']:
-            Path(f'data/{split}/{form}').mkdir(parents=True, exist_ok=True)
-    print("  ✓ Directories ready")
-    # Load font pool — multiple typefaces so model generalises across fonts
-    print("\n[3/4] Loading fonts...")
-    font_pool = load_font_pool(FONT_SIZE)
-    print(f"  ✓ {len(font_pool)} font(s) loaded")
-    # Generate images
-    print("\n[4/4] Generating images...")
-    print(f"  {'Form':<10} {'Total':>7} {'Train':>7} {'Val':>7}")
-    print(f"  {'-'*35}")
-    train_annotations = []
-    val_annotations   = []
-    total_generated   = 0
-    for form_type, n_samples in SAMPLES_PER_FORM.items():
-        fields            = get_form_fields(form_type, first_names, last_names)
-        samples_per_field = max(1, n_samples // len(fields))
-        form_train        = 0
-        form_val          = 0
-        # Pre-build shuffled val assignment for unbiased 10% split
-        total_this_form = samples_per_field * len(fields)
-        val_flags       = [False] * total_this_form
-        val_indices     = random.sample(
-            range(total_this_form),
-            max(1, int(total_this_form * VAL_SPLIT))
-        )
-        for vi in val_indices:
-            val_flags[vi] = True
-        img_idx = 0
-        for field_name, generator in fields:
-            for _ in range(samples_per_field):
-                text = generator()
-                if not text or not text.strip():
-                    img_idx += 1
-                    continue
-                # 70% handwriting / 30% printed
-                use_handwriting = random.random() < 0.70
-                # Pick a random font from the pool each image — forces
-                # the model to generalise across typefaces, not memorise one font
-                font = random.choice(font_pool)
-                img   = render_text_image(text, font, handwriting=use_handwriting)
-                fname = f"{field_name}_{img_idx:06d}.jpg"
-                is_val = val_flags[img_idx] if img_idx < len(val_flags) else False
-                if is_val:
-                    out_path = f"data/val/{form_type}/{fname}"
-                    val_annotations.append({
-                        'image_path': f"{form_type}/{fname}",
-                        'text':       text,
-                    })
-                    form_val += 1
-                else:
-                    out_path = f"data/train/{form_type}/{fname}"
-                    train_annotations.append({
-                        'image_path': f"{form_type}/{fname}",
-                        'text':       text,
-                    })
-                    form_train += 1
-                img.save(out_path, quality=95)
-                img_idx += 1
-        total_generated += form_train + form_val
-        print(f"  {form_type:<10} {form_train + form_val:>7,} "
-              f"{form_train:>7,} {form_val:>7,}")
-    # Save annotation files
-    with open('data/train_annotations.json', 'w', encoding='utf-8') as f:
-        json.dump(train_annotations, f, indent=2, ensure_ascii=False)
-    with open('data/val_annotations.json', 'w', encoding='utf-8') as f:
-        json.dump(val_annotations, f, indent=2, ensure_ascii=False)
-    # Summary
-    print(f"\n{'=' * 65}")
-    print(f"  DONE!")
-    print(f"{'=' * 65}")
-    print(f"  Total images generated : {total_generated:,}")
-    print(f"  Train images           : {len(train_annotations):,}")
-    print(f"  Val images             : {len(val_annotations):,}")
-    print(f"\n  Saved:")
-    print(f"    data/train_annotations.json  ({len(train_annotations)} entries)")
-    print(f"    data/val_annotations.json    ({len(val_annotations)} entries)")
-    print(f"\n  Next step: python train.py")
-    print(f"{'=' * 65}")
-if __name__ == '__main__':
-    generate_dataset()

CRNN+CTC/generate_form_samples.py DELETED Viewed

@@ -1,389 +0,0 @@
-"""
-generate_form_samples.py
-========================
-Generates thousands of synthetic filled civil registry form images
-using the blank PDF forms + template_matcher.py coordinates.
-Each form is filled with random Filipino names/dates in handwriting fonts.
-Crops are saved with labels → ready for CRNN+CTC fine-tuning.
-Usage:
-    python generate_form_samples.py
-Output:
-    data/train/real_forms/  -- cropped field images
-    data/real_annotations.json  -- labels for fine-tuning
-"""
-import os
-import sys
-import json
-import random
-import datetime
-from PIL import Image, ImageDraw, ImageFont
-# ── Paths ─────────────────────────────────────────────────────
-BASE_DIR   = os.path.dirname(os.path.abspath(__file__))
-ROOT_DIR   = os.path.dirname(BASE_DIR)
-PYTHON_DIR = ROOT_DIR  # template_matcher.py is here
-NAMES_FILE  = os.path.join(BASE_DIR, 'data', 'ph_names.json')
-OUT_IMG_DIR = os.path.join(BASE_DIR, 'data', 'train', 'real_forms')
-OUT_ANN     = os.path.join(BASE_DIR, 'data', 'real_annotations.json')
-FONTS_DIR = os.path.join(ROOT_DIR, 'test_images', 'handwriting_fonts')
-# Only verified-working Google Fonts URLs
-GOOGLE_FONTS = {
-    'Kalam-Regular.ttf':          'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Regular.ttf',
-    'Kalam-Bold.ttf':             'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Bold.ttf',
-    'Kalam-Light.ttf':            'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Light.ttf',
-    'PatrickHand-Regular.ttf':    'https://github.com/google/fonts/raw/main/ofl/patrickhand/PatrickHand-Regular.ttf',
-    'IndieFlower-Regular.ttf':    'https://github.com/google/fonts/raw/main/ofl/indieflower/IndieFlower-Regular.ttf',
-    'Handlee-Regular.ttf':        'https://github.com/google/fonts/raw/main/ofl/handlee/Handlee-Regular.ttf',
-    'GochiHand-Regular.ttf':      'https://github.com/google/fonts/raw/main/ofl/gochihand/GochiHand-Regular.ttf',
-    'ArchitectsDaughter.ttf':     'https://github.com/google/fonts/raw/main/ofl/architectsdaughter/ArchitectsDaughter-Regular.ttf',
-    'ShadowsIntoLight.ttf':       'https://github.com/google/fonts/raw/main/ofl/shadowsintolight/ShadowsIntoLight.ttf',
-    'ShadowsIntoLightTwo.ttf':    'https://github.com/google/fonts/raw/main/ofl/shadowsintolighttwo/ShadowsIntoLightTwo-Regular.ttf',
-    'Kristi-Regular.ttf':         'https://github.com/google/fonts/raw/main/ofl/kristi/Kristi-Regular.ttf',
-    'AmaticSC-Regular.ttf':       'https://github.com/google/fonts/raw/main/ofl/amaticsc/AmaticSC-Regular.ttf',
-    'AmaticSC-Bold.ttf':          'https://github.com/google/fonts/raw/main/ofl/amaticsc/AmaticSC-Bold.ttf',
-    'BadScript-Regular.ttf':      'https://github.com/google/fonts/raw/main/ofl/badscript/BadScript-Regular.ttf',
-    'Sacramento-Regular.ttf':     'https://github.com/google/fonts/raw/main/ofl/sacramento/Sacramento-Regular.ttf',
-    'GreatVibes-Regular.ttf':     'https://github.com/google/fonts/raw/main/ofl/greatvibes/GreatVibes-Regular.ttf',
-    'Allura-Regular.ttf':         'https://github.com/google/fonts/raw/main/ofl/allura/Allura-Regular.ttf',
-    'AlexBrush-Regular.ttf':      'https://github.com/google/fonts/raw/main/ofl/alexbrush/AlexBrush-Regular.ttf',
-    'Parisienne-Regular.ttf':     'https://github.com/google/fonts/raw/main/ofl/parisienne/Parisienne-Regular.ttf',
-    'Tangerine-Regular.ttf':      'https://github.com/google/fonts/raw/main/ofl/tangerine/Tangerine-Regular.ttf',
-    'Tangerine-Bold.ttf':         'https://github.com/google/fonts/raw/main/ofl/tangerine/Tangerine-Bold.ttf',
-    'Courgette-Regular.ttf':      'https://github.com/google/fonts/raw/main/ofl/courgette/Courgette-Regular.ttf',
-    'Niconne-Regular.ttf':        'https://github.com/google/fonts/raw/main/ofl/niconne/Niconne-Regular.ttf',
-    'MarckScript-Regular.ttf':    'https://github.com/google/fonts/raw/main/ofl/marckscript/MarckScript-Regular.ttf',
-    'Norican-Regular.ttf':        'https://github.com/google/fonts/raw/main/ofl/norican/Norican-Regular.ttf',
-    'Damion-Regular.ttf':         'https://github.com/google/fonts/raw/main/ofl/damion/Damion-Regular.ttf',
-    'Satisfy-Regular.ttf':        'https://github.com/google/fonts/raw/main/ofl/satisfy/Satisfy-Regular.ttf',
-    'Pacifico-Regular.ttf':       'https://github.com/google/fonts/raw/main/ofl/pacifico/Pacifico-Regular.ttf',
-    'Italianno-Regular.ttf':      'https://github.com/google/fonts/raw/main/ofl/italianno/Italianno-Regular.ttf',
-    'Pompiere-Regular.ttf':       'https://github.com/google/fonts/raw/main/ofl/pompiere/Pompiere-Regular.ttf',
-}
-FONT_PATHS = [
-    # Downloaded handwriting fonts
-    *[os.path.join(FONTS_DIR, name) for name in GOOGLE_FONTS],
-    # Already available
-    os.path.join(ROOT_DIR, 'test_images', 'Caveat-Regular.ttf'),
-    # Windows fallbacks
-    r'C:\Windows\Fonts\segoepr.ttf',
-    r'C:\Windows\Fonts\segoeprb.ttf',
-    r'C:\Windows\Fonts\comic.ttf',
-]
-def download_fonts():
-    """Download handwriting fonts from Google Fonts if not present."""
-    import urllib.request
-    os.makedirs(FONTS_DIR, exist_ok=True)
-    ok = 0
-    for fname, url in GOOGLE_FONTS.items():
-        dest = os.path.join(FONTS_DIR, fname)
-        if os.path.exists(dest) and os.path.getsize(dest) > 10000:
-            ok += 1
-            continue
-        try:
-            print(f"  Downloading {fname}...")
-            with urllib.request.urlopen(url, timeout=10) as r, open(dest, 'wb') as f:
-                f.write(r.read())
-            # Validate: real TTF files are > 10KB
-            if os.path.getsize(dest) < 10000:
-                os.remove(dest)
-                print(f"  Skipped {fname} (invalid file)")
-            else:
-                ok += 1
-        except Exception as e:
-            print(f"  Failed {fname}: {e}")
-            if os.path.exists(dest):
-                os.remove(dest)
-    print(f"  {ok} fonts ready")
-PDF_FORMS = {
-    '97':  os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 97 (MARRIAGE CERTIFICATE).pdf'),
-    '102': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 102 (BIRTH CERTIFICATE).pdf'),
-    '103': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 103 (DEATH CERTIFICATE).pdf'),
-    '90':  os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 90-MARRIAGE-LICENCE-FORM.pdf'),
-}
-SAMPLES_PER_FORM = 1000   # forms to generate per type
-IMG_W = 64
-IMG_H = 512
-# ── Load TEMPLATES from template_matcher ─────────────────────
-sys.path.insert(0, PYTHON_DIR)
-from template_matcher import TEMPLATES
-# ── Load Filipino names ───────────────────────────────────────
-def load_names():
-    if not os.path.exists(NAMES_FILE):
-        print(f"ERROR: {NAMES_FILE} not found. Run generate_ph_names.py first.")
-        sys.exit(1)
-    with open(NAMES_FILE) as f:
-        data = json.load(f)
-    return data
-# ── Random data generators ────────────────────────────────────
-MONTHS = ['January','February','March','April','May','June',
-          'July','August','September','October','November','December']
-RELIGIONS = ['Roman Catholic','Islam','Baptist','Iglesia ni Cristo',
-             'Seventh Day Adventist','Born Again Christian']
-CIVIL_STATUSES = ['Single','Married','Widowed','Legally Separated']
-CITIZENSHIPS = ['Filipino','American','Chinese','Japanese']
-PROVINCES = ['Cebu','Davao del Sur','Metro Manila','Iloilo','Pampanga',
-             'Batangas','Laguna','Cavite','Bulacan','Quezon City']
-CITIES = ['Cebu City','Davao City','Manila','Iloilo City','San Fernando',
-          'Batangas City','Santa Rosa','Bacoor','Malolos','Quezon City']
-def rand_name(names, key):
-    pool = names.get(key, ['Juan'])
-    return random.choice(pool).upper()
-def rand_date():
-    y = random.randint(1950, 2005)
-    m = random.randint(1, 12)
-    d = random.randint(1, 28)
-    return f"{d:02d}", MONTHS[m-1], str(y)
-def rand_age():
-    return str(random.randint(18, 80))
-def rand_province():
-    return random.choice(PROVINCES).upper()
-def rand_city():
-    return random.choice(CITIES).upper()
-def rand_religion():
-    return random.choice(RELIGIONS).upper()
-def rand_civil_status():
-    return random.choice(CIVIL_STATUSES).upper()
-def rand_citizenship():
-    return random.choice(CITIZENSHIPS).upper()
-def rand_registry_no():
-    return f"{random.randint(2000,2024)}-{random.randint(1000,9999)}"
-def rand_time():
-    h = random.randint(6, 18)
-    m = random.choice(['00','15','30','45'])
-    return f"{h:02d}:{m} {'AM' if h < 12 else 'PM'}"
-def generate_field_value(field_name, names):
-    """Generate a plausible random value for a given field name."""
-    f = field_name.lower()
-    if 'province' in f:          return rand_province()
-    if 'registry' in f:          return rand_registry_no()
-    if 'city' in f or 'municipality' in f: return rand_city()
-    if 'first' in f and ('name' in f or 'father' in f or 'mother' in f):
-        return rand_name(names, 'first')
-    if 'middle' in f:            return rand_name(names, 'middle')
-    if 'last' in f:              return rand_name(names, 'last')
-    if '_name' in f and 'father' not in f and 'mother' not in f:
-        return rand_name(names, 'first')
-    if 'father_name' in f or 'mother_name' in f:
-        return f"{rand_name(names,'first')} {rand_name(names,'middle')} {rand_name(names,'last')}"
-    if 'dob_day' in f or 'day' in f:    return rand_date()[0]
-    if 'dob_month' in f or 'month' in f: return rand_date()[1]
-    if 'dob_year' in f or 'year' in f:   return rand_date()[2]
-    if 'dob' in f and 'day' not in f and 'month' not in f and 'year' not in f:
-        d,m,y = rand_date(); return f"{d} {m} {y}"
-    if 'age' in f:               return rand_age()
-    if 'birth' in f and 'place' in f: return rand_city()
-    if 'place_of_birth' in f:    return rand_city()
-    if 'sex' in f:               return random.choice(['MALE','FEMALE'])
-    if 'citizenship' in f:       return rand_citizenship()
-    if 'residence' in f:         return f"{rand_city()}, {rand_province()}"
-    if 'religion' in f:          return rand_religion()
-    if 'civil_status' in f:      return rand_civil_status()
-    if 'place_of_marriage' in f: return rand_city()
-    if 'date_of_marriage' in f:
-        d,m,y = rand_date(); return f"{d} {m} {y}"
-    if 'time_of_marriage' in f:  return rand_time()
-    if 'marriage_date' in f:
-        d,m,y = rand_date(); return f"{d} {m} {y}"
-    if 'marriage_place' in f:    return rand_city()
-    if 'marriage_license' in f:  return rand_registry_no()
-    if 'date_issued' in f:
-        d,m,y = rand_date(); return f"{d} {m} {y}"
-    if 'occupation' in f:        return random.choice(['FARMER','TEACHER','NURSE','ENGINEER','DRIVER','HOUSEWIFE'])
-    if 'type_of_birth' in f:     return random.choice(['SINGLE','TWIN','TRIPLET'])
-    if 'birth_order' in f:       return random.choice(['1ST','2ND','3RD','4TH'])
-    if 'weight' in f:            return f"{random.randint(2,5)}.{random.randint(0,9)} KG"
-    if 'cause' in f:             return random.choice(['CARDIAC ARREST','PNEUMONIA','DIABETES','HYPERTENSION'])
-    if 'father_name' in f:       return f"{rand_name(names,'first')} {rand_name(names,'last')}"
-    if 'mother_name' in f:       return f"{rand_name(names,'first')} {rand_name(names,'last')}"
-    return rand_name(names, 'first')
-# ── Load fonts ────────────────────────────────────────────────
-def load_fonts():
-    fonts = []
-    for path in FONT_PATHS:
-        if os.path.exists(path):
-            for size in [14, 16, 18, 20]:
-                try:
-                    fonts.append(ImageFont.truetype(path, size))
-                except:
-                    pass
-    if not fonts:
-        fonts = [ImageFont.load_default()]
-    print(f"  Loaded {len(fonts)} font variants")
-    return fonts
-# ── Load blank form image ─────────────────────────────────────
-def load_blank_form(form_type):
-    """Convert PDF to image or use a reference scan as background."""
-    pdf_path = PDF_FORMS.get(form_type)
-    # Try pdf2image first
-    if pdf_path and os.path.exists(pdf_path):
-        try:
-            from pdf2image import convert_from_path
-            pages = convert_from_path(pdf_path, dpi=150)
-            if pages:
-                return pages[0].convert('RGB')
-        except Exception as e:
-            print(f"  pdf2image failed: {e}")
-    # Fallback: use reference image (try png, jpg, jpeg)
-    for ext in ['png', 'jpg', 'jpeg']:
-        ref_path = os.path.join(ROOT_DIR, 'references', f'reference_{form_type}.{ext}')
-        if os.path.exists(ref_path):
-            return Image.open(ref_path).convert('RGB')
-    # Also try hyphen variant (e.g. reference-90.jpg)
-    for ext in ['png', 'jpg', 'jpeg']:
-        ref_path = os.path.join(ROOT_DIR, 'references', f'reference-{form_type}.{ext}')
-        if os.path.exists(ref_path):
-            return Image.open(ref_path).convert('RGB')
-    print(f"  WARNING: No blank form found for {form_type} — skipping")
-    return None
-# ── Render text on form ───────────────────────────────────────
-def render_field(draw, x1r, y1r, x2r, y2r, text, img_w, img_h, fonts):
-    """Draw handwritten-style text in a field box."""
-    x1 = int(x1r * img_w)
-    y1 = int(y1r * img_h)
-    x2 = int(x2r * img_w)
-    y2 = int(y2r * img_h)
-    box_w = max(x2 - x1, 1)
-    box_h = max(y2 - y1, 1)
-    # Pick a font that fits
-    font = random.choice(fonts)
-    for f in fonts:
-        bbox = f.getbbox(text)
-        fw = bbox[2] - bbox[0]
-        fh = bbox[3] - bbox[1]
-        if fw <= box_w * 0.95 and fh <= box_h * 1.2:
-            font = f
-            break
-    # Random pen color (dark blue/black like ballpen)
-    r = random.randint(0, 40)
-    g = random.randint(0, 40)
-    b = random.randint(60, 120)
-    color = (r, g, b)
-    # Center text vertically in box
-    bbox = font.getbbox(text)
-    fh = bbox[3] - bbox[1]
-    ty = y1 + (box_h - fh) // 2
-    # Slight random x offset
-    tx = x1 + random.randint(2, max(3, box_w // 10))
-    draw.text((tx, ty), text, fill=color, font=font)
-# ── Crop a field ──────────────────────────────────────────────
-def crop_field(img, x1r, y1r, x2r, y2r):
-    w, h = img.size
-    x1 = max(0, int(x1r * w) - 4)
-    y1 = max(0, int(y1r * h) - 4)
-    x2 = min(w, int(x2r * w) + 4)
-    y2 = min(h, int(y2r * h) + 4)
-    return img.crop((x1, y1, x2, y2))
-# ── Main ────────────────────────────────────────────────────��─
-def main():
-    print("=" * 60)
-    print("  Form Sample Generator")
-    print("=" * 60)
-    os.makedirs(OUT_IMG_DIR, exist_ok=True)
-    print("\n  Downloading handwriting fonts...")
-    download_fonts()
-    names = load_names()
-    fonts = load_fonts()
-    annotations = []
-    total = 0
-    for form_type, template in TEMPLATES.items():
-        print(f"\n  Generating Form {form_type}...")
-        blank = load_blank_form(form_type)
-        if blank is None:
-            continue
-        for i in range(SAMPLES_PER_FORM):
-            # Fresh copy of blank form
-            form_img = blank.copy()
-            draw = ImageDraw.Draw(form_img)
-            img_w, img_h = form_img.size
-            field_values = {}
-            for field_name, coords in template.items():
-                x1r, y1r, x2r, y2r, _ = coords
-                text = generate_field_value(field_name, names)
-                field_values[field_name] = text
-                render_field(draw, x1r, y1r, x2r, y2r, text, img_w, img_h, fonts)
-            # Save full form preview (first sample only)
-            if i == 0:
-                preview_path = os.path.join(OUT_IMG_DIR, f'form{form_type}_preview.png')
-                form_img.save(preview_path)
-                print(f"    Preview saved: {preview_path}")
-            # Crop each field and save
-            for field_name, coords in template.items():
-                x1r, y1r, x2r, y2r, _ = coords
-                crop = crop_field(form_img, x1r, y1r, x2r, y2r)
-                crop = crop.convert('L')  # grayscale
-                fname = f"form{form_type}_{i:05d}_{field_name}.png"
-                fpath = os.path.join(OUT_IMG_DIR, fname)
-                crop.save(fpath)
-                annotations.append({
-                    "image_path": f"real_forms/{fname}",
-                    "text": field_values[field_name]
-                })
-                total += 1
-            if (i + 1) % 100 == 0:
-                print(f"    {i+1}/{SAMPLES_PER_FORM} forms done ({total} crops so far)")
-        print(f"  Form {form_type} done.")
-    # Save annotations
-    with open(OUT_ANN, 'w') as f:
-        json.dump(annotations, f, indent=2)
-    print(f"\n{'='*60}")
-    print(f"  DONE!")
-    print(f"  Total crops : {total}")
-    print(f"  Annotations : {OUT_ANN}")
-    print(f"  Next step   : upload to Kaggle and run fine-tune")
-    print(f"{'='*60}")
-if __name__ == '__main__':
-    main()

CRNN+CTC/inference.py CHANGED Viewed

@@ -214,7 +214,7 @@ class CivilRegistryOCR:
     def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
         """
         Args:
-            checkpoint_path : path to best_model.pth
             device          : 'cuda' or 'cpu'
             mode            : 'auto'     → auto-detect per image  (recommended)
                               'simple'   → always use simple pipeline

     def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
         """
         Args:
+            checkpoint_path : path to best_model_v4.pth
             device          : 'cuda' or 'cpu'
             mode            : 'auto'     → auto-detect per image  (recommended)
                               'simple'   → always use simple pipeline

debug_and_retrain.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import cv2
+import matplotlib.pyplot as plt
+# Load and show the image
+img = cv2.imread('your_image.png')
+plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+plt.title('Original Image')
+plt.show()
+# Preprocess and show
+gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+thresh = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
+plt.imshow(thresh, cmap='gray')
+plt.title('Thresholded Image')
+plt.show()
+# Run OCR and print output
+import pytesseract
+text = pytesseract.image_to_string(thresh)
+print("OCR Output:", text)

finetune.py CHANGED Viewed

@@ -3,18 +3,20 @@ finetune.py
 ===========
 Fine-tune CRNN+CTC on generated civil registry form crops.
-Loads best_model_iam.pth (already knows real handwriting from IAM),
-then trains on real_annotations.json (Filipino names on real form backgrounds).
 Usage:
     python finetune.py
 Output:
-    checkpoints/best_model_final.pth
 """
 import os
 import sys
 import torch
 import torch.nn.functional as F
 import torch.optim as optim
@@ -25,13 +27,14 @@ from crnn_model import get_crnn_model
 from dataset import CivilRegistryDataset, collate_fn
 # ── Config ────────────────────────────────────────────────────
-CHECKPOINT_IN  = "checkpoints/best_model_iam.pth"
-CHECKPOINT_OUT = "checkpoints/best_model_final.pth"
-REAL_ANN   = "data/real_annotations.json"    # generated by generate_form_samples.py
-ACTUAL_ANN = "data/actual_annotations.json" # real scanned forms (extract_actual_data.py)
-SYNTH_ANN  = "data/train_annotations.json"  # original synthetic data
-VAL_ANN    = "data/val_annotations.json"    # validation set
 IMG_HEIGHT = 64
 IMG_WIDTH  = 512
@@ -42,10 +45,26 @@ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # ── Phase settings ────────────────────────────────────────────
 PHASES = [
     # (name, epochs, lr, freeze_cnn, patience)
-    ("Phase 1 — CNN frozen,   adapt to form crops", 20, 1e-4, True,  5),
-    ("Phase 2 — Full model,   low LR polish",        15, 1e-5, False, 4),
 ]
 # ── Main ──────────────────────────────────────────────────────
 def main():
     print("=" * 60)
@@ -60,6 +79,11 @@ def main():
             print(f"ERROR: {f} not found.")
             sys.exit(1)
     # ── Datasets ──────────────────────────────────────────────
     datasets_to_merge = []
@@ -72,32 +96,23 @@ def main():
         datasets_to_merge.append(actual_dataset)
         print(f"  Actual crops: {len(actual_dataset)}  (real scanned forms)")
     else:
-        print(f"  [!] {ACTUAL_ANN} not found — run extract_actual_data.py first")
-    # 2. Synthetic on real form backgrounds
-    if os.path.exists(REAL_ANN):
-        real_dataset = CivilRegistryDataset(
-            data_dir="data/train", annotations_file=REAL_ANN,
-            img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
-        )
-        datasets_to_merge.append(real_dataset)
-        print(f"  Real crops  : {len(real_dataset)}  (synthetic on real backgrounds)")
-    # 3. Fully synthetic — keep so model doesn't forget basic characters
     if os.path.exists(SYNTH_ANN):
         synth_dataset = CivilRegistryDataset(
-            data_dir="data/train", annotations_file=SYNTH_ANN,
             img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
         )
         datasets_to_merge.append(synth_dataset)
         print(f"  Synth crops : {len(synth_dataset)}  (fully synthetic)")
     if not datasets_to_merge:
-        print("ERROR: No training data found. Run extract_actual_data.py first.")
         sys.exit(1)
     val_dataset = CivilRegistryDataset(
-        data_dir="data/val", annotations_file=VAL_ANN,
         img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
     )
@@ -115,7 +130,7 @@ def main():
     ckpt   = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
     config = ckpt.get('config', {})
-    ref_dataset = datasets_to_merge[0]   # use whichever dataset was loaded first
     model = get_crnn_model(
         model_type      = config.get('model_type', 'standard'),
         img_height      = config.get('img_height', 64),
@@ -144,8 +159,8 @@ def main():
                 batch_size = images.size(0)
                 if training:
                     optimizer.zero_grad()
-                outputs      = F.log_softmax(model(images), dim=2)
-                seq_len      = outputs.size(0)
                 input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
                 loss = criterion(outputs, targets, input_lengths, target_lengths)
                 if not torch.isnan(loss) and not torch.isinf(loss):
@@ -186,6 +201,7 @@ def main():
                 if vl < best_overall:
                     best_overall = vl
                     torch.save({
                         'model_state_dict': model.state_dict(),
                         'config':           config,
                         'char_to_idx':      ref_dataset.char_to_idx,
@@ -201,6 +217,11 @@ def main():
                     print(f"  Early stopping.")
                     break
     print(f"\n{'='*60}")
     print(f"  Fine-tuning complete!")
     print(f"  Best val loss : {best_overall:.4f}")
@@ -209,4 +230,4 @@ def main():
 if __name__ == '__main__':
-    main()

 ===========
 Fine-tune CRNN+CTC on generated civil registry form crops.
+Continues from best_model_v2.pth, trains on actual_annotations.json
++ train_annotations.json, saves to best_model_v4.pth.
 Usage:
     python finetune.py
 Output:
+    checkpoints/best_model_v4.pth
 """
 import os
 import sys
+import json
+import shutil
 import torch
 import torch.nn.functional as F
 import torch.optim as optim
 from dataset import CivilRegistryDataset, collate_fn
 # ── Config ────────────────────────────────────────────────────
+CHECKPOINT_IN  = "checkpoints/best_model_v3.pth"
+CHECKPOINT_OUT = "checkpoints/best_model_v4.pth"
+ACTUAL_ANN = "data/actual_annotations.json"  # real scanned forms
+SYNTH_ANN  = "data/train_annotations.json"   # synthetic / train split
+VAL_ANN    = "data/val_annotations.json"     # validation set
+DRIVE_BACKUP = "/content/drive/MyDrive/crnn_finetune/CRNN+CTC/checkpoints/best_model_v4.pth"
 IMG_HEIGHT = 64
 IMG_WIDTH  = 512
 # ── Phase settings ────────────────────────────────────────────
 PHASES = [
     # (name, epochs, lr, freeze_cnn, patience)
+    ("Phase 1 — CNN frozen,   warm up on actual crops",  20, 1e-4, True,  5),
+    ("Phase 2 — Full model,   main training",            30, 1e-5, False, 6),
+    ("Phase 3 — Full model,   slow burn",                30, 5e-6, False, 6),
+    ("Phase 4 — Full model,   final polish",             20, 1e-6, False, 5),
 ]
+# ── Fix Windows backslash paths ───────────────────────────────
+def fix_paths(json_path):
+    with open(json_path) as f:
+        ann = json.load(f)
+    changed = False
+    for a in ann:
+        if 'image_path' in a and '\\' in a['image_path']:
+            a['image_path'] = a['image_path'].replace('\\', '/')
+            changed = True
+    if changed:
+        with open(json_path, 'w') as f:
+            json.dump(ann, f)
+        print(f"  Fixed backslash paths in {json_path}")
 # ── Main ──────────────────────────────────────────────────────
 def main():
     print("=" * 60)
             print(f"ERROR: {f} not found.")
             sys.exit(1)
+    # ── Fix backslash paths ───────────────────────────────────
+    for ann_file in [ACTUAL_ANN, SYNTH_ANN, VAL_ANN]:
+        if os.path.exists(ann_file):
+            fix_paths(ann_file)
     # ── Datasets ──────────────────────────────────────────────
     datasets_to_merge = []
         datasets_to_merge.append(actual_dataset)
         print(f"  Actual crops: {len(actual_dataset)}  (real scanned forms)")
     else:
+        print(f"  [!] {ACTUAL_ANN} not found")
+    # 2. Fully synthetic — keep so model doesn't forget basic characters
     if os.path.exists(SYNTH_ANN):
         synth_dataset = CivilRegistryDataset(
+            data_dir=".", annotations_file=SYNTH_ANN,
             img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
         )
         datasets_to_merge.append(synth_dataset)
         print(f"  Synth crops : {len(synth_dataset)}  (fully synthetic)")
     if not datasets_to_merge:
+        print("ERROR: No training data found.")
         sys.exit(1)
     val_dataset = CivilRegistryDataset(
+        data_dir=".", annotations_file=VAL_ANN,
         img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
     )
     ckpt   = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
     config = ckpt.get('config', {})
+    ref_dataset = datasets_to_merge[0]
     model = get_crnn_model(
         model_type      = config.get('model_type', 'standard'),
         img_height      = config.get('img_height', 64),
                 batch_size = images.size(0)
                 if training:
                     optimizer.zero_grad()
+                outputs       = F.log_softmax(model(images), dim=2)
+                seq_len       = outputs.size(0)
                 input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
                 loss = criterion(outputs, targets, input_lengths, target_lengths)
                 if not torch.isnan(loss) and not torch.isinf(loss):
                 if vl < best_overall:
                     best_overall = vl
                     torch.save({
+                        **ckpt,
                         'model_state_dict': model.state_dict(),
                         'config':           config,
                         'char_to_idx':      ref_dataset.char_to_idx,
                     print(f"  Early stopping.")
                     break
+    # ── Drive backup ──────────────────────────────────────────
+    if os.path.exists(CHECKPOINT_OUT) and os.path.exists(os.path.dirname(DRIVE_BACKUP)):
+        shutil.copy(CHECKPOINT_OUT, DRIVE_BACKUP)
+        print(f"\n  Backed up to Drive: {DRIVE_BACKUP}")
     print(f"\n{'='*60}")
     print(f"  Fine-tuning complete!")
     print(f"  Best val loss : {best_overall:.4f}")
 if __name__ == '__main__':
+    main()

inference.py CHANGED Viewed

@@ -214,7 +214,7 @@ class CivilRegistryOCR:
     def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
         """
         Args:
-            checkpoint_path : path to best_model.pth
             device          : 'cuda' or 'cpu'
             mode            : 'auto'     → auto-detect per image  (recommended)
                               'simple'   → always use simple pipeline
@@ -340,9 +340,9 @@ def demo_inference():
     print("=" * 70)
     ocr = CivilRegistryOCR(
-        checkpoint_path='checkpoints/best_model.pth',
         device='cuda',
-        mode='auto',
         verbose=True   # shows which mode each image triggers
     )

     def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
         """
         Args:
+            checkpoint_path : path to best_model_v4.pth
             device          : 'cuda' or 'cpu'
             mode            : 'auto'     → auto-detect per image  (recommended)
                               'simple'   → always use simple pipeline
     print("=" * 70)
     ocr = CivilRegistryOCR(
+        checkpoint_path='checkpoints/best_model_v4.pth',
         device='cuda',
+        mode='adaptive',  # force adaptive for demo images (many are zoomed/physical)
         verbose=True   # shows which mode each image triggers
     )

spacyNER/debug_and_retrain.py DELETED Viewed

@@ -1,316 +0,0 @@
-#!/usr/bin/env python3
-# debug_and_retrain.py
-# ============================================================
-# USE THIS WHEN: training crashes with E024 or any span error
-#
-# WHAT IT DOES (in order):
-#   1. Checks all .spacy files for bad spans (whitespace, empty)
-#   2. Runs spaCy's official debug data command
-#   3. Deletes corrupted .spacy files so they get rebuilt clean
-#   4. Rebuilds: prepare_data → funsd_integration → train
-#
-# USAGE:
-#   python debug_and_retrain.py           ← full check + retrain
-#   python debug_and_retrain.py --check   ← check only, no retrain
-#   python debug_and_retrain.py --retrain ← skip check, just retrain
-# ============================================================
-import subprocess
-import sys
-import argparse
-from pathlib import Path
-# ── All .spacy files to check ─────────────────────────────
-SPACY_FILES = {
-    "train.spacy":         "data/training/train.spacy",
-    "dev.spacy":           "data/training/dev.spacy",
-    "funsd_train.spacy":   "data/training/funsd_train.spacy",
-    "funsd_dev.spacy":     "data/training/funsd_dev.spacy",
-    "merged_train.spacy":  "data/training/merged_train.spacy",
-    "merged_dev.spacy":    "data/training/merged_dev.spacy",
-}
-# Files that get REBUILT (delete these before retraining)
-REBUILT_FILES = list(SPACY_FILES.values())
-CFG = "training/config.cfg"
-# ══════════════════════════════════════════════════════════
-# STEP 1 — INSPECT .spacy FILES FOR BAD SPANS
-# ══════════════════════════════════════════════════════════
-def inspect_spacy_file(path: str):
-    """
-    Load a .spacy file and scan every entity span for problems.
-    Returns (total_docs, total_ents, bad_spans_list).
-    Bad span types that cause E024:
-      - Leading whitespace:   span.text starts with ' ' or '\\n'
-      - Trailing whitespace:  span.text ends with ' ' or '\\n'
-      - Empty span:           span.text == ''
-      - Punctuation-only:     e.g. '.' or ','
-    """
-    import spacy
-    from spacy.tokens import DocBin
-    nlp    = spacy.blank("en")
-    db     = DocBin().from_disk(path)
-    docs   = list(db.get_docs(nlp.vocab))
-    total_ents = 0
-    bad_spans  = []
-    for i, doc in enumerate(docs):
-        for ent in doc.ents:
-            total_ents += 1
-            t = ent.text
-            if not t.strip():
-                bad_spans.append({
-                    "doc": i, "label": ent.label_, "text": repr(t),
-                    "reason": "EMPTY or whitespace-only"
-                })
-            elif t != t.strip():
-                bad_spans.append({
-                    "doc": i, "label": ent.label_, "text": repr(t),
-                    "reason": f"WHITESPACE — leading={repr(t[0])} trailing={repr(t[-1])}"
-                })
-            elif len(t) == 1 and not t.isalnum():
-                bad_spans.append({
-                    "doc": i, "label": ent.label_, "text": repr(t),
-                    "reason": "SINGLE PUNCTUATION CHAR"
-                })
-    return len(docs), total_ents, bad_spans
-def check_all_spacy_files():
-    """Check every .spacy file and report problems."""
-    try:
-        import spacy
-    except ImportError:
-        print("  ❌ spaCy not installed. Run: pip install spacy")
-        return False
-    print("\n" + "=" * 62)
-    print("  STEP 1 — SCANNING .spacy FILES FOR BAD SPANS")
-    print("=" * 62)
-    any_problems = False
-    for name, path in SPACY_FILES.items():
-        if not Path(path).exists():
-            print(f"\n  ⚪ {name:30s}  not found — will be created")
-            continue
-        print(f"\n  📄 {name}")
-        try:
-            n_docs, n_ents, bad = inspect_spacy_file(path)
-            print(f"     docs: {n_docs}   entities: {n_ents}   bad spans: {len(bad)}")
-            if bad:
-                any_problems = True
-                print(f"     ❌ {len(bad)} PROBLEM SPAN(S):")
-                for b in bad[:10]:   # show first 10
-                    print(f"        doc {b['doc']:>3}  [{b['label']}]  {b['text']:30s}  ← {b['reason']}")
-                if len(bad) > 10:
-                    print(f"        ... and {len(bad) - 10} more")
-            else:
-                print(f"     ✅ All spans clean")
-        except Exception as e:
-            print(f"     ❌ Could not read file: {e}")
-            any_problems = True
-    return any_problems
-# ══════════════════════════════════════════════════════════
-# STEP 2 — spaCy OFFICIAL DEBUG DATA
-# ═══════════════════════════════════════���══════════════════
-def run_spacy_debug():
-    """
-    Run spaCy's built-in debug data command.
-    This catches problems our scanner might miss.
-    """
-    print("\n" + "=" * 62)
-    print("  STEP 2 — spaCy OFFICIAL DEBUG DATA")
-    print("=" * 62)
-    train = "data/training/merged_train.spacy"
-    dev   = "data/training/merged_dev.spacy"
-    # Fall back to civil-only if merged doesn't exist
-    if not Path(train).exists():
-        train = "data/training/train.spacy"
-        dev   = "data/training/dev.spacy"
-    if not Path(train).exists():
-        print("\n  ⚪ No training data found yet — skipping debug.")
-        print("  → Run: python training/prepare_data.py first")
-        return
-    if not Path(CFG).exists():
-        print(f"\n  ⚪ Config not found: {CFG} — skipping debug.")
-        return
-    print(f"\n  Checking: {train}")
-    print(f"  Dev:      {dev}\n")
-    result = subprocess.run([
-        sys.executable, "-m", "spacy", "debug", "data", CFG,
-        "--paths.train", train,
-        "--paths.dev",   dev,
-    ])
-    if result.returncode != 0:
-        print("\n  ⚠️  debug data found issues — see above.")
-    else:
-        print("\n  ✅ debug data passed — no issues found.")
-# ══════════════════════════════════════════════════════════
-# STEP 3 — DELETE OLD .spacy FILES
-# ══════════════════════════════════════════════════════════
-def delete_spacy_files():
-    """Delete all generated .spacy files so they get rebuilt clean."""
-    print("\n" + "=" * 62)
-    print("  STEP 3 — DELETING OLD .spacy FILES")
-    print("=" * 62)
-    deleted = 0
-    for path in REBUILT_FILES:
-        p = Path(path)
-        if p.exists():
-            p.unlink()
-            print(f"  🗑️  Deleted: {path}")
-            deleted += 1
-    if deleted == 0:
-        print("  ⚪ Nothing to delete.")
-    else:
-        print(f"\n  ✅ Deleted {deleted} file(s) — will be rebuilt clean.")
-# ══════════════════════════════════════════════════════════
-# STEP 4 — REBUILD + RETRAIN
-# ══════════════════════════════════════════════════════════
-def run_script(script: str, label: str) -> bool:
-    """Run a training script. Returns True on success."""
-    print(f"\n{'─' * 62}")
-    print(f"  ▶ {label}")
-    print(f"  Script: {script}")
-    print(f"{'─' * 62}\n")
-    if not Path(script).exists():
-        print(f"  ❌ Script not found: {script}")
-        return False
-    result = subprocess.run([sys.executable, script])
-    if result.returncode != 0:
-        print(f"\n  ❌ {label} failed.")
-        return False
-    print(f"\n  ✅ {label} complete.")
-    return True
-def retrain():
-    """Run the full rebuild pipeline: prepare → funsd → train."""
-    print("\n" + "=" * 62)
-    print("  STEP 4 — REBUILD + RETRAIN")
-    print("=" * 62)
-    steps = [
-        ("training/prepare_data.py",      "Step 1/3: Build civil registry data"),
-        ("training/funsd_integration.py", "Step 2/3: Merge FUNSD + civil registry"),
-        ("training/train.py",             "Step 3/3: Train NER model"),
-    ]
-    for script, label in steps:
-        ok = run_script(script, label)
-        if not ok:
-            print(f"\n  ❌ Pipeline stopped at: {script}")
-            print(f"  Fix the error above, then re-run:")
-            print(f"    python debug_and_retrain.py --retrain")
-            sys.exit(1)
-    print("\n" + "=" * 62)
-    print("  ✅ RETRAIN COMPLETE")
-    print("=" * 62)
-    print("\n  Best model → models/civil_registry_model/model-best/")
-    print("\n  NEXT:  python training/evaluate.py")
-# ══════════════════════════════════════════════════════════
-# MAIN
-# ══════════════════════════════════════════════════════════
-def main():
-    parser = argparse.ArgumentParser(
-        description="Debug FUNSD/civil data and retrain NER model"
-    )
-    parser.add_argument("--check",   action="store_true",
-                        help="Check for bad spans only — don't retrain")
-    parser.add_argument("--retrain", action="store_true",
-                        help="Skip check — delete old files and retrain immediately")
-    args = parser.parse_args()
-    print("\n" + "=" * 62)
-    print("  CIVIL REGISTRY NER — DEBUG & RETRAIN")
-    print("=" * 62)
-    print("\n  This script fixes the E024 'bad span' training error.")
-    print("  Root causes: whitespace in spans, wrong alignment_mode,")
-    print("               offset shift from text.strip() after build.")
-    if args.retrain:
-        # Skip checking — just delete and rebuild
-        delete_spacy_files()
-        retrain()
-        return
-    # ── Always run checks ─────────────────────────────────
-    has_problems = check_all_spacy_files()
-    run_spacy_debug()
-    if args.check:
-        # Check-only mode — stop here
-        print("\n" + "=" * 62)
-        if has_problems:
-            print("  ⚠️  Problems found — run without --check to fix:")
-            print("      python debug_and_retrain.py")
-        else:
-            print("  ✅ No problems found — safe to train:")
-            print("      python training/train.py")
-        print("=" * 62)
-        return
-    # ── Ask before deleting ───────────────────────────────
-    print("\n" + "=" * 62)
-    if has_problems:
-        print("  ⚠️  Bad spans detected in .spacy files.")
-        print("  The fixed funsd_integration.py will rebuild them cleanly.")
-    else:
-        print("  ✅ No bad spans detected in existing files.")
-    print("\n  Proceeding to delete old .spacy files and retrain...")
-    print("  (Ctrl+C now to cancel)")
-    print("=" * 62)
-    try:
-        input("\n  Press ENTER to continue, Ctrl+C to cancel...\n")
-    except KeyboardInterrupt:
-        print("\n  Cancelled.")
-        return
-    delete_spacy_files()
-    retrain()
-if __name__ == "__main__":
-    main()

spacyNER/models/phase1_funsd/model-last/vocab/strings.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

spacyNER/models/phase1_funsd/model-last/vocab/vectors.cfg CHANGED Viewed

@@ -1,3 +1,3 @@
-{
-  "mode":"default"
-}

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff4359091952c8cd16f1f0482f5770fb82d1707368d5cca3c46aa501f552e3c5
+size 22

template_matcher.py CHANGED Viewed

@@ -41,7 +41,7 @@ _CRNN_DIR = os.path.join(os.path.dirname(__file__), 'CRNN+CTC')
 if _CRNN_DIR not in _sys.path:
     _sys.path.insert(0, _CRNN_DIR)
-_CRNN_CHECKPOINT = os.path.join(_CRNN_DIR, 'checkpoints', 'latest_checkpoint.pth')
 _crnn_ocr    = None
 _crnn_decode = None   # reference to decode_ctc_predictions

 if _CRNN_DIR not in _sys.path:
     _sys.path.insert(0, _CRNN_DIR)
+_CRNN_CHECKPOINT = os.path.join(_CRNN_DIR, 'checkpoints', 'best_model_v4.pth')
 _crnn_ocr    = None
 _crnn_decode = None   # reference to decode_ctc_predictions