Spaces:

ifgr003
/

ocr

Sleeping

App Files Files Community

ifgr003 commited on 29 days ago

Commit

4bfc055

verified ·

1 Parent(s): 091afb2

Upload 124 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +37 -0
CRNN+CTC/.env +1 -0
CRNN+CTC/.gitignore +14 -0
CRNN+CTC/IAM_train.py +332 -0
CRNN+CTC/README.md +449 -0
CRNN+CTC/calibrate_fields.py +196 -0
CRNN+CTC/calibrated_fields.py +7 -0
CRNN+CTC/check_cer.py +331 -0
CRNN+CTC/checkpoints/best_model.pth +3 -0
CRNN+CTC/checkpoints/best_model_final.pth +3 -0
CRNN+CTC/checkpoints/best_model_iam.pth +3 -0
CRNN+CTC/checkpoints/best_model_v2.pth +3 -0
CRNN+CTC/checkpoints/best_model_v3.pth +3 -0
CRNN+CTC/checkpoints/best_model_v4.pth +3 -0
CRNN+CTC/checkpoints/best_model_v5.pth +3 -0
CRNN+CTC/checkpoints/best_model_v6.pth +3 -0
CRNN+CTC/checkpoints/best_model_v7.pth +3 -0
CRNN+CTC/checkpoints/best_model_v732.pth +3 -0
CRNN+CTC/checkpoints/checkpoint_epoch_10.pth +3 -0
CRNN+CTC/checkpoints/latest_checkpoint.pth +3 -0
CRNN+CTC/compare_checkpoints.py +34 -0
CRNN+CTC/compare_live_cer.py +158 -0
CRNN+CTC/create_test_images.py +50 -0
CRNN+CTC/crnn_model.py +119 -0
CRNN+CTC/dataset.py +401 -0
CRNN+CTC/field_extractor.py +735 -0
CRNN+CTC/finetune.py +202 -0
CRNN+CTC/generate_ph_names.py +350 -0
CRNN+CTC/inference.py +395 -0
CRNN+CTC/prepare_emnist.py +97 -0
CRNN+CTC/requirements.txt +61 -0
CRNN+CTC/train.py +438 -0
CRNN+CTC/train_emnist.py +15 -0
CRNN+CTC/train_mnist.py +42 -0
CRNN+CTC/train_with_emnist.py +169 -0
CRNN+CTC/utils.py +397 -0
MNB/__init__.py +4 -0
MNB/classifier.py +292 -0
MNB/form_classifier.py +466 -0
MNB/keywords.py +127 -0
MNB/mnb_metadata.json +17 -0
MNB/models/mnb_classifier.pkl +3 -0
MNB/models/mnb_metadata.json +13 -0
MNB/models/tfidf_vectorizer.pkl +3 -0
references/12 +3 -0
references/321 +3 -0
references/321321 +3 -0
references/old.jpg +3 -0
references/reference-102.png +3 -0
references/reference-103.png +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+CRNN+CTC/checkpoints/best_model_final.pth filter=lfs diff=lfs merge=lfs -text
+CRNN+CTC/checkpoints/best_model_iam.pth filter=lfs diff=lfs merge=lfs -text
+CRNN+CTC/checkpoints/best_model_v2.pth filter=lfs diff=lfs merge=lfs -text
+CRNN+CTC/checkpoints/best_model_v3.pth filter=lfs diff=lfs merge=lfs -text
+CRNN+CTC/checkpoints/best_model_v4.pth filter=lfs diff=lfs merge=lfs -text
+CRNN+CTC/checkpoints/best_model_v5.pth filter=lfs diff=lfs merge=lfs -text
+CRNN+CTC/checkpoints/best_model_v6.pth filter=lfs diff=lfs merge=lfs -text
+CRNN+CTC/checkpoints/best_model_v7.pth filter=lfs diff=lfs merge=lfs -text
+CRNN+CTC/checkpoints/best_model_v732.pth filter=lfs diff=lfs merge=lfs -text
+CRNN+CTC/checkpoints/best_model.pth filter=lfs diff=lfs merge=lfs -text
+CRNN+CTC/checkpoints/checkpoint_epoch_10.pth filter=lfs diff=lfs merge=lfs -text
+CRNN+CTC/checkpoints/latest_checkpoint.pth filter=lfs diff=lfs merge=lfs -text
+MNB/models/mnb_classifier.pkl filter=lfs diff=lfs merge=lfs -text
+MNB/models/tfidf_vectorizer.pkl filter=lfs diff=lfs merge=lfs -text
+references/12 filter=lfs diff=lfs merge=lfs -text
+references/321 filter=lfs diff=lfs merge=lfs -text
+references/321321 filter=lfs diff=lfs merge=lfs -text
+references/old.jpg filter=lfs diff=lfs merge=lfs -text
+references/reference-102.png filter=lfs diff=lfs merge=lfs -text
+references/reference-103.png filter=lfs diff=lfs merge=lfs -text
+references/reference-97.png filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/civil_registry_model/model-best/ner/model filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/civil_registry_model/model-best/tok2vec/model filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/civil_registry_model/model-best/vocab/key2row filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/civil_registry_model/model-best/vocab/vectors filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/civil_registry_model/model-last/ner/model filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/civil_registry_model/model-last/tok2vec/model filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/civil_registry_model/model-last/vocab/key2row filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/civil_registry_model/model-last/vocab/vectors filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/phase1_funsd/model-best/ner/model filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/phase1_funsd/model-best/tok2vec/model filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/phase1_funsd/model-best/vocab/key2row filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/phase1_funsd/model-best/vocab/vectors filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/phase1_funsd/model-last/ner/model filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/phase1_funsd/model-last/tok2vec/model filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/phase1_funsd/model-last/vocab/key2row filter=lfs diff=lfs merge=lfs -text
+spacyNER/models/phase1_funsd/model-last/vocab/vectors filter=lfs diff=lfs merge=lfs -text

CRNN+CTC/.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ POPPLER_PATH=C:\Program Files\poppler-25.12.0\Library\bin

CRNN+CTC/.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+datasets/
+checkpoints/
+logs/
+test_images/
+data/
+__pycache__/
+*.png
+*.jpg
+*.jpeg
+*.npy
+*.h5
+*.pkl
+*.pyc
+iam-handwriting-word-database/

CRNN+CTC/IAM_train.py ADDED Viewed

	@@ -0,0 +1,332 @@

+"""
+IAM_train.py
+============
+Fine-tune the CRNN model using the IAM Handwriting Word Database.
+Builds on top of EMNIST-trained model (best_model_emnist.pth).
+FIXES vs old version:
+  - IMG_WIDTH 400 -> 512 (must match pipeline)
+  - Added log_softmax before CTCLoss (was missing — caused catastrophic forgetting)
+  - Phase 1: CNN FROZEN — only RNN+FC trained
+  - Phase 2: Full model at very low LR
+  - Loads from best_model_emnist.pth, falls back to best_model.pth
+  - Uses get_crnn_model() with correct architecture from checkpoint config
+DATASET:
+  Download from: https://www.kaggle.com/datasets/nibinv23/iam-handwriting-word-database
+  Expected structure:
+    data/IAM/iam_words/
+      words/        <- word image folders (a01, a02, ...)
+      words.txt     <- annotation file
+USAGE:
+  python IAM_train.py --prepare          # convert IAM -> annotation JSON
+  python IAM_train.py --train            # fine-tune model
+  python IAM_train.py --prepare --train  # do both
+"""
+import os
+import sys
+import json
+import argparse
+import random
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader, ConcatDataset
+sys.path.append('.')
+from crnn_model import get_crnn_model
+from dataset import CivilRegistryDataset, collate_fn
+# ─────────────────────────────────────────────
+#  CONFIG
+# ─────────────────────────────────────────────
+IAM_ROOT      = "data/IAM/iam_words"
+IAM_WORDS_TXT = f"{IAM_ROOT}/words.txt"
+IAM_WORDS_DIR = f"{IAM_ROOT}/words"
+TRAIN_ANN     = "data/iam_train_annotations.json"
+IAM_VAL_ANN   = "data/iam_val_annotations.json"   # written by --prepare (IAM word images)
+SYNTH_VAL_ANN = "data/val_annotations.json"       # real civil registry val set — never overwritten
+TRAIN_IMG_DIR = "data/train/iam"
+VAL_IMG_DIR   = "data/val/iam"
+IMG_HEIGHT    = 64
+IMG_WIDTH     = 512       # FIXED: was 400 — must match pipeline
+BATCH_SIZE    = 32
+VAL_SPLIT     = 0.1
+MAX_SAMPLES   = 50000
+# Load from EMNIST checkpoint, fall back to synthetic if not found
+CHECKPOINT_IN  = "checkpoints/best_model_emnist.pth"
+CHECKPOINT_IN2 = "checkpoints/best_model.pth"   # fallback
+CHECKPOINT_OUT = "checkpoints/best_model_iam.pth"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ─────────────────────────────────────────────
+#  STEP 1 — PREPARE
+# ─────────────────────────────────────────────
+def prepare_iam():
+    from PIL import Image
+    print("\n" + "=" * 50)
+    print("STEP 1 — Preparing IAM dataset")
+    print("=" * 50)
+    if not os.path.exists(IAM_WORDS_TXT):
+        print(f"ERROR: {IAM_WORDS_TXT} not found!")
+        print("Download from: https://www.kaggle.com/datasets/nibinv23/iam-handwriting-word-database")
+        print("Expected structure:")
+        print("  data/IAM/iam_words/words.txt")
+        print("  data/IAM/iam_words/words/")
+        sys.exit(1)
+    os.makedirs(TRAIN_IMG_DIR, exist_ok=True)
+    os.makedirs(VAL_IMG_DIR,   exist_ok=True)
+    entries = []
+    print(f"  Reading {IAM_WORDS_TXT} ...")
+    with open(IAM_WORDS_TXT, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            parts = line.split(" ")
+            if len(parts) < 9:
+                continue
+            word_id    = parts[0]
+            seg_result = parts[1]
+            text       = parts[-1]
+            if seg_result != "ok":
+                continue
+            if len(text) < 1 or len(text) > 32:
+                continue
+            parts_id = word_id.split("-")
+            img_path = os.path.join(
+                IAM_WORDS_DIR,
+                parts_id[0],
+                f"{parts_id[0]}-{parts_id[1]}",
+                f"{word_id}.png"
+            )
+            if not os.path.exists(img_path):
+                continue
+            entries.append((img_path, text))
+    print(f"  Found {len(entries)} valid word entries")
+    if MAX_SAMPLES and len(entries) > MAX_SAMPLES:
+        random.shuffle(entries)
+        entries = entries[:MAX_SAMPLES]
+        print(f"  Limiting to {MAX_SAMPLES} samples")
+    random.shuffle(entries)
+    split_idx     = int(len(entries) * (1 - VAL_SPLIT))
+    train_entries = entries[:split_idx]
+    val_entries   = entries[split_idx:]
+    print(f"  Train: {len(train_entries)} | Val: {len(val_entries)}")
+    print("  Copying and resizing images...")
+    def process_entries(entry_list, out_dir, prefix):
+        annotations = []
+        for i, (src_path, text) in enumerate(entry_list):
+            try:
+                img = Image.open(src_path).convert("RGB")
+                img = img.resize((IMG_WIDTH, IMG_HEIGHT))  # FIXED: 512x64
+                fname    = f"iam_{prefix}_{i:06d}.jpg"
+                out_path = os.path.join(out_dir, fname)
+                img.save(out_path, quality=90)
+                annotations.append({"image_path": f"iam/{fname}", "text": text})
+            except Exception:
+                continue
+            if i % 5000 == 0:
+                print(f"    {i}/{len(entry_list)} processed...")
+        return annotations
+    train_ann = process_entries(train_entries, TRAIN_IMG_DIR, "train")
+    val_ann   = process_entries(val_entries,   VAL_IMG_DIR,   "val")
+    with open(TRAIN_ANN, "w") as f:
+        json.dump(train_ann, f, indent=2)
+    with open(IAM_VAL_ANN, "w") as f:
+        json.dump(val_ann, f, indent=2)
+    print(f"\n  Train annotations -> {TRAIN_ANN} ({len(train_ann)} entries)")
+    print(f"  Val annotations   -> {IAM_VAL_ANN} ({len(val_ann)} entries)")
+    print("\n  Done! Now run: python IAM_train.py --train")
+# ─────────────────────────────────────────────
+#  STEP 2 — TRAIN
+# ─────────────────────────────────────────────
+def train_iam():
+    print("\n" + "=" * 55)
+    print("STEP 2 — Fine-tuning CRNN with IAM dataset")
+    print("=" * 55)
+    print(f"  Device : {DEVICE}")
+    for ann_file in [TRAIN_ANN, SYNTH_VAL_ANN]:
+        if not os.path.exists(ann_file):
+            print(f"ERROR: {ann_file} not found! Run --prepare first.")
+            sys.exit(1)
+    train_dataset = CivilRegistryDataset(
+        data_dir="data/train", annotations_file=TRAIN_ANN,
+        img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
+    )
+    # FIXED: mix synthetic data in so the model never forgets Filipino multi-word sequences
+    synth_dataset = CivilRegistryDataset(
+        data_dir="data/train", annotations_file="data/train_annotations.json",
+        img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
+    )
+    mixed_train = ConcatDataset([train_dataset, synth_dataset])
+    val_dataset = CivilRegistryDataset(
+        data_dir="data/val", annotations_file=SYNTH_VAL_ANN,
+        img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
+    )
+    print(f"  IAM train     : {len(train_dataset)}")
+    print(f"  Synthetic train: {len(synth_dataset)}")
+    print(f"  Mixed train   : {len(mixed_train)}")
+    print(f"  Val           : {len(val_dataset)}")
+    train_loader = DataLoader(mixed_train, batch_size=BATCH_SIZE,
+                              shuffle=True,  num_workers=0, collate_fn=collate_fn)
+    val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE,
+                              shuffle=False, num_workers=0, collate_fn=collate_fn)
+    # ── Load checkpoint (EMNIST preferred, synthetic fallback) ──
+    ckpt_path = CHECKPOINT_IN if os.path.exists(CHECKPOINT_IN) else CHECKPOINT_IN2
+    if not os.path.exists(ckpt_path):
+        print(f"ERROR: No checkpoint found at {CHECKPOINT_IN} or {CHECKPOINT_IN2}")
+        print("Run: python train.py  then  python train_with_emnist.py")
+        sys.exit(1)
+    print(f"  Loading: {ckpt_path}")
+    ckpt   = torch.load(ckpt_path, map_location=DEVICE, weights_only=False)
+    config = ckpt.get('config', {})
+    model = get_crnn_model(
+        model_type      = config.get('model_type', 'standard'),
+        img_height      = config.get('img_height', 64),
+        num_chars       = train_dataset.num_chars,
+        hidden_size     = config.get('hidden_size', 128),
+        num_lstm_layers = config.get('num_lstm_layers', 1),
+    ).to(DEVICE)
+    missing, _ = model.load_state_dict(ckpt['model_state_dict'], strict=False)
+    if missing:
+        print(f"  Note: {len(missing)} layers re-initialized")
+    print(f"  Loaded epoch {ckpt.get('epoch', 'N/A')} "
+          f"val_loss={ckpt.get('val_loss', ckpt.get('val_cer', 0)):.4f}")
+    criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
+    os.makedirs("checkpoints", exist_ok=True)
+    def run_epoch(loader, training, optimizer=None):
+        model.train() if training else model.eval()
+        total, n = 0, 0
+        ctx = torch.enable_grad() if training else torch.no_grad()
+        with ctx:
+            for images, targets, target_lengths, _ in loader:
+                images        = images.to(DEVICE)
+                batch_size    = images.size(0)
+                if training:
+                    optimizer.zero_grad()
+                # CRITICAL: log_softmax before CTCLoss
+                outputs       = F.log_softmax(model(images), dim=2)
+                seq_len       = outputs.size(0)
+                input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
+                loss = criterion(outputs, targets, input_lengths, target_lengths)
+                if not torch.isnan(loss) and not torch.isinf(loss):
+                    if training:
+                        loss.backward()
+                        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
+                        optimizer.step()
+                    total += loss.item()
+                    n     += 1
+        return total / max(n, 1)
+    def run_phase(num, epochs, lr, freeze_cnn, patience):
+        print(f"\n{'='*55}")
+        print(f"  PHASE {num} — "
+              f"{'CNN FROZEN  (RNN+FC only)' if freeze_cnn else 'FULL MODEL  (all layers)'}"
+              f"   LR={lr}")
+        print(f"{'='*55}")
+        for name, param in model.named_parameters():
+            param.requires_grad = not (freeze_cnn and 'cnn' in name)
+        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print(f"  Trainable params : {trainable:,}")
+        opt     = optim.Adam(
+            filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
+        sched   = optim.lr_scheduler.ReduceLROnPlateau(opt, patience=3, factor=0.5)
+        best    = float('inf')
+        counter = 0
+        for epoch in range(1, epochs + 1):
+            tr = run_epoch(train_loader, True,  opt)
+            vl = run_epoch(val_loader,   False, None)
+            sched.step(vl)
+            if vl < best:
+                best    = vl
+                counter = 0
+                torch.save({
+                    'model_state_dict': model.state_dict(),
+                    'config':           config,
+                    'char_to_idx':      train_dataset.char_to_idx,
+                    'idx_to_char':      train_dataset.idx_to_char,
+                    'epoch':            epoch,
+                    'val_loss':         vl,   # FIXED: renamed from val_cer — this is val loss, not CER%
+                }, CHECKPOINT_OUT)
+                print(f"  Epoch {epoch:02d}/{epochs}  "
+                      f"Train={tr:.4f}  Val={vl:.4f}  <- saved")
+            else:
+                counter += 1
+                print(f"  Epoch {epoch:02d}/{epochs}  "
+                      f"Train={tr:.4f}  Val={vl:.4f}  "
+                      f"(patience {counter}/{patience})")
+                if counter >= patience:
+                    print(f"  Early stopping at epoch {epoch}.")
+                    break
+        return best
+    # Phase 1: Freeze CNN
+    p1 = run_phase(1, epochs=30, lr=1e-4, freeze_cnn=True,  patience=7)
+    # Phase 2: Full model, very low LR
+    p2 = run_phase(2, epochs=20, lr=1e-6, freeze_cnn=False, patience=5)
+    print(f"\n{'='*55}")
+    print(f"IAM fine-tuning complete!")
+    print(f"  Phase 1 best val loss : {p1:.4f}")
+    print(f"  Phase 2 best val loss : {p2:.4f}")
+    print(f"  Saved : {CHECKPOINT_OUT}")
+    print(f"\nNext step: collect physical certificate scans")
+# ─────────────────────────────────────────────
+#  MAIN
+# ─────────────────────────────────────────────
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prepare", action="store_true")
+    parser.add_argument("--train",   action="store_true")
+    args = parser.parse_args()
+    if not args.prepare and not args.train:
+        print("Usage:")
+        print("  python IAM_train.py --prepare          # prepare dataset")
+        print("  python IAM_train.py --train            # train model")
+        print("  python IAM_train.py --prepare --train  # do both")
+        sys.exit(0)
+    if args.prepare:
+        prepare_iam()
+    if args.train:
+        train_iam()

CRNN+CTC/README.md ADDED Viewed

	@@ -0,0 +1,449 @@

+# Local Civil Registry Document Digitization and Data Extraction
+## Using CRNN+CTC, Multinomial Naive Bayes, and Named Entity Recognition
+**Thesis Project by:**
+- Shane Mark C. Blanco
+- Princess A. Pasamonte
+- Irish Faith G. Ramirez
+**Institution:** Tarlac State University, College of Computer Studies
+---
+## 📋 Project Overview
+This system automates the digitization and data extraction of Philippine Civil Registry documents using advanced machine learning algorithms:
+### Target Documents:
+- **Form 1A** - Birth Certificate
+- **Form 2A** - Death Certificate
+- **Form 3A** - Marriage Certificate
+- **Form 90** - Application of Marriage License
+### Key Features:
+✅ OCR for printed and handwritten text
+✅ Automatic document classification
+✅ Named entity extraction (names, dates, places)
+✅ Auto-fill digital forms
+✅ MySQL database storage
+✅ Searchable digital archive
+✅ Data visualization dashboard
+---
+## 🏗️ System Architecture
+```
+Input: Scanned Civil Registry Form
+    ↓
+1. Image Preprocessing
+    ↓
+2. CRNN+CTC → Text Recognition
+    ↓
+3. Multinomial Naive Bayes → Document Classification
+    ↓
+4. spaCy NER → Entity Extraction
+    ↓
+5. Data Validation & Storage → MySQL Database
+    ↓
+Output: Digitized & Searchable Record
+```
+---
+## 🚀 Quick Start
+### Prerequisites
+- Python 3.8+
+- CUDA-capable GPU (recommended) or CPU
+- 8GB RAM minimum
+### Installation
+```bash
+# 1. Clone or download the project
+cd civil_registry_ocr
+# 2. Create virtual environment
+python -m venv venv
+source venv/bin/activate  # Linux/Mac
+venv\Scripts\activate     # Windows
+# 3. Install dependencies
+pip install -r requirements.txt
+# 4. Download spaCy model
+python -m spacy download en_core_web_sm
+```
+### Quick Test
+```python
+from inference import CivilRegistryOCR
+# Load model
+ocr = CivilRegistryOCR('checkpoints/best_model.pth')
+# Recognize text
+text = ocr.predict('test_images/sample_name.jpg')
+print(f"Recognized: {text}")
+```
+---
+## 📁 Project Files
+### Core Implementation Files:
+1. **crnn_model.py** - CRNN+CTC neural network architecture
+2. **dataset.py** - Data loading and preprocessing
+3. **train.py** - Model training script
+4. **inference.py** - Prediction and inference
+5. **utils.py** - Helper functions and metrics
+6. **requirements.txt** - Python dependencies
+7. **IMPLEMENTATION_GUIDE.md** - Detailed implementation guide
+### Additional Components (To be created):
+8. **document_classifier.py** - Multinomial Naive Bayes classifier
+9. **ner_extractor.py** - Named Entity Recognition
+10. **web_app.py** - Web application (Flask/FastAPI)
+11. **database.py** - MySQL integration
+---
+## 📊 Training the Model
+### 1. Prepare Your Data
+Organize images and labels:
+```
+data/
+  train/
+    form1a/
+      name_001.jpg
+      name_001.txt
+    form2a/
+      ...
+  val/
+    ...
+```
+### 2. Create Annotations
+```python
+from dataset import create_annotation_file
+create_annotation_file('data/train', 'data/train_annotations.json')
+create_annotation_file('data/val', 'data/val_annotations.json')
+```
+### 3. Train Model
+```bash
+python train.py
+```
+Monitor metrics:
+- Character Error Rate (CER)
+- Word Error Rate (WER)
+- Training/Validation Loss
+### 4. Evaluate
+```python
+from utils import calculate_cer, calculate_wer
+predictions = [ocr.predict(img) for img in test_images]
+cer = calculate_cer(predictions, ground_truths)
+print(f"CER: {cer:.2f}%")
+```
+---
+## 🌐 Web Application
+### Start the Server
+```bash
+python web_app.py
+```
+### API Endpoints
+**POST /api/ocr** - Process document
+```bash
+curl -X POST -F "file=@birth_cert.jpg" http://localhost:8000/api/ocr
+```
+**Response:**
+```json
+{
+  "text": "Juan Dela Cruz\n01/15/1990\nTarlac City",
+  "form_type": "form1a",
+  "entities": {
+    "persons": ["Juan Dela Cruz"],
+    "dates": ["01/15/1990"],
+    "locations": ["Tarlac City"]
+  }
+}
+```
+---
+## 🎯 Expected Performance
+Based on thesis objectives:
+### CRNN+CTC Model:
+- **Target CER:** < 5%
+- **Target Accuracy:** > 95%
+- Handles both printed and handwritten text
+### Document Classifier (MNB):
+- **Target Accuracy:** > 90%
+- Fast classification (< 100ms)
+### NER (spaCy):
+- **F1 Score:** > 85%
+- Extracts: Names, Dates, Places
+---
+## 🧪 Testing
+### ISO 25010 Evaluation
+**Usability Testing:**
+```python
+# Metrics to measure:
+- Task completion rate
+- Average time per task
+- User satisfaction score (SUS)
+```
+**Reliability Testing:**
+```python
+# Metrics to measure:
+- System uptime %
+- Error rate
+- Recovery time
+```
+### Confusion Matrix
+```python
+from sklearn.metrics import confusion_matrix
+import seaborn as sns
+cm = confusion_matrix(true_labels, predicted_labels)
+sns.heatmap(cm, annot=True)
+```
+---
+## 💾 Database Schema
+### Birth Certificates Table
+```sql
+CREATE TABLE birth_certificates (
+    id INT PRIMARY KEY AUTO_INCREMENT,
+    child_name VARCHAR(255),
+    date_of_birth DATE,
+    place_of_birth VARCHAR(255),
+    sex CHAR(1),
+    father_name VARCHAR(255),
+    mother_name VARCHAR(255),
+    raw_text TEXT,
+    form_image LONGBLOB,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+```
+---
+## 📈 System Requirements
+### Minimum:
+- CPU: Intel i5 or equivalent
+- RAM: 8GB
+- Storage: 10GB
+- OS: Windows 10, Ubuntu 18.04, macOS 10.14
+### Recommended:
+- CPU: Intel i7 or equivalent
+- GPU: NVIDIA GTX 1060 or better
+- RAM: 16GB
+- Storage: 50GB SSD
+---
+## 🔒 Data Privacy & Security
+Following Philippine Data Privacy Act (RA 10173):
+- ✅ Encrypted data transmission
+- ✅ Access control and authentication
+- ✅ Audit logging
+- ✅ Regular security updates
+- ✅ Data retention policies
+---
+## 📚 Key Algorithms
+### 1. CRNN+CTC
+**Purpose:** Text recognition from images
+**Strengths:** Handles variable-length sequences, no character segmentation needed
+**Reference:** Shi et al. (2016)
+### 2. Multinomial Naive Bayes
+**Purpose:** Document classification
+**Strengths:** Fast, efficient, works well with text data
+**Reference:** McCallum & Nigam (1998)
+### 3. Named Entity Recognition
+**Purpose:** Extract entities (names, dates, places)
+**Strengths:** Pre-trained, accurate, easy to use
+**Reference:** spaCy (Honnibal & Montani, 2017)
+---
+## 🛠️ Troubleshooting
+### Low Accuracy?
+1. Increase training data (target: 10,000+ samples)
+2. Use data augmentation
+3. Train longer (100+ epochs)
+4. Clean your dataset
+### Out of Memory?
+1. Reduce batch size
+2. Use smaller image dimensions
+3. Use gradient accumulation
+4. Enable mixed precision
+### Slow Inference?
+1. Use GPU if available
+2. Batch process images
+3. Optimize model (ONNX)
+4. Cache frequent results
+---
+## 📖 Documentation
+- **IMPLEMENTATION_GUIDE.md** - Complete step-by-step guide
+- **API_DOCUMENTATION.md** - API reference (to be created)
+- **USER_MANUAL.md** - End-user guide (to be created)
+---
+## 🎓 Academic References
+### Key Papers:
+1. **CRNN**
+   Shi, B., Bai, X., & Yao, C. (2016). An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. *IEEE TPAMI*.
+2. **CTC Loss**
+   Graves, A., et al. (2006). Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural networks. *ICML*.
+3. **Naive Bayes**
+   McCallum, A., & Nigam, K. (1998). A comparison of event models for naive bayes text classification. *AAAI Workshop*.
+4. **spaCy**
+   Honnibal, M., & Montani, I. (2017). spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing.
+---
+## 👥 Contributors
+**Researchers:**
+- Shane Mark C. Blanco
+- Princess A. Pasamonte
+- Irish Faith G. Ramirez
+**Advisers:**
+- Mr. Rengel V. Corpuz (Technical Adviser)
+- Mr. Joselito T. Tan (Subject Teacher)
+**Institution:**
+Tarlac State University
+College of Computer Studies
+Bachelor of Science in Computer Science
+---
+## 📞 Support
+For questions regarding this implementation:
+1. Review IMPLEMENTATION_GUIDE.md
+2. Check code documentation
+3. Consult with thesis advisers
+---
+## 📄 License
+This project is for academic purposes as part of a thesis requirement.
+---
+## ✅ Implementation Checklist
+### Phase 1: Setup ✓
+- [x] Install dependencies
+- [x] Set up project structure
+- [x] Prepare development environment
+### Phase 2: Data Preparation
+- [ ] Collect civil registry form images
+- [ ] Create annotations
+- [ ] Split into train/val/test sets
+### Phase 3: Model Development
+- [ ] Train CRNN+CTC model
+- [ ] Train document classifier
+- [ ] Integrate NER system
+### Phase 4: Web Application
+- [ ] Develop Flask/FastAPI backend
+- [ ] Create frontend interface
+- [ ] Implement database integration
+### Phase 5: Testing
+- [ ] Accuracy testing
+- [ ] Black-box testing
+- [ ] ISO 25010 evaluation
+- [ ] User acceptance testing
+### Phase 6: Deployment
+- [ ] Optimize for production
+- [ ] Set up server
+- [ ] Deploy application
+- [ ] Monitor performance
+---
+## 🎯 Success Metrics
+Target metrics for thesis evaluation:
+| Metric | Target | Status |
+|--------|--------|--------|
+| OCR Accuracy | > 95% | Pending |
+| CER | < 5% | Pending |
+| Classifier Accuracy | > 90% | Pending |
+| NER F1 Score | > 85% | Pending |
+| Response Time | < 2s | Pending |
+| System Uptime | > 99% | Pending |
+---
+**Good luck with your thesis defense! 🎓✨**
+For detailed implementation instructions, see **IMPLEMENTATION_GUIDE.md**

CRNN+CTC/calibrate_fields.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+calibrate_fields.py
+===================
+Click-to-measure tool for recalibrating field ratios in field_extractor.py.
+Usage:
+    python calibrate_fields.py --image your_scan.png --form birth
+Controls:
+    • Click and drag  → draw a field box
+    • After releasing → enter the field name in the terminal
+    • Press S         → save all measured ratios to calibrated_fields.py
+    • Press Z         → undo last box
+    • Press Q / ESC   → quit without saving
+Output:
+    calibrated_fields.py  — copy-paste the dict into field_extractor.py
+"""
+import argparse
+import json
+import cv2
+import numpy as np
+from pathlib import Path
+# ── state ─────────────────────────────────────────────────────────────────────
+drawing   = False
+ix, iy    = -1, -1
+ex, ey    = -1, -1
+boxes     = []        # list of (name, rx1, ry1, rx2, ry2)
+form_name = "birth"
+COLOURS = [
+    (0,200,0),(0,150,255),(200,0,200),(0,200,200),(200,200,0),(220,20,60),
+    (255,140,0),(150,50,200),(0,160,80),(30,144,255),(255,20,147),(100,200,100),
+]
+def draw_boxes(img, bounds):
+    left, top, right, bottom = bounds
+    h, w = img.shape[:2]
+    vis = img.copy()
+    # form boundary
+    cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 2)
+    for idx, (name, rx1, ry1, rx2, ry2) in enumerate(boxes):
+        x1 = int(rx1 * w)
+        y1 = int(ry1 * h)
+        x2 = int(rx2 * w)
+        y2 = int(ry2 * h)
+        c  = COLOURS[idx % len(COLOURS)]
+        cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2)
+        cv2.putText(vis, name[:25], (x1 + 2, max(0, y1 - 3)),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.35, c, 1)
+    # live cursor box
+    if drawing and ix >= 0 and ex >= 0:
+        cv2.rectangle(vis, (ix, iy), (ex, ey), (255, 255, 255), 1)
+    # instructions
+    cv2.putText(vis, "Drag=draw box | S=save | Z=undo | Q=quit",
+                (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1)
+    cv2.putText(vis, f"Boxes: {len(boxes)}",
+                (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1)
+    return vis
+def detect_bounds(image_bgr):
+    """Simple form boundary detection (reuses logic from FormBoundsDetector)."""
+    h, w  = image_bgr.shape[:2]
+    gray  = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
+    try:
+        thresh  = cv2.adaptiveThreshold(
+            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY_INV, 11, 2)
+        hk      = cv2.getStructuringElement(cv2.MORPH_RECT, (max(w // 5, 10), 1))
+        h_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, hk)
+        h_rows  = np.where(np.sum(h_lines, axis=1) > w * 0.15)[0]
+        vk      = cv2.getStructuringElement(cv2.MORPH_RECT, (1, max(h // 5, 10)))
+        v_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vk)
+        v_cols  = np.where(np.sum(v_lines, axis=0) > h * 0.08)[0]
+        if len(h_rows) == 0 or len(v_cols) == 0:
+            return (0, 0, w, h)
+        top_b, bottom_b = int(h_rows.min()), int(h_rows.max())
+        left_b, right_b = int(v_cols.min()), int(v_cols.max())
+        if (right_b - left_b) < w * 0.4 or (bottom_b - top_b) < h * 0.4:
+            return (0, 0, w, h)
+        return (left_b, top_b, right_b, bottom_b)
+    except Exception:
+        return (0, 0, w, h)
+def save_calibration(output_path, form):
+    dict_name = {
+        "birth":            "BIRTH_FIELDS",
+        "death":            "DEATH_FIELDS",
+        "marriage":         "MARRIAGE_FIELDS",
+        "marriage_license": "MARRIAGE_LICENSE_FIELDS",
+    }.get(form, "CALIBRATED_FIELDS")
+    lines = [f"# Auto-calibrated — copy-paste into field_extractor.py\n",
+             f"{dict_name} = {{\n"]
+    for name, rx1, ry1, rx2, ry2 in boxes:
+        lines.append(f'    "{name}":{" " * max(1, 34 - len(name))}'
+                     f'({rx1:.4f}, {ry1:.4f}, {rx2:.4f}, {ry2:.4f}),\n')
+    lines.append("}\n")
+    with open(output_path, "w") as f:
+        f.writelines(lines)
+    print(f"\n  Saved {len(boxes)} fields → {output_path}")
+def main():
+    global drawing, ix, iy, ex, ey, form_name
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--image", required=True)
+    parser.add_argument("--form",  default="birth",
+                        choices=["birth","death","marriage","marriage_license"])
+    parser.add_argument("--output", default="calibrated_fields.py")
+    parser.add_argument("--scale",  type=float, default=1.0,
+                        help="Scale factor to fit image on screen (e.g. 0.5)")
+    args = parser.parse_args()
+    form_name = args.form
+    img_orig = cv2.imread(args.image)
+    if img_orig is None:
+        print(f"ERROR: Cannot load {args.image}")
+        return
+    scale = args.scale
+    if scale != 1.0:
+        img_orig = cv2.resize(img_orig, None, fx=scale, fy=scale)
+    bounds = detect_bounds(img_orig)
+    left, top, right, bottom = bounds
+    fw = right - left
+    fh = bottom - top
+    print(f"  Form boundary detected: {bounds}  ({fw}×{fh} px)")
+    print(f"  Scale: {scale}")
+    print("\n  Instructions:")
+    print("    Drag  → draw a field box")
+    print("    After releasing → type field name in terminal, press Enter")
+    print("    S     → save all boxes")
+    print("    Z     → undo last box")
+    print("    Q/ESC → quit\n")
+    win = "Calibrate Fields"
+    cv2.namedWindow(win, cv2.WINDOW_NORMAL)
+    def mouse(event, x, y, flags, param):
+        global drawing, ix, iy, ex, ey
+        if event == cv2.EVENT_LBUTTONDOWN:
+            drawing = True
+            ix, iy  = x, y
+            ex, ey  = x, y
+        elif event == cv2.EVENT_MOUSEMOVE and drawing:
+            ex, ey  = x, y
+        elif event == cv2.EVENT_LBUTTONUP:
+            drawing = False
+            ex, ey  = x, y
+            ih, iw  = img_orig.shape[:2]
+            x1r = min(ix, ex) / iw
+            y1r = min(iy, ey) / ih
+            x2r = max(ix, ex) / iw
+            y2r = max(iy, ey) / ih
+            x1r, y1r = max(0.0, x1r), max(0.0, y1r)
+            x2r, y2r = min(1.0, x2r), min(1.0, y2r)
+            if (x2r - x1r) > 0.005 and (y2r - y1r) > 0.003:
+                name = input(f"  Field name for ({x1r:.3f},{y1r:.3f},{x2r:.3f},{y2r:.3f}): ").strip()
+                if name:
+                    boxes.append((name, x1r, y1r, x2r, y2r))
+                    print(f"  ✓  '{name}' added  (total: {len(boxes)})")
+    cv2.setMouseCallback(win, mouse)
+    while True:
+        vis = draw_boxes(img_orig, bounds)
+        cv2.imshow(win, vis)
+        key = cv2.waitKey(20) & 0xFF
+        if key in (ord('q'), 27):
+            print("  Quit — no file saved.")
+            break
+        elif key == ord('s'):
+            save_calibration(args.output, form_name)
+            break
+        elif key == ord('z') and boxes:
+            removed = boxes.pop()
+            print(f"  Undone: '{removed[0]}'")
+    cv2.destroyAllWindows()
+if __name__ == "__main__":
+    main()

CRNN+CTC/calibrated_fields.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Auto-calibrated � copy-paste into field_extractor.py
+BIRTH_FIELDS = {
+    "Province":                          (0.0941, 0.0701, 0.6361, 0.0848),
+    "City/Municipality":                 (0.1621, 0.0880, 0.6429, 0.1086),
+    "first_name":                        (0.0465, 0.1183, 0.3265, 0.1375),
+    "middle_name":                       (0.3469, 0.1189, 0.6916, 0.1375),
+}

CRNN+CTC/check_cer.py ADDED Viewed

	@@ -0,0 +1,331 @@

+"""
+check_cer.py
+============
+Measures TRUE CER by actually running the model on images.
+Usage:
+    python check_cer.py                        # live CER on val set
+    python check_cer.py --saved                # old behavior (fast, unreliable)
+    python check_cer.py --images test_images/  # run on any image folder
+"""
+import os
+import sys
+import json
+import random
+import cv2
+import numpy as np
+import editdistance
+from pathlib import Path
+try:
+    import torch
+except ImportError:
+    print("ERROR: torch not installed. Run: pip install torch")
+    exit(1)
+USE_SAVED = '--saved' in sys.argv
+IMAGE_DIR = None
+for i, arg in enumerate(sys.argv[1:], 1):
+    if arg == '--images' and i < len(sys.argv) - 1:
+        IMAGE_DIR = sys.argv[i + 1]
+    elif arg.startswith('--images='):
+        IMAGE_DIR = arg.split('=', 1)[1]
+CHECKPOINTS = [
+    'checkpoint_epoch_50.pth',
+    'checkpoint_epoch_60.pth',
+    'checkpoint_epoch_70.pth',
+    'checkpoint_epoch_80.pth',
+    'checkpoint_epoch_90.pth',
+    'checkpoint_epoch_100.pth',
+]
+CHECKPOINT_DIR = 'checkpoints'
+VAL_DATA_DIR   = 'data/val'
+VAL_ANN_FILE   = 'data/val_annotations.json'
+class AdaptiveImageNormalizer:
+    def __init__(self, target_height=64, target_width=512):
+        self.H = target_height
+        self.W = target_width
+    def _crop_to_text(self, gray):
+        inv = cv2.bitwise_not(gray)
+        _, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
+        coords = np.column_stack(np.where(thresh > 0))
+        if len(coords) == 0:
+            return gray
+        y_min, x_min = coords.min(axis=0)
+        y_max, x_max = coords.max(axis=0)
+        pad   = max(4, int((y_max - y_min) * 0.15))
+        y_min = max(0, y_min - pad)
+        x_min = max(0, x_min - pad)
+        y_max = min(gray.shape[0] - 1, y_max + pad)
+        x_max = min(gray.shape[1] - 1, x_max + pad)
+        return gray[y_min:y_max + 1, x_min:x_max + 1]
+    def _smart_resize_gray(self, gray):
+        h, w = gray.shape
+        if h == 0 or w == 0:
+            return np.ones((self.H, self.W), dtype=np.uint8) * 255
+        scale = self.H / h
+        new_w = int(w * scale)
+        new_h = self.H
+        if new_w > self.W:
+            scale = self.W / w
+            new_h = int(h * scale)
+            new_w = self.W
+        resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
+        canvas  = np.ones((self.H, self.W), dtype=np.uint8) * 255
+        y_off   = (self.H - new_h) // 2
+        x_off   = (self.W - new_w) // 2
+        canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized
+        return canvas
+    def _binarize(self, img):
+        _, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        white_ratio = np.mean(otsu == 255)
+        if white_ratio < 0.30 or white_ratio > 0.97:
+            return cv2.adaptiveThreshold(
+                img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY, 11, 2)
+        return otsu
+    def normalize(self, img):
+        if len(img.shape) == 3:
+            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = img.copy()
+        gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
+        gray = self._crop_to_text(gray)
+        gray = self._smart_resize_gray(gray)
+        return self._binarize(gray)
+    def to_tensor(self, img):
+        return torch.FloatTensor(
+            img.astype(np.float32) / 255.0
+        ).unsqueeze(0).unsqueeze(0)
+def greedy_decode(outputs, idx_to_char):
+    pred_indices = torch.argmax(outputs, dim=2).permute(1, 0)
+    results = []
+    for seq in pred_indices:
+        chars, prev = [], -1
+        for idx in seq:
+            idx = idx.item()
+            if idx != 0 and idx != prev and idx in idx_to_char:
+                chars.append(idx_to_char[idx])
+            prev = idx
+        results.append(''.join(chars))
+    return results
+def measure_live_cer(model, idx_to_char, img_h, img_w,
+                     ann_file, data_dir, device, max_samples=200):
+    if not os.path.exists(ann_file):
+        return None, 0, f"Annotation file not found: {ann_file}"
+    with open(ann_file, 'r', encoding='utf-8') as f:
+        annotations = json.load(f)
+    if len(annotations) > max_samples:
+        random.seed(42)
+        annotations = random.sample(annotations, max_samples)
+    normalizer = AdaptiveImageNormalizer(img_h, img_w)
+    model.eval()
+    total_char_dist = 0
+    total_chars     = 0
+    total_word_dist = 0
+    total_words     = 0
+    n_exact         = 0
+    n_evaluated     = 0
+    worst_errors    = []
+    with torch.no_grad():
+        for ann in annotations:
+            img_path = os.path.join(data_dir, ann['image_path'])
+            gt       = ann['text']
+            if not os.path.exists(img_path):
+                continue
+            try:
+                raw = cv2.imread(img_path)
+                if raw is None:
+                    continue
+                norm   = normalizer.normalize(raw)
+                tensor = normalizer.to_tensor(norm).to(device)
+                out    = model(tensor)
+                pred   = greedy_decode(out.cpu(), idx_to_char)[0]
+                cd = editdistance.eval(pred, gt)
+                wd = editdistance.eval(pred.split(), gt.split())
+                total_char_dist += cd
+                total_chars     += len(gt)
+                total_word_dist += wd
+                total_words     += len(gt.split())
+                if pred == gt:
+                    n_exact += 1
+                if cd > 0:
+                    worst_errors.append((gt, pred, cd))
+                n_evaluated += 1
+            except Exception:
+                continue
+    if n_evaluated == 0:
+        return None, 0, "No images could be evaluated"
+    cer = (total_char_dist / total_chars * 100) if total_chars > 0 else 0
+    wer = (total_word_dist / total_words * 100) if total_words > 0 else 0
+    acc = (n_exact / n_evaluated * 100)
+    return {
+        'cer': cer, 'wer': wer, 'exact_match': acc,
+        'n_evaluated': n_evaluated,
+        'errors': sorted(worst_errors, key=lambda x: x[2], reverse=True)[:5]
+    }, n_evaluated, None
+def run_on_folder(model, idx_to_char, img_h, img_w, folder, device):
+    normalizer = AdaptiveImageNormalizer(img_h, img_w)
+    model.eval()
+    exts  = {'.jpg', '.jpeg', '.png', '.bmp'}
+    paths = sorted(p for p in Path(folder).rglob('*') if p.suffix.lower() in exts)
+    results = []
+    with torch.no_grad():
+        for p in paths:
+            try:
+                raw    = cv2.imread(str(p))
+                norm   = normalizer.normalize(raw)
+                tensor = normalizer.to_tensor(norm).to(device)
+                pred   = greedy_decode(model(tensor).cpu(), idx_to_char)[0]
+                results.append((p.name, pred))
+            except Exception as e:
+                results.append((p.name, f'ERROR: {e}'))
+    return results
+# ─────────────────────────────────────────────────────────────────────────────
+# MAIN
+# ─────────────────────────────────────────────────────────────────────────────
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+if USE_SAVED:
+    print("=" * 65)
+    print("  SAVED CER  (training-time value — may not reflect real accuracy)")
+    print("  Run without --saved for true live CER.")
+    print("=" * 65)
+    print("{:<8} {:<12} {:<12} {}".format("Epoch", "CER(%)", "WER(%)", "File"))
+    print("-" * 65)
+    best_cer, best_cp = float('inf'), None
+    for cp in CHECKPOINTS:
+        path = os.path.join(CHECKPOINT_DIR, cp)
+        if not os.path.exists(path):
+            continue
+        try:
+            c        = torch.load(path, weights_only=False)
+            cer      = c.get('val_cer', c.get('val_loss', 0))
+            epoch    = c['epoch']
+            history  = c.get('history', {})
+            wer_list = history.get('val_wer', [])
+            wer      = wer_list[epoch - 1] if wer_list and epoch <= len(wer_list) else None
+            wer_s    = f"{wer:.4f}%" if wer else 'N/A'
+            marker   = ' <-- BEST' if cer < best_cer else ''
+            print("{:<8} {:<12} {:<12} {}{}".format(
+                epoch, f"{cer:.4f}%", wer_s, cp, marker))
+            if cer < best_cer:
+                best_cer, best_cp = cer, cp
+        except Exception as e:
+            print(f"  Could not load {cp}: {e}")
+    print("=" * 65)
+    print(f"\nBEST: {best_cp}  CER={best_cer:.4f}%")
+else:
+    print("=" * 78)
+    print("  LIVE CER  —  model actually runs on images  (true accuracy)")
+    print("=" * 78)
+    print("{:<8} {:<10} {:<10} {:<12} {:<8} {}".format(
+        "Epoch", "CER(%)", "WER(%)", "ExactMatch", "N", "File"))
+    print("-" * 78)
+    best_cer, best_cp, best_metrics = float('inf'), None, None
+    for cp in CHECKPOINTS:
+        cp_path = os.path.join(CHECKPOINT_DIR, cp)
+        if not os.path.exists(cp_path):
+            print(f"  (skipping {cp} — not found)")
+            continue
+        try:
+            from crnn_model import get_crnn_model
+            c           = torch.load(cp_path, map_location=device, weights_only=False)
+            epoch       = c['epoch']
+            idx_to_char = c['idx_to_char']
+            config      = c.get('config', {})
+            img_h       = config.get('img_height', 64)
+            img_w       = config.get('img_width', 512)
+            saved_cer   = c.get('val_cer', c.get('val_loss', None))
+            model = get_crnn_model(
+                model_type=config.get('model_type', 'standard'),
+                img_height=img_h,
+                num_chars=c['model_state_dict']['fc.weight'].shape[0],
+                hidden_size=config.get('hidden_size', 128),       # FIXED: was 256
+                num_lstm_layers=config.get('num_lstm_layers', 1)  # FIXED: was 2
+            ).to(device)
+            model.load_state_dict(c['model_state_dict'])
+            if IMAGE_DIR:
+                print(f"\nPredictions from {cp}:")
+                for fname, pred in run_on_folder(
+                        model, idx_to_char, img_h, img_w, IMAGE_DIR, device):
+                    print(f"  {fname:<35} ->  {pred}")
+                continue
+            metrics, n, err = measure_live_cer(
+                model, idx_to_char, img_h, img_w,
+                VAL_ANN_FILE, VAL_DATA_DIR, device)
+            if metrics is None:
+                print(f"  Epoch {epoch}  SKIP: {err}")
+                continue
+            cer    = metrics['cer']
+            marker = ' <-- BEST' if cer < best_cer else ''
+            print("{:<8} {:<10} {:<10} {:<12} {:<8} {}{}".format(
+                epoch,
+                f"{cer:.2f}%",
+                f"{metrics['wer']:.2f}%",
+                f"{metrics['exact_match']:.1f}%",
+                n, cp, marker))
+            if saved_cer and abs(cer - saved_cer) > 2.0:
+                print(f"          ^ MISMATCH: saved={saved_cer:.2f}%  live={cer:.2f}%"
+                      f"  diff={abs(cer - saved_cer):.2f}%")
+                print(f"            Cause: model trained on clean synthetic only.")
+                print(f"            Fix:   regenerate data with fix_data.py + retrain.")
+            if cer < best_cer:
+                best_cer, best_cp, best_metrics = cer, cp, metrics
+        except Exception as e:
+            print(f"  Could not evaluate {cp}: {e}")
+    if not IMAGE_DIR:
+        print("=" * 78)
+        print(f"\nBEST CHECKPOINT : {best_cp}")
+        print(f"BEST LIVE CER   : {best_cer:.4f}%")
+        if best_metrics and best_metrics['errors']:
+            print(f"\nWorst predictions (GT -> Predicted):")
+            for gt, pred, dist in best_metrics['errors']:
+                print(f"  [{dist:2d}]  '{gt}'")
+                print(f"        '{pred}'")
+        print(f"\nTo use best model:")
+        print(f"  import shutil")
+        print(f"  shutil.copy('checkpoints/{best_cp}', 'checkpoints/best_model.pth')")
+        print("=" * 78)

CRNN+CTC/checkpoints/best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f98f0590b354c11f40fefcab6fe172ae57cb37e49277062a00dbbe3f5aa6b8b5
+size 19204606

CRNN+CTC/checkpoints/best_model_final.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7da91ba5cd78b602eebb9c9f63175d9bc47ec8cb6fbdac6a06c78814e2e6b8f2
+size 6407143

CRNN+CTC/checkpoints/best_model_iam.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7f4cdef044a163632be2cbf7fbed9d869b4a2e85977aef60e5f88501969e257
+size 6405834

CRNN+CTC/checkpoints/best_model_v2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:047c9af89f9486553a2c17736cafcc0a7a45a99e21619064ee00299e2cd6a8df
+size 6406990

CRNN+CTC/checkpoints/best_model_v3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b1def35ee8c623aac01004ecb9f979d51d3ed3a486d8adf7a8acd67e5b03a31
+size 6406990

CRNN+CTC/checkpoints/best_model_v4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73a939bab133573e8b771a6d48aca10c9a98e804cedd79f06eac4e24735df1d4
+size 6406201

CRNN+CTC/checkpoints/best_model_v5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef986cbea34d4c5dc31b32aac3bc2dfaa20720cdb133d9d6c79a5d5123700942
+size 6406201

CRNN+CTC/checkpoints/best_model_v6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dbcdecce6d83b7f7c74ae6df05ae9222b345668e2dff84de9aa108562bd71ac
+size 6406201

CRNN+CTC/checkpoints/best_model_v7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7adea1b88ab7e4ecdf9354a8f1adbfbe7c95e26808319e307483ca6ea2555e0
+size 6406201

CRNN+CTC/checkpoints/best_model_v732.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7adea1b88ab7e4ecdf9354a8f1adbfbe7c95e26808319e307483ca6ea2555e0
+size 6406201

CRNN+CTC/checkpoints/checkpoint_epoch_10.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08f3a40e99411b95e8a8563ed42f6998f367b84dd799b8c0cbcffac1bdd5576f
+size 19201165

CRNN+CTC/checkpoints/latest_checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b10077d433edbd5946499fef7334421d8b7ba351f55d631fbeb085592c10545
+size 19201651

CRNN+CTC/compare_checkpoints.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import sys
+sys.path.append('.')
+from crnn_model import get_crnn_model
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def test_model(path, label):
+    c      = torch.load(path, map_location=device, weights_only=False)
+    config = c.get('config', {})
+    model  = get_crnn_model(
+        model_type      = config.get('model_type', 'standard'),
+        img_height      = config.get('img_height', 64),
+        num_chars       = c['model_state_dict']['fc.weight'].shape[0],
+        hidden_size     = config.get('hidden_size', 128),
+        num_lstm_layers = config.get('num_lstm_layers', 1),
+    ).to(device)
+    model.load_state_dict(c['model_state_dict'], strict=False)
+    epoch   = c.get('epoch', 'N/A')
+    val_loss = c.get('val_loss', None)   # fine-tuned checkpoints (EMNIST, IAM)
+    val_cer  = c.get('val_cer',  None)   # synthetic baseline checkpoint
+    if val_loss is not None:
+        metric_str = f"val_loss={val_loss:.4f}"
+    elif val_cer is not None:
+        metric_str = f"val_cer={val_cer:.4f}%"
+    else:
+        metric_str = "no metric saved"
+    print(f"{label}: epoch={epoch}  {metric_str}")
+print("=" * 55)
+test_model('checkpoints/best_model.pth',       'Synthetic  ')
+test_model('checkpoints/best_model_emnist.pth', 'EMNIST     ')
+test_model('checkpoints/best_model_iam.pth',    'IAM        ')
+print("=" * 55)

CRNN+CTC/compare_live_cer.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+compare_live_cer.py
+===================
+Runs live CER on all three checkpoints to find the best one.
+Usage: python compare_live_cer.py
+"""
+import os
+import sys
+import json
+import random
+import cv2
+import numpy as np
+import editdistance
+import torch
+import torch.nn.functional as F
+sys.path.append('.')
+from crnn_model import get_crnn_model
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+VAL_ANN  = 'data/val_annotations.json'
+VAL_DIR  = 'data/val'
+MAX_SAMPLES = 200
+CHECKPOINTS = {
+    'Synthetic' : 'checkpoints/best_model.pth',
+    'EMNIST'    : 'checkpoints/best_model_emnist.pth',
+    'IAM'       : 'checkpoints/best_model_iam.pth',
+}
+def normalize(img, H=64, W=512):
+    if len(img.shape) == 3:
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    else:
+        gray = img.copy()
+    gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
+    inv  = cv2.bitwise_not(gray)
+    _, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
+    coords = np.column_stack(np.where(thresh > 0))
+    if len(coords) > 0:
+        y_min, x_min = coords.min(axis=0)
+        y_max, x_max = coords.max(axis=0)
+        pad   = max(4, int((y_max - y_min) * 0.15))
+        y_min = max(0, y_min - pad)
+        x_min = max(0, x_min - pad)
+        y_max = min(gray.shape[0]-1, y_max + pad)
+        x_max = min(gray.shape[1]-1, x_max + pad)
+        gray  = gray[y_min:y_max+1, x_min:x_max+1]
+    h, w = gray.shape
+    if h == 0 or w == 0:
+        return np.ones((H, W), dtype=np.uint8) * 255
+    scale = H / h
+    new_w = int(w * scale)
+    if new_w > W:
+        scale = W / w
+        new_w = W
+        new_h = int(h * scale)
+    else:
+        new_h = H
+    resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
+    canvas  = np.ones((H, W), dtype=np.uint8) * 255
+    canvas[(H-new_h)//2:(H-new_h)//2+new_h,
+           (W-new_w)//2:(W-new_w)//2+new_w] = resized
+    _, otsu = cv2.threshold(canvas, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    return otsu
+def greedy_decode(outputs, idx_to_char):
+    pred_indices = torch.argmax(outputs, dim=2).permute(1, 0)
+    results = []
+    for seq in pred_indices:
+        chars, prev = [], -1
+        for idx in seq:
+            idx = idx.item()
+            if idx != 0 and idx != prev and idx in idx_to_char:
+                chars.append(idx_to_char[idx])
+            prev = idx
+        results.append(''.join(chars))
+    return results
+def evaluate(checkpoint_path, label):
+    if not os.path.exists(checkpoint_path):
+        print(f"  {label:<12}: FILE NOT FOUND — skipping")
+        return
+    c      = torch.load(checkpoint_path, map_location=device, weights_only=False)
+    config = c.get('config', {})
+    # Load idx_to_char from checkpoint if available
+    idx_to_char = c.get('idx_to_char', None)
+    if idx_to_char is None:
+        from dataset import build_char_maps
+        _, idx_to_char, _ = build_char_maps()
+    model = get_crnn_model(
+        model_type      = config.get('model_type', 'standard'),
+        img_height      = config.get('img_height', 64),
+        num_chars       = c['model_state_dict']['fc.weight'].shape[0],
+        hidden_size     = config.get('hidden_size', 128),
+        num_lstm_layers = config.get('num_lstm_layers', 1),
+    ).to(device)
+    model.load_state_dict(c['model_state_dict'], strict=False)
+    model.eval()
+    with open(VAL_ANN, 'r', encoding='utf-8') as f:
+        anns = json.load(f)
+    random.seed(42)
+    if len(anns) > MAX_SAMPLES:
+        anns = random.sample(anns, MAX_SAMPLES)
+    total_cd, total_c = 0, 0
+    exact, n = 0, 0
+    worst = []
+    with torch.no_grad():
+        for ann in anns:
+            img_path = os.path.join(VAL_DIR, ann['image_path'])
+            gt       = ann['text']
+            if not os.path.exists(img_path):
+                continue
+            raw = cv2.imread(img_path)
+            if raw is None:
+                continue
+            norm   = normalize(raw)
+            tensor = torch.FloatTensor(
+                norm.astype(np.float32) / 255.0
+            ).unsqueeze(0).unsqueeze(0).to(device)
+            out  = model(tensor)
+            pred = greedy_decode(out.cpu(), idx_to_char)[0]
+            cd   = editdistance.eval(pred, gt)
+            total_cd += cd
+            total_c  += len(gt)
+            if pred == gt:
+                exact += 1
+            if cd > 0:
+                worst.append((gt, pred, cd))
+            n += 1
+    cer = (total_cd / total_c * 100) if total_c > 0 else 0
+    acc = (exact / n * 100) if n > 0 else 0
+    print(f"  {label:<12}: CER={cer:.2f}%  ExactMatch={acc:.1f}%  (n={n})")
+    if worst:
+        worst = sorted(worst, key=lambda x: x[2], reverse=True)[:2]
+        for gt, pred, d in worst:
+            print(f"             [{d}] '{gt}' -> '{pred}'")
+print("=" * 60)
+print("  LIVE CER COMPARISON — all checkpoints")
+print("=" * 60)
+for label, path in CHECKPOINTS.items():
+    evaluate(path, label)
+print("=" * 60)
+print("Use the checkpoint with the lowest CER for IAM/physical fine-tuning.")

CRNN+CTC/create_test_images.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+from PIL import Image, ImageDraw, ImageFont
+os.makedirs('test_images', exist_ok=True)
+def load_font(size=22):   # FIXED: was 20 — must match fix_data.py FONT_SIZE=22
+    """Same font loader as fix_data.py — tries multiple paths."""
+    for fp in [
+        'arial.ttf', 'Arial.ttf',
+        '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
+        '/System/Library/Fonts/Helvetica.ttc',
+        'C:/Windows/Fonts/arial.ttf',
+    ]:
+        try:
+            return ImageFont.truetype(fp, size)
+        except Exception:
+            continue
+    print("WARNING: Could not load Arial/DejaVu font. Using default — predictions may be inaccurate.")
+    return ImageFont.load_default()
+def create_image(text, filename):
+    """Render text exactly the same way as fix_data.py training images."""
+    img  = Image.new('RGB', (512, 64), color=(255, 255, 255))
+    draw = ImageDraw.Draw(img)
+    font = load_font(22)
+    bbox = draw.textbbox((0, 0), text, font=font)
+    x = max((512 - (bbox[2] - bbox[0])) // 2, 2)
+    y = max((64  - (bbox[3] - bbox[1])) // 2, 2)
+    draw.text((x, y), text, fill=(0, 0, 0), font=font)
+    img.save(filename)
+    print(f'Created: {filename}')
+# ── Test samples ──────────────────────────────────────────────
+create_image('Juan Dela Cruz',                  'test_images/demo.jpg')
+create_image('Juan Dela Cruz',                  'test_images/name1.jpg')
+create_image('01/15/1990',                      'test_images/date1.jpg')
+create_image('Tarlac City',                     'test_images/place1.jpg')
+create_image('Maria Santos',                    'test_images/form1a_sample.jpg')
+# ── Extra test cases (names, dates, addresses) ────────────────
+create_image('Jose Dela Cruz Jr.',              'test_images/name2.jpg')
+create_image('Ana Marie Reyes',                 'test_images/name3.jpg')
+create_image('03/22/1985',                      'test_images/date2.jpg')
+create_image('07/04/2000',                      'test_images/date3.jpg')
+create_image('Brgy. San Jose, Capas, Tarlac',   'test_images/place2.jpg')
+create_image('78 MacArthur Hwy., Tarlac City',  'test_images/place3.jpg')
+print('\nAll test images created!')
+print('Font used matches training data — predictions should be accurate.')

CRNN+CTC/crnn_model.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+CRNN+CTC Model — simplified for small datasets (~5000-10000 samples)
+~700K parameters, converges reliably without CTC blank collapse.
+"""
+import torch
+import torch.nn as nn
+class CRNN_CivilRegistry(nn.Module):
+    def __init__(self, img_height=64, num_chars=96, hidden_size=128, num_lstm_layers=1,
+                 dropout=0.3):
+        super().__init__()
+        # CNN — width reductions for 512px input:
+        # MaxPool(2,2): 512→256, MaxPool(2,2): 256→128
+        # MaxPool(2,1): 128 (height only), MaxPool(2,1): 128 (height only)
+        # Conv(k=2,p=0): 127  →  seq_len=127, fits labels up to 64 chars
+        self.cnn = nn.Sequential(
+            nn.Conv2d(1, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
+            nn.MaxPool2d((2, 1)),
+            nn.Conv2d(128, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(inplace=True),
+            nn.MaxPool2d((2, 1)),
+            nn.Conv2d(256, 256, kernel_size=2, padding=0),
+            nn.BatchNorm2d(256), nn.ReLU(inplace=True),
+        )
+        # FIXED Bug 4: derive cnn_out_h from a real forward pass instead of
+        # a hardcoded formula — safer if architecture or img_height ever changes.
+        with torch.no_grad():
+            _dummy = torch.zeros(1, 1, img_height, 32)
+            _out   = self.cnn(_dummy)
+            cnn_out_h = _out.shape[2]   # actual height after all CNN layers
+        rnn_input = 256 * cnn_out_h
+        self.rnn = nn.LSTM(
+            input_size=rnn_input,
+            hidden_size=hidden_size,
+            num_layers=num_lstm_layers,
+            bidirectional=True,
+            batch_first=False,
+        )
+        # Dropout before FC — prevents overfitting on small datasets.
+        # Applied after BiLSTM output, before character projection.
+        # p=0.3 is standard for CRNN OCR models (disabled at inference via model.eval()).
+        self.dropout = nn.Dropout(p=dropout)
+        self.fc = nn.Linear(hidden_size * 2, num_chars)
+    def forward(self, x):
+        f = self.cnn(x)
+        B, C, h, w = f.size()
+        f = f.permute(3, 0, 1, 2).reshape(w, B, C * h)
+        f, _ = self.rnn(f)
+        return self.fc(self.dropout(f))
+class CRNN_Ensemble(nn.Module):
+    def __init__(self, num_models=3, **kwargs):
+        super().__init__()
+        self.models = nn.ModuleList([CRNN_CivilRegistry(**kwargs) for _ in range(num_models)])
+    def forward(self, x):
+        # FIXED Rec 3: average softmax probabilities across models (correct ensemble),
+        # then return log of the average so CTCLoss receives log-probabilities —
+        # the same contract as CRNN_CivilRegistry (raw logits + log_softmax in trainer).
+        # Returning raw averaged probabilities caused CTCLoss to receive un-logged values.
+        probs = [torch.nn.functional.softmax(m(x), dim=2) for m in self.models]
+        avg_probs = torch.mean(torch.stack(probs), dim=0)
+        return torch.log(avg_probs.clamp(min=1e-9))  # log-probs, safe clamp avoids log(0)
+def get_crnn_model(model_type='standard', **kwargs):
+    if model_type == 'ensemble':
+        return CRNN_Ensemble(**kwargs)
+    return CRNN_CivilRegistry(**kwargs)
+def initialize_weights(model):
+    for m in model.modules():
+        if isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.BatchNorm2d):
+            nn.init.constant_(m.weight, 1)
+            nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, 0, 0.01)
+            nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LSTM):
+            for name, param in m.named_parameters():
+                if 'weight' in name:
+                    nn.init.orthogonal_(param)
+                elif 'bias' in name:
+                    nn.init.constant_(param, 0)
+                    # Rec 1: set forget gate bias to 1.0 — helps the model
+                    # remember across long sequences at the start of training.
+                    # LSTM gate order: [input | forget | cell | output]
+                    n = param.size(0)
+                    param.data[n // 4 : n // 2].fill_(1.0)
+if __name__ == "__main__":
+    model = get_crnn_model('standard', img_height=64, num_chars=96, hidden_size=128, num_lstm_layers=1)
+    initialize_weights(model)
+    x = torch.randn(2, 1, 64, 512)
+    out = model(x)
+    params = sum(p.numel() for p in model.parameters())
+    print(f"Output: {out.shape}  seq_len={out.shape[0]}")
+    print(f"Params: {params:,}  (unchanged — dropout adds no parameters)")
+    print(f"Dropout p=0.3 active during training, disabled during model.eval()")

CRNN+CTC/dataset.py ADDED Viewed

	@@ -0,0 +1,401 @@

+"""
+dataset.py
+==========
+PyTorch Dataset and DataLoader utilities for the Civil Registry OCR system.
+"""
+import os
+import json
+import random
+from pathlib import Path
+from typing import List, Tuple, Dict, Optional
+import cv2
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+# ─────────────────────────────────────────────────────────────────────────────
+#  CHARACTER SET
+# ─────────────────────────────────────────────────────────────────────────────
+PRINTABLE_CHARS = [chr(i) for i in range(32, 127)]  # space (32) to ~ (126)
+def build_char_maps(extra_chars: Optional[List[str]] = None):
+    chars = PRINTABLE_CHARS.copy()
+    if extra_chars:
+        for c in extra_chars:
+            if c not in chars:
+                chars.append(c)
+    char_to_idx = {c: i + 1 for i, c in enumerate(chars)}
+    idx_to_char = {i + 1: c for i, c in enumerate(chars)}
+    num_chars   = len(chars) + 1  # +1 for blank=0
+    return char_to_idx, idx_to_char, num_chars
+# ─────────────────────────────────────────────────────────────────────────────
+#  IMAGE NORMALIZER
+# ─────────────────────────────────────────────────────────────────────────────
+class ImageNormalizer:
+    def __init__(self, target_height: int = 64, target_width: int = 512):
+        self.H = target_height
+        self.W = target_width
+    def _to_gray(self, img):
+        if len(img.shape) == 3:
+            return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        return img.copy()
+    def _crop_to_text(self, gray):
+        inv = cv2.bitwise_not(gray)
+        _, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
+        coords = np.column_stack(np.where(thresh > 0))
+        if len(coords) == 0:
+            return gray
+        y_min, x_min = coords.min(axis=0)
+        y_max, x_max = coords.max(axis=0)
+        pad   = max(4, int((y_max - y_min) * 0.15))
+        y_min = max(0, y_min - pad)
+        x_min = max(0, x_min - pad)
+        y_max = min(gray.shape[0] - 1, y_max + pad)
+        x_max = min(gray.shape[1] - 1, x_max + pad)
+        return gray[y_min:y_max + 1, x_min:x_max + 1]
+    def _aspect_resize(self, gray):
+        h, w = gray.shape
+        if h == 0 or w == 0:
+            return np.ones((self.H, self.W), dtype=np.uint8) * 255
+        scale = self.H / h
+        new_w = int(w * scale)
+        new_h = self.H
+        if new_w > self.W:
+            scale = self.W / w
+            new_h = int(h * scale)
+            new_w = self.W
+        resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
+        canvas  = np.ones((self.H, self.W), dtype=np.uint8) * 255
+        y_off   = (self.H - new_h) // 2
+        x_off   = (self.W - new_w) // 2
+        canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized
+        return canvas
+    def _binarize(self, img):
+        _, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        white_ratio = np.mean(otsu == 255)
+        if white_ratio < 0.30 or white_ratio > 0.97:
+            return cv2.adaptiveThreshold(
+                img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY, 11, 2)
+        return otsu
+    def normalize(self, img: np.ndarray, augmenter=None) -> np.ndarray:
+        gray = self._to_gray(img)
+        # NOTE: fastNlMeansDenoising intentionally removed from training pipeline.
+        # It is slow (~200ms/image) and pointless on clean synthetic images.
+        # Denoising is only applied in check_cer.py / inference.py (AdaptiveNormalizer)
+        # which runs on real scanned documents where denoising actually helps.
+        gray = self._crop_to_text(gray)
+        gray = self._aspect_resize(gray)
+        # FIXED Bug 3: augment on grayscale BEFORE binarize.
+        # Brightness/contrast augmentation has zero effect on binary (0/255) pixels.
+        if augmenter is not None:
+            gray = augmenter(gray)
+        return self._binarize(gray)
+    def to_tensor(self, img: np.ndarray) -> torch.Tensor:
+        return torch.FloatTensor(
+            img.astype(np.float32) / 255.0
+        ).unsqueeze(0)  # [1, H, W]
+# ─────────────────────────────────────────────────────────────────────────────
+#  AUGMENTATION
+# ─────────────────────────────────────────────────────────────────────────────
+class Augmenter:
+    def __call__(self, img: np.ndarray) -> np.ndarray:
+        img = img.copy()
+        # Random slight rotation (±3°)
+        if random.random() < 0.3:
+            angle = random.uniform(-3, 3)
+            h, w  = img.shape
+            M     = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
+            img   = cv2.warpAffine(img, M, (w, h),
+                                   borderMode=cv2.BORDER_CONSTANT,
+                                   borderValue=255)
+        # Random brightness/contrast
+        if random.random() < 0.4:
+            alpha = random.uniform(0.8, 1.2)
+            beta  = random.randint(-20, 20)
+            img   = np.clip(alpha * img.astype(np.float32) + beta,
+                            0, 255).astype(np.uint8)
+        # Gaussian blur
+        if random.random() < 0.3:
+            ksize = random.choice([3, 5])
+            img   = cv2.GaussianBlur(img, (ksize, ksize), 0)
+        # Salt-and-pepper noise
+        if random.random() < 0.2:
+            noise = np.random.randint(0, 100, img.shape)
+            img[noise < 2]  = 0
+            img[noise > 97] = 255
+        # Random small horizontal shift
+        if random.random() < 0.2:
+            h, w  = img.shape
+            shift = random.randint(-int(w * 0.05), int(w * 0.05))
+            M     = np.float32([[1, 0, shift], [0, 1, 0]])
+            img   = cv2.warpAffine(img, M, (w, h),
+                                   borderMode=cv2.BORDER_CONSTANT,
+                                   borderValue=255)
+        # ── NEW: Horizontal line noise ────────────────────────────────────────
+        # Simulates ruled form lines bleeding through behind the text.
+        # Civil registry forms have printed horizontal grid lines — scanners
+        # often pick these up as faint grey stripes across text fields.
+        if random.random() < 0.3:
+            h, w    = img.shape
+            n_lines = random.randint(1, 3)
+            for _ in range(n_lines):
+                y         = random.randint(0, h - 1)
+                thickness = random.choice([1, 1, 1, 2])   # mostly 1px
+                intensity = random.randint(160, 220)       # light grey, not black
+                cv2.line(img, (0, y), (w, y),
+                         color=intensity, thickness=thickness)
+        # ── NEW: Perspective warp ─────────────────────────────────────────────
+        # Simulates documents scanned or photographed at a slight angle.
+        # Keystone distortion is common when forms are placed unevenly on
+        # a flatbed scanner or photographed with a phone camera.
+        if random.random() < 0.25:
+            h, w  = img.shape
+            d     = 0.03
+            dx    = int(w * d)
+            dy    = int(h * d)
+            src   = np.float32([[0, 0], [w, 0], [w, h], [0, h]])
+            dst   = np.float32([
+                [random.randint(0, dx),     random.randint(0, dy)],
+                [w - random.randint(0, dx), random.randint(0, dy)],
+                [w - random.randint(0, dx), h - random.randint(0, dy)],
+                [random.randint(0, dx),     h - random.randint(0, dy)],
+            ])
+            M   = cv2.getPerspectiveTransform(src, dst)
+            img = cv2.warpPerspective(img, M, (w, h),
+                                      borderMode=cv2.BORDER_CONSTANT,
+                                      borderValue=255)
+        return img
+# ─────────────────────────────────────────────────────────────────────────────
+#  DATASET
+# ─────────────────────────────────────────────────────────────────────────────
+class CivilRegistryDataset(Dataset):
+    """
+    Args:
+        data_dir         : root folder containing image subfolders (e.g. 'data/train')
+        annotations_file : path to JSON file with image_path + text pairs
+        img_height       : target image height (default 64)
+        img_width        : target image width  (default 512)
+        augment          : True = apply augmentation (training only)
+        form_type        : 'all' or filter by form e.g. 'form1a'
+    Properties used by train.py:
+        .num_chars    → passed to CRNN model
+        .char_to_idx  → saved in checkpoint
+        .idx_to_char  → used for decoding predictions
+    __getitem__ returns:
+        image_tensor   FloatTensor [1, H, W]
+        target         LongTensor  [label_length]
+        target_length  int
+        text           str  (original ground truth)
+    """
+    def __init__(
+        self,
+        data_dir:         str,
+        annotations_file: str,
+        img_height:       int  = 64,
+        img_width:        int  = 512,
+        augment:          bool = False,
+        form_type:        str  = 'all',
+        seed:             Optional[int] = None,   # Rec 2: reproducible augmentation
+    ):
+        self.data_dir   = Path(data_dir)
+        self.augment    = augment
+        self.normalizer = ImageNormalizer(img_height, img_width)
+        self.augmenter  = Augmenter()
+        if seed is not None:                       # Rec 2: seed random for reproducibility
+            random.seed(seed)
+            np.random.seed(seed)
+        self.char_to_idx, self.idx_to_char, self.num_chars = build_char_maps()
+        with open(annotations_file, 'r', encoding='utf-8') as f:
+            all_annotations = json.load(f)
+        if form_type != 'all':
+            all_annotations = [
+                a for a in all_annotations
+                if form_type in a.get('image_path', '')
+            ]
+        self.samples: List[Dict] = []
+        missing = 0
+        for ann in all_annotations:
+            img_path = self.data_dir / ann['image_path']
+            if img_path.exists():
+                text = ann['text'].strip()
+                if text:
+                    self.samples.append({
+                        'image_path': str(img_path),
+                        'text':       text,
+                    })
+            else:
+                missing += 1
+        if missing > 0:
+            print(f"  [Dataset] WARNING: {missing} image(s) not found and skipped.")
+        print(f"  [Dataset] Loaded {len(self.samples)} samples "
+              f"from {annotations_file}  (augment={augment})")
+    def __len__(self) -> int:
+        return len(self.samples)
+    def __getitem__(self, idx: int):
+        sample = self.samples[idx]
+        text   = sample['text']
+        img = cv2.imread(sample['image_path'])
+        if img is None:
+            img = np.ones((64, 512, 3), dtype=np.uint8) * 255
+        # FIXED Bug 3: pass augmenter into normalize() so it runs on grayscale
+        # (before binarization), not on the binary output where it has no effect.
+        aug = self.augmenter if self.augment else None
+        normalized = self.normalizer.normalize(img, augmenter=aug)
+        image_tensor = self.normalizer.to_tensor(normalized)  # [1, H, W]
+        encoded = [
+            self.char_to_idx[c]
+            for c in text
+            if c in self.char_to_idx
+        ]
+        if len(encoded) == 0:
+            encoded = [self.char_to_idx.get(' ', 1)]
+        target        = torch.LongTensor(encoded)
+        target_length = len(encoded)
+        return image_tensor, target, target_length, text
+# ─────────────────────────────────────────────────────────────────────────────
+#  COLLATE FUNCTION
+# ─────────────────────────────────────────────────────────────────────────────
+def collate_fn(batch):
+    """
+    CTC loss needs all labels packed into one flat 1D tensor.
+    PyTorch's default collator can't handle variable-length labels,
+    so this custom function packs them correctly.
+    Returns:
+        images         FloatTensor [B, 1, H, W]
+        targets        LongTensor  [sum of all label lengths]
+        target_lengths LongTensor  [B]
+        texts          List[str]
+    """
+    images, targets, target_lengths, texts = zip(*batch)
+    images         = torch.stack(images, dim=0)
+    targets        = torch.cat([t for t in targets])
+    target_lengths = torch.LongTensor(target_lengths)
+    return images, targets, target_lengths, list(texts)
+# ─────────────────────────────────────────────────────────────────────────────
+#  HELPER: CREATE ANNOTATION FILE  (run once to build your JSON)
+# ─────────────────────────────────────────────────────────────────────────────
+def create_annotation_file(data_dir: str, output_file: str,
+                            extensions=('.jpg', '.jpeg', '.png')):
+    """
+    Auto-generate annotations JSON by scanning data_dir.
+    For each image, looks for a sidecar .txt file with the same name.
+    If not found, uses the filename stem (underscores → spaces) as label.
+    Usage:
+        from dataset import create_annotation_file
+        create_annotation_file('data/train', 'data/train_annotations.json')
+        create_annotation_file('data/val',   'data/val_annotations.json')
+    """
+    data_path   = Path(data_dir)
+    annotations = []
+    for img_path in sorted(data_path.rglob('*')):
+        if img_path.suffix.lower() not in extensions:
+            continue
+        txt_path = img_path.with_suffix('.txt')
+        if txt_path.exists():
+            label = txt_path.read_text(encoding='utf-8').strip()
+        else:
+            label = img_path.stem.replace('_', ' ')
+        if not label:
+            continue
+        rel_path = img_path.relative_to(data_path)
+        annotations.append({
+            'image_path': str(rel_path).replace('\\', '/'),
+            'text':       label,
+        })
+    os.makedirs(Path(output_file).parent, exist_ok=True)
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(annotations, f, indent=2, ensure_ascii=False)
+    print(f"✓ Saved {len(annotations)} entries → {output_file}")
+    return annotations
+# ─────────────────────────────────────────────────────────────────────────────
+#  SELF-TEST  (python dataset.py)
+# ─────────────────────────────────────────────────────────────────────────────
+if __name__ == '__main__':
+    print("=" * 55)
+    print("  dataset.py self-test")
+    print("=" * 55)
+    c2i, i2c, n = build_char_maps()
+    print(f"\n  Vocab size : {n}  (including blank=0)")
+    print(f"  'A'={c2i['A']}  '0'={c2i['0']}  ' '={c2i[' ']}  '.'={c2i['.']}")
+    dummy = np.ones((80, 300, 3), dtype=np.uint8) * 200
+    norm  = ImageNormalizer(64, 512)
+    out   = norm.normalize(dummy)
+    t     = norm.to_tensor(out)
+    print(f"\n  Normalizer : {dummy.shape} → {out.shape} → tensor {t.shape}")
+    fake = [
+        (torch.zeros(1, 64, 512), torch.LongTensor([1, 2, 3]),    3, "ABC"),
+        (torch.zeros(1, 64, 512), torch.LongTensor([4, 5]),        2, "DE"),
+        (torch.zeros(1, 64, 512), torch.LongTensor([6, 7, 8, 9]), 4, "FGHI"),
+    ]
+    imgs, tgts, tlens, txts = collate_fn(fake)
+    print(f"\n  collate_fn : images={imgs.shape}  "
+          f"targets={tgts.shape}  lengths={tlens.tolist()}")
+    print("\n  ✓ All checks passed.\n")

CRNN+CTC/field_extractor.py ADDED Viewed

	@@ -0,0 +1,735 @@

+"""
+Philippine Civil Registry — Field Extractor (Dynamic)
+======================================================
+Automatically detects form borders on ANY scan/photo and aligns field
+extraction to the detected boundary — no hardcoded pixel positions.
+Field coordinates calibrated directly from official PDF renders at 200 DPI:
+  Form 102 (Birth):    1700 x 2800 px
+  Form 103 (Death):    1700 x 2878 px
+  Form 97  (Marriage): 1700 x 2600 px
+  Form 90  (License):  1700 x 2600 px
+Usage:
+    python field_extractor.py --pdf  FORM_102.pdf --form birth
+    python field_extractor.py --pdf  FORM_97.pdf  --form marriage --visualize
+    python field_extractor.py --pdf  FORM_103.pdf --form death    --output results.json
+    python field_extractor.py --image form102.png --form birth    --visualize
+    python field_extractor.py --pdf  FORM_102.pdf --form birth    --checkpoint checkpoints/best_model_emnist.pth
+.env file (project root) — each team member sets their own:
+    POPPLER_PATH=C:\\your\\path\\to\\poppler\\Library\\bin
+"""
+import argparse
+import os
+import sys
+import json
+import cv2
+import numpy as np
+from pathlib import Path
+import torch
+from dotenv import load_dotenv
+# Load .env from same folder as this script (works regardless of cwd)
+_script_dir = Path(__file__).parent.resolve()
+load_dotenv(dotenv_path=_script_dir / ".env")
+# Poppler path — from .env or None (Linux/Mac auto-detects)
+POPPLER_PATH = os.environ.get("POPPLER_PATH", None)
+DEFAULT_CHECKPOINT = "checkpoints/best_model.pth"
+# ══════════════════════════════════════════════════════════════════════════════
+#  FIELD RATIO MAPS
+#  Format: field_name: (x1, y1, x2, y2) — ratios 0.0–1.0
+#  Coordinates are relative to the DETECTED FORM BOUNDARY (not full image).
+#  x = left→right,  y = top→bottom
+# ══════════════════════════════════════════════════════════════════════════════
+# Form 102 → Certificate of Live Birth (Form 1A)
+BIRTH_FIELDS = {
+    # Header
+    "province":                   (0.02, 0.068, 0.30, 0.088),
+    "registry_number":            (0.66, 0.068, 0.99, 0.108),
+    "city_municipality":          (0.02, 0.090, 0.65, 0.108),
+    # Item 1 — Child Name
+    "child_first_name":           (0.03, 0.109, 0.40, 0.141),
+    "child_middle_name":          (0.40, 0.109, 0.64, 0.141),
+    "child_last_name":            (0.64, 0.109, 0.99, 0.141),
+    # Items 2-3 — Sex / Date of Birth
+    "sex":                        (0.03, 0.142, 0.30, 0.167),
+    "dob_day":                    (0.40, 0.142, 0.80, 0.167),
+    "dob_month":                  (0.80, 0.142, 0.60, 0.167),
+    "dob_year":                   (0.80, 0.142, 0.99, 0.167),
+    # Item 4 — Place of Birth
+    "place_birth_hospital":       (0.03, 0.169, 0.46, 0.197),
+    "place_birth_city":           (0.47, 0.169, 0.70, 0.199),
+    "place_birth_province":       (0.71, 0.169, 0.99, 0.199),
+    # Mother section
+    "mother_first_name":          (0.03, 0.248, 0.40, 0.276),
+    "mother_middle_name":         (0.40, 0.248, 0.64, 0.276),
+    "mother_last_name":           (0.64, 0.248, 0.99, 0.276),
+    "mother_citizenship":         (0.03, 0.277, 0.50, 0.305),
+    # Father section
+    "father_first_name":          (0.03, 0.380, 0.40, 0.410),
+    "father_middle_name":         (0.40, 0.380, 0.64, 0.410),
+    "father_last_name":           (0.64, 0.380, 0.99, 0.410),
+    "father_citizenship":         (0.03, 0.411, 0.28, 0.445),
+    # Item 20 — Marriage of Parents
+    "parents_marriage_month":     (0.03, 0.496, 0.19, 0.526),
+    "parents_marriage_day":       (0.19, 0.496, 0.27, 0.526),
+    "parents_marriage_year":      (0.27, 0.496, 0.38, 0.526),
+    "parents_marriage_city":      (0.41, 0.496, 0.68, 0.526),
+    "parents_marriage_province":  (0.68, 0.496, 0.84, 0.526),
+}
+# Form 103 → Certificate of Death (Form 2A)
+DEATH_FIELDS = {
+    # Header
+    "province":                   (0.04, 0.128, 0.40, 0.144),
+    "registry_number":            (0.52, 0.128, 0.75, 0.144),
+    "city_municipality":          (0.04, 0.145, 0.45, 0.160),
+    # Item 1 — Name
+    "deceased_first_name":        (0.10, 0.162, 0.34, 0.178),
+    "deceased_middle_name":       (0.34, 0.162, 0.56, 0.178),
+    "deceased_last_name":         (0.56, 0.162, 0.75, 0.178),
+    # Items 2-4 — Sex / Religion / Age
+    "sex":                        (0.04, 0.182, 0.13, 0.220),
+    "age_years":                  (0.28, 0.182, 0.38, 0.202),
+    # Item 5 — Place of Death
+    "place_death_hospital":       (0.13, 0.224, 0.42, 0.242),
+    "place_death_city":           (0.42, 0.224, 0.58, 0.242),
+    "place_death_province":       (0.58, 0.224, 0.75, 0.242),
+    # Items 6-7 — Date of Death / Citizenship
+    "dod_day":                    (0.10, 0.252, 0.22, 0.268),
+    "dod_month":                  (0.22, 0.252, 0.38, 0.268),
+    "dod_year":                   (0.38, 0.252, 0.52, 0.268),
+    "citizenship":                (0.52, 0.252, 0.75, 0.268),
+    # Item 8 — Residence
+    "residence_house":            (0.13, 0.278, 0.40, 0.294),
+    "residence_city":             (0.40, 0.278, 0.56, 0.294),
+    "residence_province":         (0.56, 0.278, 0.75, 0.294),
+    # Items 9-10 — Civil Status / Occupation
+    "civil_status":               (0.04, 0.302, 0.38, 0.360),
+    "occupation":                 (0.44, 0.302, 0.75, 0.360),
+    # Item 17 — Causes of Death
+    "cause_immediate":            (0.18, 0.402, 0.58, 0.418),
+    "cause_antecedent":           (0.18, 0.424, 0.58, 0.440),
+    "cause_underlying":           (0.18, 0.446, 0.58, 0.462),
+    "cause_other":                (0.18, 0.468, 0.58, 0.484),
+    # Item 25 — Informant
+    "informant_name":             (0.04, 0.808, 0.35, 0.822),
+    "informant_address":          (0.04, 0.822, 0.35, 0.836),
+    "informant_date":             (0.35, 0.836, 0.58, 0.850),
+}
+# Form 97 → Certificate of Marriage (Form 3A)
+# Only the fields that flow through bridge.py → spaCy NER → SpouseOutput/Form3A.
+# Removed: province, city_municipality, dob_day/month/year (×2),
+#   place_birth_city/prov/country (×2), sex (×2), residence (×2),
+#   religion (×2), civil_status (×2).
+MARRIAGE_FIELDS = {
+    # ── Header ───────────────────────────────────────────────────────────────
+    "registry_number":            (0.62, 0.088, 0.97, 0.104),  # → Form3A.registry_number
+    #"registry_number":            (0.62, 0.088, 0.97, 0.104),  # → Form3A.registry_number
+    # ── Item 1 — Name (HUSBAND left / WIFE right) ────────────────────────────
+    "husband_first_name":         (0.23, 0.121, 0.56, 0.139),
+    "husband_middle_name":        (0.23, 0.141, 0.56, 0.159),
+    "husband_last_name":          (0.23, 0.160, 0.56, 0.178),
+    "wife_first_name":            (0.65, 0.121, 0.98, 0.139),
+    "wife_middle_name":           (0.65, 0.141, 0.98, 0.159),
+    "wife_last_name":             (0.65, 0.160, 0.98, 0.178),
+   # "husband_first_name":         (0.14, 0.138, 0.47, 0.156),
+   # "husband_middle_name":        (0.14, 0.156, 0.47, 0.174),
+   # "husband_last_name":          (0.14, 0.174, 0.47, 0.192),
+   # "wife_first_name":            (0.53, 0.138, 0.86, 0.156),
+   # "wife_middle_name":           (0.53, 0.156, 0.86, 0.174),
+   # "wife_last_name":             (0.53, 0.174, 0.86, 0.192),
+    # ── Item 2b — Age ────────────────────────────────────────────────────────
+    "husband_age":                (0.40, 0.198, 0.47, 0.216),  # → husband.age
+    "wife_age":                   (0.78, 0.198, 0.86, 0.216),  # → wife.age
+    # ── Item 4b — Citizenship ────────────────────────────────────────────────
+    "husband_citizenship":        (0.22, 0.252, 0.47, 0.270),  # → husband.nationality
+    "wife_citizenship":           (0.62, 0.252, 0.86, 0.270),  # → wife.nationality
+    # ── Item 8 — Name of Father ──────────────────────────────────────────────
+    "husband_father_first":       (0.14, 0.396, 0.24, 0.414),
+    "husband_father_middle":      (0.24, 0.396, 0.34, 0.414),
+    "husband_father_last":        (0.34, 0.396, 0.47, 0.414),
+    "wife_father_first":          (0.53, 0.396, 0.63, 0.414),
+    "wife_father_middle":         (0.63, 0.396, 0.73, 0.414),
+    "wife_father_last":           (0.73, 0.396, 0.86, 0.414),
+    # ── Item 9 — Citizenship of Father ──────────────────────────────────────
+    "husband_father_citizenship": (0.14, 0.420, 0.47, 0.436),  # → husband.nationality_of_father
+    "wife_father_citizenship":    (0.53, 0.420, 0.86, 0.436),  # → wife.nationality_of_father
+    # ── Item 10 — Name of Mother ─────────────────────────────────────────────
+    "husband_mother_first":       (0.14, 0.444, 0.24, 0.462),
+    "husband_mother_middle":      (0.24, 0.444, 0.34, 0.462),
+    "husband_mother_last":        (0.34, 0.444, 0.47, 0.462),
+    "wife_mother_first":          (0.53, 0.444, 0.63, 0.462),
+    "wife_mother_middle":         (0.63, 0.444, 0.73, 0.462),
+    "wife_mother_last":           (0.73, 0.444, 0.86, 0.462),
+    # ── Item 11 — Citizenship of Mother ─────────────────────────────────────
+    "husband_mother_citizenship": (0.14, 0.468, 0.47, 0.484),  # → husband.nationality_of_mother
+    "wife_mother_citizenship":    (0.53, 0.468, 0.86, 0.484),  # → wife.nationality_of_mother
+    # ── Items 15–16 — Place / Date of Marriage ───────────────────────────────
+    "place_marriage_office":      (0.14, 0.596, 0.44, 0.614),
+    "place_marriage_city":        (0.44, 0.596, 0.68, 0.614),
+    "place_marriage_province":    (0.68, 0.596, 0.88, 0.614),
+    "date_marriage_day":          (0.14, 0.630, 0.24, 0.648),
+    "date_marriage_month":        (0.24, 0.630, 0.38, 0.648),
+    "date_marriage_year":         (0.38, 0.630, 0.48, 0.648),
+}
+# Form 90 → Application for Marriage License
+MARRIAGE_LICENSE_FIELDS = {
+    # Header
+    "province":                   (0.12, 0.092, 0.48, 0.108),
+    "registry_number":            (0.56, 0.092, 0.97, 0.108),
+    "city_municipality":          (0.12, 0.108, 0.48, 0.124),
+    "received_by":                (0.12, 0.124, 0.48, 0.140),
+    "date_of_receipt":            (0.12, 0.140, 0.48, 0.156),
+    "marriage_license_number":    (0.56, 0.124, 0.97, 0.140),
+    "date_of_issuance":           (0.56, 0.140, 0.97, 0.156),
+    # Item 1 — Name of Applicant (GROOM left / BRIDE right)
+    "groom_first_name":           (0.02, 0.278, 0.46, 0.294),
+    "bride_first_name":           (0.54, 0.278, 0.97, 0.294),
+    "groom_middle_name":          (0.02, 0.296, 0.46, 0.312),
+    "bride_middle_name":          (0.54, 0.296, 0.97, 0.312),
+    "groom_last_name":            (0.02, 0.314, 0.46, 0.330),
+    "bride_last_name":            (0.54, 0.314, 0.97, 0.330),
+    # Item 2 — Date of Birth / Age
+    "groom_dob_day":              (0.02, 0.334, 0.12, 0.350),
+    "groom_dob_month":            (0.12, 0.334, 0.24, 0.350),
+    "groom_dob_year":             (0.24, 0.334, 0.34, 0.350),
+    "groom_age":                  (0.34, 0.334, 0.46, 0.350),
+    "bride_dob_day":              (0.54, 0.334, 0.62, 0.350),
+    "bride_dob_month":            (0.62, 0.334, 0.74, 0.350),
+    "bride_dob_year":             (0.74, 0.334, 0.84, 0.350),
+    "bride_age":                  (0.84, 0.334, 0.97, 0.350),
+    # Item 3 — Place of Birth
+    "groom_place_birth_city":     (0.02, 0.354, 0.18, 0.370),
+    "groom_place_birth_province": (0.18, 0.354, 0.32, 0.370),
+    "groom_place_birth_country":  (0.32, 0.354, 0.46, 0.370),
+    "bride_place_birth_city":     (0.54, 0.354, 0.70, 0.370),
+    "bride_place_birth_province": (0.70, 0.354, 0.84, 0.370),
+    "bride_place_birth_country":  (0.84, 0.354, 0.97, 0.370),
+    # Item 4 — Sex / Citizenship
+    "groom_sex":                  (0.02, 0.374, 0.16, 0.390),
+    "groom_citizenship":          (0.16, 0.374, 0.46, 0.390),
+    "bride_sex":                  (0.54, 0.374, 0.68, 0.390),
+    "bride_citizenship":          (0.68, 0.374, 0.97, 0.390),
+    # Item 5 — Residence
+    "groom_residence":            (0.02, 0.394, 0.46, 0.412),
+    "bride_residence":            (0.54, 0.394, 0.97, 0.412),
+    # Item 6 — Religion
+    "groom_religion":             (0.02, 0.424, 0.46, 0.440),
+    "bride_religion":             (0.54, 0.424, 0.97, 0.440),
+    # Item 7 — Civil Status
+    "groom_civil_status":         (0.02, 0.452, 0.46, 0.468),
+    "bride_civil_status":         (0.54, 0.452, 0.97, 0.468),
+    # Item 9 — Place where dissolved
+    "groom_dissolution_city":     (0.02, 0.496, 0.16, 0.512),
+    "groom_dissolution_province": (0.16, 0.496, 0.30, 0.512),
+    "groom_dissolution_country":  (0.30, 0.496, 0.46, 0.512),
+    "bride_dissolution_city":     (0.54, 0.496, 0.68, 0.512),
+    "bride_dissolution_province": (0.68, 0.496, 0.82, 0.512),
+    "bride_dissolution_country":  (0.82, 0.496, 0.97, 0.512),
+    # Item 10 — Date when dissolved
+    "groom_dissolution_day":      (0.02, 0.520, 0.12, 0.536),
+    "groom_dissolution_month":    (0.12, 0.520, 0.24, 0.536),
+    "groom_dissolution_year":     (0.24, 0.520, 0.34, 0.536),
+    "bride_dissolution_day":      (0.54, 0.520, 0.62, 0.536),
+    "bride_dissolution_month":    (0.62, 0.520, 0.74, 0.536),
+    "bride_dissolution_year":     (0.74, 0.520, 0.84, 0.536),
+    # Item 12 — Father Name
+    "groom_father_first":         (0.02, 0.594, 0.16, 0.610),
+    "groom_father_middle":        (0.16, 0.594, 0.28, 0.610),
+    "groom_father_last":          (0.28, 0.594, 0.46, 0.610),
+    "bride_father_first":         (0.54, 0.594, 0.66, 0.610),
+    "bride_father_middle":        (0.66, 0.594, 0.78, 0.610),
+    "bride_father_last":          (0.78, 0.594, 0.97, 0.610),
+    # Item 13 — Father Citizenship
+    "groom_father_citizenship":   (0.02, 0.620, 0.46, 0.636),
+    "bride_father_citizenship":   (0.54, 0.620, 0.97, 0.636),
+    # Item 14 — Father Residence
+    "groom_father_residence":     (0.02, 0.644, 0.46, 0.660),
+    "bride_father_residence":     (0.54, 0.644, 0.97, 0.660),
+    # Item 15 — Mother Name
+    "groom_mother_first":         (0.02, 0.674, 0.16, 0.690),
+    "groom_mother_middle":        (0.16, 0.674, 0.28, 0.690),
+    "groom_mother_last":          (0.28, 0.674, 0.46, 0.690),
+    "bride_mother_first":         (0.54, 0.674, 0.66, 0.690),
+    "bride_mother_middle":        (0.66, 0.674, 0.78, 0.690),
+    "bride_mother_last":          (0.78, 0.674, 0.97, 0.690),
+    # Item 16 — Mother Citizenship
+    "groom_mother_citizenship":   (0.02, 0.696, 0.46, 0.712),
+    "bride_mother_citizenship":   (0.54, 0.696, 0.97, 0.712),
+    # Item 17 — Mother Residence
+    "groom_mother_residence":     (0.02, 0.720, 0.46, 0.736),
+    "bride_mother_residence":     (0.54, 0.720, 0.97, 0.736),
+}
+FORM_FIELDS = {
+    "birth":            BIRTH_FIELDS,
+    "death":            DEATH_FIELDS,
+    "marriage":         MARRIAGE_FIELDS,
+    "marriage_license": MARRIAGE_LICENSE_FIELDS,
+}
+COLOURS = [
+    (0,200,0),(0,150,255),(200,0,200),(0,200,200),(200,200,0),(220,20,60),
+    (255,140,0),(150,50,200),(0,160,80),(30,144,255),(255,20,147),(100,200,100),
+]
+# ══════════════════════════════════════════════════════════════════════════════
+#  FORM BOUNDS DETECTOR
+#  Finds the outer border of a civil registry form using line detection.
+#  Falls back to full image if detection fails.
+# ══════════════════════════════════════════════════════════════════════════════
+class FormBoundsDetector:
+    def __init__(self, verbose=False):
+        self.verbose = verbose
+    def detect(self, image_bgr):
+        h, w   = image_bgr.shape[:2]
+        gray   = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
+        bounds = self._detect_by_lines(gray, w, h)
+        if bounds is None:
+            if self.verbose:
+                print("  [Bounds] Line detection failed — using full image")
+            return (0, 0, w, h)
+        if self.verbose:
+            print(f"  [Bounds] Detected: {bounds}")
+        return bounds
+    def _detect_by_lines(self, gray, w, h):
+        try:
+            thresh  = cv2.adaptiveThreshold(
+                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY_INV, 11, 2)
+            hk      = cv2.getStructuringElement(cv2.MORPH_RECT, (max(w // 5, 10), 1))
+            h_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, hk)
+            h_rows  = np.where(np.sum(h_lines, axis=1) > w * 0.15)[0]
+            vk      = cv2.getStructuringElement(cv2.MORPH_RECT, (1, max(h // 5, 10)))
+            v_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vk)
+            v_cols  = np.where(np.sum(v_lines, axis=0) > h * 0.08)[0]
+            if len(h_rows) == 0 or len(v_cols) == 0:
+                return None
+            top, bottom = int(h_rows.min()), int(h_rows.max())
+            left, right = int(v_cols.min()), int(v_cols.max())
+            if (right - left) < w * 0.4 or (bottom - top) < h * 0.4:
+                return None
+            return (left, top, right, bottom)
+        except Exception as e:
+            if self.verbose:
+                print(f"  [Bounds error] {e}")
+            return None
+# ══════════════════════════════════════════════════════════════════════════════
+#  DYNAMIC FIELD EXTRACTOR
+#  Crops each field region relative to the detected form boundary.
+#  Works on any image size, DPI, scan margin, or slight rotation.
+# ══════════════════════════════════════════════════════════════════════════════
+class DynamicFieldExtractor:
+    def __init__(self, form_type="birth", verbose=False):
+        self.form_type    = form_type.lower()
+        self.field_map    = FORM_FIELDS.get(self.form_type, BIRTH_FIELDS)
+        self.detector     = FormBoundsDetector(verbose=verbose)
+        self.verbose      = verbose
+        self._last_bounds = None
+    def _to_bgr(self, image):
+        try:
+            from PIL import Image as PILImage
+            if isinstance(image, PILImage.Image):
+                arr = np.array(image.convert("RGB"))
+                return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
+        except ImportError:
+            pass
+        if isinstance(image, np.ndarray):
+            if len(image.shape) == 2:
+                return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+            if image.shape[2] == 4:
+                return cv2.cvtColor(image, cv2.COLOR_BGRA2BGR)
+            return image
+        raise TypeError(f"Unsupported image type: {type(image)}")
+    def extract(self, image):
+        """Returns {field_name: BGR numpy array}."""
+        image = self._to_bgr(image)
+        h, w  = image.shape[:2]
+        left, top, right, bottom = self.detector.detect(image)
+        self._last_bounds = (left, top, right, bottom)
+        form_w = right - left
+        form_h = bottom - top
+        if self.verbose:
+            print(f"  [Extract] Image={w}x{h} "
+                  f" Form={form_w}x{form_h} @ ({left},{top})-({right},{bottom})")
+        crops = {}
+        for name, (rx1, ry1, rx2, ry2) in self.field_map.items():
+            x1 = max(0, min(int(left + rx1 * form_w), w - 1))
+            y1 = max(0, min(int(top  + ry1 * form_h), h - 1))
+            x2 = max(0, min(int(left + rx2 * form_w), w - 1))
+            y2 = max(0, min(int(top  + ry2 * form_h), h - 1))
+            if x2 > x1 and y2 > y1:
+                crops[name] = image[y1:y2, x1:x2]
+        return crops
+    def visualize(self, image, output_path=None):
+        """Draw detected boundary + field boxes. Returns annotated BGR image."""
+        image = self._to_bgr(image)
+        vis   = image.copy()
+        h, w  = vis.shape[:2]
+        self.extract(image)
+        left, top, right, bottom = self._last_bounds
+        form_w = right - left
+        form_h = bottom - top
+        cv2.rectangle(vis, (left, top), (right, bottom), (0, 140, 255), 3)
+        cv2.putText(vis, "DETECTED FORM BOUNDARY",
+                    (left, max(0, top - 8)),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 140, 255), 1)
+        for idx, (name, (rx1, ry1, rx2, ry2)) in enumerate(self.field_map.items()):
+            x1 = max(0, min(int(left + rx1 * form_w), w - 1))
+            y1 = max(0, min(int(top  + ry1 * form_h), h - 1))
+            x2 = max(0, min(int(left + rx2 * form_w), w - 1))
+            y2 = max(0, min(int(top  + ry2 * form_h), h - 1))
+            c  = COLOURS[idx % len(COLOURS)]
+            cv2.rectangle(vis, (x1, y1), (x2, y2), c, 2)
+            cv2.putText(vis, name[:22], (x1 + 2, max(0, y1 - 2)),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.28, c, 1)
+        if output_path:
+            cv2.imwrite(str(output_path), vis)
+            print(f"  Field map saved -> {output_path}")
+        return vis
+# ══════════════════════════════════════════════════════════════════════════════
+#  FIELD NORMALIZER — prepares a BGR crop for CRNN inference
+# ══════════════════════════════════════════════════════════════════════════════
+class FieldNormalizer:
+    def __init__(self, target_height=64, target_width=512):
+        self.H = target_height
+        self.W = target_width
+    def _crop_to_text(self, gray):
+        inv = cv2.bitwise_not(gray)
+        _, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
+        coords = np.column_stack(np.where(thresh > 0))
+        if len(coords) == 0:
+            return gray
+        y_min, x_min = coords.min(axis=0)
+        y_max, x_max = coords.max(axis=0)
+        pad   = max(4, int((y_max - y_min) * 0.15))
+        y_min = max(0, y_min - pad)
+        x_min = max(0, x_min - pad)
+        y_max = min(gray.shape[0] - 1, y_max + pad)
+        x_max = min(gray.shape[1] - 1, x_max + pad)
+        return gray[y_min:y_max + 1, x_min:x_max + 1]
+    def _smart_resize(self, gray):
+        h, w = gray.shape
+        if h == 0 or w == 0:
+            return np.ones((self.H, self.W), dtype=np.uint8) * 255
+        scale = self.H / h
+        new_w = int(w * scale)
+        new_h = self.H
+        if new_w > self.W:
+            scale = self.W / w
+            new_h = int(h * scale)
+            new_w = self.W
+        resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
+        canvas  = np.ones((self.H, self.W), dtype=np.uint8) * 255
+        y_off   = (self.H - new_h) // 2
+        x_off   = (self.W - new_w) // 2
+        canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized
+        return canvas
+    def _binarize(self, img):
+        _, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        white_ratio = np.mean(otsu == 255)
+        if white_ratio < 0.30 or white_ratio > 0.97:
+            return cv2.adaptiveThreshold(
+                img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY, 11, 2)
+        return otsu
+    def normalize(self, crop) -> np.ndarray:
+        """Accept BGR numpy array or PIL image, return normalized binary array."""
+        try:
+            from PIL import Image as PILImage
+            if isinstance(crop, PILImage.Image):
+                crop = cv2.cvtColor(np.array(crop.convert("RGB")), cv2.COLOR_RGB2BGR)
+        except ImportError:
+            pass
+        gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) if len(crop.shape) == 3 else crop.copy()
+        gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
+        gray = self._crop_to_text(gray)
+        gray = self._smart_resize(gray)
+        return self._binarize(gray)
+    def to_tensor(self, img: np.ndarray) -> torch.Tensor:
+        return torch.FloatTensor(
+            img.astype(np.float32) / 255.0
+        ).unsqueeze(0).unsqueeze(0)
+# ══════════════════════════════════════════════════════════════════════════════
+#  CRNN MODEL LOADER
+# ══════════════════════════════════════════════════════════════════════════════
+def load_crnn_model(checkpoint_path: str, device: torch.device):
+    sys.path.insert(0, str(Path(__file__).parent))
+    from crnn_model import get_crnn_model
+    print(f"  Loading CRNN model from: {checkpoint_path}")
+    c           = torch.load(checkpoint_path, map_location=device, weights_only=False)
+    config      = c.get("config", {})
+    idx_to_char = c["idx_to_char"]
+    num_chars   = c["model_state_dict"]["fc.weight"].shape[0]
+    model = get_crnn_model(
+        model_type=config.get("model_type", "standard"),
+        img_height=config.get("img_height", 64),
+        num_chars=num_chars,
+        hidden_size=config.get("hidden_size", 128),
+        num_lstm_layers=config.get("num_lstm_layers", 1),
+    ).to(device)
+    model.load_state_dict(c["model_state_dict"])
+    model.eval()
+    val_cer  = c.get("val_cer",  None)
+    val_loss = c.get("val_loss", None)
+    metric   = f"val_cer={val_cer:.2f}%" if val_cer else \
+               f"val_loss={val_loss:.4f}" if val_loss else "no metric"
+    print(f"  Model loaded  |  {metric}  |  chars={num_chars}")
+    return model, idx_to_char, config.get("img_height", 64), config.get("img_width", 512)
+# ══════════════════════════════════════════════════════════════════════════════
+#  GREEDY CTC DECODE
+# ══════════════════════════════════════════════════════════════════════════════
+def greedy_decode(outputs: torch.Tensor, idx_to_char: dict) -> str:
+    pred_indices = torch.argmax(outputs, dim=2).permute(1, 0)
+    chars, prev  = [], -1
+    for idx in pred_indices[0]:
+        idx = idx.item()
+        if idx != 0 and idx != prev and idx in idx_to_char:
+            chars.append(idx_to_char[idx])
+        prev = idx
+    return "".join(chars)
+# ══════════════════════════════════════════════════════════════════════════════
+#  PDF → PIL IMAGE
+# ══════════════════════════════════════════════════════════════════════════════
+def pdf_to_image(pdf_path: str, dpi: int = 200):
+    from pdf2image import convert_from_path
+    # Resolve to absolute path — fixes "Unable to get page count" on Windows
+    pdf_path = str(Path(pdf_path).resolve())
+    kwargs = {"dpi": dpi, "first_page": 1, "last_page": 1}
+    if POPPLER_PATH:
+        kwargs["poppler_path"] = str(Path(POPPLER_PATH).resolve())
+    return convert_from_path(pdf_path, **kwargs)[0]
+# ══════════════════════════════════════════════════════════════════════════════
+#  CRNN OCR — runs on extracted field crops
+# ══════════════════════════════════════════════════════════════════════════════
+def run_crnn_ocr(crops: dict, model, idx_to_char: dict,
+                 img_h: int, img_w: int, device: torch.device) -> dict:
+    normalizer = FieldNormalizer(target_height=img_h, target_width=img_w)
+    results    = {}
+    with torch.no_grad():
+        for name, crop in crops.items():
+            try:
+                norm   = normalizer.normalize(crop)
+                tensor = normalizer.to_tensor(norm).to(device)
+                text   = greedy_decode(model(tensor).cpu(), idx_to_char)
+                results[name] = text
+            except Exception as e:
+                results[name] = f"[ERROR: {e}]"
+    return results
+# ══════════════════════════════════════════════════════════════════════════════
+#  CONVENIENCE WRAPPER — for other scripts that import this module
+# ══════════════════════════════════════════════════════════════════════════════
+def extract_field_images(image, form_type="birth", verbose=False):
+    """Extract field crops using dynamic boundary detection.
+    Parameters
+    ----------
+    image     : PIL Image or BGR numpy array
+    form_type : str  'birth' | 'death' | 'marriage' | 'marriage_license'
+    verbose   : bool
+    Returns
+    -------
+    dict  {field_name: BGR numpy array}
+    """
+    return DynamicFieldExtractor(form_type=form_type, verbose=verbose).extract(image)
+# Keep old name as alias so any existing code doesn't break
+extract_field_images_dynamic = extract_field_images
+# ══════════════════════════════════════════════════════════════════════════════
+#  MAIN
+# ══════════════════════════════════════════════════════════════════════════════
+def main():
+    parser = argparse.ArgumentParser(
+        description="PH Civil Registry Field Extractor — Dynamic CRNN OCR")
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--pdf",   help="Path to scanned PDF")
+    group.add_argument("--image", help="Path to scanned image (JPG/PNG)")
+    parser.add_argument("--form",       required=True,
+                        choices=["birth", "death", "marriage", "marriage_license"])
+    parser.add_argument("--checkpoint", default=DEFAULT_CHECKPOINT)
+    parser.add_argument("--visualize",  action="store_true",
+                        help="Save annotated field-map image")
+    parser.add_argument("--output",     default=None,
+                        help="Save extracted fields to JSON")
+    parser.add_argument("--poppler",    default=None,
+                        help="Override Poppler bin path (overrides .env)")
+    parser.add_argument("--dpi",        type=int, default=200)
+    parser.add_argument("--verbose",    action="store_true")
+    args = parser.parse_args()
+    global POPPLER_PATH
+    if args.poppler:
+        POPPLER_PATH = args.poppler
+    form_labels = {
+        "birth":            "Form 102 — Certificate of Live Birth",
+        "death":            "Form 103 — Certificate of Death",
+        "marriage":         "Form 97  — Certificate of Marriage",
+        "marriage_license": "Form 90  — Application for Marriage License",
+    }
+    input_file = args.pdf or args.image
+    print("\nPhilippine Civil Registry OCR — Dynamic Field Extractor")
+    print("=" * 65)
+    print(f"  Form       : {form_labels[args.form]}")
+    print(f"  File       : {input_file}")
+    print(f"  Checkpoint : {args.checkpoint}")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"  Device     : {device}\n")
+    if not os.path.exists(args.checkpoint):
+        print(f"ERROR: Checkpoint not found: {args.checkpoint}")
+        sys.exit(1)
+    model, idx_to_char, img_h, img_w = load_crnn_model(args.checkpoint, device)
+    # Load image
+    if args.pdf:
+        print(f"  Converting PDF to image at {args.dpi} DPI...")
+        try:
+            pil_img    = pdf_to_image(args.pdf, dpi=args.dpi)
+            page_image = cv2.cvtColor(np.array(pil_img.convert("RGB")), cv2.COLOR_RGB2BGR)
+        except Exception as e:
+            print(f"\nERROR converting PDF: {e}")
+            print("Fix: add POPPLER_PATH=C:\\...\\poppler\\Library\\bin to your .env file")
+            sys.exit(1)
+    else:
+        page_image = cv2.imread(args.image)
+        if page_image is None:
+            print(f"ERROR: Could not load image: {args.image}")
+            sys.exit(1)
+    h, w = page_image.shape[:2]
+    print(f"  Page size  : {w} x {h} px")
+    extractor = DynamicFieldExtractor(form_type=args.form, verbose=args.verbose)
+    if args.visualize:
+        stem     = Path(input_file).stem
+        out_path = stem + "_field_map.jpg"
+        extractor.visualize(page_image, output_path=out_path)
+        print(f"  Field map saved -> {out_path}")
+    print(f"\n  Detecting form boundary and extracting fields...")
+    crops = extractor.extract(page_image)
+    print(f"  {len(crops)} field crops extracted")
+    print(f"\n  Running CRNN OCR on {len(crops)} fields...")
+    results = run_crnn_ocr(crops, model, idx_to_char, img_h, img_w, device)
+    print(f"\n{'─'*65}")
+    print(f"  {'FIELD':<42} TEXT")
+    print(f"{'─'*65}")
+    for name, text in results.items():
+        print(f"  {name:<42} {text if text.strip() else '(empty)'}")
+    print(f"{'─'*65}")
+    print(f"\n  Fields recognized : {sum(1 for t in results.values() if t.strip())} / {len(results)}")
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            json.dump({"form": form_labels[args.form], "file": input_file,
+                       "fields": results}, f, ensure_ascii=False, indent=2)
+        print(f"\n  Results saved -> {args.output}")
+    print()
+if __name__ == "__main__":
+    main()

CRNN+CTC/finetune.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+finetune.py
+===========
+Fine-tune CRNN+CTC on generated civil registry form crops.
+Loads best_model_final.pth (pretrained), continues training on
+actual_annotations.json + train_annotations.json.
+Usage:
+    python finetune.py
+Output:
+    checkpoints/best_model_v2.pth
+"""
+import os
+import sys
+import torch
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader, ConcatDataset
+sys.path.append('.')
+from crnn_model import get_crnn_model
+from dataset import CivilRegistryDataset, collate_fn
+# ── Config ────────────────────────────────────────────────────
+CHECKPOINT_IN  = "checkpoints/best_model_final.pth"
+CHECKPOINT_OUT = "checkpoints/best_model_v2.pth"
+ACTUAL_ANN = "data/actual_annotations.json"  # real scanned forms
+SYNTH_ANN  = "data/train_annotations.json"   # synthetic / train split
+VAL_ANN    = "data/val_annotations.json"     # validation set
+IMG_HEIGHT = 64
+IMG_WIDTH  = 512
+BATCH_SIZE = 32
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ── Phase settings ────────────────────────────────────────────
+PHASES = [
+    # (name, epochs, lr, freeze_cnn, patience)
+    ("Phase 1 — CNN frozen,   adapt to form crops", 20, 1e-4, True,  5),
+    ("Phase 2 — Full model,   low LR polish",        15, 1e-5, False, 4),
+]
+# ── Main ──────────────────────────────────────────────────────
+def main():
+    print("=" * 60)
+    print("  Fine-tuning CRNN+CTC on civil registry form crops")
+    print("=" * 60)
+    print(f"  Device      : {DEVICE}")
+    print(f"  Checkpoint  : {CHECKPOINT_IN}")
+    # ── Check required files ──────────────────────────────────
+    for f in [CHECKPOINT_IN, VAL_ANN]:
+        if not os.path.exists(f):
+            print(f"ERROR: {f} not found.")
+            sys.exit(1)
+    # ── Datasets ──────────────────────────────────────────────
+    datasets_to_merge = []
+    # 1. Actual scanned forms (highest priority — real data)
+    if os.path.exists(ACTUAL_ANN):
+        actual_dataset = CivilRegistryDataset(
+            data_dir=".", annotations_file=ACTUAL_ANN,
+            img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
+        )
+        datasets_to_merge.append(actual_dataset)
+        print(f"  Actual crops: {len(actual_dataset)}  (real scanned forms)")
+    else:
+        print(f"  [!] {ACTUAL_ANN} not found — run extract_actual_data.py first")
+    # 2. Fully synthetic — keep so model doesn't forget basic characters
+    if os.path.exists(SYNTH_ANN):
+        synth_dataset = CivilRegistryDataset(
+            data_dir="data/train", annotations_file=SYNTH_ANN,
+            img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
+        )
+        datasets_to_merge.append(synth_dataset)
+        print(f"  Synth crops : {len(synth_dataset)}  (fully synthetic)")
+    if not datasets_to_merge:
+        print("ERROR: No training data found. Run extract_actual_data.py first.")
+        sys.exit(1)
+    val_dataset = CivilRegistryDataset(
+        data_dir="data/val", annotations_file=VAL_ANN,
+        img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
+    )
+    train_dataset = ConcatDataset(datasets_to_merge) if len(datasets_to_merge) > 1 else datasets_to_merge[0]
+    print(f"  Total train : {len(train_dataset)}")
+    print(f"  Val         : {len(val_dataset)}")
+    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
+                              shuffle=True,  num_workers=0, collate_fn=collate_fn)
+    val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE,
+                              shuffle=False, num_workers=0, collate_fn=collate_fn)
+    # ── Load checkpoint ───────────────────────────────────────
+    print(f"\n  Loading {CHECKPOINT_IN}...")
+    ckpt   = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
+    config = ckpt.get('config', {})
+    ref_dataset = datasets_to_merge[0]
+    model = get_crnn_model(
+        model_type      = config.get('model_type', 'standard'),
+        img_height      = config.get('img_height', 64),
+        num_chars       = ref_dataset.num_chars,
+        hidden_size     = config.get('hidden_size', 128),
+        num_lstm_layers = config.get('num_lstm_layers', 1),
+    ).to(DEVICE)
+    missing, _ = model.load_state_dict(ckpt['model_state_dict'], strict=False)
+    if missing:
+        print(f"  Note: {len(missing)} layers re-initialized (expected if vocab size changed)")
+    print(f"  Loaded epoch {ckpt.get('epoch','?')}  "
+          f"val_loss={ckpt.get('val_loss', ckpt.get('val_cer', 0)):.4f}")
+    criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
+    os.makedirs("checkpoints", exist_ok=True)
+    # ── Train/val loop ────────────────────────────────────────
+    def run_epoch(loader, training, optimizer=None):
+        model.train() if training else model.eval()
+        total, n = 0, 0
+        ctx = torch.enable_grad() if training else torch.no_grad()
+        with ctx:
+            for images, targets, target_lengths, _ in loader:
+                images     = images.to(DEVICE)
+                batch_size = images.size(0)
+                if training:
+                    optimizer.zero_grad()
+                outputs       = F.log_softmax(model(images), dim=2)
+                seq_len       = outputs.size(0)
+                input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
+                loss = criterion(outputs, targets, input_lengths, target_lengths)
+                if not torch.isnan(loss) and not torch.isinf(loss):
+                    if training:
+                        loss.backward()
+                        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
+                        optimizer.step()
+                    total += loss.item()
+                    n     += 1
+        return total / max(n, 1)
+    best_overall = float('inf')
+    for phase_name, epochs, lr, freeze_cnn, patience in PHASES:
+        print(f"\n{'='*60}")
+        print(f"  {phase_name}   LR={lr}")
+        print(f"{'='*60}")
+        for name, param in model.named_parameters():
+            param.requires_grad = not (freeze_cnn and 'cnn' in name)
+        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print(f"  Trainable params : {trainable:,}")
+        opt   = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
+        sched = optim.lr_scheduler.ReduceLROnPlateau(opt, patience=2, factor=0.5)
+        best  = float('inf')
+        wait  = 0
+        for epoch in range(1, epochs + 1):
+            tr = run_epoch(train_loader, True,  opt)
+            vl = run_epoch(val_loader,   False, None)
+            sched.step(vl)
+            if vl < best:
+                best = vl
+                wait = 0
+                if vl < best_overall:
+                    best_overall = vl
+                    torch.save({
+                        'model_state_dict': model.state_dict(),
+                        'config':           config,
+                        'char_to_idx':      ref_dataset.char_to_idx,
+                        'idx_to_char':      ref_dataset.idx_to_char,
+                        'epoch':            epoch,
+                        'val_loss':         vl,
+                    }, CHECKPOINT_OUT)
+                print(f"  Epoch {epoch:02d}/{epochs}  Train={tr:.4f}  Val={vl:.4f}  <- saved")
+            else:
+                wait += 1
+                print(f"  Epoch {epoch:02d}/{epochs}  Train={tr:.4f}  Val={vl:.4f}  (patience {wait}/{patience})")
+                if wait >= patience:
+                    print(f"  Early stopping.")
+                    break
+    print(f"\n{'='*60}")
+    print(f"  Fine-tuning complete!")
+    print(f"  Best val loss : {best_overall:.4f}")
+    print(f"  Saved         : {CHECKPOINT_OUT}")
+    print(f"{'='*60}")
+if __name__ == '__main__':
+    main()

CRNN+CTC/generate_ph_names.py ADDED Viewed

	@@ -0,0 +1,350 @@

+"""
+generate_ph_names.py
+====================
+Run this file ONCE to extract Filipino names from the
+names-dataset library and save them to data/ph_names.json.
+Install first:
+    pip install names-dataset
+Usage:
+    python generate_ph_names.py
+Output:
+    data/ph_names.json  <-- used by fix_data.py every run
+"""
+import json
+import os
+print("=" * 60)
+print("  Filipino Name Extractor  |  names-dataset (PyPI)")
+print("=" * 60)
+# ── Step 1: Load NameDataset ──────────────────────────────────
+print("\n[1/5] Loading NameDataset...")
+print("      (This takes 30-60 seconds and needs ~3.2 GB RAM)")
+try:
+    from names_dataset import NameDataset
+    nd = NameDataset()
+    print("      OK - Dataset loaded!")
+except ImportError:
+    print("\n  ERROR: names-dataset is not installed.")
+    print("  Fix:   pip install names-dataset")
+    exit(1)
+except MemoryError:
+    print("\n  ERROR: Not enough RAM. Need ~3.2 GB free.")
+    exit(1)
+# ── Step 2: Extract Filipino FIRST names ─────────────────────
+print("\n[2/5] Extracting Filipino first names (Male + Female)...")
+ph_male   = nd.get_top_names(n=300, gender='Male',   country_alpha2='PH')
+ph_female = nd.get_top_names(n=300, gender='Female', country_alpha2='PH')
+# API returns: { 'PH': { 'M': [...] } }
+male_first   = ph_male.get('PH',   {}).get('M', [])
+female_first = ph_female.get('PH', {}).get('F', [])
+all_first    = male_first + female_first
+print(f"      Male   first names : {len(male_first)}")
+print(f"      Female first names : {len(female_first)}")
+print(f"      Total  first names : {len(all_first)}")
+print(f"      Sample (male)      : {male_first[:5]}")
+print(f"      Sample (female)    : {female_first[:5]}")
+# ── Step 3: Extract Filipino LAST names ──────────────────────
+print("\n[3/5] Extracting Filipino last names...")
+ph_last_raw = nd.get_top_names(n=300, country_alpha2='PH', use_first_names=False)
+print(f"      Raw last name API type : {type(ph_last_raw)}")
+ph_last_ph = ph_last_raw.get('PH', {})
+print(f"      PH entry type          : {type(ph_last_ph)}")
+raw_last = []
+if isinstance(ph_last_ph, list):
+    raw_last = ph_last_ph
+elif isinstance(ph_last_ph, dict):
+    first_val = next(iter(ph_last_ph.values()), None)
+    if isinstance(first_val, list):
+        for lst in ph_last_ph.values():
+            raw_last.extend(lst)
+    elif isinstance(first_val, dict):
+        raw_last = list(ph_last_ph.keys())
+    else:
+        raw_last = list(ph_last_ph.keys())
+# Deduplicate while preserving order
+seen     = set()
+all_last = []
+for name in raw_last:
+    if isinstance(name, str) and name not in seen:
+        seen.add(name)
+        all_last.append(name)
+print(f"      Total last names   : {len(all_last)}")
+print(f"      Sample             : {all_last[:5]}")
+if len(all_last) == 0:
+    print("\n  WARNING: Could not extract last names from API.")
+    print("  Using common Filipino last names as fallback...")
+    all_last = [
+        'Santos', 'Reyes', 'Cruz', 'Bautista', 'Ocampo',
+        'Garcia', 'Mendoza', 'Torres', 'Flores', 'Aquino',
+        'Dela Cruz', 'Del Rosario', 'San Jose', 'De Guzman',
+        'Villanueva', 'Gonzales', 'Ramos', 'Diaz', 'Castro',
+        'Morales', 'Ortega', 'Gutierrez', 'Lopez', 'Ramirez',
+        'Navarro', 'Aguilar', 'Espinosa', 'Mercado', 'Tolentino',
+        'Lim', 'Tan', 'Go', 'Chua', 'Sy', 'Ong', 'Co',
+        'Macaraeg', 'Macapagal', 'Magsaysay', 'Magno',
+        'Pascual', 'Buenaventura', 'Concepcion', 'Resurreccion',
+        'Ilagan', 'Manalo', 'Soriano', 'Evangelista', 'Salazar',
+    ]
+    print(f"      Fallback last names: {len(all_last)}")
+# ── Step 4: Build MIDDLE names pool ──────────────────────────
+# Middle names in Filipino naming convention are the mother's
+# maiden last name. We build a large pool by combining:
+#   A) The last names pool already extracted (primary source)
+#   B) A curated extended list of common Filipino surnames
+#      used specifically as middle names
+print("\n[4/5] Building middle names pool...")
+EXTENDED_MIDDLE_NAMES = [
+    # Common Filipino surnames used as middle names
+    'Abad', 'Abaya', 'Abella', 'Ablaza', 'Abrera',
+    'Acosta', 'Adriano', 'Afable', 'Africa', 'Agcaoili',
+    'Agno', 'Agpalo', 'Aguinaldo', 'Agustin', 'Ahorro',
+    'Alano', 'Alba', 'Albano', 'Alberto', 'Alcantara',
+    'Alcazar', 'Alcon', 'Aldana', 'Alegre', 'Alejandro',
+    'Aligaen', 'Alim', 'Alinea', 'Alipio', 'Almario',
+    'Almeda', 'Almendras', 'Alminiana', 'Almodiel', 'Alonto',
+    'Alvarado', 'Alvarez', 'Amante', 'Amaro', 'Ambrocio',
+    'Amor', 'Amores', 'Amparo', 'Anastacio', 'Andal',
+    'Andaya', 'Angeles', 'Angsioco', 'Antiporda', 'Antonio',
+    'Apalisok', 'Apolinario', 'Apostol', 'Aquino', 'Araneta',
+    'Aranas', 'Aranda', 'Arceo', 'Arenas', 'Arias',
+    'Ariate', 'Arillo', 'Arimado', 'Arjona', 'Arlante',
+    'Arnaldo', 'Arnaiz', 'Arnoco', 'Arocena', 'Arroyo',
+    'Asejo', 'Asuncion', 'Austria', 'Avecilla', 'Avena',
+    'Avila', 'Avinante', 'Ayala', 'Azucena', 'Azul',
+    'Bacani', 'Bacunawa', 'Baguio', 'Bagunu', 'Balagtas',
+    'Balangue', 'Balbin', 'Balde', 'Baldeo', 'Balgos',
+    'Balili', 'Balinas', 'Balitaan', 'Balladares', 'Ballesteros',
+    'Balmeo', 'Balmores', 'Banaag', 'Banaag', 'Bandola',
+    'Bangayan', 'Bansil', 'Bansode', 'Bantigue', 'Bantug',
+    'Barbin', 'Barcenas', 'Bareng', 'Barrion', 'Barroga',
+    'Bartolome', 'Bases', 'Batac', 'Bataller', 'Batanes',
+    'Batungbakal', 'Bautista', 'Bayani', 'Bayot', 'Baysic',
+    'Belarmino', 'Beldia', 'Belen', 'Belgica', 'Bello',
+    'Benavides', 'Bendaña', 'Benedicto', 'Benigno', 'Benitez',
+    'Bernardino', 'Bernardo', 'Bernarte', 'Besares', 'Billones',
+    'Binay', 'Binayas', 'Biscocho', 'Blanco', 'Bondoc',
+    'Borja', 'Borromeo', 'Bravo', 'Buenaobra', 'Buenaflor',
+    'Buenafe', 'Buenaseda', 'Buenconsejo', 'Buendia', 'Bugarin',
+    'Bulalacao', 'Bulalacao', 'Bulatao', 'Bumanlag', 'Bunag',
+    'Caballero', 'Cabigting', 'Cabral', 'Cabreros', 'Cacal',
+    'Cagampan', 'Cagas', 'Caguioa', 'Cahilig', 'Cajucom',
+    'Calagos', 'Calamba', 'Calasanz', 'Calatrava', 'Calderon',
+    'Calimag', 'Calimutan', 'Calinawan', 'Calleja', 'Callejo',
+    'Caluag', 'Calugay', 'Camacho', 'Camino', 'Campaner',
+    'Camposano', 'Candelario', 'Canete', 'Caning', 'Canlas',
+    'Caoile', 'Capili', 'Carandang', 'Carbonell', 'Cariaga',
+    'Carino', 'Carunungan', 'Casaje', 'Casas', 'Casidsid',
+    'Castañeda', 'Castillo', 'Castillo', 'Catalan', 'Catapang',
+    'Cayabyab', 'Cayco', 'Celdran', 'Cerillo', 'Cervantes',
+    'Chico', 'Chikiamco', 'Chiongbian', 'Cipriano', 'Clarin',
+    'Claudio', 'Clavecillas', 'Climaco', 'Cobankiat', 'Colambo',
+    'Collado', 'Comafay', 'Comia', 'Concepcion', 'Condino',
+    'Consing', 'Contraras', 'Coquia', 'Cordero', 'Corotan',
+    'Corpus', 'Cosico', 'Costales', 'Crisostomo', 'Cristobal',
+    'Cueto', 'Culala', 'Cunanan', 'Cunanon', 'Curato',
+    'Dadivas', 'Daep', 'Daez', 'Daguplo', 'Dalida',
+    'Dalisay', 'Dalmacion', 'Dalusong', 'Damasco', 'Damo',
+    'Danao', 'Dancel', 'Dandan', 'Danila', 'Daquigan',
+    'Dario', 'Datoc', 'Datumanong', 'David', 'Dayao',
+    'Dayrit', 'De Borja', 'De Castro', 'De Jesus', 'De Jose',
+    'De La Cruz', 'De La Pena', 'De La Rosa', 'De Leon', 'De Lima',
+    'De Los Angeles', 'De Los Reyes', 'De Los Santos', 'De Luna', 'De Mesa',
+    'De Ocampo', 'De Paz', 'De Vera', 'De Villa', 'Delos Reyes',
+    'Demaisip', 'Delos Santos', 'Demillo', 'Demonteverde', 'Denosta',
+    'Derequito', 'Deri', 'Detablan', 'Deveraturda', 'Diaz',
+    'Dichoso', 'Diego', 'Diesto', 'Dimaano', 'Dimabuyu',
+    'Dimagiba', 'Dimaguila', 'Dimaio', 'Dimanlig', 'Dimayuga',
+    'Dingal', 'Dinglasan', 'Dionisio', 'Dioquino', 'Ditan',
+    'Diwata', 'Domingo', 'Dominguez', 'Donato', 'Dorado',
+    'Doria', 'Duallo', 'Duenas', 'Duerme', 'Dulay',
+    'Dumalaog', 'Dumpit', 'Duque', 'Duran', 'Durante',
+    'Ebdane', 'Echavez', 'Echevarria', 'Edralin', 'Ejercito',
+    'Elago', 'Elazegui', 'Elises', 'Elumba', 'Enage',
+    'Encarnacion', 'Enriquez', 'Escobar', 'Escueta', 'Escutin',
+    'Esguerra', 'Eslit', 'Espejo', 'Espeleta', 'Espinas',
+    'Espino', 'Espiritu', 'Estepa', 'Esteves', 'Estrada',
+    'Estrellas', 'Evangelista', 'Evasco', 'Evidente', 'Eyas',
+    'Fabella', 'Fabros', 'Faelnar', 'Fajardo', 'Fajutag',
+    'Famadico', 'Famador', 'Faustino', 'Favila', 'Feliciano',
+    'Felipe', 'Fermin', 'Fernandez', 'Fernando', 'Ferrer',
+    'Figueras', 'Fider', 'Florendo', 'Florentino', 'Floreta',
+    'Flores', 'Florido', 'Floriza', 'Foja', 'Fonacier',
+    'Fontanilla', 'Formoso', 'Fornier', 'Fortich', 'Fortuna',
+    'Francisco', 'Frano', 'Frasco', 'Frias', 'Fuentes',
+    'Gaabucayan', 'Gabutero', 'Gaerlan', 'Gaffud', 'Galapon',
+    'Galera', 'Galicia', 'Galindez', 'Gallardo', 'Gallo',
+    'Galvez', 'Gamalinda', 'Gamboa', 'Gammad', 'Gandionco',
+    'Ganzon', 'Garado', 'Garayblas', 'Garcia', 'Garduce',
+    'Garrido', 'Gatdula', 'Gatmaitan', 'Gatus', 'Gawat',
+    'Gelera', 'Gelua', 'Gemora', 'Genato', 'Generoso',
+    'Gequillana', 'Gerona', 'Gerundio', 'Gianan', 'Gimenez',
+    'Gloria', 'Glorioso', 'Glova', 'Golez', 'Gomez',
+    'Gonzaga', 'Gonzales', 'Gordoncillo', 'Gorre', 'Grafilo',
+    'Gregorio', 'Griño', 'Guanzon', 'Guerrero', 'Guevara',
+    'Guiao', 'Guillen', 'Guinto', 'Guison', 'Gullas',
+    'Gutierrez', 'Guzman', 'Hernandez', 'Herrera', 'Hizon',
+    'Honasan', 'Hontiveros', 'Horca', 'Hufana', 'Humilde',
+    'Ibañez', 'Ignacio', 'Ilustre', 'Imbong', 'Imperial',
+    'Infante', 'Inion', 'Inocentes', 'Inso', 'Iringan',
+    'Jacinto', 'Javier', 'Jimenez', 'Jose', 'Joson',
+    'Juan', 'Juico', 'Jurado', 'Kabigting', 'Kalaw',
+    'Kho', 'Lacaba', 'Lacadin', 'Lacson', 'Ladesma',
+    'Laderas', 'Lagman', 'Lagua', 'Laguna', 'Lainez',
+    'Lajarca', 'Lamayo', 'Lambino', 'Lapid', 'Lapuz',
+    'Lara', 'Largo', 'Lariza', 'Larizal', 'Laserna',
+    'Latorre', 'Laurel', 'Laurente', 'Lazaro', 'Leano',
+    'Legarda', 'Leonor', 'Leynes', 'Libunao', 'Licup',
+    'Lim', 'Limkaichong', 'Limpag', 'Liwanag', 'Llanes',
+    'Llamado', 'Llaneta', 'Locsin', 'Logarta', 'Lopez',
+    'Lorenzo', 'Lorilla', 'Lozada', 'Lucero', 'Luistro',
+    'Luna', 'Luneta', 'Luzon', 'Macalintal', 'Macam',
+    'Maceda', 'Madera', 'Madrazo', 'Magtanggol', 'Malabanan',
+    'Malacaman', 'Malajacan', 'Malanyaon', 'Malaya', 'Malbas',
+    'Malcampo', 'Maldia', 'Maligalig', 'Malinao', 'Malonzo',
+    'Mangahas', 'Mangubat', 'Manigbas', 'Manila', 'Manlangit',
+    'Manlapaz', 'Manlongat', 'Manrique', 'Mansalay', 'Mante',
+    'Manuel', 'Manzano', 'Marcelo', 'Marcos', 'Mariano',
+    'Maristela', 'Marquez', 'Maravilla', 'Masangkay', 'Masapol',
+    'Mateo', 'Matienzo', 'Matining', 'Matugas', 'Maula',
+    'Maulion', 'Mayuga', 'Medina', 'Mejia', 'Melchor',
+    'Melo', 'Menor', 'Mercado', 'Mesina', 'Miguel',
+    'Miralles', 'Miranda', 'Molano', 'Molina', 'Mondejar',
+    'Monreal', 'Montano', 'Montenegro', 'Montero', 'Montes',
+    'Montesa', 'Montoya', 'Moraga', 'Moraleda', 'Moreno',
+    'Morial', 'Muncal', 'Muñoz', 'Murillo', 'Musni',
+    'Nacion', 'Nadal', 'Nagrampa', 'Nalzaro', 'Napeñas',
+    'Narciso', 'Natividad', 'Navales', 'Navarro', 'Neri',
+    'Nicolas', 'Nisperos', 'Nolasco', 'Noynay', 'Nuñez',
+    'Oaminal', 'Ocampo', 'Ocfemia', 'Ochoa', 'Olaguera',
+    'Olano', 'Oliva', 'Olivares', 'Oliveros', 'Olpindo',
+    'Omadto', 'Ombion', 'Onate', 'Ong', 'Orbeta',
+    'Orbita', 'Ordoño', 'Orendain', 'Orense', 'Orobia',
+    'Orozco', 'Ortega', 'Osmeña', 'Osorio', 'Ostrea',
+    'Ouano', 'Pabiton', 'Pableo', 'Pabriaga', 'Pacanan',
+    'Padayao', 'Padilla', 'Padua', 'Paguio', 'Pagulayan',
+    'Palad', 'Palacios', 'Palafox', 'Palaganas', 'Palattao',
+    'Palencia', 'Palma', 'Palo', 'Paloma', 'Palomares',
+    'Pamaran', 'Pamintuan', 'Panaligan', 'Panganiban', 'Pangilinan',
+    'Panopio', 'Papa', 'Paqueo', 'Paras', 'Paredes',
+    'Parreño', 'Pascua', 'Pascual', 'Pastor', 'Paterno',
+    'Patron', 'Pavia', 'Pecaña', 'Pecho', 'Pedrosa',
+    'Pelayo', 'Peña', 'Peñaflor', 'Peñaranda', 'Penarroyo',
+    'Peralta', 'Perez', 'Perlas', 'Pernia', 'Pesquera',
+    'Pestano', 'Piccio', 'Picardal', 'Pineda', 'Pimentel',
+    'Pilapil', 'Pili', 'Piliin', 'Pillar', 'Pilorin',
+    'Poblete', 'Poliquit', 'Ponce', 'Ponferrada', 'Porras',
+    'Prado', 'Prieto', 'Prodigalidad', 'Prudente', 'Punsalan',
+    'Quezon', 'Quiambao', 'Quiaoit', 'Quijano', 'Quimpo',
+    'Quinit', 'Quinones', 'Quiogue', 'Quirino', 'Quisao',
+    'Racelis', 'Rada', 'Ramirez', 'Ramon', 'Ramos',
+    'Ravalo', 'Rayala', 'Razon', 'Recinto', 'Recometa',
+    'Reforma', 'Regalado', 'Reganit', 'Regio', 'Regidor',
+    'Regis', 'Reodica', 'Respicio', 'Revilla', 'Reyes',
+    'Ricafort', 'Ricalde', 'Ridad', 'Rillo', 'Rivera',
+    'Rivero', 'Rizal', 'Robles', 'Roca', 'Rocamora',
+    'Rocero', 'Rodriguez', 'Rojas', 'Romero', 'Ronquillo',
+    'Rosales', 'Rosario', 'Rosete', 'Rotor', 'Roxas',
+    'Rubio', 'Rufino', 'Ruiz', 'Sabal', 'Sabando',
+    'Sabido', 'Sabijon', 'Sabio', 'Saceda', 'Saclolo',
+    'Sagum', 'Salceda', 'Salcedo', 'Salgado', 'Salinas',
+    'Saludar', 'Saluta', 'Salvador', 'Sambrano', 'Samson',
+    'Sanchez', 'Sandoval', 'Sangalang', 'Santiago', 'Santillan',
+    'Sanz', 'Sarino', 'Sarmiento', 'Sarona', 'Savellano',
+    'Sebastian', 'Segovia', 'Sendin', 'Seneres', 'Serafica',
+    'Sereno', 'Senga', 'Serrano', 'Sierra', 'Sigua',
+    'Silva', 'Silvestre', 'Simon', 'Sinco', 'Singson',
+    'Siy', 'Sobejana', 'Soberano', 'Socrates', 'Soliman',
+    'Solis', 'Soliven', 'Solomon', 'Sotto', 'Suansing',
+    'Suarez', 'Subido', 'Sulit', 'Sultan', 'Sumagaysay',
+    'Sunga', 'Tabamo', 'Tabinas', 'Tabuena', 'Tagle',
+    'Taguba', 'Tajonera', 'Talabong', 'Talavera', 'Talento',
+    'Taleon', 'Talosig', 'Tamano', 'Tambalo', 'Tanada',
+    'Tandoc', 'Tañada', 'Tarriela', 'Tating', 'Tautho',
+    'Tayag', 'Tayco', 'Tecson', 'Tejano', 'Tejero',
+    'Teodoro', 'Tibay', 'Tigas', 'Tiglao', 'Timbol',
+    'Tingzon', 'Tiongco', 'Tiongson', 'Tirol', 'Tobias',
+    'Toledo', 'Tolentino', 'Tomelden', 'Tomas', 'Tomaro',
+    'Tomaroy', 'Torino', 'Torralba', 'Torrente', 'Torno',
+    'Trea', 'Trinidad', 'Tuazon', 'Tubig', 'Tubigan',
+    'Tugade', 'Tumbocon', 'Tupas', 'Tuquero', 'Turla',
+    'Umagat', 'Umali', 'Usman', 'Uson', 'Uy',
+    'Valdez', 'Valencia', 'Valenciano', 'Valentin', 'Valera',
+    'Valiao', 'Varela', 'Vargas', 'Vasquez', 'Velarde',
+    'Velasco', 'Velasquez', 'Velez', 'Vera', 'Vergara',
+    'Vibandor', 'Vicente', 'Victorino', 'Vidal', 'Viernes',
+    'Villacorta', 'Villaflor', 'Villafranca', 'Villagomez', 'Villagonzalo',
+    'Villanueva', 'Villar', 'Villareal', 'Villaruel', 'Villaverde',
+    'Villena', 'Virata', 'Vista', 'Vivar', 'Vizconde',
+    'Yabes', 'Yap', 'Yasay', 'Yatco', 'Ylagan',
+    'Yñiguez', 'Yorac', 'Yulo', 'Zabala', 'Zaldivar',
+    'Zamora', 'Zapanta', 'Zaragoza', 'Zosa', 'Zulueta',
+]
+# Combine last names pool + extended middle names, deduplicated
+middle_seen = set()
+all_middle  = []
+for name in (all_last + EXTENDED_MIDDLE_NAMES):
+    if isinstance(name, str) and name not in middle_seen:
+        middle_seen.add(name)
+        all_middle.append(name)
+print(f"      Total middle names : {len(all_middle)}")
+print(f"      Sample             : {all_middle[:5]}")
+# ── Step 5: Save to JSON ──────────────────────────────────────
+print("\n[5/5] Saving to data/ph_names.json ...")
+os.makedirs('data', exist_ok=True)
+output = {
+    "first_names": {
+        "male":   male_first,
+        "female": female_first,
+        "all":    all_first
+    },
+    "last_names":   all_last,
+    "middle_names": all_middle,
+    "metadata": {
+        "source":              "names-dataset (PyPI) -- country_alpha2='PH'",
+        "total_first":         len(all_first),
+        "total_last":          len(all_last),
+        "total_middle":        len(all_middle),
+        "total_name_combos":   len(all_first) * len(all_middle) * len(all_last),
+    }
+}
+with open('data/ph_names.json', 'w', encoding='utf-8') as f:
+    json.dump(output, f, indent=2, ensure_ascii=False)
+# ── Summary ───────────────────────────────────────────────────
+print("\n" + "=" * 60)
+print("  DONE!")
+print("=" * 60)
+print(f"  Male first names   : {len(male_first)}")
+print(f"  Female first names : {len(female_first)}")
+print(f"  Last names         : {len(all_last)}")
+print(f"  Middle names       : {len(all_middle)}")
+print(f"  Possible 3-part name combos : {len(all_first) * len(all_middle) * len(all_last):,}")
+print(f"\n  Saved to: data/ph_names.json")
+print(f"\n  Next step: python fix_data.py")
+print("=" * 60)

CRNN+CTC/inference.py ADDED Viewed

	@@ -0,0 +1,395 @@

+"""
+Inference Script for CRNN+CTC Civil Registry OCR
+TWO NORMALIZERS:
+  SimpleNormalizer   — for PIL-rendered synthetic images (matches training exactly)
+  AdaptiveNormalizer — for physical/scanned images (any zoom, any size)
+AUTO-DETECT MODE: automatically decides which pipeline to use based on
+text density in the image — zoomed-in images get adaptive treatment,
+clean synthetic images get simple treatment.
+"""
+import torch
+import cv2
+import numpy as np
+from pathlib import Path
+from typing import Dict, List
+from crnn_model import get_crnn_model
+from utils import decode_ctc_predictions, extract_form_fields
+# ─────────────────────────────────────────────────────────────────────────────
+# HELPERS
+# ─────────────────────────────────────────────────────────────────────────────
+def _to_gray(img: np.ndarray) -> np.ndarray:
+    if len(img.shape) == 3:
+        return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    return img.copy()
+def _binarize(gray: np.ndarray) -> np.ndarray:
+    """Otsu, falls back to adaptive for uneven backgrounds."""
+    _, otsu = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    white_ratio = np.mean(otsu == 255)
+    if white_ratio < 0.30 or white_ratio > 0.97:
+        return cv2.adaptiveThreshold(
+            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY, 11, 2)
+    return otsu
+def _crop_to_text(gray: np.ndarray, pad_ratio=0.15) -> np.ndarray:
+    """Crop tightly around dark pixels (the text)."""
+    inv = cv2.bitwise_not(gray)
+    _, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
+    coords = np.column_stack(np.where(thresh > 0))
+    if len(coords) == 0:
+        return gray
+    y_min, x_min = coords.min(axis=0)
+    y_max, x_max = coords.max(axis=0)
+    pad   = max(4, int((y_max - y_min) * pad_ratio))
+    y_min = max(0, y_min - pad)
+    x_min = max(0, x_min - pad)
+    y_max = min(gray.shape[0] - 1, y_max + pad)
+    x_max = min(gray.shape[1] - 1, x_max + pad)
+    return gray[y_min:y_max+1, x_min:x_max+1]
+def _aspect_resize(gray: np.ndarray, H: int, W: int) -> np.ndarray:
+    """Resize preserving aspect ratio, pad with white to fill canvas."""
+    h, w = gray.shape
+    if h == 0 or w == 0:
+        return np.ones((H, W), dtype=np.uint8) * 255
+    scale = H / h
+    new_w = int(w * scale)
+    new_h = H
+    if new_w > W:
+        scale = W / w
+        new_h = int(h * scale)
+        new_w = W
+    resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
+    canvas  = np.ones((H, W), dtype=np.uint8) * 255
+    y_off   = (H - new_h) // 2
+    x_off   = (W - new_w) // 2
+    canvas[y_off:y_off+new_h, x_off:x_off+new_w] = resized
+    return canvas
+def _detect_mode(gray: np.ndarray) -> str:
+    """
+    Auto-detect whether image needs adaptive or simple normalization.
+    Logic:
+      - If >25% of pixels are dark, text is very large/zoomed → adaptive.
+      - If image size is far from training size (512x64) → adaptive.
+      - Otherwise → simple (matches training pipeline).
+    """
+    h, w  = gray.shape
+    _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
+    dark_px = np.mean(bw == 0)
+    # Text fills too much of the image → zoomed in (like shane.jpg)
+    if dark_px > 0.25:
+        return 'adaptive'
+    # Image is far from expected training size (allow 50% tolerance)
+    if not (256 <= w <= 1024 and 32 <= h <= 128):
+        return 'adaptive'
+    return 'simple'
+def _to_tensor(img: np.ndarray) -> torch.Tensor:
+    return torch.FloatTensor(
+        img.astype(np.float32) / 255.0
+    ).unsqueeze(0).unsqueeze(0)
+# ─────────────────────────────────────────────────────────────────────────────
+# SIMPLE NORMALIZER  ← for PIL-rendered / training-matched images
+# ─────────────────────────────────────────────────────────────────────────────
+class SimpleNormalizer:
+    """
+    Matches fix_data.py training pipeline exactly:
+      grayscale → resize → binarize
+    Best for test images created by create_test_images.py.
+    """
+    def __init__(self, H=64, W=512):
+        self.H, self.W = H, W
+    def normalize(self, img: np.ndarray) -> np.ndarray:
+        gray    = _to_gray(img)
+        resized = cv2.resize(gray, (self.W, self.H), interpolation=cv2.INTER_LANCZOS4)
+        return _binarize(resized)
+    def normalize_from_path(self, path: str) -> np.ndarray:
+        img = cv2.imread(str(path))
+        if img is None:
+            raise ValueError(f"Cannot load: {path}")
+        return self.normalize(img)
+# ─────────────────────────────────────────────────────────────────────────────
+# ADAPTIVE NORMALIZER  ← for real / physical / scanned images
+# ─────────────────────────────────────────────────────────────────────────────
+class AdaptiveNormalizer:
+    """
+    For physical documents or images with non-standard zoom/size:
+      grayscale → denoise → crop text → aspect-ratio resize → binarize
+    Crops to actual text first, so a zoomed-in image like shane.jpg
+    gets scaled down to training size instead of being squeezed/stretched.
+    """
+    def __init__(self, H=64, W=512):
+        self.H, self.W = H, W
+    def normalize(self, img: np.ndarray) -> np.ndarray:
+        gray   = _to_gray(img)
+        gray   = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
+        gray   = _crop_to_text(gray)
+        canvas = _aspect_resize(gray, self.H, self.W)
+        return _binarize(canvas)
+    def normalize_from_path(self, path: str) -> np.ndarray:
+        img = cv2.imread(str(path))
+        if img is None:
+            raise ValueError(f"Cannot load: {path}")
+        return self.normalize(img)
+# ─────────────────────────────────────────────────────────────────────────────
+# AUTO NORMALIZER  ← detects which pipeline to use per image automatically
+# ─────────────────────────────────────────────────────────────────────────────
+class AutoNormalizer:
+    """
+    Automatically picks Simple or Adaptive based on image characteristics.
+    Examples:
+      demo.jpg  (clean 512x64 PIL)   → Simple   (matches training)
+      name1.jpg (clean 512x64 PIL)   → Simple
+      shane.jpg (huge zoomed text)   → Adaptive (crop then resize)
+      real scan (any size/zoom)      → Adaptive
+    """
+    def __init__(self, H=64, W=512, verbose=False):
+        self.H, self.W = H, W
+        self.verbose   = verbose
+        self._simple   = SimpleNormalizer(H, W)
+        self._adaptive = AdaptiveNormalizer(H, W)
+    def normalize(self, img: np.ndarray) -> np.ndarray:
+        gray = _to_gray(img)
+        mode = _detect_mode(gray)
+        if self.verbose:
+            print(f"      auto → {mode}")
+        return self._simple.normalize(img) if mode == 'simple' \
+               else self._adaptive.normalize(img)
+    def normalize_from_path(self, path: str) -> np.ndarray:
+        img = cv2.imread(str(path))
+        if img is None:
+            raise ValueError(f"Cannot load: {path}")
+        gray = _to_gray(img)
+        mode = _detect_mode(gray)
+        if self.verbose:
+            print(f"      [{Path(path).name}] → {mode}")
+        return self._simple.normalize(img) if mode == 'simple' \
+               else self._adaptive.normalize(img)
+    def to_tensor(self, img: np.ndarray) -> torch.Tensor:
+        return _to_tensor(img)
+# ─────────────────────────────────────────────────────────────────────────────
+# MAIN OCR CLASS
+# ─────────────────────────────────────────────────────────────────────────────
+class CivilRegistryOCR:
+    def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
+        """
+        Args:
+            checkpoint_path : path to best_model_v6.pth
+            device          : 'cuda' or 'cpu'
+            mode            : 'auto'     → auto-detect per image  (recommended)
+                              'simple'   → always use simple pipeline
+                              'adaptive' → always use adaptive pipeline
+            verbose         : print which mode was chosen per image
+        """
+        if device == 'cuda' and not torch.cuda.is_available():
+            device = 'cpu'
+        self.device  = torch.device(device)
+        self.verbose = verbose
+        print(f"Loading model from {checkpoint_path}...")
+        checkpoint = torch.load(checkpoint_path, map_location=self.device,
+                                weights_only=False)
+        self.char_to_idx = checkpoint['char_to_idx']
+        self.idx_to_char = checkpoint['idx_to_char']
+        self.config      = checkpoint.get('config', {})
+        img_height = self.config.get('img_height', 64)
+        img_width  = self.config.get('img_width',  512)
+        if mode == 'simple':
+            self.normalizer = SimpleNormalizer(img_height, img_width)
+        elif mode == 'adaptive':
+            self.normalizer = AdaptiveNormalizer(img_height, img_width)
+        else:
+            self.normalizer = AutoNormalizer(img_height, img_width, verbose=verbose)
+        self.model = get_crnn_model(
+            model_type=self.config.get('model_type', 'standard'),
+            img_height=img_height,
+            num_chars=checkpoint['model_state_dict']['fc.weight'].shape[0],
+            hidden_size=self.config.get('hidden_size', 128),
+            num_lstm_layers=self.config.get('num_lstm_layers', 1)
+        )
+        self.model.load_state_dict(checkpoint['model_state_dict'])
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        print(f"Model loaded successfully")
+        # Support both key names: val_loss (fine-tuned) and val_cer (synthetic baseline)
+        # FIXED Bug 5: removed incorrect `val_cer < 10` heuristic that mislabelled
+        # the metric. The key name alone is the reliable indicator.
+        val_loss = checkpoint.get('val_loss', None)
+        val_cer  = checkpoint.get('val_cer',  None)
+        if val_loss is not None and val_cer is not None:
+            print(f"  Val Loss : {val_loss:.4f} | Val CER: {val_cer:.2f}%")
+        elif val_loss is not None:
+            print(f"  Val Loss : {val_loss:.4f}  (fine-tuned checkpoint — run compare_live_cer.py for true CER)")
+        elif val_cer is not None:
+            print(f"  Val CER  : {val_cer:.2f}%")
+        else:
+            print(f"  Val CER  : N/A (run check_cer.py for true CER)")
+        print(f"  Device   : {self.device}")
+        print(f"  Mode     : {mode}  ({img_height}x{img_width})")
+    def _preprocess(self, image_path) -> torch.Tensor:
+        normalized = self.normalizer.normalize_from_path(str(image_path))
+        return _to_tensor(normalized)
+    def predict(self, image_path, decode_method='greedy') -> str:
+        img = self._preprocess(image_path).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(img)
+            decoded = decode_ctc_predictions(
+                outputs.cpu(), self.idx_to_char, method=decode_method)
+        return decoded[0]
+    def predict_batch(self, image_paths, decode_method='greedy') -> List[Dict]:
+        results = []
+        for image_path in image_paths:
+            try:
+                text = self.predict(image_path, decode_method)
+                results.append({'image_path': str(image_path),
+                                'text': text, 'success': True})
+            except Exception as e:
+                results.append({'image_path': str(image_path),
+                                'error': str(e), 'success': False})
+        return results
+    def process_form(self, form_image_path, form_type) -> Dict:
+        text   = self.predict(form_image_path)
+        fields = extract_form_fields(text, form_type)
+        fields['raw_text'] = text
+        return fields
+# ─────────────────────────────────────────────────────────────────────────────
+# FORM FIELD EXTRACTOR
+# ─────────────────────────────────────────────────────────────────────────────
+class FormFieldExtractor:
+    def __init__(self, ocr_model: CivilRegistryOCR):
+        self.ocr = ocr_model
+    def extract_form1a_fields(self, path):
+        text = self.ocr.predict(path)
+        return {'form_type': 'Form 1A - Birth Certificate', 'raw_text': text}
+    def extract_form2a_fields(self, path):
+        text = self.ocr.predict(path)
+        return {'form_type': 'Form 2A - Death Certificate', 'raw_text': text}
+    def extract_form3a_fields(self, path):
+        text = self.ocr.predict(path)
+        return {'form_type': 'Form 3A - Marriage Certificate', 'raw_text': text}
+    def extract_form90_fields(self, path):
+        text = self.ocr.predict(path)
+        return {'form_type': 'Form 90 - Marriage License Application',
+                'raw_text': text}
+# ─────────────────────────────────────────────────────────────────────────────
+# DEMO
+# ���────────────────────────────────────────────────────────────────────────────
+def demo_inference():
+    print("=" * 70)
+    print("Civil Registry OCR  (auto-adaptive normalizer)")
+    print("=" * 70)
+    ocr = CivilRegistryOCR(
+        checkpoint_path='checkpoints/best_model_v6.pth',
+        device='cuda',
+        mode='auto',
+        verbose=True   # shows which mode each image triggers
+    )
+    print("\n1. Single Prediction:")
+    try:
+        result = ocr.predict('test_images/date1.jpg')
+        print(f"   Recognized text: {result}")
+    except Exception as e:
+        print(f"   Error: {e}")
+    print("\n2. Batch Prediction:")
+    '''batch_results = ocr.predict_batch([
+        'test_images/name1.jpg',
+        'test_images/shane.jpg',
+        'test_images/date1.jpg',
+        'test_images/place1.jpg',
+    ])
+    for r in batch_results:
+        status = r['text'] if r['success'] else f"ERROR - {r['error']}"
+        print(f"   {r['image_path']}: {status}")'''
+    print("\n3. Form Processing:")
+    try:
+        form_data = ocr.process_form('test_images/form1a_sample.jpg', 'form1a')
+        print(f"   Form Type: Form 1A - Birth Certificate")
+        print(f"   Raw Text: {form_data['raw_text']}")
+    except Exception as e:
+        print(f"   Error: {e}")
+def create_inference_api():
+    class OCR_API:
+        def __init__(self, checkpoint_path, mode='auto'):
+            self.ocr       = CivilRegistryOCR(checkpoint_path, mode=mode)
+            self.extractor = FormFieldExtractor(self.ocr)
+        def recognize_text(self, p):
+            return {'text': self.ocr.predict(p), 'success': True}
+        def process_birth_certificate(self, p):
+            return self.extractor.extract_form1a_fields(p)
+        def process_death_certificate(self, p):
+            return self.extractor.extract_form2a_fields(p)
+        def process_marriage_certificate(self, p):
+            return self.extractor.extract_form3a_fields(p)
+        def process_marriage_license(self, p):
+            return self.extractor.extract_form90_fields(p)
+    return OCR_API
+if __name__ == "__main__":
+    demo_inference()

CRNN+CTC/prepare_emnist.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torchvision
+import torchvision.transforms as transforms
+from PIL import Image
+import numpy as np
+import os
+import json
+print("Preparing EMNIST data for CRNN training...")
+print("Using 'balanced' split (47 classes — digits, uppercase, selected lowercase)")
+# MAX_SAMPLES: how many EMNIST images to use out of 112,800 available.
+# 50,000 chosen deliberately:
+#   - ~1,064 images per class (47 classes) — enough for solid character recognition
+#   - Keeps a healthy ~3:1 ratio vs synthetic data (16,000) in mixed training
+#   - Going higher (e.g. full 112,800) would drown out synthetic Filipino-specific
+#     patterns since EMNIST would be 88% of the mixed dataset
+#   - IAM fine-tuning and physical scans handle remaining handwriting gaps
+MAX_SAMPLES = 50000
+VAL_RATIO   = 0.10   # 90% train, 10% val — proper percentage split
+train_data = torchvision.datasets.EMNIST(
+    root='datasets/emnist',
+    split='balanced',       # balanced split — already downloaded
+    train=True,
+    download=False,         # files already exist, skip download
+    transform=transforms.ToTensor()
+)
+# balanced split has 47 classes:
+# 0-9 digits, A-Z uppercase, and selected lowercase
+# mapping follows EMNIST balanced label order
+LABELS = [
+    '0','1','2','3','4','5','6','7','8','9',
+    'A','B','C','D','E','F','G','H','I','J','K','L','M',
+    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
+    'a','b','d','e','f','g','h','n','q','r','t',
+]  # 47 classes exactly matching balanced split label indices
+os.makedirs('data/train/emnist', exist_ok=True)
+os.makedirs('data/val/emnist', exist_ok=True)
+annotations_train = []
+annotations_val   = []
+val_cutoff = int(MAX_SAMPLES * (1 - VAL_RATIO))  # 45,000 train / 5,000 val
+print(f"Dataset size : {len(train_data)} images available")
+print(f"Using        : {MAX_SAMPLES} ({MAX_SAMPLES/len(train_data)*100:.1f}% of full dataset)")
+print(f"Train / Val  : {val_cutoff} / {MAX_SAMPLES - val_cutoff} (90/10 split)")
+print("Saving images...")
+saved = 0   # count of successfully saved images (skips bad label indices)
+for i, (img_tensor, label_idx) in enumerate(train_data):
+    if saved >= MAX_SAMPLES:
+        break
+    # Safety check — skip if label index is out of range for our LABELS list
+    if label_idx >= len(LABELS):
+        continue
+    char = LABELS[label_idx]
+    img  = img_tensor.squeeze().numpy()
+    img  = (img * 255).astype(np.uint8)
+    # EMNIST images are transposed — rotate and flip to correct orientation
+    img  = np.rot90(img, k=3)
+    img  = np.fliplr(img)
+    pil_img = Image.fromarray(img).convert('RGB')
+    pil_img = pil_img.resize((512, 64))   # must match IMG_WIDTH=512
+    fname = f'emnist_{saved:05d}.jpg'   # sequential filenames based on saved count
+    # FIXED: proper percentage-based split (was hardcoded `if i < 5000`)
+    if saved < val_cutoff:
+        pil_img.save(f'data/train/emnist/{fname}')
+        annotations_train.append({'image_path': f'emnist/{fname}', 'text': char})
+    else:
+        pil_img.save(f'data/val/emnist/{fname}')
+        annotations_val.append({'image_path': f'emnist/{fname}', 'text': char})
+    saved += 1
+    if saved % 5000 == 0:
+        print(f"  Processed {saved}/{MAX_SAMPLES} images...")
+with open('data/emnist_train_annotations.json', 'w') as f:
+    json.dump(annotations_train, f, indent=2)
+with open('data/emnist_val_annotations.json', 'w') as f:
+    json.dump(annotations_val, f, indent=2)
+print(f"\nDone!")
+print(f"  Train : {len(annotations_train)} images  (~{len(annotations_train)//47} per class)")
+print(f"  Val   : {len(annotations_val)} images")
+print(f"  Total : {len(annotations_train) + len(annotations_val)} / {len(train_data)} used")
+print(f"  Labels: {sorted(set(a['text'] for a in annotations_train))}")
+print(f"\nClass coverage: {len(set(a['text'] for a in annotations_train))}/47 classes in train")
+print("\nNext step: python train_with_emnist.py")

CRNN+CTC/requirements.txt ADDED Viewed

	@@ -0,0 +1,61 @@

+# Core Deep Learning
+torch>=2.0.0
+torchvision>=0.15.0
+# Image Processing
+opencv-python>=4.8.0
+Pillow>=10.0.0
+albumentations>=1.3.0
+pdf2image>=1.17.0
+pytesseract>=0.3.13
+# Data Processing
+numpy>=1.24.0
+pandas>=2.0.0
+# Metrics
+editdistance>=0.6.2
+# Progress Bars
+tqdm>=4.65.0
+# Web Framework (for deployment)
+flask>=3.0.0
+flask-cors>=4.0.0
+fastapi>=0.104.0
+uvicorn>=0.24.0
+python-multipart>=0.0.6
+# Database
+pymysql>=1.1.0
+sqlalchemy>=2.0.0
+# NLP for Named Entity Recognition
+spacy>=3.7.0
+# Download model: python -m spacy download en_core_web_sm
+# Document Classification
+scikit-learn>=1.3.0
+# Visualization
+matplotlib>=3.7.0
+seaborn>=0.12.0
+# Configuration
+pyyaml>=6.0
+# Utilities
+python-dotenv>=1.0.0
+requests>=2.31.0
+# Document Processing
+python-docx>=1.1.0
+# Optional: For production deployment
+gunicorn>=21.2.0
+celery>=5.3.0
+redis>=5.0.0
+# Testing
+pytest>=7.4.0
+pytest-cov>=4.1.0

CRNN+CTC/train.py ADDED Viewed

	@@ -0,0 +1,438 @@

+# Training Script for CRNN+CTC Civil Registry OCR Includes CTC loss, learning rate scheduling, and model checkpointing
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+import os
+from tqdm import tqdm
+import numpy as np
+from pathlib import Path
+import json
+from crnn_model import get_crnn_model, initialize_weights
+from dataset import CivilRegistryDataset, collate_fn
+from utils import (
+    decode_ctc_predictions,
+    calculate_cer,
+    calculate_wer,
+    EarlyStopping
+)
+class CRNNTrainer:
+    """
+    Trainer class for CRNN+CTC model
+    """
+    def __init__(self, config):
+        self.config = config
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Create directories
+        self.checkpoint_dir = Path(config['checkpoint_dir'])
+        self.log_dir = Path(config['log_dir'])
+        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        self.log_dir.mkdir(parents=True, exist_ok=True)
+        # Initialize datasets
+        print("Loading datasets...")
+        self.train_dataset = CivilRegistryDataset(
+            data_dir=config['train_data_dir'],
+            annotations_file=config['train_annotations'],
+            img_height=config['img_height'],
+            img_width=config['img_width'],
+            augment=True,
+            form_type=config.get('form_type', 'all')
+        )
+        self.val_dataset = CivilRegistryDataset(
+            data_dir=config['val_data_dir'],
+            annotations_file=config['val_annotations'],
+            img_height=config['img_height'],
+            img_width=config['img_width'],
+            augment=False,
+            form_type=config.get('form_type', 'all')
+        )
+        # Create data loaders
+        self.train_loader = DataLoader(
+            self.train_dataset,
+            batch_size=config['batch_size'],
+            shuffle=True,
+            num_workers=config['num_workers'],
+            collate_fn=collate_fn,
+            pin_memory=False
+        )
+        self.val_loader = DataLoader(
+            self.val_dataset,
+            batch_size=config['batch_size'],
+            shuffle=False,
+            num_workers=config['num_workers'],
+            collate_fn=collate_fn,
+            pin_memory=False
+        )
+        # Initialize model
+        print(f"Initializing model on {self.device}...")
+        self.model = get_crnn_model(
+            model_type=config.get('model_type', 'standard'),
+            img_height=config['img_height'],
+            num_chars=self.train_dataset.num_chars,
+            hidden_size=config['hidden_size'],
+            num_lstm_layers=config['num_lstm_layers']
+        )
+        self.model = self.model.to(self.device)
+        # Loss function - CTC Loss
+        self.criterion = nn.CTCLoss(blank=0, zero_infinity=True)
+        # Optimizer — lower LR prevents CTC collapse on epoch 1
+        self.optimizer = optim.Adam(
+            self.model.parameters(),
+            lr=config['learning_rate'],
+            weight_decay=config.get('weight_decay', 1e-4)   # FIXED: fallback was 1e-5
+        )
+        # Warmup scheduler: ramp LR from near-zero to target over first N epochs,
+        # then hand off to ReduceLROnPlateau.
+        # This is the single most effective fix for CTC blank collapse.
+        warmup_epochs = config.get('warmup_epochs', 5)
+        def warmup_lambda(epoch):
+            if epoch < warmup_epochs:
+                return (epoch + 1) / warmup_epochs   # gradual: 0.2→0.4→0.6→0.8→1.0
+            return 1.0
+        self.warmup_scheduler = optim.lr_scheduler.LambdaLR(
+            self.optimizer, lr_lambda=warmup_lambda)
+        # ReduceLROnPlateau kicks in after warmup
+        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+            self.optimizer,
+            mode='min',
+            factor=0.5,
+            patience=config.get('lr_patience', 5),
+            min_lr=1e-6
+        )
+        self._warmup_epochs = warmup_epochs
+        # Early stopping
+        self.early_stopping = EarlyStopping(
+            patience=config.get('early_stopping_patience', 10),
+            min_delta=config.get('min_delta', 0.001)
+        )
+        # Training history
+        self.history = {
+            'train_loss': [],
+            'val_loss': [],
+            'val_cer': [],
+            'val_wer': [],
+            'learning_rates': []
+        }
+        # ── Resume from checkpoint if available ──────────────
+        self.start_epoch = 1
+        self.best_val_loss = float('inf')
+        resume_path = self.checkpoint_dir / 'latest_checkpoint.pth'
+        if resume_path.exists():
+            print(f"\n  Found checkpoint: {resume_path}")
+            print(f"  Resuming training from last saved epoch...")
+            ckpt = torch.load(resume_path, map_location=self.device, weights_only=False)
+            self.model.load_state_dict(ckpt['model_state_dict'])
+            self.optimizer.load_state_dict(ckpt['optimizer_state_dict'])
+            self.scheduler.load_state_dict(ckpt['scheduler_state_dict'])
+            if 'warmup_scheduler_state_dict' in ckpt:
+                self.warmup_scheduler.load_state_dict(ckpt['warmup_scheduler_state_dict'])
+            self.start_epoch = ckpt['epoch'] + 1
+            self.best_val_loss = ckpt.get('val_loss', float('inf'))
+            self.history = ckpt.get('history', self.history)
+            print(f"  ✓ Resumed from Epoch {ckpt['epoch']}  "
+                  f"(Val Loss: {ckpt['val_loss']:.4f}, CER: {ckpt['val_cer']:.2f}%)")
+        else:
+            print(f"  No checkpoint found — starting fresh.")
+            initialize_weights(self.model)
+        print(f"✓ Model ready with {sum(p.numel() for p in self.model.parameters()):,} parameters")
+    def train_epoch(self, epoch):
+        """Train for one epoch"""
+        self.model.train()
+        total_loss = 0
+        pbar = tqdm(self.train_loader, desc=f"Epoch {epoch}/{self.config['epochs']}")
+        nan_count = 0
+        for batch_idx, (images, targets, target_lengths, _) in enumerate(pbar):
+            images = images.to(self.device)
+            targets = targets.to(self.device)
+            # FIXED: zero_grad before forward pass (was incorrectly placed after loss)
+            self.optimizer.zero_grad()
+            # Forward pass
+            outputs = self.model(images)  # [seq_len, batch, num_chars]
+            # Apply log_softmax for CTC
+            log_probs = nn.functional.log_softmax(outputs, dim=2)
+            # Calculate sequence lengths
+            batch_size = images.size(0)
+            input_lengths = torch.full(
+                size=(batch_size,),
+                fill_value=outputs.size(0),
+                dtype=torch.long
+            ).to(self.device)
+            # CTC loss
+            loss = self.criterion(
+                log_probs,
+                targets,
+                input_lengths,
+                target_lengths
+            )
+            # FIXED: skip NaN/Inf batches — accumulating them corrupts gradients
+            if torch.isnan(loss) or torch.isinf(loss):
+                nan_count += 1
+                continue
+            # Backward pass
+            loss.backward()
+            # Gradient clipping to prevent exploding gradients
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
+            self.optimizer.step()
+            total_loss += loss.item()
+            # Update progress bar
+            pbar.set_postfix({
+                'loss': f'{loss.item():.4f}',
+                'avg_loss': f'{total_loss / (batch_idx + 1):.4f}'
+            })
+        if nan_count > 0:
+            print(f"  [WARNING] {nan_count} NaN/Inf batches skipped this epoch.")
+        avg_loss = total_loss / len(self.train_loader)
+        return avg_loss
+    def validate(self):
+        """Validate the model"""
+        self.model.eval()
+        total_loss = 0
+        all_predictions = []
+        all_ground_truths = []
+        with torch.no_grad():
+            for images, targets, target_lengths, texts in tqdm(self.val_loader, desc="Validating"):
+                images = images.to(self.device)
+                targets_gpu = targets.to(self.device)
+                # Forward pass
+                outputs = self.model(images)
+                log_probs = nn.functional.log_softmax(outputs, dim=2)
+                # CTC loss
+                batch_size = images.size(0)
+                input_lengths = torch.full(
+                    size=(batch_size,),
+                    fill_value=outputs.size(0),
+                    dtype=torch.long
+                ).to(self.device)
+                loss = self.criterion(log_probs, targets_gpu, input_lengths, target_lengths)
+                total_loss += loss.item()
+                # Decode predictions
+                predictions = decode_ctc_predictions(
+                    outputs.cpu(),
+                    self.train_dataset.idx_to_char
+                )
+                all_predictions.extend(predictions)
+                all_ground_truths.extend(texts)
+        avg_loss = total_loss / len(self.val_loader)
+        # Calculate metrics
+        cer = calculate_cer(all_predictions, all_ground_truths)
+        wer = calculate_wer(all_predictions, all_ground_truths)
+        return avg_loss, cer, wer, all_predictions, all_ground_truths
+    def train(self):
+        """Main training loop"""
+        print("\n" + "=" * 70)
+        print("Starting Training")
+        print("=" * 70)
+        best_val_loss = self.best_val_loss
+        for epoch in range(self.start_epoch, self.config['epochs'] + 1):
+            print(f"\nEpoch {epoch}/{self.config['epochs']}")
+            print("-" * 70)
+            # Train
+            train_loss = self.train_epoch(epoch)
+            # Validate
+            val_loss, val_cer, val_wer, predictions, ground_truths = self.validate()
+            # Learning rate scheduling
+            # Use warmup for first N epochs, then ReduceLROnPlateau
+            if epoch <= self._warmup_epochs:
+                self.warmup_scheduler.step()
+            else:
+                self.scheduler.step(val_loss)
+            current_lr = self.optimizer.param_groups[0]['lr']
+            # Update history
+            self.history['train_loss'].append(train_loss)
+            self.history['val_loss'].append(val_loss)
+            self.history['val_cer'].append(val_cer)
+            self.history['val_wer'].append(val_wer)
+            self.history['learning_rates'].append(current_lr)
+            # Print metrics
+            print(f"\nMetrics:")
+            print(f"  Train Loss: {train_loss:.4f}")
+            print(f"  Val Loss:   {val_loss:.4f}")
+            print(f"  Val CER:    {val_cer:.2f}%")
+            print(f"  Val WER:    {val_wer:.2f}%")
+            print(f"  LR:         {current_lr:.6f}")
+            # Print sample predictions
+            print(f"\nSample Predictions:")
+            for i in range(min(3, len(predictions))):
+                print(f"  GT:   {ground_truths[i]}")
+                print(f"  Pred: {predictions[i]}")
+                print()
+            # show raw model output
+            with torch.no_grad():
+                sample_img = self.val_dataset[0][0].unsqueeze(0).to(self.device)
+                raw_out    = self.model(sample_img)
+                probs      = torch.softmax(raw_out, dim=2)
+                best_idx   = probs[:, 0, :].argmax(dim=1)
+                best_prob  = probs[:, 0, :].max(dim=1).values
+                blank_pct  = (best_idx == 0).float().mean().item() * 100
+                avg_conf   = best_prob.mean().item()
+                non_blank  = [self.train_dataset.idx_to_char.get(i.item(), '?')
+                              for i in best_idx if i.item() != 0]
+                print(f"  blank={blank_pct:.0f}%  conf={avg_conf:.3f}  "
+                      f"chars={''.join(non_blank[:20])!r}")
+            # Save checkpoint
+            is_best = val_loss < best_val_loss
+            if is_best:
+                best_val_loss = val_loss
+            self.save_checkpoint(epoch, val_loss, val_cer, is_best)
+            # Early stopping
+            if self.early_stopping(val_loss):
+                print(f"\nEarly stopping triggered at epoch {epoch}")
+                break
+        print("\n" + "=" * 70)
+        print("Training Complete!")
+        print(f"Best validation loss: {best_val_loss:.4f}")
+        print("=" * 70)
+        # Save final training history
+        self.save_history()
+    def save_checkpoint(self, epoch, val_loss, val_cer, is_best=False):
+        """Save model checkpoint"""
+        checkpoint = {
+            'epoch': epoch,
+            'model_state_dict': self.model.state_dict(),
+            'optimizer_state_dict': self.optimizer.state_dict(),
+            'scheduler_state_dict': self.scheduler.state_dict(),
+            'warmup_scheduler_state_dict': self.warmup_scheduler.state_dict(),
+            'val_loss': val_loss,
+            'val_cer': val_cer,
+            'char_to_idx': self.train_dataset.char_to_idx,
+            'idx_to_char': self.train_dataset.idx_to_char,
+            'config': self.config,
+            'history': self.history
+        }
+        # Save latest checkpoint
+        checkpoint_path = self.checkpoint_dir / 'latest_checkpoint.pth'
+        torch.save(checkpoint, checkpoint_path)
+        # Save best checkpoint
+        if is_best:
+            best_path = self.checkpoint_dir / 'best_model.pth'
+            torch.save(checkpoint, best_path)
+            print(f"  ✓ Best model saved (Val Loss: {val_loss:.4f}, CER: {val_cer:.2f}%)")
+        # Save epoch checkpoint (history omitted to save disk space — it's in latest_checkpoint.pth)
+        if epoch % self.config.get('save_freq', 10) == 0:
+            epoch_path = self.checkpoint_dir / f'checkpoint_epoch_{epoch}.pth'
+            epoch_ckpt = {k: v for k, v in checkpoint.items() if k != 'history'}
+            torch.save(epoch_ckpt, epoch_path)
+    def save_history(self):
+        """Save training history"""
+        history_path = self.log_dir / 'training_history.json'
+        with open(history_path, 'w') as f:
+            json.dump(self.history, f, indent=2)
+        print(f"\n✓ Training history saved to {history_path}")
+def main():
+    """Main training function"""
+    # Configuration
+    config = {
+        # Data
+        'train_data_dir': 'data/train',
+        'train_annotations': 'data/train_annotations.json',
+        'val_data_dir': 'data/val',
+        'val_annotations': 'data/val_annotations.json',
+        'form_type': 'all',  # 'all', 'form1a', 'form2a', 'form3a', 'form90'
+        # Model
+        'model_type': 'standard',  # 'standard', 'ensemble', 'lightweight'
+        'img_height': 64,
+        'img_width': 512,
+        'hidden_size': 128,
+        'num_lstm_layers': 1,
+        # Training
+        'batch_size': 32,
+        'epochs': 100,
+        'learning_rate': 0.0001,
+        'weight_decay': 1e-4,   # FIXED: was 1e-5 — stronger L2 regularisation to reduce overfitting
+        'num_workers': 0,
+        'warmup_epochs': 5,        # Ramp LR gradually for first 5 epochs
+        # Scheduling & Early Stopping
+        'lr_patience': 5,          # FIXED: was 3 — give model more time before halving LR
+        'early_stopping_patience': 20,  # FIXED: was 10 — more patience during zoom training
+        'min_delta': 0.001,
+        # Saving
+        'checkpoint_dir': 'checkpoints',
+        'log_dir': 'logs',
+        'save_freq': 10,
+    }
+    # Initialize trainer
+    trainer = CRNNTrainer(config)
+    # Start training
+    trainer.train()
+if __name__ == "__main__":
+    main()

CRNN+CTC/train_emnist.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torchvision
+import torchvision.transforms as transforms
+print("Loading EMNIST dataset...")
+train_data = torchvision.datasets.EMNIST(
+    root='datasets/emnist',
+    split='byclass',
+    train=True,
+    download=False,
+    transform=transforms.ToTensor()
+)
+print(f"Training samples: {len(train_data)}")
+print("EMNIST loaded successfully!")

CRNN+CTC/train_mnist.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import tensorflow as tf
+import numpy as np
+from tensorflow.keras import layers, models
+# Load MNIST dataset
+(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+# Normalize pixel values to 0-1
+x_train = x_train / 255.0
+x_test = x_test / 255.0
+# Add channel dimension (28, 28) -> (28, 28, 1)
+x_train = x_train[..., tf.newaxis]
+x_test = x_test[..., tf.newaxis]
+# Build simple CNN model
+model = models.Sequential([
+    layers.Conv2D(32, (3,3), activation='relu', input_shape=(28,28,1)),
+    layers.MaxPooling2D(2,2),
+    layers.Conv2D(64, (3,3), activation='relu'),
+    layers.MaxPooling2D(2,2),
+    layers.Flatten(),
+    layers.Dense(128, activation='relu'),
+    layers.Dense(10, activation='softmax')  # 10 digits (0-9)
+])
+model.compile(optimizer='adam',
+              loss='sparse_categorical_crossentropy',
+              metrics=['accuracy'])
+model.summary()
+# Train
+model.fit(x_train, y_train, epochs=5, validation_split=0.1)
+# Evaluate
+test_loss, test_acc = model.evaluate(x_test, y_test)
+print(f"\nTest accuracy: {test_acc:.4f}")
+# Save model
+model.save("mnist_model.h5")
+print("Model saved as mnist_model.h5")

CRNN+CTC/train_with_emnist.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""
+train_with_emnist.py
+====================
+Fine-tune the CRNN model with EMNIST character data.
+FIXES vs old version:
+  - Phase 1: CNN FROZEN — only RNN+FC trained (prevents catastrophic forgetting)
+  - Phase 2: Full model at 10x lower LR for final polish
+  - log_softmax applied before CTCLoss (was missing — caused garbage loss)
+  - Loads from best_model.pth (synthetic, 0.12% CER baseline)
+  - Saves best_model_emnist.pth only when val improves
+"""
+import os
+import sys
+import torch
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader, ConcatDataset
+sys.path.append('.')
+from crnn_model import get_crnn_model
+from dataset import CivilRegistryDataset, collate_fn
+print("=" * 55)
+print("Fine-tuning CRNN with EMNIST dataset")
+print("=" * 55)
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(f"Device: {DEVICE}")
+emnist_dataset = CivilRegistryDataset(
+    data_dir='data/train',
+    annotations_file='data/emnist_train_annotations.json',
+    img_height=64, img_width=512, augment=True
+)
+# FIXED: mix synthetic data in so the model never forgets multi-word sequences
+synth_dataset = CivilRegistryDataset(
+    data_dir='data/train',
+    annotations_file='data/train_annotations.json',
+    img_height=64, img_width=512, augment=True
+)
+train_dataset = emnist_dataset  # keep reference for char_to_idx / num_chars
+mixed_train   = ConcatDataset([emnist_dataset, synth_dataset])
+val_dataset = CivilRegistryDataset(
+    data_dir='data/val',
+    annotations_file='data/val_annotations.json',  # FIXED: was emnist_val — must match real task
+    img_height=64, img_width=512, augment=False
+)
+print(f"EMNIST train  : {len(emnist_dataset)}")
+print(f"Synthetic train: {len(synth_dataset)}")
+print(f"Mixed train   : {len(mixed_train)}")
+print(f"Val           : {len(val_dataset)}")
+train_loader = DataLoader(mixed_train, batch_size=32, shuffle=True,
+                          num_workers=0, collate_fn=collate_fn)
+val_loader   = DataLoader(val_dataset,   batch_size=32, shuffle=False,
+                          num_workers=0, collate_fn=collate_fn)
+# ── Load best synthetic checkpoint ───────────────────────────
+BASE = 'checkpoints/best_model.pth'
+if not os.path.exists(BASE):
+    print(f"ERROR: {BASE} not found. Run: python train.py")
+    sys.exit(1)
+ckpt   = torch.load(BASE, map_location=DEVICE, weights_only=False)
+config = ckpt.get('config', {})
+model = get_crnn_model(
+    model_type      = config.get('model_type', 'standard'),
+    img_height      = config.get('img_height', 64),
+    num_chars       = train_dataset.num_chars,
+    hidden_size     = config.get('hidden_size', 128),
+    num_lstm_layers = config.get('num_lstm_layers', 1),
+).to(DEVICE)
+missing, _ = model.load_state_dict(ckpt['model_state_dict'], strict=False)
+if missing:
+    print(f"  Note: {len(missing)} layers re-initialized (expected for fc layer)")
+print(f"  Loaded epoch {ckpt.get('epoch')} "
+      f"(val_loss={ckpt.get('val_loss', ckpt.get('val_cer', 0)):.4f})")
+criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
+def run_epoch(loader, training, optimizer=None):
+    model.train() if training else model.eval()
+    total, n = 0, 0
+    ctx = torch.enable_grad() if training else torch.no_grad()
+    with ctx:
+        for images, targets, target_lengths, _ in loader:
+            images        = images.to(DEVICE)
+            batch_size    = images.size(0)
+            if training:
+                optimizer.zero_grad()
+            # CRITICAL: log_softmax before CTCLoss
+            outputs       = F.log_softmax(model(images), dim=2)
+            seq_len       = outputs.size(0)
+            input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
+            loss = criterion(outputs, targets, input_lengths, target_lengths)
+            if not torch.isnan(loss) and not torch.isinf(loss):
+                if training:
+                    loss.backward()
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
+                    optimizer.step()
+                total += loss.item()
+                n     += 1
+    return total / max(n, 1)
+def run_phase(num, epochs, lr, freeze_cnn, patience):
+    print(f"\n{'='*55}")
+    print(f"  PHASE {num} — "
+          f"{'CNN FROZEN  (RNN+FC only)' if freeze_cnn else 'FULL MODEL  (all layers)'}"
+          f"   LR={lr}")
+    print(f"{'='*55}")
+    # Freeze or unfreeze CNN
+    for name, param in model.named_parameters():
+        param.requires_grad = not (freeze_cnn and 'cnn' in name)
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"  Trainable params : {trainable:,}")
+    opt      = optim.Adam(
+        filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
+    sched    = optim.lr_scheduler.ReduceLROnPlateau(opt, patience=3, factor=0.5)
+    best     = float('inf')
+    counter  = 0
+    for epoch in range(1, epochs + 1):
+        tr = run_epoch(train_loader, True,  opt)
+        vl = run_epoch(val_loader,   False, None)
+        sched.step(vl)
+        if vl < best:
+            best    = vl
+            counter = 0
+            torch.save({
+                'model_state_dict': model.state_dict(),
+                'config':           config,
+                'char_to_idx':      train_dataset.char_to_idx,
+                'idx_to_char':      train_dataset.idx_to_char,
+                'epoch':            epoch,
+                'val_loss':         vl,   # FIXED: renamed from val_cer — this is val loss, not CER%
+            }, 'checkpoints/best_model_emnist.pth')
+            print(f"  Epoch {epoch:02d}/{epochs}  Train={tr:.4f}  Val={vl:.4f}  <- saved")
+        else:
+            counter += 1
+            print(f"  Epoch {epoch:02d}/{epochs}  Train={tr:.4f}  Val={vl:.4f}"
+                  f"  (patience {counter}/{patience})")
+            if counter >= patience:
+                print(f"  Early stopping at epoch {epoch}.")
+                break
+    return best
+# ── Phase 1: Freeze CNN — teach RNN+FC to handle EMNIST chars ─
+p1_best = run_phase(1, epochs=30, lr=1e-4, freeze_cnn=True, patience=7)
+# ── Phase 2: Unfreeze all — gentle full-model polish ──────────
+p2_best = run_phase(2, epochs=20, lr=1e-6, freeze_cnn=False, patience=5)
+print(f"\n{'='*55}")
+print(f"EMNIST fine-tuning complete!")
+print(f"  Phase 1 best val loss : {p1_best:.4f}")
+print(f"  Phase 2 best val loss : {p2_best:.4f}")
+print(f"  Saved : checkpoints/best_model_emnist.pth")
+print(f"\nNext step: python IAM_train.py --prepare --train")

CRNN+CTC/utils.py ADDED Viewed

	@@ -0,0 +1,397 @@

+"""
+Utility Functions for CRNN+CTC Civil Registry OCR
+Includes CTC decoding, metrics calculation, and helper functions
+"""
+import torch
+import numpy as np
+def _editdistance(a, b):
+    """Pure-Python Levenshtein distance — replaces the editdistance C extension."""
+    m, n = len(a), len(b)
+    dp = list(range(n + 1))
+    for i in range(1, m + 1):
+        prev, dp[0] = dp[0], i
+        for j in range(1, n + 1):
+            prev, dp[j] = dp[j], prev if a[i-1] == b[j-1] else 1 + min(prev, dp[j], dp[j-1])
+    return dp[n]
+from typing import List, Dict, Tuple
+def decode_ctc_predictions(outputs, idx_to_char, method='greedy'):
+    """
+    Decode CTC predictions to text
+    Args:
+        outputs: Model outputs [seq_len, batch, num_chars]
+        idx_to_char: Dictionary mapping indices to characters
+        method: 'greedy' or 'beam_search'
+    Returns:
+        List of decoded strings
+    """
+    if method == 'greedy':
+        return greedy_decode(outputs, idx_to_char)
+    elif method == 'beam_search':
+        return beam_search_decode(outputs, idx_to_char)
+    else:
+        raise ValueError(f"Unknown decoding method: {method}")
+def greedy_decode(outputs, idx_to_char):
+    """
+    Greedy CTC decoding - fast but less accurate
+    """
+    # Get most probable characters
+    pred_indices = torch.argmax(outputs, dim=2)  # [seq_len, batch]
+    pred_indices = pred_indices.permute(1, 0)  # [batch, seq_len]
+    decoded_texts = []
+    for sequence in pred_indices:
+        chars = []
+        prev_idx = -1
+        for idx in sequence:
+            idx = idx.item()
+            # Skip blank (0) and consecutive duplicates
+            if idx != 0 and idx != prev_idx:
+                if idx in idx_to_char:
+                    chars.append(idx_to_char[idx])
+            prev_idx = idx
+        decoded_texts.append(''.join(chars))
+    return decoded_texts
+def beam_search_decode(outputs, idx_to_char, beam_width=10):
+    """
+    Beam search CTC decoding - slower but more accurate.
+    FIXED Bug 6: previous code mixed list-of-chars and string representations.
+    After sorting new_beams (a dict keyed by strings), it did `list(seq)` on the
+    string key — which splits a string like "AB" into ['A','B'] accidentally works
+    for ASCII but is fragile and confusing. Rewritten to use strings throughout:
+    beams are now List[Tuple[str, float]] with the sequence always kept as a plain
+    string, eliminating the list/string ambiguity entirely.
+    """
+    outputs = torch.nn.functional.softmax(outputs, dim=2)
+    outputs = outputs.permute(1, 0, 2).cpu().numpy()  # [batch, seq_len, num_chars]
+    decoded_texts = []
+    for output in outputs:
+        # Each beam is (sequence_string, cumulative_probability)
+        beams: list = [('', 1.0)]
+        for timestep in output:
+            new_beams: dict = {}
+            for sequence, prob in beams:
+                for idx, char_prob in enumerate(timestep):
+                    if idx == 0:  # blank token — sequence unchanged
+                        new_seq = sequence
+                    elif idx in idx_to_char:
+                        char = idx_to_char[idx]
+                        # CTC rule: merge consecutive duplicate characters
+                        if sequence and sequence[-1] == char:
+                            new_seq = sequence        # duplicate — stay the same
+                        else:
+                            new_seq = sequence + char # append directly to string
+                    else:
+                        continue
+                    new_prob = prob * char_prob
+                    # Merge beams that produce the same string
+                    if new_seq in new_beams:
+                        new_beams[new_seq] = max(new_beams[new_seq], new_prob)
+                    else:
+                        new_beams[new_seq] = new_prob
+            # Keep top-k beams; keys are already strings — no list() conversion needed
+            beams = sorted(new_beams.items(), key=lambda x: x[1], reverse=True)[:beam_width]
+        # Best sequence is the string with highest probability
+        best_sequence = max(beams, key=lambda x: x[1])[0]
+        decoded_texts.append(best_sequence)
+    return decoded_texts
+def calculate_cer(predictions: List[str], ground_truths: List[str]) -> float:
+    """
+    Calculate Character Error Rate (CER)
+    CER = (Substitutions + Deletions + Insertions) / Total Characters
+    """
+    if len(predictions) != len(ground_truths):
+        raise ValueError("Predictions and ground truths must have same length")
+    total_distance = 0
+    total_length = 0
+    for pred, gt in zip(predictions, ground_truths):
+        distance = _editdistance(pred, gt)
+        total_distance += distance
+        total_length += len(gt)
+    cer = (total_distance / total_length * 100) if total_length > 0 else 0
+    return cer
+def calculate_wer(predictions: List[str], ground_truths: List[str]) -> float:
+    """
+    Calculate Word Error Rate (WER)
+    WER = (Substitutions + Deletions + Insertions) / Total Words
+    """
+    if len(predictions) != len(ground_truths):
+        raise ValueError("Predictions and ground truths must have same length")
+    total_distance = 0
+    total_length = 0
+    for pred, gt in zip(predictions, ground_truths):
+        pred_words = pred.split()
+        gt_words = gt.split()
+        distance = _editdistance(pred_words, gt_words)
+        total_distance += distance
+        total_length += len(gt_words)
+    wer = (total_distance / total_length * 100) if total_length > 0 else 0
+    return wer
+def calculate_accuracy(predictions: List[str], ground_truths: List[str]) -> float:
+    """
+    Calculate exact match accuracy
+    """
+    if len(predictions) != len(ground_truths):
+        raise ValueError("Predictions and ground truths must have same length")
+    correct = sum(1 for pred, gt in zip(predictions, ground_truths) if pred == gt)
+    accuracy = (correct / len(predictions) * 100) if len(predictions) > 0 else 0
+    return accuracy
+class EarlyStopping:
+    """
+    Early stopping to stop training when validation loss stops improving
+    """
+    def __init__(self, patience=10, min_delta=0.001):
+        self.patience = patience
+        self.min_delta = min_delta
+        self.counter = 0
+        self.best_loss = None
+        self.early_stop = False
+    def __call__(self, val_loss):
+        if self.best_loss is None:
+            self.best_loss = val_loss
+        elif val_loss > self.best_loss - self.min_delta:
+            self.counter += 1
+            if self.counter >= self.patience:
+                self.early_stop = True
+        else:
+            self.best_loss = val_loss
+            self.counter = 0
+        return self.early_stop
+class AverageMeter:
+    """
+    Computes and stores the average and current value
+    """
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+def calculate_confusion_matrix(predictions: List[str], ground_truths: List[str], char_set: List[str]) -> np.ndarray:
+    """
+    Calculate character-level confusion matrix
+    Args:
+        predictions: List of predicted strings
+        ground_truths: List of ground truth strings
+        char_set: List of all possible characters
+    Returns:
+        Confusion matrix [num_chars, num_chars]
+    """
+    char_to_idx = {char: idx for idx, char in enumerate(char_set)}
+    n_chars = len(char_set)
+    confusion = np.zeros((n_chars, n_chars), dtype=np.int64)
+    for pred, gt in zip(predictions, ground_truths):
+        # Align sequences (simple alignment)
+        max_len = max(len(pred), len(gt))
+        pred_padded = pred + ' ' * (max_len - len(pred))
+        gt_padded = gt + ' ' * (max_len - len(gt))
+        for p_char, g_char in zip(pred_padded, gt_padded):
+            if p_char in char_to_idx and g_char in char_to_idx:
+                confusion[char_to_idx[g_char], char_to_idx[p_char]] += 1
+    return confusion
+def extract_form_fields(text: str, form_type: str) -> Dict[str, str]:
+    """
+    Extract specific fields from recognized text based on form type
+    Args:
+        text: Recognized text
+        form_type: 'form1a', 'form2a', 'form3a', 'form90'
+    Returns:
+        Dictionary of extracted fields
+    """
+    fields = {}
+    if form_type == 'form1a':  # Birth Certificate
+        # Extract common fields (simplified)
+        # In practice, use NER or regex patterns
+        fields['type'] = 'Birth Certificate'
+        # Add more field extraction logic
+    elif form_type == 'form2a':  # Death Certificate
+        fields['type'] = 'Death Certificate'
+    elif form_type == 'form3a':  # Marriage Certificate
+        fields['type'] = 'Marriage Certificate'
+    elif form_type == 'form90':  # Marriage License Application
+        fields['type'] = 'Marriage License Application'
+    return fields
+def validate_extracted_data(data: Dict[str, str], form_type: str) -> Tuple[bool, List[str]]:
+    """
+    Validate extracted data for completeness and format
+    Args:
+        data: Extracted data dictionary
+        form_type: Form type
+    Returns:
+        (is_valid, list_of_errors)
+    """
+    errors = []
+    # Define required fields per form type
+    required_fields = {
+        'form1a': ['name', 'date_of_birth', 'place_of_birth'],
+        'form2a': ['name', 'date_of_death', 'place_of_death'],
+        'form3a': ['husband_name', 'wife_name', 'date_of_marriage'],
+        'form90': ['husband_name', 'wife_name', 'date_of_application']
+    }
+    # Check required fields
+    for field in required_fields.get(form_type, []):
+        if field not in data or not data[field]:
+            errors.append(f"Missing required field: {field}")
+    # Additional validation can be added here
+    # - Date format validation
+    # - Name format validation
+    # - etc.
+    is_valid = len(errors) == 0
+    return is_valid, errors
+def load_checkpoint(checkpoint_path, model, optimizer=None, device='cpu'):
+    """
+    Load model checkpoint
+    Args:
+        checkpoint_path: Path to checkpoint file
+        model: Model instance
+        optimizer: Optimizer instance (optional)
+        device: Device to load to
+    Returns:
+        (model, optimizer, checkpoint_dict)
+    """
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    if optimizer is not None and 'optimizer_state_dict' in checkpoint:
+        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+    print(f"✓ Loaded checkpoint from {checkpoint_path}")
+    print(f"  Epoch: {checkpoint.get('epoch', 'N/A')}")
+    if 'val_cer' in checkpoint:
+        print(f"  Val CER  : {checkpoint['val_cer']:.4f}%")
+    elif 'val_loss' in checkpoint:
+        print(f"  Val Loss : {checkpoint['val_loss']:.4f}  (run compare_live_cer.py for true CER)")
+    else:
+        print(f"  Val CER  : N/A  (run compare_live_cer.py for true CER)")
+    return model, optimizer, checkpoint
+def save_predictions_to_file(predictions: List[str], ground_truths: List[str], output_file: str):
+    """
+    Save predictions and ground truths to file for analysis
+    """
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write("Ground Truth\tPrediction\tMatch\n")
+        f.write("=" * 80 + "\n")
+        for gt, pred in zip(ground_truths, predictions):
+            match = "✓" if gt == pred else "✗"
+            f.write(f"{gt}\t{pred}\t{match}\n")
+    print(f"✓ Predictions saved to {output_file}")
+if __name__ == "__main__":
+    # Test utility functions
+    print("=" * 60)
+    print("Testing Utility Functions")
+    print("=" * 60)
+    # Test CER calculation
+    predictions = ["Hello World", "Test", "Sample Text"]
+    ground_truths = ["Hello World", "Tset", "Sample Txt"]
+    cer = calculate_cer(predictions, ground_truths)
+    wer = calculate_wer(predictions, ground_truths)
+    accuracy = calculate_accuracy(predictions, ground_truths)
+    print(f"\nMetrics:")
+    print(f"  CER: {cer:.2f}%")
+    print(f"  WER: {wer:.2f}%")
+    print(f"  Accuracy: {accuracy:.2f}%")
+    # Test early stopping
+    print("\nTesting Early Stopping:")
+    early_stopping = EarlyStopping(patience=3, min_delta=0.001)
+    val_losses = [1.0, 0.9, 0.85, 0.84, 0.84, 0.84, 0.84]
+    for epoch, loss in enumerate(val_losses, 1):
+        should_stop = early_stopping(loss)
+        print(f"  Epoch {epoch}: Loss = {loss:.2f}, Stop = {should_stop}")
+        if should_stop:
+            break

MNB/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# mnb/__init__.py
+from .classifier import MNBClassifier
+__all__ = ["MNBClassifier"]

MNB/classifier.py ADDED Viewed

	@@ -0,0 +1,292 @@

+# mnb/classifier.py
+# ============================================================
+# MNB CLASSIFIER — wraps the trained DocumentClassifier
+#
+# TWO SEPARATE CONCERNS:
+#
+# PATH A — Certifications Page
+#   User uploads a certification scan.
+#   MNB identifies which form it is:
+#     form102  → Form 102 (Certificate of Live Birth)
+#     form103  → Form 103 (Certificate of Death)
+#     form97   → Form 97  (Certificate of Marriage)
+#
+# PATH B — Application for Marriage License Page (Form 90)
+#   User uploads TWO birth certificates:
+#     - Groom's Birth Cert (PSA/NSO sealed)
+#     - Bride's Birth Cert (PSA/NSO sealed)
+#   MNB is NOT used for form type here — the upload page
+#   already tells us it's a birth cert.
+#   classify_sex() reads the SEX field → GROOM (Male) or BRIDE (Female)
+#   and routes each cert to the correct Form 90 slot.
+#
+# Files needed:
+#   form_classifier.py     ← training + DocumentClassifier
+#   models/mnb_classifier.pkl
+#   models/tfidf_vectorizer.pkl
+#   models/mnb_metadata.json
+# ============================================================
+import sys
+import os
+_mnb_dir = os.path.dirname(os.path.abspath(__file__))
+if _mnb_dir not in sys.path:
+    sys.path.insert(0, _mnb_dir)
+_root_dir = os.path.dirname(_mnb_dir)
+if _root_dir not in sys.path:
+    sys.path.insert(0, _root_dir)
+try:
+    from form_classifier import DocumentClassifier
+    _HAVE_DOC_CLASSIFIER = True
+except ImportError:
+    _HAVE_DOC_CLASSIFIER = False
+# ── Keyword fallback (used if .pkl files not found) ────────
+# Uses exact Philippine civil registry form headers
+_FORM_KEYWORDS = {
+    "form102": [
+        "Municipal Form No. 102",
+        "Municipal Form No.102",
+        "Certificate of Live Birth",
+        "live birth",
+        "name of child",
+        "date of birth",
+        "place of birth",
+        "birth certificate",
+        "mother", "father",
+        "infant", "newborn",
+        "attendant at birth",
+    ],
+    "form103": [
+        "Municipal Form No. 103",
+        "Municipal Form No.103",
+        "Certificate of Death",
+        "death certificate",
+        "name of deceased",
+        "date of death",
+        "place of death",
+        "cause of death",
+        "burial", "deceased",
+        "immediate cause",
+    ],
+    "form97": [
+        "Municipal Form No. 97",
+        "Municipal Form No.97",
+        "Certificate of Marriage",
+        "marriage certificate",
+        "name of husband",
+        "name of wife",
+        "date of marriage",
+        "place of marriage",
+        "solemnizing officer",
+        "contracting parties",
+        "witnesses",
+    ],
+}
+# Sex keywords for Form 90 routing (Groom/Bride)
+_SEX_KEYWORDS = {
+    "GROOM": [
+        "sex: male",
+        "sex male",
+        "2. sex: male",
+        " male",
+        "sex m",
+    ],
+    "BRIDE": [
+        "sex: female",
+        "sex female",
+        "2. sex: female",
+        " female",
+        "sex f",
+    ],
+}
+def _keyword_classify_form(text: str) -> str:
+    """Keyword fallback for Certifications page classification."""
+    t = text.lower()
+    scores = {k: sum(1 for kw in v if kw.lower() in t) for k, v in _FORM_KEYWORDS.items()}
+    return max(scores, key=scores.get)
+def _keyword_classify_sex(text: str) -> str:
+    """Keyword-based sex classifier for Form 90 routing."""
+    t = text.lower()
+    scores = {k: sum(1 for kw in v if kw.lower() in t) for k, v in _SEX_KEYWORDS.items()}
+    return max(scores, key=scores.get)
+# ── Form code → NER hint map ──────────────────────────────
+_FORM_CODE_TO_HINT = {
+    "form102": "birth",
+    "form103": "death",
+    "form97":  "marriage",
+    # Form 90 is handled by classify_sex() — not this map
+}
+class MNBClassifier:
+    """
+    MNB Classifier for the Civil Registry Digitization System.
+    PATH A — Certifications Page:
+        mnb = MNBClassifier()
+        form_code = mnb.classify_form_type(ocr_text)
+        # → 'form102' | 'form103' | 'form97'
+        hint = mnb.get_ner_hint(ocr_text)
+        # → 'birth' | 'death' | 'marriage'
+        result = mnb.classify_full(ocr_text)
+        # → {'label': 'Form 102 - Certificate of Live Birth',
+        #    'form_code': 'form102', 'confidence': 0.97, 'probabilities': {...}}
+    PATH B — Application for Marriage License Page (Form 90):
+        sex_role = mnb.classify_sex(ocr_text)
+        # → 'GROOM' (Male birth cert) | 'BRIDE' (Female birth cert)
+    """
+    def __init__(self, model_dir: str = "models"):
+        self._doc_clf = None
+        if _HAVE_DOC_CLASSIFIER:
+            try:
+                self._doc_clf = DocumentClassifier(model_dir=model_dir)
+                print(f"  [MNB] Loaded DocumentClassifier from {model_dir}/")
+            except FileNotFoundError as e:
+                print(f"  [MNB] {e}")
+                print("  [MNB] Using keyword fallback — run: python mnb/form_classifier.py")
+        else:
+            print("  [MNB] form_classifier.py not found — using keyword fallback")
+    # ── PATH A: Certifications Page ────────────────────────
+    def classify_form_type(self, ocr_text: str) -> str:
+        """
+        Certifications page: identify which form was uploaded.
+        Returns: 'form102' | 'form103' | 'form97'
+        """
+        if self._doc_clf is not None:
+            return self._doc_clf.predict(ocr_text)["form_code"]
+        return _keyword_classify_form(ocr_text)
+    def classify_full(self, ocr_text: str) -> dict:
+        """
+        Certifications page: full result with confidence scores.
+        Returns:
+            {
+                'label':         'Form 102 - Certificate of Live Birth',
+                'form_code':     'form102',
+                'confidence':    0.97,
+                'probabilities': { ... }
+            }
+        """
+        if self._doc_clf is not None:
+            return self._doc_clf.predict(ocr_text)
+        winner = _keyword_classify_form(ocr_text)
+        return {
+            "label":         winner,
+            "form_code":     winner,
+            "confidence":    1.0,
+            "probabilities": {k: (1.0 if k == winner else 0.0) for k in _FORM_KEYWORDS},
+        }
+    def get_ner_hint(self, ocr_text: str) -> str:
+        """
+        Returns NER hint string for bridge.py:
+        'birth' | 'death' | 'marriage'
+        """
+        code = self.classify_form_type(ocr_text)
+        return _FORM_CODE_TO_HINT.get(code, "birth")
+    # ── PATH B: Marriage License Page (Form 90) ────────────
+    def classify_sex(self, ocr_text: str) -> str:
+        """
+        Form 90 upload page only.
+        Reads the SEX field on a PSA/NSO birth certificate.
+        Returns: 'GROOM' (Male) | 'BRIDE' (Female)
+        """
+        return _keyword_classify_sex(ocr_text)
+    def classify_sex_proba(self, ocr_text: str) -> dict:
+        """
+        Returns confidence scores for sex classification.
+        Returns: {'GROOM': 0.9, 'BRIDE': 0.1}
+        """
+        winner = _keyword_classify_sex(ocr_text)
+        return {k: (1.0 if k == winner else 0.0) for k in _SEX_KEYWORDS}
+# ── Quick test ──────────────────────────────────────────────
+if __name__ == "__main__":
+    mnb = MNBClassifier()
+    print("\n  ── PATH A: Certifications Page Tests ──")
+    cert_tests = [
+        (
+            "Municipal Form No. 102 Certificate of Live Birth "
+            "Name of child Maria Santos Date of birth 01/15/1990 "
+            "Place of birth Brgy. San Jose Tarlac City "
+            "Name of mother Lani Santos Name of father Jose Santos "
+            "Sex Female birth certificate infant",
+            "form102"
+        ),
+        (
+            "Municipal Form No.102 Certificate of Live Birth "
+            "PSA Child Juan Dela Cruz born 03/22/1985 Capas Tarlac "
+            "mother Rosa father Pedro Sex Male",
+            "form102"
+        ),
+        (
+            "Municipal Form No. 103 Certificate of Death "
+            "Name of deceased Pedro Reyes Date of death 03/22/2020 "
+            "Cause of death Cardiac Arrest death certificate burial",
+            "form103"
+        ),
+        (
+            "Municipal Form No.103 Certificate of Death "
+            "Deceased Ana Torres died 07/04/2000 Pneumonia burial permit",
+            "form103"
+        ),
+        (
+            "Municipal Form No. 97 Certificate of Marriage "
+            "Name of husband Carlos Bautista Name of wife Ana Torres "
+            "Date of marriage 07/04/2005 solemnizing officer witnesses",
+            "form97"
+        ),
+        (
+            "Municipal Form No.97 Certificate of Marriage "
+            "Husband Jose Santos wife Maria Reyes married 11/30/1995 "
+            "contracting parties",
+            "form97"
+        ),
+    ]
+    for text, expected in cert_tests:
+        result = mnb.classify_full(text)
+        mark = "✅" if result["form_code"] == expected else "❌"
+        print(f"  {mark}  Expected={expected:<8}  Got={result['form_code']:<8}  "
+              f"Confidence={result['confidence']:.1%}  ({result['label']})")
+    print("\n  ── PATH B: Form 90 Marriage License — Sex Routing Tests ──")
+    sex_tests = [
+        (
+            "Municipal Form No.102 Certificate of Live Birth PSA "
+            "CHILD (First): Juan Dela Cruz SEX: Male "
+            "Date of Birth March 15 1990 Mother Maria Dela Cruz",
+            "GROOM"
+        ),
+        (
+            "Municipal Form No.102 Certificate of Live Birth NSO "
+            "CHILD (First): Ana Santos SEX: Female "
+            "Date of Birth August 21 1995 Mother Gloria Santos",
+            "BRIDE"
+        ),
+    ]
+    for text, expected in sex_tests:
+        pred = mnb.classify_sex(text)
+        mark = "✅" if pred == expected else "❌"
+        print(f"  {mark}  Expected={expected}  Got={pred}")

MNB/form_classifier.py ADDED Viewed

	@@ -0,0 +1,466 @@

+"""
+form_classifier.py
+=======================
+Multinomial Naive Bayes (MNB) Document Classifier
+for Local Civil Registry Document Digitization System
+Classifies extracted OCR text into:
+  - Form 102  (Certificate of Live Birth)       ← Certifications page
+  - Form 103  (Certificate of Death)             ← Certifications page
+  - Form 97   (Certificate of Marriage)          ← Certifications page
+NOTE: Form 90 (Application for Marriage License) is NOT classified here.
+      Form 90 has its OWN upload page where the user uploads:
+        - Groom's Birth Certificate (PSA/NSO sealed)
+        - Bride's Birth Certificate (PSA/NSO sealed)
+      The SEX field on each birth cert determines GROOM (Male) or BRIDE (Female).
+      See classify_sex() in classifier.py for that routing.
+Usage:
+    python form_classifier.py            # trains and saves model
+    python form_classifier.py --test     # runs test predictions
+"""
+import os
+import json
+import random
+import argparse
+import pickle
+import numpy as np
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import (
+    accuracy_score, classification_report, confusion_matrix
+)
+# ─────────────────────────────────────────────────────────────
+# 1.  LABEL MAP  (Certifications page only — NO Form 90 here)
+# ─────────────────────────────────────────────────────────────
+LABEL_MAP = {
+    0: 'Form 102 - Certificate of Live Birth',
+    1: 'Form 103 - Certificate of Death',
+    2: 'Form 97 - Certificate of Marriage',
+}
+LABEL_NAMES = list(LABEL_MAP.values())
+# ─────────────────────────────────────────────────────────────
+# 2.  VOCABULARY POOLS  (Filipino civil registry)
+# ─────────────────────────────────────────────────────────────
+FIRST_NAMES = [
+    'Juan', 'Maria', 'Jose', 'Ana', 'Pedro', 'Rosa', 'Carlos', 'Lani',
+    'Roberto', 'Nena', 'Ramon', 'Cynthia', 'Eduardo', 'Marites', 'Danilo',
+    'Rowena', 'Renato', 'Melinda', 'Ernesto', 'Josephine', 'Michael',
+    'Jennifer', 'Angelo', 'Christine', 'Mark', 'Patricia', 'John', 'Mary'
+]
+LAST_NAMES = [
+    'Dela Cruz', 'Santos', 'Reyes', 'Garcia', 'Torres', 'Flores',
+    'Bautista', 'Villanueva', 'Mendoza', 'Castro', 'Ramos', 'Lim',
+    'Aquino', 'Diaz', 'Fernandez', 'Lopez', 'Gonzales', 'Ramirez',
+    'Abad', 'Aguilar', 'Manalo', 'Navarro', 'Ocampo', 'Pascual'
+]
+MUNICIPALITIES = [
+    'Tarlac City', 'Capas', 'Paniqui', 'Gerona', 'Camiling',
+    'Victoria', 'San Manuel', 'Concepcion', 'La Paz', 'Sta. Ignacia',
+    'Bamban', 'Moncada', 'Pura', 'Ramos', 'Anao'
+]
+PROVINCES = ['Tarlac', 'Pampanga', 'Nueva Ecija', 'Bulacan', 'Zambales']
+BARANGAYS = [
+    'Brgy. San Jose', 'Brgy. Poblacion', 'Brgy. Sto. Cristo',
+    'Brgy. Tibag', 'Brgy. Maliwalo', 'Brgy. San Nicolas',
+    'Brgy. San Roque', 'Brgy. San Vicente', 'Brgy. Salapungan'
+]
+DATES = [
+    '01/15/1990', '03/22/1985', '07/04/2000', '11/30/1995',
+    '05/18/1988', '09/12/1975', '02/28/1993', '06/06/1980',
+    '12/25/1998', '04/17/2001', '08/08/1965', '10/31/1970',
+]
+def _name():
+    return f"{random.choice(FIRST_NAMES)} {random.choice(LAST_NAMES)}"
+def _date():
+    return random.choice(DATES)
+def _place():
+    return f"{random.choice(BARANGAYS)}, {random.choice(MUNICIPALITIES)}, {random.choice(PROVINCES)}"
+# ─────────────────────────────────────────────────────────────
+# 3.  SAMPLE GENERATORS
+#     Each generator uses the EXACT Philippine form header
+#     so MNB learns the real keywords from actual documents.
+# ─────────────────────────────────────────────────────────────
+def generate_form102():
+    """
+    Form 102 — Certificate of Live Birth
+    Header keywords: 'Municipal Form No. 102', 'Certificate of Live Birth'
+    """
+    templates = [
+        # Template A: Exact header present
+        f"Municipal Form No. 102 Certificate of Live Birth "
+        f"Name of child {_name()} Date of birth {_date()} Place of birth {_place()} "
+        f"Name of mother {_name()} Name of father {_name()} "
+        f"Sex {random.choice(['Male', 'Female'])} "
+        f"Legitimacy {random.choice(['Legitimate', 'Illegitimate'])} "
+        f"Attendant {random.choice(['Physician', 'Midwife', 'Nurse'])} "
+        f"birth certificate registry birth registration infant newborn child",
+        # Template B: No. without space
+        f"Municipal Form No.102 Certificate of Live Birth "
+        f"Child {_name()} born {_date()} at {_place()} "
+        f"mother {_name()} father {_name()} "
+        f"birth weight {random.randint(2, 4)}.{random.randint(1, 9)} kg "
+        f"birth order {random.choice(['First', 'Second', 'Third'])} "
+        f"birth certificate Form 102",
+        # Template C: Registry number format
+        f"Municipal Form No. 102 Certificate of Live Birth "
+        f"Registry number {random.randint(100, 999)}-{random.randint(1, 99):02d} "
+        f"name of child {_name()} date of birth {_date()} "
+        f"place of birth {_place()} birth certificate municipal civil registrar",
+        # Template D: PSA/NSO sealed copy (used when filing Form 90)
+        f"Municipal Form No. 102 Certificate of Live Birth "
+        f"PSA {_name()} born on {_date()} "
+        f"place of birth {_place()} "
+        f"mother maiden name {_name()} father {_name()} "
+        f"type of birth {random.choice(['Single', 'Twin'])} infant newborn",
+        # Template E: NSO variation
+        f"Municipal Form No.102 Certificate of Live Birth "
+        f"NSO birth registration {_name()} "
+        f"birth date {_date()} birthplace {_place()} "
+        f"parents mother {_name()} father {_name()} "
+        f"attendant at birth {random.choice(['hospital', 'midwife', 'physician'])} "
+        f"sex {random.choice(['male', 'female'])}",
+    ]
+    return random.choice(templates)
+def generate_form103():
+    """
+    Form 103 — Certificate of Death
+    Header keywords: 'Municipal Form No. 103', 'Certificate of Death'
+    """
+    causes = [
+        'Cardiac Arrest', 'Pneumonia', 'Hypertension', 'Diabetes Mellitus',
+        'Stroke', 'Respiratory Failure', 'Natural Causes', 'Cancer',
+        'Septicemia', 'Renal Failure'
+    ]
+    templates = [
+        # Template A: Exact header
+        f"Municipal Form No. 103 Certificate of Death "
+        f"Name of deceased {_name()} Date of death {_date()} Place of death {_place()} "
+        f"Cause of death {random.choice(causes)} Age at death {random.randint(1, 95)} "
+        f"Sex {random.choice(['Male', 'Female'])} "
+        f"Civil status {random.choice(['Single', 'Married', 'Widowed'])} "
+        f"death certificate deceased burial interment",
+        # Template B: No space
+        f"Municipal Form No.103 Certificate of Death "
+        f"Deceased {_name()} died on {_date()} at {_place()} "
+        f"cause {random.choice(causes)} corpse informant {_name()} "
+        f"death certificate Form 103 municipal civil registrar",
+        # Template C: Registry format
+        f"Municipal Form No. 103 Certificate of Death "
+        f"Registry number death {random.randint(100, 999)}-{random.randint(1, 99):02d} "
+        f"name of deceased {_name()} date of death {_date()} "
+        f"place of death {_place()} cause of death {random.choice(causes)} "
+        f"death certificate burial permit",
+        # Template D: Clinical format
+        f"Municipal Form No.103 Certificate of Death "
+        f"{_name()} died {_date()} "
+        f"place {_place()} cause of death {random.choice(causes)} "
+        f"informant {_name()} relationship {random.choice(['spouse', 'child', 'sibling', 'parent'])} "
+        f"death deceased cadaver",
+        # Template E: Full form
+        f"Municipal Form No. 103 Certificate of Death "
+        f"Form 103 death registration {_name()} "
+        f"date of death {_date()} place of death {_place()} "
+        f"immediate cause {random.choice(causes)} "
+        f"attending physician {_name()} certificate of death",
+    ]
+    return random.choice(templates)
+def generate_form97():
+    """
+    Form 97 — Certificate of Marriage
+    Header keywords: 'Municipal Form No. 97', 'Certificate of Marriage'
+    """
+    officers = ['Rev.', 'Judge', 'Mayor', 'Pastor', 'Fr.']
+    licenses = [f"{random.randint(10000, 99999)}", f"ML-{random.randint(1000, 9999)}"]
+    templates = [
+        # Template A: Exact header
+        f"Municipal Form No. 97 Certificate of Marriage "
+        f"Name of husband {_name()} Name of wife {_name()} "
+        f"Date of marriage {_date()} Place of marriage {_place()} "
+        f"Solemnizing officer {random.choice(officers)} {_name()} "
+        f"Marriage license number {random.choice(licenses)} witnesses {_name()} {_name()} "
+        f"marriage certificate contracting parties wedding",
+        # Template B: No space
+        f"Municipal Form No.97 Certificate of Marriage "
+        f"Husband {_name()} wife {_name()} "
+        f"married on {_date()} at {_place()} "
+        f"officiated by {random.choice(officers)} {_name()} "
+        f"marriage certificate Form 97 solemnizing officer",
+        # Template C: Registry format
+        f"Municipal Form No. 97 Certificate of Marriage "
+        f"Registry number marriage {random.randint(100, 999)}-{random.randint(1, 99):02d} "
+        f"husband {_name()} wife {_name()} "
+        f"date of marriage {_date()} place {_place()} "
+        f"marriage license {random.choice(licenses)} issued at {_place()} "
+        f"marriage certificate civil registrar",
+        # Template D: Ceremony format
+        f"Municipal Form No.97 Certificate of Marriage "
+        f"{_name()} and {_name()} "
+        f"solemnized {_date()} at {_place()} "
+        f"solemnizing officer {random.choice(officers)} {_name()} "
+        f"witnesses {_name()} {_name()} "
+        f"marriage contracting parties husband wife ceremony",
+        # Template E: Full form
+        f"Municipal Form No. 97 Certificate of Marriage "
+        f"Form 97 marriage registration husband {_name()} "
+        f"wife {_name()} date of marriage {_date()} "
+        f"place of marriage {_place()} "
+        f"license number {random.choice(licenses)} marriage nuptial wed",
+    ]
+    return random.choice(templates)
+# ─────────────────────────────────────────────────────────────
+# 4.  DATASET GENERATOR  (3 classes only — no Form 90)
+# ─────────────────────────────────────────────────────────────
+def generate_dataset(samples_per_class=150):
+    generators = [generate_form102, generate_form103, generate_form97]
+    labels_map = [0, 1, 2]  # 0=Form102, 1=Form103, 2=Form97
+    texts, labels = [], []
+    for gen, label in zip(generators, labels_map):
+        for _ in range(samples_per_class):
+            texts.append(gen())
+            labels.append(label)
+    combined = list(zip(texts, labels))
+    random.shuffle(combined)
+    texts, labels = zip(*combined)
+    return list(texts), list(labels)
+# ─────────────────────────────────────────────────────────────
+# 5.  TRAIN & SAVE
+# ─────────────────────────────────────────────────────────────
+def train(samples_per_class=150, save_dir='models'):
+    os.makedirs(save_dir, exist_ok=True)
+    print("=" * 60)
+    print("  MNB Document Classifier  |  Filipino Civil Registry")
+    print("  Certifications Page: Form 102 / 103 / 97 ONLY")
+    print("  (Form 90 routing is handled separately via SEX field)")
+    print("=" * 60)
+    print(f"\n  Generating dataset ({samples_per_class} samples × 3 forms = {samples_per_class * 3} total)...")
+    texts, labels = generate_dataset(samples_per_class)
+    X_train, X_test, y_train, y_test = train_test_split(
+        texts, labels, test_size=0.2, random_state=42, stratify=labels
+    )
+    print(f"  Train: {len(X_train)}  |  Test: {len(X_test)}")
+    # TF-IDF vectorizer
+    vectorizer = TfidfVectorizer(
+        ngram_range=(1, 2),
+        max_features=5000,
+        sublinear_tf=True,
+        min_df=1,
+    )
+    X_train_vec = vectorizer.fit_transform(X_train)
+    X_test_vec  = vectorizer.transform(X_test)
+    # Train MNB
+    clf = MultinomialNB(alpha=0.1)
+    clf.fit(X_train_vec, y_train)
+    # Evaluate
+    y_pred = clf.predict(X_test_vec)
+    acc    = accuracy_score(y_test, y_pred)
+    print(f"\n  Accuracy : {acc * 100:.2f}%")
+    print("\n  Classification Report:")
+    print(classification_report(y_test, y_pred, target_names=LABEL_NAMES))
+    print("  Confusion Matrix:")
+    cm = confusion_matrix(y_test, y_pred)
+    headers = ['Form102', 'Form103', 'Form97']
+    print(f"  {'':30s} " + "  ".join(headers))
+    for i, row in enumerate(cm):
+        print(f"  Actual {headers[i]}: {str(row)}")
+    # Save
+    model_path = os.path.join(save_dir, 'mnb_classifier.pkl')
+    vec_path   = os.path.join(save_dir, 'tfidf_vectorizer.pkl')
+    with open(model_path, 'wb') as f:
+        pickle.dump(clf, f)
+    with open(vec_path, 'wb') as f:
+        pickle.dump(vectorizer, f)
+    meta = {
+        'accuracy': round(acc * 100, 2),
+        'samples_per_class': samples_per_class,
+        'total_samples': samples_per_class * 3,
+        'labels': LABEL_MAP,
+        'note': 'Form 90 routing is handled by classify_sex() — not this model',
+        'model_path': model_path,
+        'vectorizer_path': vec_path,
+    }
+    with open(os.path.join(save_dir, 'mnb_metadata.json'), 'w') as f:
+        json.dump(meta, f, indent=2)
+    print(f"\n  Model saved     : {model_path}")
+    print(f"  Vectorizer saved: {vec_path}")
+    print(f"\n  Target accuracy : >90%")
+    print(f"  Achieved        : {acc * 100:.2f}% {'✓' if acc >= 0.90 else '✗ (try increasing samples_per_class)'}")
+    print("=" * 60)
+    return clf, vectorizer, acc
+# ─────────────────────────────────────────────────────────────
+# 6.  DOCUMENT CLASSIFIER CLASS
+# ─────────────────────────────────────────────────────────────
+class DocumentClassifier:
+    """Load trained MNB model and classify OCR text from Certifications page."""
+    def __init__(self, model_dir='models'):
+        model_path = os.path.join(model_dir, 'mnb_classifier.pkl')
+        vec_path   = os.path.join(model_dir, 'tfidf_vectorizer.pkl')
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(
+                f"Model not found at {model_path}. Run: python form_classifier.py"
+            )
+        with open(model_path, 'rb') as f:
+            self.clf = pickle.load(f)
+        with open(vec_path, 'rb') as f:
+            self.vectorizer = pickle.load(f)
+    def predict(self, text: str) -> dict:
+        """
+        Classify OCR text from Certifications page.
+        Returns:
+            {
+                'label':        'Form 102 - Certificate of Live Birth',
+                'form_code':    'form102',
+                'confidence':   0.95,
+                'probabilities': { ... }
+            }
+        """
+        vec   = self.vectorizer.transform([text])
+        probs = self.clf.predict_proba(vec)[0]
+        idx   = int(np.argmax(probs))
+        form_codes = ['form102', 'form103', 'form97']
+        return {
+            'label':      LABEL_MAP[idx],
+            'form_code':  form_codes[idx],
+            'confidence': round(float(probs[idx]), 4),
+            'probabilities': {
+                LABEL_MAP[i]: round(float(p), 4)
+                for i, p in enumerate(probs)
+            }
+        }
+# ─────────────────────────────────────────────────────────────
+# 7.  TEST DEMO
+# ─────────────────────────────────────────────────────────────
+def run_test():
+    print("\n" + "=" * 60)
+    print("  Testing DocumentClassifier — Certifications Page")
+    print("=" * 60)
+    classifier = DocumentClassifier()
+    test_cases = [
+        (
+            "Municipal Form No. 102 Certificate of Live Birth "
+            "Name of child Maria Santos Date of birth 01/15/1990 "
+            "Place of birth Brgy. San Jose, Tarlac City, Tarlac "
+            "Name of mother Lani Santos Name of father Jose Santos "
+            "Sex Female birth certificate infant",
+            "Form 102 - Certificate of Live Birth"
+        ),
+        (
+            "Municipal Form No.102 Certificate of Live Birth "
+            "PSA Child Juan Dela Cruz born 03/22/1985 "
+            "Place of birth Capas Tarlac mother Rosa Dela Cruz "
+            "father Pedro Dela Cruz Sex Male",
+            "Form 102 - Certificate of Live Birth"
+        ),
+        (
+            "Municipal Form No. 103 Certificate of Death "
+            "Name of deceased Pedro Reyes Date of death 03/22/2020 "
+            "Place of death Capas, Tarlac Cause of death Cardiac Arrest "
+            "Age at death 75 death certificate deceased burial",
+            "Form 103 - Certificate of Death"
+        ),
+        (
+            "Municipal Form No.103 Certificate of Death "
+            "Deceased Ana Torres died 07/04/2000 "
+            "cause Pneumonia burial permit interment",
+            "Form 103 - Certificate of Death"
+        ),
+        (
+            "Municipal Form No. 97 Certificate of Marriage "
+            "Name of husband Carlos Bautista Name of wife Ana Torres "
+            "Date of marriage 07/04/2005 Place of marriage Paniqui, Tarlac "
+            "Solemnizing officer Rev. Santos witnesses marriage certificate",
+            "Form 97 - Certificate of Marriage"
+        ),
+        (
+            "Municipal Form No.97 Certificate of Marriage "
+            "Husband Jose Santos wife Maria Reyes "
+            "married 11/30/1995 contracting parties solemnizing officer",
+            "Form 97 - Certificate of Marriage"
+        ),
+    ]
+    correct = 0
+    for text, expected in test_cases:
+        result = classifier.predict(text)
+        status = '✓' if expected in result['label'] else '✗'
+        if expected in result['label']:
+            correct += 1
+        print(f"\n  {status} Expected : {expected}")
+        print(f"    Predicted: {result['label']} ({result['confidence'] * 100:.1f}% confidence)")
+    print(f"\n  Test Accuracy: {correct}/{len(test_cases)} ({correct / len(test_cases) * 100:.0f}%)")
+    print("=" * 60)
+# ─────────────────────────────────────────────────────────────
+# 8.  MAIN
+# ─────────────────────────────────────────────────────────────
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--test', action='store_true', help='Run test predictions only')
+    parser.add_argument('--samples', type=int, default=150, help='Samples per class (default: 150)')
+    args = parser.parse_args()
+    if args.test:
+        run_test()
+    else:
+        train(samples_per_class=args.samples)
+        print("\nTo test predictions, run:")
+        print("  python form_classifier.py --test")

MNB/keywords.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# mnb/keywords.py
+# ============================================================
+# Keyword fallback lists used by classifier.py when the
+# trained .pkl models are not available.
+#
+# Uses EXACT Philippine civil registry form headers:
+#   Form 102 → "Municipal Form No. 102 / Certificate of Live Birth"
+#   Form 103 → "Municipal Form No. 103 / Certificate of Death"
+#   Form 97  → "Municipal Form No. 97  / Certificate of Marriage"
+#
+# NOTE: Form 90 is NOT classified here.
+#   Form 90 has its own upload page (Application for Marriage License).
+#   The SEX field on the uploaded birth cert determines routing:
+#     Male   → GROOM slot in Form 90
+#     Female → BRIDE slot in Form 90
+# ============================================================
+# ── PATH A: Certifications Page ──────────────────────────────
+FORM_KEYWORDS = {
+    "form102": [
+        # Exact header variants
+        "Municipal Form No. 102",
+        "Municipal Form No.102",
+        "Certificate of Live Birth",
+        # Field-level keywords
+        "name of child",
+        "date of birth",
+        "place of birth",
+        "birth certificate",
+        "name of mother",
+        "name of father",
+        "attendant at birth",
+        "type of birth",
+        "birth order",
+        "legitimacy",
+        "infant",
+        "newborn",
+        # PSA/NSO sealed copy keywords
+        "PSA",
+        "NSO",
+        "bc registry",
+    ],
+    "form103": [
+        # Exact header variants
+        "Municipal Form No. 103",
+        "Municipal Form No.103",
+        "Certificate of Death",
+        # Field-level keywords
+        "name of deceased",
+        "date of death",
+        "place of death",
+        "cause of death",
+        "death certificate",
+        "immediate cause",
+        "antecedent cause",
+        "underlying cause",
+        "burial",
+        "deceased",
+        "died",
+        "burial permit",
+        "interment",
+    ],
+    "form97": [
+        # Exact header variants
+        "Municipal Form No. 97",
+        "Municipal Form No.97",
+        "Certificate of Marriage",
+        # Field-level keywords
+        "name of husband",
+        "name of wife",
+        "date of marriage",
+        "place of marriage",
+        "marriage certificate",
+        "solemnizing officer",
+        "contracting parties",
+        "witnesses",
+        "marriage license number",
+        "mc registry",
+        "nuptial",
+        "wed",
+    ],
+    "form90": [
+        # Exact header variants
+        "Municipal Form 90",
+        "Municipal Form No. 90",
+        "Municipal Form No.90",
+        "Application for Marriage License",
+        "APPLICATION FOR MARRIAGE LICENSE",
+        "Form No. 2",
+        # Field-level keywords
+        "name of applicant",
+        "marriage license no",
+        "marriage license number",
+        "date of issuance",
+        "date of issuance of marriage license",
+        "groom",
+        "bride",
+        "may i apply for a license",
+        "accountable form no. 54",
+    ],
+}
+# ── PATH B: Form 90 Marriage License Page ────────────────────
+# Used ONLY on the Marriage License upload page.
+# Reads the SEX field from the uploaded PSA/NSO birth certificate.
+#   Male   → GROOM (routed to Groom slot in Form 90)
+#   Female → BRIDE (routed to Bride slot in Form 90)
+SEX_KEYWORDS = {
+    "GROOM": [
+        "sex: male",
+        "sex male",
+        "2. sex: male",
+        " male",
+        "sex m",
+    ],
+    "BRIDE": [
+        "sex: female",
+        "sex female",
+        "2. sex: female",
+        " female",
+        "sex f",
+    ],
+}

MNB/mnb_metadata.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "accuracy": 100.0,
+  "samples_per_class": 150,
+  "total_samples": 450,
+  "labels": {
+    "0": "Form 102 - Certificate of Live Birth",
+    "1": "Form 103 - Certificate of Death",
+    "2": "Form 97 - Certificate of Marriage"
+  },
+  "note": "Form 90 routing is handled separately by classify_sex() using the SEX field on uploaded PSA/NSO birth certificates. Male = GROOM, Female = BRIDE.",
+  "pages": {
+    "certifications": "Classifies Form 102 / 103 / 97 from uploaded certification scan",
+    "marriage_license": "classify_sex() routes birth cert to GROOM or BRIDE slot in Form 90"
+  },
+  "model_path": "models/mnb_classifier.pkl",
+  "vectorizer_path": "models/tfidf_vectorizer.pkl"
+}

MNB/models/mnb_classifier.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d62d9cbdd7d76b60d17787b93bcc22f51c5602934ac60117e15279c3a22c519
+size 200089

MNB/models/mnb_metadata.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "accuracy": 100.0,
+  "samples_per_class": 150,
+  "total_samples": 600,
+  "labels": {
+    "0": "Form 1A - Birth Certificate",
+    "1": "Form 2A - Death Certificate",
+    "2": "Form 3A - Marriage Certificate",
+    "3": "Form 90 - Application for Marriage License"
+  },
+  "model_path": "models\\mnb_classifier.pkl",
+  "vectorizer_path": "models\\tfidf_vectorizer.pkl"
+}

MNB/models/tfidf_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:217cd506c7d9d7bfcfef73fc107273c129d4d55ab7dfddc1190e2863ee381ec4
+size 129497

references/12 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d9e1c47ea7a15f7ff1e14a3b34db3f2eb690c15c45c2a5b8174d964633d0f6f
+size 1924369

references/321 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc0159e24fa6735aeed7153ecf0092ba6d7bec510c57c8ec52a28328083d2e61
+size 957650

references/321321 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3e4cf4da9290a262997067fda2640da298979a5f3dc069b88177104d1a629ce
+size 3225794

references/old.jpg ADDED Viewed

Git LFS Details

SHA256: 451d3062ee9fe0c3941c5cb8997109bc24ca2a363c06f31bf0eea7a26a144cee
Pointer size: 131 Bytes
Size of remote file: 633 kB

references/reference-102.png ADDED Viewed

Git LFS Details

SHA256: d0228edcc4baa444f78f2ff908dc9df82dafdebbd933bbd7a49ec52afdbd7352
Pointer size: 132 Bytes
Size of remote file: 4.07 MB

references/reference-103.png ADDED Viewed

Git LFS Details

SHA256: d2cb21bd62b12f08cd02593d97db1e49664d4ec3763a255a1f5653b0171ab92c
Pointer size: 132 Bytes
Size of remote file: 5.27 MB