Spaces:

hanz245
/

ocr

Running

File size: 6,394 Bytes

7111e1a

"""
extract_actual_data.py
======================
Extract field crops from actual scanned civil registry forms and
auto-label them with EasyOCR as a starting point for CRNN fine-tuning.

Reads images from:
    actual_images/{form_type}/*.{png,jpg,jpeg}

For each image:
  1. Aligns to reference using ORB + ECC + corner fallback
  2. Crops every field defined in TEMPLATES
  3. Applies CLAHE per-crop before auto-labeling
  4. Saves crop to data/actual_crops/
  5. Auto-labels with EasyOCR + field-type post-processing

Output:
    data/actual_crops/             -- field crop images
    data/actual_annotations.json   -- labels for fine-tuning

After running:
  - Open actual_annotations.json
  - Fix any wrong 'text' values
  - Run finetune.py to train

Usage:
    cd python/CRNN+CTC
    python extract_actual_data.py

    # or point to a different images folder:
    python extract_actual_data.py --images /path/to/actual_images
"""

import os
import sys
import json
import argparse
import numpy as np
from PIL import Image

# ── Paths ─────────────────────────────────────────────────────
THIS_DIR   = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR   = os.path.dirname(os.path.dirname(THIS_DIR))   # project root
PYTHON_DIR = os.path.dirname(THIS_DIR)                    # python/

sys.path.insert(0, PYTHON_DIR)

from template_matcher import (
    TEMPLATES, REFERENCE_IMAGES,
    align_to_reference, _preprocess, _crop_field,
    _get_easyocr, _postprocess,
)

try:
    import cv2 as _cv2
    _CV2_OK = True
except ImportError:
    _CV2_OK = False

CROPS_DIR  = os.path.join(THIS_DIR, 'data', 'actual_crops')
ANN_PATH   = os.path.join(THIS_DIR, 'data', 'actual_annotations.json')
MIN_CROP_W = 10
MIN_CROP_H = 6

# Substrings that indicate a file is a debug/test output, not a real scan
_SKIP_SUBSTRINGS = ('debug', 'aligned', 'crops_aligned')
_SKIP_PREFIXES   = ('test_', 'father_', 'father2_')


def _is_scan(fname: str) -> bool:
    base = fname.lower()
    ext  = os.path.splitext(base)[1]
    if ext not in ('.png', '.jpg', '.jpeg', '.tiff', '.bmp'):
        return False
    if any(s in base for s in _SKIP_SUBSTRINGS):
        return False
    if any(base.startswith(p) for p in _SKIP_PREFIXES):
        return False
    return True


def _ocr_crop(arr: np.ndarray, reader) -> str:
    """Run EasyOCR on a uint8 RGB numpy array."""
    try:
        results = reader.readtext(arr, detail=0, paragraph=True)
        return ' '.join(results).strip()
    except Exception as e:
        return ''


def process_image(img_path: str, form_type: str, reader, crops_dir: str) -> list:
    """
    Align one scan, crop every template field, save crops, auto-label.
    Returns list of annotation dicts for this image.
    """
    template = TEMPLATES[form_type]
    fname    = os.path.basename(img_path)
    stem     = os.path.splitext(fname)[0]

    try:
        img = Image.open(img_path).convert('RGB')
    except Exception as e:
        print(f'    [skip] Cannot open: {e}')
        return []

    w, h = img.size
    print(f'  Processing {fname} ({w}x{h})...')

    # Align (ORB → ECC → corner → resize)
    img, orb_inliers = align_to_reference(img, form_type)
    print(f'    ORB inliers: {orb_inliers}')

    # Grayscale + deskew
    processed = _preprocess(img)

    annotations = []

    for field_name, coords in template.items():
        x1r, y1r, x2r, y2r, _ = coords
        crop = _crop_field(processed, x1r, y1r, x2r, y2r)

        if crop is None or crop.size[0] < MIN_CROP_W or crop.size[1] < MIN_CROP_H:
            continue

        # CLAHE per-crop before OCR (same as extract_fields in template_matcher)
        gray = np.array(crop.convert('L'))
        if _CV2_OK:
            clahe = _cv2.createCLAHE(clipLimit=1.5, tileGridSize=(2, 2))
            gray  = clahe.apply(gray)
        arr = np.stack([gray, gray, gray], axis=-1)

        raw   = _ocr_crop(arr, reader)
        label = _postprocess(raw, field_name)

        crop_fname = f'{form_type}_{stem}_{field_name}.png'
        crop.save(os.path.join(crops_dir, crop_fname))

        annotations.append({
            'image_path': os.path.join('data', 'actual_crops', crop_fname),
            'text':       label,
            'form_type':  form_type,
            'field':      field_name,
            'source_img': fname,
        })

    print(f'    Saved {len(annotations)} crops')
    return annotations


def main(images_root: str):
    os.makedirs(CROPS_DIR, exist_ok=True)

    print('[extract] Loading EasyOCR...')
    reader = _get_easyocr()
    if reader is None:
        print('[extract] ERROR: EasyOCR failed to load.')
        sys.exit(1)
    print('[extract] EasyOCR ready.')

    all_annotations = []

    for form_type in sorted(TEMPLATES.keys()):
        folder = os.path.join(images_root, form_type)
        if not os.path.isdir(folder):
            print(f'\n[extract] No images in {folder}, skipping.')
            continue

        scans = sorted(f for f in os.listdir(folder) if _is_scan(f))
        if not scans:
            print(f'\n[extract] No scan images in {folder}, skipping.')
            continue

        ref = REFERENCE_IMAGES.get(form_type, '')
        if not os.path.exists(ref):
            print(f'\n[extract] WARNING: No reference image for form {form_type} — alignment will be resize-only')

        print(f'\n[extract] Form {form_type} — {len(scans)} image(s)')

        for fname in scans:
            anns = process_image(os.path.join(folder, fname), form_type, reader, CROPS_DIR)
            all_annotations.extend(anns)

    with open(ANN_PATH, 'w', encoding='utf-8') as f:
        json.dump(all_annotations, f, indent=2, ensure_ascii=False)

    total = len(all_annotations)
    print(f'\n[extract] Done.')
    print(f'  Crops saved : {total}')
    print(f'  Annotations : {ANN_PATH}')
    print()
    print('Review actual_annotations.json and correct any wrong labels,')
    print('then run finetune.py to train on this data.')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--images',
        default=os.path.join(ROOT_DIR, 'actual_images'),
        help='Path to actual_images/ folder (default: <project_root>/actual_images)',
    )
    args = parser.parse_args()
    main(args.images)