| """ |
| extract_actual_data.py |
| ====================== |
| Extract field crops from actual scanned civil registry forms and |
| auto-label them with EasyOCR as a starting point for CRNN fine-tuning. |
| |
| Reads images from: |
| actual_images/{form_type}/*.{png,jpg,jpeg} |
| |
| For each image: |
| 1. Aligns to reference using ORB + ECC + corner fallback |
| 2. Crops every field defined in TEMPLATES |
| 3. Applies CLAHE per-crop before auto-labeling |
| 4. Saves crop to data/actual_crops/ |
| 5. Auto-labels with EasyOCR + field-type post-processing |
| |
| Output: |
| data/actual_crops/ -- field crop images |
| data/actual_annotations.json -- labels for fine-tuning |
| |
| After running: |
| - Open actual_annotations.json |
| - Fix any wrong 'text' values |
| - Run finetune.py to train |
| |
| Usage: |
| cd python/CRNN+CTC |
| python extract_actual_data.py |
| |
| # or point to a different images folder: |
| python extract_actual_data.py --images /path/to/actual_images |
| """ |
|
|
| import os |
| import sys |
| import json |
| import argparse |
| import numpy as np |
| from PIL import Image |
|
|
| |
| THIS_DIR = os.path.dirname(os.path.abspath(__file__)) |
| ROOT_DIR = os.path.dirname(os.path.dirname(THIS_DIR)) |
| PYTHON_DIR = os.path.dirname(THIS_DIR) |
|
|
| sys.path.insert(0, PYTHON_DIR) |
|
|
| from template_matcher import ( |
| TEMPLATES, REFERENCE_IMAGES, |
| align_to_reference, _preprocess, _crop_field, |
| _get_easyocr, _postprocess, |
| ) |
|
|
| try: |
| import cv2 as _cv2 |
| _CV2_OK = True |
| except ImportError: |
| _CV2_OK = False |
|
|
| CROPS_DIR = os.path.join(THIS_DIR, 'data', 'actual_crops') |
| ANN_PATH = os.path.join(THIS_DIR, 'data', 'actual_annotations.json') |
| MIN_CROP_W = 10 |
| MIN_CROP_H = 6 |
|
|
| |
| _SKIP_SUBSTRINGS = ('debug', 'aligned', 'crops_aligned') |
| _SKIP_PREFIXES = ('test_', 'father_', 'father2_') |
|
|
|
|
| def _is_scan(fname: str) -> bool: |
| base = fname.lower() |
| ext = os.path.splitext(base)[1] |
| if ext not in ('.png', '.jpg', '.jpeg', '.tiff', '.bmp'): |
| return False |
| if any(s in base for s in _SKIP_SUBSTRINGS): |
| return False |
| if any(base.startswith(p) for p in _SKIP_PREFIXES): |
| return False |
| return True |
|
|
|
|
| def _ocr_crop(arr: np.ndarray, reader) -> str: |
| """Run EasyOCR on a uint8 RGB numpy array.""" |
| try: |
| results = reader.readtext(arr, detail=0, paragraph=True) |
| return ' '.join(results).strip() |
| except Exception as e: |
| return '' |
|
|
|
|
| def process_image(img_path: str, form_type: str, reader, crops_dir: str) -> list: |
| """ |
| Align one scan, crop every template field, save crops, auto-label. |
| Returns list of annotation dicts for this image. |
| """ |
| template = TEMPLATES[form_type] |
| fname = os.path.basename(img_path) |
| stem = os.path.splitext(fname)[0] |
|
|
| try: |
| img = Image.open(img_path).convert('RGB') |
| except Exception as e: |
| print(f' [skip] Cannot open: {e}') |
| return [] |
|
|
| w, h = img.size |
| print(f' Processing {fname} ({w}x{h})...') |
|
|
| |
| img, orb_inliers = align_to_reference(img, form_type) |
| print(f' ORB inliers: {orb_inliers}') |
|
|
| |
| processed = _preprocess(img) |
|
|
| annotations = [] |
|
|
| for field_name, coords in template.items(): |
| x1r, y1r, x2r, y2r, _ = coords |
| crop = _crop_field(processed, x1r, y1r, x2r, y2r) |
|
|
| if crop is None or crop.size[0] < MIN_CROP_W or crop.size[1] < MIN_CROP_H: |
| continue |
|
|
| |
| gray = np.array(crop.convert('L')) |
| if _CV2_OK: |
| clahe = _cv2.createCLAHE(clipLimit=1.5, tileGridSize=(2, 2)) |
| gray = clahe.apply(gray) |
| arr = np.stack([gray, gray, gray], axis=-1) |
|
|
| raw = _ocr_crop(arr, reader) |
| label = _postprocess(raw, field_name) |
|
|
| crop_fname = f'{form_type}_{stem}_{field_name}.png' |
| crop.save(os.path.join(crops_dir, crop_fname)) |
|
|
| annotations.append({ |
| 'image_path': os.path.join('data', 'actual_crops', crop_fname), |
| 'text': label, |
| 'form_type': form_type, |
| 'field': field_name, |
| 'source_img': fname, |
| }) |
|
|
| print(f' Saved {len(annotations)} crops') |
| return annotations |
|
|
|
|
| def main(images_root: str): |
| os.makedirs(CROPS_DIR, exist_ok=True) |
|
|
| print('[extract] Loading EasyOCR...') |
| reader = _get_easyocr() |
| if reader is None: |
| print('[extract] ERROR: EasyOCR failed to load.') |
| sys.exit(1) |
| print('[extract] EasyOCR ready.') |
|
|
| all_annotations = [] |
|
|
| for form_type in sorted(TEMPLATES.keys()): |
| folder = os.path.join(images_root, form_type) |
| if not os.path.isdir(folder): |
| print(f'\n[extract] No images in {folder}, skipping.') |
| continue |
|
|
| scans = sorted(f for f in os.listdir(folder) if _is_scan(f)) |
| if not scans: |
| print(f'\n[extract] No scan images in {folder}, skipping.') |
| continue |
|
|
| ref = REFERENCE_IMAGES.get(form_type, '') |
| if not os.path.exists(ref): |
| print(f'\n[extract] WARNING: No reference image for form {form_type} β alignment will be resize-only') |
|
|
| print(f'\n[extract] Form {form_type} β {len(scans)} image(s)') |
|
|
| for fname in scans: |
| anns = process_image(os.path.join(folder, fname), form_type, reader, CROPS_DIR) |
| all_annotations.extend(anns) |
|
|
| with open(ANN_PATH, 'w', encoding='utf-8') as f: |
| json.dump(all_annotations, f, indent=2, ensure_ascii=False) |
|
|
| total = len(all_annotations) |
| print(f'\n[extract] Done.') |
| print(f' Crops saved : {total}') |
| print(f' Annotations : {ANN_PATH}') |
| print() |
| print('Review actual_annotations.json and correct any wrong labels,') |
| print('then run finetune.py to train on this data.') |
|
|
|
|
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| '--images', |
| default=os.path.join(ROOT_DIR, 'actual_images'), |
| help='Path to actual_images/ folder (default: <project_root>/actual_images)', |
| ) |
| args = parser.parse_args() |
| main(args.images) |
|
|