File size: 6,394 Bytes
7111e1a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | """
extract_actual_data.py
======================
Extract field crops from actual scanned civil registry forms and
auto-label them with EasyOCR as a starting point for CRNN fine-tuning.
Reads images from:
actual_images/{form_type}/*.{png,jpg,jpeg}
For each image:
1. Aligns to reference using ORB + ECC + corner fallback
2. Crops every field defined in TEMPLATES
3. Applies CLAHE per-crop before auto-labeling
4. Saves crop to data/actual_crops/
5. Auto-labels with EasyOCR + field-type post-processing
Output:
data/actual_crops/ -- field crop images
data/actual_annotations.json -- labels for fine-tuning
After running:
- Open actual_annotations.json
- Fix any wrong 'text' values
- Run finetune.py to train
Usage:
cd python/CRNN+CTC
python extract_actual_data.py
# or point to a different images folder:
python extract_actual_data.py --images /path/to/actual_images
"""
import os
import sys
import json
import argparse
import numpy as np
from PIL import Image
# ββ Paths βββββββββββββββββββββββββββββββββββββββββββββββββββββ
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR = os.path.dirname(os.path.dirname(THIS_DIR)) # project root
PYTHON_DIR = os.path.dirname(THIS_DIR) # python/
sys.path.insert(0, PYTHON_DIR)
from template_matcher import (
TEMPLATES, REFERENCE_IMAGES,
align_to_reference, _preprocess, _crop_field,
_get_easyocr, _postprocess,
)
try:
import cv2 as _cv2
_CV2_OK = True
except ImportError:
_CV2_OK = False
CROPS_DIR = os.path.join(THIS_DIR, 'data', 'actual_crops')
ANN_PATH = os.path.join(THIS_DIR, 'data', 'actual_annotations.json')
MIN_CROP_W = 10
MIN_CROP_H = 6
# Substrings that indicate a file is a debug/test output, not a real scan
_SKIP_SUBSTRINGS = ('debug', 'aligned', 'crops_aligned')
_SKIP_PREFIXES = ('test_', 'father_', 'father2_')
def _is_scan(fname: str) -> bool:
base = fname.lower()
ext = os.path.splitext(base)[1]
if ext not in ('.png', '.jpg', '.jpeg', '.tiff', '.bmp'):
return False
if any(s in base for s in _SKIP_SUBSTRINGS):
return False
if any(base.startswith(p) for p in _SKIP_PREFIXES):
return False
return True
def _ocr_crop(arr: np.ndarray, reader) -> str:
"""Run EasyOCR on a uint8 RGB numpy array."""
try:
results = reader.readtext(arr, detail=0, paragraph=True)
return ' '.join(results).strip()
except Exception as e:
return ''
def process_image(img_path: str, form_type: str, reader, crops_dir: str) -> list:
"""
Align one scan, crop every template field, save crops, auto-label.
Returns list of annotation dicts for this image.
"""
template = TEMPLATES[form_type]
fname = os.path.basename(img_path)
stem = os.path.splitext(fname)[0]
try:
img = Image.open(img_path).convert('RGB')
except Exception as e:
print(f' [skip] Cannot open: {e}')
return []
w, h = img.size
print(f' Processing {fname} ({w}x{h})...')
# Align (ORB β ECC β corner β resize)
img, orb_inliers = align_to_reference(img, form_type)
print(f' ORB inliers: {orb_inliers}')
# Grayscale + deskew
processed = _preprocess(img)
annotations = []
for field_name, coords in template.items():
x1r, y1r, x2r, y2r, _ = coords
crop = _crop_field(processed, x1r, y1r, x2r, y2r)
if crop is None or crop.size[0] < MIN_CROP_W or crop.size[1] < MIN_CROP_H:
continue
# CLAHE per-crop before OCR (same as extract_fields in template_matcher)
gray = np.array(crop.convert('L'))
if _CV2_OK:
clahe = _cv2.createCLAHE(clipLimit=1.5, tileGridSize=(2, 2))
gray = clahe.apply(gray)
arr = np.stack([gray, gray, gray], axis=-1)
raw = _ocr_crop(arr, reader)
label = _postprocess(raw, field_name)
crop_fname = f'{form_type}_{stem}_{field_name}.png'
crop.save(os.path.join(crops_dir, crop_fname))
annotations.append({
'image_path': os.path.join('data', 'actual_crops', crop_fname),
'text': label,
'form_type': form_type,
'field': field_name,
'source_img': fname,
})
print(f' Saved {len(annotations)} crops')
return annotations
def main(images_root: str):
os.makedirs(CROPS_DIR, exist_ok=True)
print('[extract] Loading EasyOCR...')
reader = _get_easyocr()
if reader is None:
print('[extract] ERROR: EasyOCR failed to load.')
sys.exit(1)
print('[extract] EasyOCR ready.')
all_annotations = []
for form_type in sorted(TEMPLATES.keys()):
folder = os.path.join(images_root, form_type)
if not os.path.isdir(folder):
print(f'\n[extract] No images in {folder}, skipping.')
continue
scans = sorted(f for f in os.listdir(folder) if _is_scan(f))
if not scans:
print(f'\n[extract] No scan images in {folder}, skipping.')
continue
ref = REFERENCE_IMAGES.get(form_type, '')
if not os.path.exists(ref):
print(f'\n[extract] WARNING: No reference image for form {form_type} β alignment will be resize-only')
print(f'\n[extract] Form {form_type} β {len(scans)} image(s)')
for fname in scans:
anns = process_image(os.path.join(folder, fname), form_type, reader, CROPS_DIR)
all_annotations.extend(anns)
with open(ANN_PATH, 'w', encoding='utf-8') as f:
json.dump(all_annotations, f, indent=2, ensure_ascii=False)
total = len(all_annotations)
print(f'\n[extract] Done.')
print(f' Crops saved : {total}')
print(f' Annotations : {ANN_PATH}')
print()
print('Review actual_annotations.json and correct any wrong labels,')
print('then run finetune.py to train on this data.')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--images',
default=os.path.join(ROOT_DIR, 'actual_images'),
help='Path to actual_images/ folder (default: <project_root>/actual_images)',
)
args = parser.parse_args()
main(args.images)
|