File size: 6,394 Bytes
7111e1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""
extract_actual_data.py
======================
Extract field crops from actual scanned civil registry forms and
auto-label them with EasyOCR as a starting point for CRNN fine-tuning.

Reads images from:
    actual_images/{form_type}/*.{png,jpg,jpeg}

For each image:
  1. Aligns to reference using ORB + ECC + corner fallback
  2. Crops every field defined in TEMPLATES
  3. Applies CLAHE per-crop before auto-labeling
  4. Saves crop to data/actual_crops/
  5. Auto-labels with EasyOCR + field-type post-processing

Output:
    data/actual_crops/             -- field crop images
    data/actual_annotations.json   -- labels for fine-tuning

After running:
  - Open actual_annotations.json
  - Fix any wrong 'text' values
  - Run finetune.py to train

Usage:
    cd python/CRNN+CTC
    python extract_actual_data.py

    # or point to a different images folder:
    python extract_actual_data.py --images /path/to/actual_images
"""

import os
import sys
import json
import argparse
import numpy as np
from PIL import Image

# ── Paths ─────────────────────────────────────────────────────
THIS_DIR   = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR   = os.path.dirname(os.path.dirname(THIS_DIR))   # project root
PYTHON_DIR = os.path.dirname(THIS_DIR)                    # python/

sys.path.insert(0, PYTHON_DIR)

from template_matcher import (
    TEMPLATES, REFERENCE_IMAGES,
    align_to_reference, _preprocess, _crop_field,
    _get_easyocr, _postprocess,
)

try:
    import cv2 as _cv2
    _CV2_OK = True
except ImportError:
    _CV2_OK = False

CROPS_DIR  = os.path.join(THIS_DIR, 'data', 'actual_crops')
ANN_PATH   = os.path.join(THIS_DIR, 'data', 'actual_annotations.json')
MIN_CROP_W = 10
MIN_CROP_H = 6

# Substrings that indicate a file is a debug/test output, not a real scan
_SKIP_SUBSTRINGS = ('debug', 'aligned', 'crops_aligned')
_SKIP_PREFIXES   = ('test_', 'father_', 'father2_')


def _is_scan(fname: str) -> bool:
    base = fname.lower()
    ext  = os.path.splitext(base)[1]
    if ext not in ('.png', '.jpg', '.jpeg', '.tiff', '.bmp'):
        return False
    if any(s in base for s in _SKIP_SUBSTRINGS):
        return False
    if any(base.startswith(p) for p in _SKIP_PREFIXES):
        return False
    return True


def _ocr_crop(arr: np.ndarray, reader) -> str:
    """Run EasyOCR on a uint8 RGB numpy array."""
    try:
        results = reader.readtext(arr, detail=0, paragraph=True)
        return ' '.join(results).strip()
    except Exception as e:
        return ''


def process_image(img_path: str, form_type: str, reader, crops_dir: str) -> list:
    """
    Align one scan, crop every template field, save crops, auto-label.
    Returns list of annotation dicts for this image.
    """
    template = TEMPLATES[form_type]
    fname    = os.path.basename(img_path)
    stem     = os.path.splitext(fname)[0]

    try:
        img = Image.open(img_path).convert('RGB')
    except Exception as e:
        print(f'    [skip] Cannot open: {e}')
        return []

    w, h = img.size
    print(f'  Processing {fname} ({w}x{h})...')

    # Align (ORB β†’ ECC β†’ corner β†’ resize)
    img, orb_inliers = align_to_reference(img, form_type)
    print(f'    ORB inliers: {orb_inliers}')

    # Grayscale + deskew
    processed = _preprocess(img)

    annotations = []

    for field_name, coords in template.items():
        x1r, y1r, x2r, y2r, _ = coords
        crop = _crop_field(processed, x1r, y1r, x2r, y2r)

        if crop is None or crop.size[0] < MIN_CROP_W or crop.size[1] < MIN_CROP_H:
            continue

        # CLAHE per-crop before OCR (same as extract_fields in template_matcher)
        gray = np.array(crop.convert('L'))
        if _CV2_OK:
            clahe = _cv2.createCLAHE(clipLimit=1.5, tileGridSize=(2, 2))
            gray  = clahe.apply(gray)
        arr = np.stack([gray, gray, gray], axis=-1)

        raw   = _ocr_crop(arr, reader)
        label = _postprocess(raw, field_name)

        crop_fname = f'{form_type}_{stem}_{field_name}.png'
        crop.save(os.path.join(crops_dir, crop_fname))

        annotations.append({
            'image_path': os.path.join('data', 'actual_crops', crop_fname),
            'text':       label,
            'form_type':  form_type,
            'field':      field_name,
            'source_img': fname,
        })

    print(f'    Saved {len(annotations)} crops')
    return annotations


def main(images_root: str):
    os.makedirs(CROPS_DIR, exist_ok=True)

    print('[extract] Loading EasyOCR...')
    reader = _get_easyocr()
    if reader is None:
        print('[extract] ERROR: EasyOCR failed to load.')
        sys.exit(1)
    print('[extract] EasyOCR ready.')

    all_annotations = []

    for form_type in sorted(TEMPLATES.keys()):
        folder = os.path.join(images_root, form_type)
        if not os.path.isdir(folder):
            print(f'\n[extract] No images in {folder}, skipping.')
            continue

        scans = sorted(f for f in os.listdir(folder) if _is_scan(f))
        if not scans:
            print(f'\n[extract] No scan images in {folder}, skipping.')
            continue

        ref = REFERENCE_IMAGES.get(form_type, '')
        if not os.path.exists(ref):
            print(f'\n[extract] WARNING: No reference image for form {form_type} β€” alignment will be resize-only')

        print(f'\n[extract] Form {form_type} β€” {len(scans)} image(s)')

        for fname in scans:
            anns = process_image(os.path.join(folder, fname), form_type, reader, CROPS_DIR)
            all_annotations.extend(anns)

    with open(ANN_PATH, 'w', encoding='utf-8') as f:
        json.dump(all_annotations, f, indent=2, ensure_ascii=False)

    total = len(all_annotations)
    print(f'\n[extract] Done.')
    print(f'  Crops saved : {total}')
    print(f'  Annotations : {ANN_PATH}')
    print()
    print('Review actual_annotations.json and correct any wrong labels,')
    print('then run finetune.py to train on this data.')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--images',
        default=os.path.join(ROOT_DIR, 'actual_images'),
        help='Path to actual_images/ folder (default: <project_root>/actual_images)',
    )
    args = parser.parse_args()
    main(args.images)