| """ |
| generate_form_samples.py |
| ======================== |
| Generates thousands of synthetic filled civil registry form images |
| using the blank PDF forms + template_matcher.py coordinates. |
| |
| Each form is filled with random Filipino names/dates in handwriting fonts. |
| Crops are saved with labels β ready for CRNN+CTC fine-tuning. |
| |
| Usage: |
| python generate_form_samples.py |
| |
| Output: |
| data/train/real_forms/ -- cropped field images |
| data/real_annotations.json -- labels for fine-tuning |
| """ |
|
|
| import os |
| import sys |
| import json |
| import random |
| import datetime |
|
|
| from PIL import Image, ImageDraw, ImageFont |
|
|
| |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
| ROOT_DIR = os.path.dirname(BASE_DIR) |
| PYTHON_DIR = ROOT_DIR |
|
|
| NAMES_FILE = os.path.join(BASE_DIR, 'data', 'ph_names.json') |
| OUT_IMG_DIR = os.path.join(BASE_DIR, 'data', 'train', 'real_forms') |
| OUT_ANN = os.path.join(BASE_DIR, 'data', 'real_annotations.json') |
|
|
| FONTS_DIR = os.path.join(ROOT_DIR, 'test_images', 'handwriting_fonts') |
|
|
| |
| GOOGLE_FONTS = { |
| 'Kalam-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Regular.ttf', |
| 'Kalam-Bold.ttf': 'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Bold.ttf', |
| 'Kalam-Light.ttf': 'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Light.ttf', |
| 'PatrickHand-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/patrickhand/PatrickHand-Regular.ttf', |
| 'IndieFlower-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/indieflower/IndieFlower-Regular.ttf', |
| 'Handlee-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/handlee/Handlee-Regular.ttf', |
| 'GochiHand-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/gochihand/GochiHand-Regular.ttf', |
| 'ArchitectsDaughter.ttf': 'https://github.com/google/fonts/raw/main/ofl/architectsdaughter/ArchitectsDaughter-Regular.ttf', |
| 'ShadowsIntoLight.ttf': 'https://github.com/google/fonts/raw/main/ofl/shadowsintolight/ShadowsIntoLight.ttf', |
| 'ShadowsIntoLightTwo.ttf': 'https://github.com/google/fonts/raw/main/ofl/shadowsintolighttwo/ShadowsIntoLightTwo-Regular.ttf', |
| 'Kristi-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/kristi/Kristi-Regular.ttf', |
| 'AmaticSC-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/amaticsc/AmaticSC-Regular.ttf', |
| 'AmaticSC-Bold.ttf': 'https://github.com/google/fonts/raw/main/ofl/amaticsc/AmaticSC-Bold.ttf', |
| 'BadScript-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/badscript/BadScript-Regular.ttf', |
| 'Sacramento-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/sacramento/Sacramento-Regular.ttf', |
| 'GreatVibes-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/greatvibes/GreatVibes-Regular.ttf', |
| 'Allura-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/allura/Allura-Regular.ttf', |
| 'AlexBrush-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/alexbrush/AlexBrush-Regular.ttf', |
| 'Parisienne-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/parisienne/Parisienne-Regular.ttf', |
| 'Tangerine-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/tangerine/Tangerine-Regular.ttf', |
| 'Tangerine-Bold.ttf': 'https://github.com/google/fonts/raw/main/ofl/tangerine/Tangerine-Bold.ttf', |
| 'Courgette-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/courgette/Courgette-Regular.ttf', |
| 'Niconne-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/niconne/Niconne-Regular.ttf', |
| 'MarckScript-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/marckscript/MarckScript-Regular.ttf', |
| 'Norican-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/norican/Norican-Regular.ttf', |
| 'Damion-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/damion/Damion-Regular.ttf', |
| 'Satisfy-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/satisfy/Satisfy-Regular.ttf', |
| 'Pacifico-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/pacifico/Pacifico-Regular.ttf', |
| 'Italianno-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/italianno/Italianno-Regular.ttf', |
| 'Pompiere-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/pompiere/Pompiere-Regular.ttf', |
| } |
|
|
| FONT_PATHS = [ |
| |
| *[os.path.join(FONTS_DIR, name) for name in GOOGLE_FONTS], |
| |
| os.path.join(ROOT_DIR, 'test_images', 'Caveat-Regular.ttf'), |
| |
| r'C:\Windows\Fonts\segoepr.ttf', |
| r'C:\Windows\Fonts\segoeprb.ttf', |
| r'C:\Windows\Fonts\comic.ttf', |
| ] |
|
|
| def download_fonts(): |
| """Download handwriting fonts from Google Fonts if not present.""" |
| import urllib.request |
| os.makedirs(FONTS_DIR, exist_ok=True) |
| ok = 0 |
| for fname, url in GOOGLE_FONTS.items(): |
| dest = os.path.join(FONTS_DIR, fname) |
| if os.path.exists(dest) and os.path.getsize(dest) > 10000: |
| ok += 1 |
| continue |
| try: |
| print(f" Downloading {fname}...") |
| with urllib.request.urlopen(url, timeout=10) as r, open(dest, 'wb') as f: |
| f.write(r.read()) |
| |
| if os.path.getsize(dest) < 10000: |
| os.remove(dest) |
| print(f" Skipped {fname} (invalid file)") |
| else: |
| ok += 1 |
| except Exception as e: |
| print(f" Failed {fname}: {e}") |
| if os.path.exists(dest): |
| os.remove(dest) |
| print(f" {ok} fonts ready") |
|
|
| PDF_FORMS = { |
| '97': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 97 (MARRIAGE CERTIFICATE).pdf'), |
| '102': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 102 (BIRTH CERTIFICATE).pdf'), |
| '103': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 103 (DEATH CERTIFICATE).pdf'), |
| '90': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 90-MARRIAGE-LICENCE-FORM.pdf'), |
| } |
|
|
| SAMPLES_PER_FORM = 1000 |
| IMG_W = 64 |
| IMG_H = 512 |
|
|
| |
| sys.path.insert(0, PYTHON_DIR) |
| from template_matcher import TEMPLATES |
|
|
| |
| def load_names(): |
| if not os.path.exists(NAMES_FILE): |
| print(f"ERROR: {NAMES_FILE} not found. Run generate_ph_names.py first.") |
| sys.exit(1) |
| with open(NAMES_FILE) as f: |
| data = json.load(f) |
| return data |
|
|
| |
| MONTHS = ['January','February','March','April','May','June', |
| 'July','August','September','October','November','December'] |
| RELIGIONS = ['Roman Catholic','Islam','Baptist','Iglesia ni Cristo', |
| 'Seventh Day Adventist','Born Again Christian'] |
| CIVIL_STATUSES = ['Single','Married','Widowed','Legally Separated'] |
| CITIZENSHIPS = ['Filipino','American','Chinese','Japanese'] |
| PROVINCES = ['Cebu','Davao del Sur','Metro Manila','Iloilo','Pampanga', |
| 'Batangas','Laguna','Cavite','Bulacan','Quezon City'] |
| CITIES = ['Cebu City','Davao City','Manila','Iloilo City','San Fernando', |
| 'Batangas City','Santa Rosa','Bacoor','Malolos','Quezon City'] |
|
|
| def rand_name(names, key): |
| pool = names.get(key, ['Juan']) |
| return random.choice(pool).upper() |
|
|
| def rand_date(): |
| y = random.randint(1950, 2005) |
| m = random.randint(1, 12) |
| d = random.randint(1, 28) |
| return f"{d:02d}", MONTHS[m-1], str(y) |
|
|
| def rand_age(): |
| return str(random.randint(18, 80)) |
|
|
| def rand_province(): |
| return random.choice(PROVINCES).upper() |
|
|
| def rand_city(): |
| return random.choice(CITIES).upper() |
|
|
| def rand_religion(): |
| return random.choice(RELIGIONS).upper() |
|
|
| def rand_civil_status(): |
| return random.choice(CIVIL_STATUSES).upper() |
|
|
| def rand_citizenship(): |
| return random.choice(CITIZENSHIPS).upper() |
|
|
| def rand_registry_no(): |
| return f"{random.randint(2000,2024)}-{random.randint(1000,9999)}" |
|
|
| def rand_time(): |
| h = random.randint(6, 18) |
| m = random.choice(['00','15','30','45']) |
| return f"{h:02d}:{m} {'AM' if h < 12 else 'PM'}" |
|
|
| def generate_field_value(field_name, names): |
| """Generate a plausible random value for a given field name.""" |
| f = field_name.lower() |
| if 'province' in f: return rand_province() |
| if 'registry' in f: return rand_registry_no() |
| if 'city' in f or 'municipality' in f: return rand_city() |
| if 'first' in f and ('name' in f or 'father' in f or 'mother' in f): |
| return rand_name(names, 'first') |
| if 'middle' in f: return rand_name(names, 'middle') |
| if 'last' in f: return rand_name(names, 'last') |
| if '_name' in f and 'father' not in f and 'mother' not in f: |
| return rand_name(names, 'first') |
| if 'father_name' in f or 'mother_name' in f: |
| return f"{rand_name(names,'first')} {rand_name(names,'middle')} {rand_name(names,'last')}" |
| if 'dob_day' in f or 'day' in f: return rand_date()[0] |
| if 'dob_month' in f or 'month' in f: return rand_date()[1] |
| if 'dob_year' in f or 'year' in f: return rand_date()[2] |
| if 'dob' in f and 'day' not in f and 'month' not in f and 'year' not in f: |
| d,m,y = rand_date(); return f"{d} {m} {y}" |
| if 'age' in f: return rand_age() |
| if 'birth' in f and 'place' in f: return rand_city() |
| if 'place_of_birth' in f: return rand_city() |
| if 'sex' in f: return random.choice(['MALE','FEMALE']) |
| if 'citizenship' in f: return rand_citizenship() |
| if 'residence' in f: return f"{rand_city()}, {rand_province()}" |
| if 'religion' in f: return rand_religion() |
| if 'civil_status' in f: return rand_civil_status() |
| if 'place_of_marriage' in f: return rand_city() |
| if 'date_of_marriage' in f: |
| d,m,y = rand_date(); return f"{d} {m} {y}" |
| if 'time_of_marriage' in f: return rand_time() |
| if 'marriage_date' in f: |
| d,m,y = rand_date(); return f"{d} {m} {y}" |
| if 'marriage_place' in f: return rand_city() |
| if 'marriage_license' in f: return rand_registry_no() |
| if 'date_issued' in f: |
| d,m,y = rand_date(); return f"{d} {m} {y}" |
| if 'occupation' in f: return random.choice(['FARMER','TEACHER','NURSE','ENGINEER','DRIVER','HOUSEWIFE']) |
| if 'type_of_birth' in f: return random.choice(['SINGLE','TWIN','TRIPLET']) |
| if 'birth_order' in f: return random.choice(['1ST','2ND','3RD','4TH']) |
| if 'weight' in f: return f"{random.randint(2,5)}.{random.randint(0,9)} KG" |
| if 'cause' in f: return random.choice(['CARDIAC ARREST','PNEUMONIA','DIABETES','HYPERTENSION']) |
| if 'father_name' in f: return f"{rand_name(names,'first')} {rand_name(names,'last')}" |
| if 'mother_name' in f: return f"{rand_name(names,'first')} {rand_name(names,'last')}" |
| return rand_name(names, 'first') |
|
|
| |
| def load_fonts(): |
| fonts = [] |
| for path in FONT_PATHS: |
| if os.path.exists(path): |
| for size in [14, 16, 18, 20]: |
| try: |
| fonts.append(ImageFont.truetype(path, size)) |
| except: |
| pass |
| if not fonts: |
| fonts = [ImageFont.load_default()] |
| print(f" Loaded {len(fonts)} font variants") |
| return fonts |
|
|
| |
| def load_blank_form(form_type): |
| """Convert PDF to image or use a reference scan as background.""" |
| pdf_path = PDF_FORMS.get(form_type) |
|
|
| |
| if pdf_path and os.path.exists(pdf_path): |
| try: |
| from pdf2image import convert_from_path |
| pages = convert_from_path(pdf_path, dpi=150) |
| if pages: |
| return pages[0].convert('RGB') |
| except Exception as e: |
| print(f" pdf2image failed: {e}") |
|
|
| |
| for ext in ['png', 'jpg', 'jpeg']: |
| ref_path = os.path.join(ROOT_DIR, 'references', f'reference_{form_type}.{ext}') |
| if os.path.exists(ref_path): |
| return Image.open(ref_path).convert('RGB') |
| |
| for ext in ['png', 'jpg', 'jpeg']: |
| ref_path = os.path.join(ROOT_DIR, 'references', f'reference-{form_type}.{ext}') |
| if os.path.exists(ref_path): |
| return Image.open(ref_path).convert('RGB') |
|
|
| print(f" WARNING: No blank form found for {form_type} β skipping") |
| return None |
|
|
| |
| def render_field(draw, x1r, y1r, x2r, y2r, text, img_w, img_h, fonts): |
| """Draw handwritten-style text in a field box.""" |
| x1 = int(x1r * img_w) |
| y1 = int(y1r * img_h) |
| x2 = int(x2r * img_w) |
| y2 = int(y2r * img_h) |
|
|
| box_w = max(x2 - x1, 1) |
| box_h = max(y2 - y1, 1) |
|
|
| |
| font = random.choice(fonts) |
| for f in fonts: |
| bbox = f.getbbox(text) |
| fw = bbox[2] - bbox[0] |
| fh = bbox[3] - bbox[1] |
| if fw <= box_w * 0.95 and fh <= box_h * 1.2: |
| font = f |
| break |
|
|
| |
| r = random.randint(0, 40) |
| g = random.randint(0, 40) |
| b = random.randint(60, 120) |
| color = (r, g, b) |
|
|
| |
| bbox = font.getbbox(text) |
| fh = bbox[3] - bbox[1] |
| ty = y1 + (box_h - fh) // 2 |
|
|
| |
| tx = x1 + random.randint(2, max(3, box_w // 10)) |
|
|
| draw.text((tx, ty), text, fill=color, font=font) |
|
|
| |
| def crop_field(img, x1r, y1r, x2r, y2r): |
| w, h = img.size |
| x1 = max(0, int(x1r * w) - 4) |
| y1 = max(0, int(y1r * h) - 4) |
| x2 = min(w, int(x2r * w) + 4) |
| y2 = min(h, int(y2r * h) + 4) |
| return img.crop((x1, y1, x2, y2)) |
|
|
| |
| def main(): |
| print("=" * 60) |
| print(" Form Sample Generator") |
| print("=" * 60) |
|
|
| os.makedirs(OUT_IMG_DIR, exist_ok=True) |
| print("\n Downloading handwriting fonts...") |
| download_fonts() |
| names = load_names() |
| fonts = load_fonts() |
| annotations = [] |
| total = 0 |
|
|
| for form_type, template in TEMPLATES.items(): |
| print(f"\n Generating Form {form_type}...") |
|
|
| blank = load_blank_form(form_type) |
| if blank is None: |
| continue |
|
|
| for i in range(SAMPLES_PER_FORM): |
| |
| form_img = blank.copy() |
| draw = ImageDraw.Draw(form_img) |
| img_w, img_h = form_img.size |
|
|
| field_values = {} |
| for field_name, coords in template.items(): |
| x1r, y1r, x2r, y2r, _ = coords |
| text = generate_field_value(field_name, names) |
| field_values[field_name] = text |
| render_field(draw, x1r, y1r, x2r, y2r, text, img_w, img_h, fonts) |
|
|
| |
| if i == 0: |
| preview_path = os.path.join(OUT_IMG_DIR, f'form{form_type}_preview.png') |
| form_img.save(preview_path) |
| print(f" Preview saved: {preview_path}") |
|
|
| |
| for field_name, coords in template.items(): |
| x1r, y1r, x2r, y2r, _ = coords |
| crop = crop_field(form_img, x1r, y1r, x2r, y2r) |
| crop = crop.convert('L') |
|
|
| fname = f"form{form_type}_{i:05d}_{field_name}.png" |
| fpath = os.path.join(OUT_IMG_DIR, fname) |
| crop.save(fpath) |
|
|
| annotations.append({ |
| "image_path": f"real_forms/{fname}", |
| "text": field_values[field_name] |
| }) |
| total += 1 |
|
|
| if (i + 1) % 100 == 0: |
| print(f" {i+1}/{SAMPLES_PER_FORM} forms done ({total} crops so far)") |
|
|
| print(f" Form {form_type} done.") |
|
|
| |
| with open(OUT_ANN, 'w') as f: |
| json.dump(annotations, f, indent=2) |
|
|
| print(f"\n{'='*60}") |
| print(f" DONE!") |
| print(f" Total crops : {total}") |
| print(f" Annotations : {OUT_ANN}") |
| print(f" Next step : upload to Kaggle and run fine-tune") |
| print(f"{'='*60}") |
|
|
| if __name__ == '__main__': |
| main() |
|
|