ocr / generate_form_samples.py
hanz245's picture
set up
7111e1a
"""
generate_form_samples.py
========================
Generates thousands of synthetic filled civil registry form images
using the blank PDF forms + template_matcher.py coordinates.
Each form is filled with random Filipino names/dates in handwriting fonts.
Crops are saved with labels β†’ ready for CRNN+CTC fine-tuning.
Usage:
python generate_form_samples.py
Output:
data/train/real_forms/ -- cropped field images
data/real_annotations.json -- labels for fine-tuning
"""
import os
import sys
import json
import random
import datetime
from PIL import Image, ImageDraw, ImageFont
# ── Paths ─────────────────────────────────────────────────────
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR = os.path.dirname(BASE_DIR)
PYTHON_DIR = ROOT_DIR # template_matcher.py is here
NAMES_FILE = os.path.join(BASE_DIR, 'data', 'ph_names.json')
OUT_IMG_DIR = os.path.join(BASE_DIR, 'data', 'train', 'real_forms')
OUT_ANN = os.path.join(BASE_DIR, 'data', 'real_annotations.json')
FONTS_DIR = os.path.join(ROOT_DIR, 'test_images', 'handwriting_fonts')
# Only verified-working Google Fonts URLs
GOOGLE_FONTS = {
'Kalam-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Regular.ttf',
'Kalam-Bold.ttf': 'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Bold.ttf',
'Kalam-Light.ttf': 'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Light.ttf',
'PatrickHand-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/patrickhand/PatrickHand-Regular.ttf',
'IndieFlower-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/indieflower/IndieFlower-Regular.ttf',
'Handlee-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/handlee/Handlee-Regular.ttf',
'GochiHand-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/gochihand/GochiHand-Regular.ttf',
'ArchitectsDaughter.ttf': 'https://github.com/google/fonts/raw/main/ofl/architectsdaughter/ArchitectsDaughter-Regular.ttf',
'ShadowsIntoLight.ttf': 'https://github.com/google/fonts/raw/main/ofl/shadowsintolight/ShadowsIntoLight.ttf',
'ShadowsIntoLightTwo.ttf': 'https://github.com/google/fonts/raw/main/ofl/shadowsintolighttwo/ShadowsIntoLightTwo-Regular.ttf',
'Kristi-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/kristi/Kristi-Regular.ttf',
'AmaticSC-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/amaticsc/AmaticSC-Regular.ttf',
'AmaticSC-Bold.ttf': 'https://github.com/google/fonts/raw/main/ofl/amaticsc/AmaticSC-Bold.ttf',
'BadScript-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/badscript/BadScript-Regular.ttf',
'Sacramento-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/sacramento/Sacramento-Regular.ttf',
'GreatVibes-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/greatvibes/GreatVibes-Regular.ttf',
'Allura-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/allura/Allura-Regular.ttf',
'AlexBrush-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/alexbrush/AlexBrush-Regular.ttf',
'Parisienne-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/parisienne/Parisienne-Regular.ttf',
'Tangerine-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/tangerine/Tangerine-Regular.ttf',
'Tangerine-Bold.ttf': 'https://github.com/google/fonts/raw/main/ofl/tangerine/Tangerine-Bold.ttf',
'Courgette-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/courgette/Courgette-Regular.ttf',
'Niconne-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/niconne/Niconne-Regular.ttf',
'MarckScript-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/marckscript/MarckScript-Regular.ttf',
'Norican-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/norican/Norican-Regular.ttf',
'Damion-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/damion/Damion-Regular.ttf',
'Satisfy-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/satisfy/Satisfy-Regular.ttf',
'Pacifico-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/pacifico/Pacifico-Regular.ttf',
'Italianno-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/italianno/Italianno-Regular.ttf',
'Pompiere-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/pompiere/Pompiere-Regular.ttf',
}
FONT_PATHS = [
# Downloaded handwriting fonts
*[os.path.join(FONTS_DIR, name) for name in GOOGLE_FONTS],
# Already available
os.path.join(ROOT_DIR, 'test_images', 'Caveat-Regular.ttf'),
# Windows fallbacks
r'C:\Windows\Fonts\segoepr.ttf',
r'C:\Windows\Fonts\segoeprb.ttf',
r'C:\Windows\Fonts\comic.ttf',
]
def download_fonts():
"""Download handwriting fonts from Google Fonts if not present."""
import urllib.request
os.makedirs(FONTS_DIR, exist_ok=True)
ok = 0
for fname, url in GOOGLE_FONTS.items():
dest = os.path.join(FONTS_DIR, fname)
if os.path.exists(dest) and os.path.getsize(dest) > 10000:
ok += 1
continue
try:
print(f" Downloading {fname}...")
with urllib.request.urlopen(url, timeout=10) as r, open(dest, 'wb') as f:
f.write(r.read())
# Validate: real TTF files are > 10KB
if os.path.getsize(dest) < 10000:
os.remove(dest)
print(f" Skipped {fname} (invalid file)")
else:
ok += 1
except Exception as e:
print(f" Failed {fname}: {e}")
if os.path.exists(dest):
os.remove(dest)
print(f" {ok} fonts ready")
PDF_FORMS = {
'97': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 97 (MARRIAGE CERTIFICATE).pdf'),
'102': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 102 (BIRTH CERTIFICATE).pdf'),
'103': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 103 (DEATH CERTIFICATE).pdf'),
'90': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 90-MARRIAGE-LICENCE-FORM.pdf'),
}
SAMPLES_PER_FORM = 1000 # forms to generate per type
IMG_W = 64
IMG_H = 512
# ── Load TEMPLATES from template_matcher ─────────────────────
sys.path.insert(0, PYTHON_DIR)
from template_matcher import TEMPLATES
# ── Load Filipino names ───────────────────────────────────────
def load_names():
if not os.path.exists(NAMES_FILE):
print(f"ERROR: {NAMES_FILE} not found. Run generate_ph_names.py first.")
sys.exit(1)
with open(NAMES_FILE) as f:
data = json.load(f)
return data
# ── Random data generators ────────────────────────────────────
MONTHS = ['January','February','March','April','May','June',
'July','August','September','October','November','December']
RELIGIONS = ['Roman Catholic','Islam','Baptist','Iglesia ni Cristo',
'Seventh Day Adventist','Born Again Christian']
CIVIL_STATUSES = ['Single','Married','Widowed','Legally Separated']
CITIZENSHIPS = ['Filipino','American','Chinese','Japanese']
PROVINCES = ['Cebu','Davao del Sur','Metro Manila','Iloilo','Pampanga',
'Batangas','Laguna','Cavite','Bulacan','Quezon City']
CITIES = ['Cebu City','Davao City','Manila','Iloilo City','San Fernando',
'Batangas City','Santa Rosa','Bacoor','Malolos','Quezon City']
def rand_name(names, key):
pool = names.get(key, ['Juan'])
return random.choice(pool).upper()
def rand_date():
y = random.randint(1950, 2005)
m = random.randint(1, 12)
d = random.randint(1, 28)
return f"{d:02d}", MONTHS[m-1], str(y)
def rand_age():
return str(random.randint(18, 80))
def rand_province():
return random.choice(PROVINCES).upper()
def rand_city():
return random.choice(CITIES).upper()
def rand_religion():
return random.choice(RELIGIONS).upper()
def rand_civil_status():
return random.choice(CIVIL_STATUSES).upper()
def rand_citizenship():
return random.choice(CITIZENSHIPS).upper()
def rand_registry_no():
return f"{random.randint(2000,2024)}-{random.randint(1000,9999)}"
def rand_time():
h = random.randint(6, 18)
m = random.choice(['00','15','30','45'])
return f"{h:02d}:{m} {'AM' if h < 12 else 'PM'}"
def generate_field_value(field_name, names):
"""Generate a plausible random value for a given field name."""
f = field_name.lower()
if 'province' in f: return rand_province()
if 'registry' in f: return rand_registry_no()
if 'city' in f or 'municipality' in f: return rand_city()
if 'first' in f and ('name' in f or 'father' in f or 'mother' in f):
return rand_name(names, 'first')
if 'middle' in f: return rand_name(names, 'middle')
if 'last' in f: return rand_name(names, 'last')
if '_name' in f and 'father' not in f and 'mother' not in f:
return rand_name(names, 'first')
if 'father_name' in f or 'mother_name' in f:
return f"{rand_name(names,'first')} {rand_name(names,'middle')} {rand_name(names,'last')}"
if 'dob_day' in f or 'day' in f: return rand_date()[0]
if 'dob_month' in f or 'month' in f: return rand_date()[1]
if 'dob_year' in f or 'year' in f: return rand_date()[2]
if 'dob' in f and 'day' not in f and 'month' not in f and 'year' not in f:
d,m,y = rand_date(); return f"{d} {m} {y}"
if 'age' in f: return rand_age()
if 'birth' in f and 'place' in f: return rand_city()
if 'place_of_birth' in f: return rand_city()
if 'sex' in f: return random.choice(['MALE','FEMALE'])
if 'citizenship' in f: return rand_citizenship()
if 'residence' in f: return f"{rand_city()}, {rand_province()}"
if 'religion' in f: return rand_religion()
if 'civil_status' in f: return rand_civil_status()
if 'place_of_marriage' in f: return rand_city()
if 'date_of_marriage' in f:
d,m,y = rand_date(); return f"{d} {m} {y}"
if 'time_of_marriage' in f: return rand_time()
if 'marriage_date' in f:
d,m,y = rand_date(); return f"{d} {m} {y}"
if 'marriage_place' in f: return rand_city()
if 'marriage_license' in f: return rand_registry_no()
if 'date_issued' in f:
d,m,y = rand_date(); return f"{d} {m} {y}"
if 'occupation' in f: return random.choice(['FARMER','TEACHER','NURSE','ENGINEER','DRIVER','HOUSEWIFE'])
if 'type_of_birth' in f: return random.choice(['SINGLE','TWIN','TRIPLET'])
if 'birth_order' in f: return random.choice(['1ST','2ND','3RD','4TH'])
if 'weight' in f: return f"{random.randint(2,5)}.{random.randint(0,9)} KG"
if 'cause' in f: return random.choice(['CARDIAC ARREST','PNEUMONIA','DIABETES','HYPERTENSION'])
if 'father_name' in f: return f"{rand_name(names,'first')} {rand_name(names,'last')}"
if 'mother_name' in f: return f"{rand_name(names,'first')} {rand_name(names,'last')}"
return rand_name(names, 'first')
# ── Load fonts ────────────────────────────────────────────────
def load_fonts():
fonts = []
for path in FONT_PATHS:
if os.path.exists(path):
for size in [14, 16, 18, 20]:
try:
fonts.append(ImageFont.truetype(path, size))
except:
pass
if not fonts:
fonts = [ImageFont.load_default()]
print(f" Loaded {len(fonts)} font variants")
return fonts
# ── Load blank form image ─────────────────────────────────────
def load_blank_form(form_type):
"""Convert PDF to image or use a reference scan as background."""
pdf_path = PDF_FORMS.get(form_type)
# Try pdf2image first
if pdf_path and os.path.exists(pdf_path):
try:
from pdf2image import convert_from_path
pages = convert_from_path(pdf_path, dpi=150)
if pages:
return pages[0].convert('RGB')
except Exception as e:
print(f" pdf2image failed: {e}")
# Fallback: use reference image (try png, jpg, jpeg)
for ext in ['png', 'jpg', 'jpeg']:
ref_path = os.path.join(ROOT_DIR, 'references', f'reference_{form_type}.{ext}')
if os.path.exists(ref_path):
return Image.open(ref_path).convert('RGB')
# Also try hyphen variant (e.g. reference-90.jpg)
for ext in ['png', 'jpg', 'jpeg']:
ref_path = os.path.join(ROOT_DIR, 'references', f'reference-{form_type}.{ext}')
if os.path.exists(ref_path):
return Image.open(ref_path).convert('RGB')
print(f" WARNING: No blank form found for {form_type} β€” skipping")
return None
# ── Render text on form ───────────────────────────────────────
def render_field(draw, x1r, y1r, x2r, y2r, text, img_w, img_h, fonts):
"""Draw handwritten-style text in a field box."""
x1 = int(x1r * img_w)
y1 = int(y1r * img_h)
x2 = int(x2r * img_w)
y2 = int(y2r * img_h)
box_w = max(x2 - x1, 1)
box_h = max(y2 - y1, 1)
# Pick a font that fits
font = random.choice(fonts)
for f in fonts:
bbox = f.getbbox(text)
fw = bbox[2] - bbox[0]
fh = bbox[3] - bbox[1]
if fw <= box_w * 0.95 and fh <= box_h * 1.2:
font = f
break
# Random pen color (dark blue/black like ballpen)
r = random.randint(0, 40)
g = random.randint(0, 40)
b = random.randint(60, 120)
color = (r, g, b)
# Center text vertically in box
bbox = font.getbbox(text)
fh = bbox[3] - bbox[1]
ty = y1 + (box_h - fh) // 2
# Slight random x offset
tx = x1 + random.randint(2, max(3, box_w // 10))
draw.text((tx, ty), text, fill=color, font=font)
# ── Crop a field ──────────────────────────────────────────────
def crop_field(img, x1r, y1r, x2r, y2r):
w, h = img.size
x1 = max(0, int(x1r * w) - 4)
y1 = max(0, int(y1r * h) - 4)
x2 = min(w, int(x2r * w) + 4)
y2 = min(h, int(y2r * h) + 4)
return img.crop((x1, y1, x2, y2))
# ── Main ──────────────────────────────────────────────────────
def main():
print("=" * 60)
print(" Form Sample Generator")
print("=" * 60)
os.makedirs(OUT_IMG_DIR, exist_ok=True)
print("\n Downloading handwriting fonts...")
download_fonts()
names = load_names()
fonts = load_fonts()
annotations = []
total = 0
for form_type, template in TEMPLATES.items():
print(f"\n Generating Form {form_type}...")
blank = load_blank_form(form_type)
if blank is None:
continue
for i in range(SAMPLES_PER_FORM):
# Fresh copy of blank form
form_img = blank.copy()
draw = ImageDraw.Draw(form_img)
img_w, img_h = form_img.size
field_values = {}
for field_name, coords in template.items():
x1r, y1r, x2r, y2r, _ = coords
text = generate_field_value(field_name, names)
field_values[field_name] = text
render_field(draw, x1r, y1r, x2r, y2r, text, img_w, img_h, fonts)
# Save full form preview (first sample only)
if i == 0:
preview_path = os.path.join(OUT_IMG_DIR, f'form{form_type}_preview.png')
form_img.save(preview_path)
print(f" Preview saved: {preview_path}")
# Crop each field and save
for field_name, coords in template.items():
x1r, y1r, x2r, y2r, _ = coords
crop = crop_field(form_img, x1r, y1r, x2r, y2r)
crop = crop.convert('L') # grayscale
fname = f"form{form_type}_{i:05d}_{field_name}.png"
fpath = os.path.join(OUT_IMG_DIR, fname)
crop.save(fpath)
annotations.append({
"image_path": f"real_forms/{fname}",
"text": field_values[field_name]
})
total += 1
if (i + 1) % 100 == 0:
print(f" {i+1}/{SAMPLES_PER_FORM} forms done ({total} crops so far)")
print(f" Form {form_type} done.")
# Save annotations
with open(OUT_ANN, 'w') as f:
json.dump(annotations, f, indent=2)
print(f"\n{'='*60}")
print(f" DONE!")
print(f" Total crops : {total}")
print(f" Annotations : {OUT_ANN}")
print(f" Next step : upload to Kaggle and run fine-tune")
print(f"{'='*60}")
if __name__ == '__main__':
main()