Hanz Pillerva commited on
Commit Β·
25a1178
1
Parent(s): 94ec9c8
Updated files
Browse files- CRNN+CTC/finetune.py +42 -31
- CRNN+CTC/fix_annotations.py +0 -40
- CRNN+CTC/fix_data.py +0 -770
- CRNN+CTC/generate_form_samples.py +0 -389
- CRNN+CTC/inference.py +1 -1
- debug_and_retrain.py +20 -0
- finetune.py +50 -29
- inference.py +3 -3
- spacyNER/debug_and_retrain.py +0 -316
- spacyNER/models/phase1_funsd/model-last/vocab/strings.json +0 -0
- spacyNER/models/phase1_funsd/model-last/vocab/vectors.cfg +3 -3
- template_matcher.py +1 -1
CRNN+CTC/finetune.py
CHANGED
|
@@ -3,14 +3,14 @@ finetune.py
|
|
| 3 |
===========
|
| 4 |
Fine-tune CRNN+CTC on generated civil registry form crops.
|
| 5 |
|
| 6 |
-
Loads
|
| 7 |
-
|
| 8 |
|
| 9 |
Usage:
|
| 10 |
python finetune.py
|
| 11 |
|
| 12 |
Output:
|
| 13 |
-
checkpoints/
|
| 14 |
"""
|
| 15 |
|
| 16 |
import os
|
|
@@ -25,12 +25,12 @@ from crnn_model import get_crnn_model
|
|
| 25 |
from dataset import CivilRegistryDataset, collate_fn
|
| 26 |
|
| 27 |
# ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
-
CHECKPOINT_IN = "checkpoints/
|
| 29 |
-
CHECKPOINT_OUT = "checkpoints/
|
| 30 |
|
| 31 |
-
|
| 32 |
-
SYNTH_ANN = "data/train_annotations.json"
|
| 33 |
-
VAL_ANN = "data/val_annotations.json"
|
| 34 |
|
| 35 |
IMG_HEIGHT = 64
|
| 36 |
IMG_WIDTH = 512
|
|
@@ -53,35 +53,45 @@ def main():
|
|
| 53 |
print(f" Device : {DEVICE}")
|
| 54 |
print(f" Checkpoint : {CHECKPOINT_IN}")
|
| 55 |
|
| 56 |
-
# ββ Check files ββββββββββββββββββββββββββββββββββ
|
| 57 |
-
for f in [CHECKPOINT_IN,
|
| 58 |
if not os.path.exists(f):
|
| 59 |
print(f"ERROR: {f} not found.")
|
| 60 |
sys.exit(1)
|
| 61 |
|
| 62 |
# ββ Datasets ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
)
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
-
#
|
| 73 |
-
train_dataset = real_dataset
|
| 74 |
if os.path.exists(SYNTH_ANN):
|
| 75 |
synth_dataset = CivilRegistryDataset(
|
| 76 |
data_dir="data/train", annotations_file=SYNTH_ANN,
|
| 77 |
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
|
| 78 |
)
|
| 79 |
-
|
| 80 |
-
print(f"
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
|
|
|
| 85 |
print(f" Total train : {len(train_dataset)}")
|
| 86 |
print(f" Val : {len(val_dataset)}")
|
| 87 |
|
|
@@ -95,10 +105,11 @@ def main():
|
|
| 95 |
ckpt = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
|
| 96 |
config = ckpt.get('config', {})
|
| 97 |
|
|
|
|
| 98 |
model = get_crnn_model(
|
| 99 |
model_type = config.get('model_type', 'standard'),
|
| 100 |
img_height = config.get('img_height', 64),
|
| 101 |
-
num_chars =
|
| 102 |
hidden_size = config.get('hidden_size', 128),
|
| 103 |
num_lstm_layers = config.get('num_lstm_layers', 1),
|
| 104 |
).to(DEVICE)
|
|
@@ -123,8 +134,8 @@ def main():
|
|
| 123 |
batch_size = images.size(0)
|
| 124 |
if training:
|
| 125 |
optimizer.zero_grad()
|
| 126 |
-
outputs
|
| 127 |
-
seq_len
|
| 128 |
input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
|
| 129 |
loss = criterion(outputs, targets, input_lengths, target_lengths)
|
| 130 |
if not torch.isnan(loss) and not torch.isinf(loss):
|
|
@@ -167,8 +178,8 @@ def main():
|
|
| 167 |
torch.save({
|
| 168 |
'model_state_dict': model.state_dict(),
|
| 169 |
'config': config,
|
| 170 |
-
'char_to_idx':
|
| 171 |
-
'idx_to_char':
|
| 172 |
'epoch': epoch,
|
| 173 |
'val_loss': vl,
|
| 174 |
}, CHECKPOINT_OUT)
|
|
@@ -188,4 +199,4 @@ def main():
|
|
| 188 |
|
| 189 |
|
| 190 |
if __name__ == '__main__':
|
| 191 |
-
main()
|
|
|
|
| 3 |
===========
|
| 4 |
Fine-tune CRNN+CTC on generated civil registry form crops.
|
| 5 |
|
| 6 |
+
Loads best_model_final.pth (pretrained), continues training on
|
| 7 |
+
actual_annotations.json + train_annotations.json.
|
| 8 |
|
| 9 |
Usage:
|
| 10 |
python finetune.py
|
| 11 |
|
| 12 |
Output:
|
| 13 |
+
checkpoints/best_model_v2.pth
|
| 14 |
"""
|
| 15 |
|
| 16 |
import os
|
|
|
|
| 25 |
from dataset import CivilRegistryDataset, collate_fn
|
| 26 |
|
| 27 |
# ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
CHECKPOINT_IN = "checkpoints/best_model_final.pth"
|
| 29 |
+
CHECKPOINT_OUT = "checkpoints/best_model_v2.pth"
|
| 30 |
|
| 31 |
+
ACTUAL_ANN = "data/actual_annotations.json" # real scanned forms
|
| 32 |
+
SYNTH_ANN = "data/train_annotations.json" # synthetic / train split
|
| 33 |
+
VAL_ANN = "data/val_annotations.json" # validation set
|
| 34 |
|
| 35 |
IMG_HEIGHT = 64
|
| 36 |
IMG_WIDTH = 512
|
|
|
|
| 53 |
print(f" Device : {DEVICE}")
|
| 54 |
print(f" Checkpoint : {CHECKPOINT_IN}")
|
| 55 |
|
| 56 |
+
# ββ Check required files ββββββββββββββββββββββββββββββββββ
|
| 57 |
+
for f in [CHECKPOINT_IN, VAL_ANN]:
|
| 58 |
if not os.path.exists(f):
|
| 59 |
print(f"ERROR: {f} not found.")
|
| 60 |
sys.exit(1)
|
| 61 |
|
| 62 |
# ββ Datasets ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
+
datasets_to_merge = []
|
| 64 |
+
|
| 65 |
+
# 1. Actual scanned forms (highest priority β real data)
|
| 66 |
+
if os.path.exists(ACTUAL_ANN):
|
| 67 |
+
actual_dataset = CivilRegistryDataset(
|
| 68 |
+
data_dir=".", annotations_file=ACTUAL_ANN,
|
| 69 |
+
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
|
| 70 |
+
)
|
| 71 |
+
datasets_to_merge.append(actual_dataset)
|
| 72 |
+
print(f" Actual crops: {len(actual_dataset)} (real scanned forms)")
|
| 73 |
+
else:
|
| 74 |
+
print(f" [!] {ACTUAL_ANN} not found β run extract_actual_data.py first")
|
| 75 |
|
| 76 |
+
# 2. Fully synthetic β keep so model doesn't forget basic characters
|
|
|
|
| 77 |
if os.path.exists(SYNTH_ANN):
|
| 78 |
synth_dataset = CivilRegistryDataset(
|
| 79 |
data_dir="data/train", annotations_file=SYNTH_ANN,
|
| 80 |
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
|
| 81 |
)
|
| 82 |
+
datasets_to_merge.append(synth_dataset)
|
| 83 |
+
print(f" Synth crops : {len(synth_dataset)} (fully synthetic)")
|
| 84 |
+
|
| 85 |
+
if not datasets_to_merge:
|
| 86 |
+
print("ERROR: No training data found. Run extract_actual_data.py first.")
|
| 87 |
+
sys.exit(1)
|
| 88 |
+
|
| 89 |
+
val_dataset = CivilRegistryDataset(
|
| 90 |
+
data_dir="data/val", annotations_file=VAL_ANN,
|
| 91 |
+
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
|
| 92 |
+
)
|
| 93 |
|
| 94 |
+
train_dataset = ConcatDataset(datasets_to_merge) if len(datasets_to_merge) > 1 else datasets_to_merge[0]
|
| 95 |
print(f" Total train : {len(train_dataset)}")
|
| 96 |
print(f" Val : {len(val_dataset)}")
|
| 97 |
|
|
|
|
| 105 |
ckpt = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
|
| 106 |
config = ckpt.get('config', {})
|
| 107 |
|
| 108 |
+
ref_dataset = datasets_to_merge[0]
|
| 109 |
model = get_crnn_model(
|
| 110 |
model_type = config.get('model_type', 'standard'),
|
| 111 |
img_height = config.get('img_height', 64),
|
| 112 |
+
num_chars = ref_dataset.num_chars,
|
| 113 |
hidden_size = config.get('hidden_size', 128),
|
| 114 |
num_lstm_layers = config.get('num_lstm_layers', 1),
|
| 115 |
).to(DEVICE)
|
|
|
|
| 134 |
batch_size = images.size(0)
|
| 135 |
if training:
|
| 136 |
optimizer.zero_grad()
|
| 137 |
+
outputs = F.log_softmax(model(images), dim=2)
|
| 138 |
+
seq_len = outputs.size(0)
|
| 139 |
input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
|
| 140 |
loss = criterion(outputs, targets, input_lengths, target_lengths)
|
| 141 |
if not torch.isnan(loss) and not torch.isinf(loss):
|
|
|
|
| 178 |
torch.save({
|
| 179 |
'model_state_dict': model.state_dict(),
|
| 180 |
'config': config,
|
| 181 |
+
'char_to_idx': ref_dataset.char_to_idx,
|
| 182 |
+
'idx_to_char': ref_dataset.idx_to_char,
|
| 183 |
'epoch': epoch,
|
| 184 |
'val_loss': vl,
|
| 185 |
}, CHECKPOINT_OUT)
|
|
|
|
| 199 |
|
| 200 |
|
| 201 |
if __name__ == '__main__':
|
| 202 |
+
main()
|
CRNN+CTC/fix_annotations.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
| 1 |
-
import json, os
|
| 2 |
-
|
| 3 |
-
# Maps any image path to its correct form subfolder.
|
| 4 |
-
# FIXED: was only handling form1a/form2a β missed form3a and form90.
|
| 5 |
-
def detect_folder(image_path):
|
| 6 |
-
for form in ['form1a', 'form2a', 'form3a', 'form90']:
|
| 7 |
-
if form in image_path:
|
| 8 |
-
return form
|
| 9 |
-
return 'form1a' # safe fallback
|
| 10 |
-
|
| 11 |
-
for split in ['train', 'val']:
|
| 12 |
-
ann_file = f'data/{split}_annotations.json'
|
| 13 |
-
if not os.path.exists(ann_file):
|
| 14 |
-
print(f'SKIP: {ann_file} not found')
|
| 15 |
-
continue
|
| 16 |
-
|
| 17 |
-
with open(ann_file) as f:
|
| 18 |
-
data = json.load(f)
|
| 19 |
-
|
| 20 |
-
fixed = []
|
| 21 |
-
skipped = 0
|
| 22 |
-
for d in data:
|
| 23 |
-
# Support both old key names ('image'/'label') and new ('image_path'/'text')
|
| 24 |
-
image_val = d.get('image') or d.get('image_path', '')
|
| 25 |
-
text_val = d.get('label') or d.get('text', '')
|
| 26 |
-
|
| 27 |
-
if not image_val or not text_val:
|
| 28 |
-
skipped += 1
|
| 29 |
-
continue
|
| 30 |
-
|
| 31 |
-
filename = os.path.basename(image_val)
|
| 32 |
-
folder = detect_folder(image_val)
|
| 33 |
-
fixed.append({'image_path': f'{folder}/{filename}', 'text': text_val})
|
| 34 |
-
|
| 35 |
-
with open(ann_file, 'w') as f:
|
| 36 |
-
json.dump(fixed, f, indent=2)
|
| 37 |
-
|
| 38 |
-
print(f'{split}: {len(fixed)} fixed, {skipped} skipped')
|
| 39 |
-
|
| 40 |
-
print('Done!')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CRNN+CTC/fix_data.py
DELETED
|
@@ -1,770 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
fix_data.py
|
| 3 |
-
===========
|
| 4 |
-
Generates synthetic training images for the Civil Registry OCR system.
|
| 5 |
-
|
| 6 |
-
Run this ONCE before training to create your dataset.
|
| 7 |
-
|
| 8 |
-
STEP ORDER:
|
| 9 |
-
1. python generate_ph_names.py <- generates data/ph_names.json
|
| 10 |
-
2. python fix_data.py <- generates all training images (THIS FILE)
|
| 11 |
-
3. python train.py <- trains the CRNN model
|
| 12 |
-
|
| 13 |
-
WHAT IT GENERATES:
|
| 14 |
-
- Printed text images of names, dates, places, and other form fields
|
| 15 |
-
- Covers all 4 form types: birth, death, marriage, marriage license
|
| 16 |
-
- Splits into train (90%) and val (10%)
|
| 17 |
-
- Writes data/train_annotations.json and data/val_annotations.json
|
| 18 |
-
|
| 19 |
-
OUTPUT STRUCTURE:
|
| 20 |
-
data/
|
| 21 |
-
train/
|
| 22 |
-
form1a/ <- birth certificate fields
|
| 23 |
-
form2a/ <- death certificate fields
|
| 24 |
-
form3a/ <- marriage certificate fields
|
| 25 |
-
form90/ <- marriage license fields
|
| 26 |
-
val/
|
| 27 |
-
form1a/
|
| 28 |
-
form2a/
|
| 29 |
-
form3a/
|
| 30 |
-
form90/
|
| 31 |
-
train_annotations.json
|
| 32 |
-
val_annotations.json
|
| 33 |
-
"""
|
| 34 |
-
|
| 35 |
-
import os
|
| 36 |
-
import json
|
| 37 |
-
import random
|
| 38 |
-
import numpy as np
|
| 39 |
-
from pathlib import Path
|
| 40 |
-
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
-
# CONFIG
|
| 45 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
-
|
| 47 |
-
IMG_WIDTH = 512
|
| 48 |
-
IMG_HEIGHT = 64
|
| 49 |
-
FONT_SIZE = 22
|
| 50 |
-
VAL_SPLIT = 0.10
|
| 51 |
-
RANDOM_SEED = 42
|
| 52 |
-
|
| 53 |
-
SAMPLES_PER_FORM = {
|
| 54 |
-
'form1a': 6000,
|
| 55 |
-
'form2a': 4000,
|
| 56 |
-
'form3a': 4000,
|
| 57 |
-
'form90': 2000,
|
| 58 |
-
}
|
| 59 |
-
|
| 60 |
-
PH_NAMES_FILE = 'data/ph_names.json'
|
| 61 |
-
|
| 62 |
-
random.seed(RANDOM_SEED)
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 66 |
-
# FONT LOADER
|
| 67 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 68 |
-
|
| 69 |
-
def load_font(size=FONT_SIZE):
|
| 70 |
-
"""Load a single font β used as fallback. Prefer load_font_pool()."""
|
| 71 |
-
for fp in [
|
| 72 |
-
'arial.ttf', 'Arial.ttf',
|
| 73 |
-
'/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
|
| 74 |
-
'/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf',
|
| 75 |
-
'/System/Library/Fonts/Helvetica.ttc',
|
| 76 |
-
'C:/Windows/Fonts/arial.ttf',
|
| 77 |
-
'C:/Windows/Fonts/calibri.ttf',
|
| 78 |
-
]:
|
| 79 |
-
try:
|
| 80 |
-
return ImageFont.truetype(fp, size)
|
| 81 |
-
except Exception:
|
| 82 |
-
continue
|
| 83 |
-
print("WARNING: Could not load a TrueType font. Using default bitmap font.")
|
| 84 |
-
print(" Prediction accuracy may be lower.")
|
| 85 |
-
return ImageFont.load_default()
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
def load_font_pool(size=FONT_SIZE):
|
| 89 |
-
"""
|
| 90 |
-
Load a pool of diverse fonts so the model trains on varied typefaces.
|
| 91 |
-
Using only one font causes the model to overfit to that font's style and
|
| 92 |
-
fail on real civil registry documents which use mixed fonts.
|
| 93 |
-
Returns a list of at least 1 font; caller picks randomly per image.
|
| 94 |
-
"""
|
| 95 |
-
candidates = [
|
| 96 |
-
# Sans-serif (most common in PH civil registry printed forms)
|
| 97 |
-
'arial.ttf', 'Arial.ttf',
|
| 98 |
-
'/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
|
| 99 |
-
'/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf',
|
| 100 |
-
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
|
| 101 |
-
'C:/Windows/Fonts/arial.ttf',
|
| 102 |
-
'C:/Windows/Fonts/arialbd.ttf',
|
| 103 |
-
'C:/Windows/Fonts/calibri.ttf',
|
| 104 |
-
'C:/Windows/Fonts/calibrib.ttf',
|
| 105 |
-
# Serif (used in older typewriter-style registry entries)
|
| 106 |
-
'/usr/share/fonts/truetype/dejavu/DejaVuSerif.ttf',
|
| 107 |
-
'/usr/share/fonts/truetype/liberation/LiberationSerif-Regular.ttf',
|
| 108 |
-
'C:/Windows/Fonts/times.ttf',
|
| 109 |
-
'C:/Windows/Fonts/Georgia.ttf',
|
| 110 |
-
'/System/Library/Fonts/Times.ttc',
|
| 111 |
-
# Mono (typewriter β common in pre-2000 civil registry forms)
|
| 112 |
-
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf',
|
| 113 |
-
'/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf',
|
| 114 |
-
'C:/Windows/Fonts/cour.ttf',
|
| 115 |
-
# Condensed / narrow (space-saving fonts used in registry tables)
|
| 116 |
-
'C:/Windows/Fonts/arialn.ttf',
|
| 117 |
-
'/usr/share/fonts/truetype/ubuntu/UbuntuCondensed-Regular.ttf',
|
| 118 |
-
]
|
| 119 |
-
pool = []
|
| 120 |
-
for fp in candidates:
|
| 121 |
-
try:
|
| 122 |
-
pool.append(ImageFont.truetype(fp, size))
|
| 123 |
-
except Exception:
|
| 124 |
-
continue
|
| 125 |
-
if not pool:
|
| 126 |
-
print("WARNING: No TrueType fonts found. Using default bitmap font.")
|
| 127 |
-
pool.append(ImageFont.load_default())
|
| 128 |
-
else:
|
| 129 |
-
print(f" β Font pool loaded: {len(pool)} font(s) available")
|
| 130 |
-
return pool
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 134 |
-
# IMAGE RENDERER
|
| 135 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 136 |
-
|
| 137 |
-
def render_text_image(text: str, font, width=IMG_WIDTH, height=IMG_HEIGHT,
|
| 138 |
-
handwriting=False) -> Image.Image:
|
| 139 |
-
"""
|
| 140 |
-
Render text on a white background, centered.
|
| 141 |
-
handwriting=True applies handwriting-style augmentations.
|
| 142 |
-
"""
|
| 143 |
-
img = Image.new('RGB', (width, height), color=(255, 255, 255))
|
| 144 |
-
draw = ImageDraw.Draw(img)
|
| 145 |
-
|
| 146 |
-
bbox = draw.textbbox((0, 0), text, font=font)
|
| 147 |
-
tw = bbox[2] - bbox[0]
|
| 148 |
-
th = bbox[3] - bbox[1]
|
| 149 |
-
x = max(4, (width - tw) // 2)
|
| 150 |
-
y = max(4, (height - th) // 2)
|
| 151 |
-
|
| 152 |
-
if not handwriting:
|
| 153 |
-
# ββ PRINTED mode ββββββββββββββββββββββββββββββββββββββ
|
| 154 |
-
shade = random.randint(0, 40)
|
| 155 |
-
draw.text((x, y), text, fill=(shade, shade, shade), font=font)
|
| 156 |
-
|
| 157 |
-
else:
|
| 158 |
-
# ββ HANDWRITING simulation mode βββββββββββββββββββββββ
|
| 159 |
-
# 1. Pen color β blue-black ballpen
|
| 160 |
-
r = random.randint(0, 60)
|
| 161 |
-
g = random.randint(0, 60)
|
| 162 |
-
b = random.randint(0, 120)
|
| 163 |
-
ink_color = (r, g, b)
|
| 164 |
-
|
| 165 |
-
# 2. Per-character y-wobble (unsteady hand)
|
| 166 |
-
if random.choice([True, False]) and len(text) > 1:
|
| 167 |
-
char_x = x
|
| 168 |
-
for ch in text:
|
| 169 |
-
y_offset = random.randint(-2, 2)
|
| 170 |
-
draw.text((char_x, y + y_offset), ch, fill=ink_color, font=font)
|
| 171 |
-
ch_bbox = draw.textbbox((0, 0), ch, font=font)
|
| 172 |
-
char_x += (ch_bbox[2] - ch_bbox[0]) + random.randint(-1, 1)
|
| 173 |
-
else:
|
| 174 |
-
draw.text((x, y), text, fill=ink_color, font=font)
|
| 175 |
-
|
| 176 |
-
# 3. Pixel-level augmentation
|
| 177 |
-
arr = np.array(img).astype(np.float32)
|
| 178 |
-
|
| 179 |
-
# 4. Ink bleed
|
| 180 |
-
if random.random() < 0.5:
|
| 181 |
-
img_pil = Image.fromarray(arr.astype(np.uint8))
|
| 182 |
-
img_pil = img_pil.filter(
|
| 183 |
-
ImageFilter.GaussianBlur(radius=random.uniform(0.3, 0.7)))
|
| 184 |
-
arr = np.array(img_pil).astype(np.float32)
|
| 185 |
-
|
| 186 |
-
# 5. Paper texture noise
|
| 187 |
-
noise_map = np.random.normal(0, random.uniform(3, 10), arr.shape)
|
| 188 |
-
arr = np.clip(arr + noise_map, 0, 255)
|
| 189 |
-
|
| 190 |
-
# 6. Scan shadow patch
|
| 191 |
-
if random.random() < 0.3:
|
| 192 |
-
patch_x = random.randint(0, width - 20)
|
| 193 |
-
patch_w = random.randint(10, 60)
|
| 194 |
-
arr[:, patch_x:patch_x + patch_w] *= random.uniform(0.88, 0.97)
|
| 195 |
-
arr = np.clip(arr, 0, 255)
|
| 196 |
-
|
| 197 |
-
img = Image.fromarray(arr.astype(np.uint8))
|
| 198 |
-
|
| 199 |
-
# 7. Pen tilt rotation (+-3 degrees)
|
| 200 |
-
if random.random() < 0.6:
|
| 201 |
-
angle = random.uniform(-3, 3)
|
| 202 |
-
img = img.rotate(angle, fillcolor=(255, 255, 255), expand=False)
|
| 203 |
-
|
| 204 |
-
return img
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 208 |
-
# NAME / DATA POOLS
|
| 209 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 210 |
-
|
| 211 |
-
# Populated at runtime from ph_names.json via load_ph_names()
|
| 212 |
-
MIDDLE_NAMES = []
|
| 213 |
-
|
| 214 |
-
SUFFIXES = ['Jr.', 'Sr.', 'II', 'III', '']
|
| 215 |
-
|
| 216 |
-
MONTHS = [
|
| 217 |
-
'January', 'February', 'March', 'April', 'May', 'June',
|
| 218 |
-
'July', 'August', 'September', 'October', 'November', 'December',
|
| 219 |
-
]
|
| 220 |
-
|
| 221 |
-
CITIES = [
|
| 222 |
-
# NCR
|
| 223 |
-
'Manila', 'Quezon City', 'Caloocan', 'Pasig', 'Makati',
|
| 224 |
-
'Taguig', 'Paranaque', 'Pasay', 'Las Pinas', 'Muntinlupa',
|
| 225 |
-
'Marikina', 'Valenzuela', 'Malabon', 'Navotas', 'Mandaluyong',
|
| 226 |
-
'San Juan', 'Pateros',
|
| 227 |
-
# Luzon
|
| 228 |
-
'Tarlac City', 'Angeles City', 'San Fernando', 'Olongapo',
|
| 229 |
-
'Cabanatuan', 'San Jose del Monte', 'Bacoor', 'Imus', 'Dasmarinas',
|
| 230 |
-
'Antipolo', 'Binangonan', 'Taytay', 'Santa Rosa', 'Calamba',
|
| 231 |
-
'San Pablo', 'Lucena', 'Batangas City', 'Lipa', 'Naga City',
|
| 232 |
-
'Legazpi', 'Sorsogon City', 'Tuguegarao', 'Ilagan', 'Santiago City',
|
| 233 |
-
'Cauayan', 'San Fernando (La Union)', 'Vigan', 'Laoag',
|
| 234 |
-
'Dagupan', 'San Carlos', 'Urdaneta', 'Baguio City',
|
| 235 |
-
# Visayas
|
| 236 |
-
'Cebu City', 'Mandaue', 'Lapu-Lapu', 'Talisay', 'Danao',
|
| 237 |
-
'Toledo', 'Carcar', 'Bacolod', 'Bago', 'Sagay', 'Victorias',
|
| 238 |
-
'Iloilo City', 'Passi', 'Roxas City', 'Kalibo',
|
| 239 |
-
'Tacloban', 'Ormoc', 'Palo', 'Catbalogan', 'Calbayog',
|
| 240 |
-
'Tagbilaran', 'Dumaguete', 'Tanjay', 'Bayawan', 'Kabankalan',
|
| 241 |
-
# Mindanao
|
| 242 |
-
'Davao City', 'Tagum', 'Panabo', 'Digos', 'Mati',
|
| 243 |
-
'General Santos', 'Koronadal', 'Kidapawan', 'Cotabato City',
|
| 244 |
-
'Cagayan de Oro', 'Iligan', 'Ozamiz', 'Oroquieta', 'Tangub',
|
| 245 |
-
'Butuan', 'Cabadbaran', 'Surigao City', 'Bislig', 'Bayugan',
|
| 246 |
-
'Zamboanga City', 'Pagadian', 'Dipolog', 'Dapitan',
|
| 247 |
-
'Marawi', 'Malaybalay', 'Valencia',
|
| 248 |
-
]
|
| 249 |
-
|
| 250 |
-
PROVINCES = [
|
| 251 |
-
# Luzon
|
| 252 |
-
'Tarlac', 'Pampanga', 'Bulacan', 'Nueva Ecija', 'Bataan',
|
| 253 |
-
'Zambales', 'Aurora', 'Rizal', 'Cavite', 'Laguna',
|
| 254 |
-
'Batangas', 'Quezon', 'Marinduque', 'Occidental Mindoro',
|
| 255 |
-
'Oriental Mindoro', 'Palawan', 'Romblon',
|
| 256 |
-
'Camarines Norte', 'Camarines Sur', 'Albay', 'Sorsogon',
|
| 257 |
-
'Catanduanes', 'Masbate',
|
| 258 |
-
'Pangasinan', 'La Union', 'Benguet', 'Ifugao', 'Mountain Province',
|
| 259 |
-
'Kalinga', 'Apayao', 'Abra', 'Ilocos Norte', 'Ilocos Sur',
|
| 260 |
-
'Cagayan', 'Isabela', 'Nueva Vizcaya', 'Quirino',
|
| 261 |
-
'Metro Manila',
|
| 262 |
-
# Visayas
|
| 263 |
-
'Cebu', 'Bohol', 'Negros Oriental', 'Siquijor',
|
| 264 |
-
'Negros Occidental', 'Iloilo', 'Capiz', 'Aklan', 'Antique',
|
| 265 |
-
'Guimaras', 'Leyte', 'Southern Leyte', 'Samar', 'Eastern Samar',
|
| 266 |
-
'Northern Samar', 'Biliran',
|
| 267 |
-
# Mindanao
|
| 268 |
-
'Davao del Sur', 'Davao del Norte', 'Davao Oriental',
|
| 269 |
-
'Davao Occidental', 'Davao de Oro',
|
| 270 |
-
'South Cotabato', 'Sarangani', 'Sultan Kudarat', 'North Cotabato',
|
| 271 |
-
'Misamis Oriental', 'Misamis Occidental', 'Camiguin',
|
| 272 |
-
'Bukidnon', 'Lanao del Norte', 'Lanao del Sur',
|
| 273 |
-
'Maguindanao', 'Basilan', 'Sulu', 'Tawi-Tawi',
|
| 274 |
-
'Zamboanga del Sur', 'Zamboanga del Norte', 'Zamboanga Sibugay',
|
| 275 |
-
'Agusan del Norte', 'Agusan del Sur', 'Surigao del Norte',
|
| 276 |
-
'Surigao del Sur', 'Dinagat Islands',
|
| 277 |
-
]
|
| 278 |
-
|
| 279 |
-
BARANGAYS = [
|
| 280 |
-
'Brgy. San Jose', 'Brgy. Sta. Maria', 'Brgy. San Antonio',
|
| 281 |
-
'Brgy. Santo Nino', 'Brgy. Poblacion', 'Brgy. San Isidro',
|
| 282 |
-
'Brgy. San Pedro', 'Brgy. San Miguel', 'Brgy. Mabini',
|
| 283 |
-
'Brgy. Rizal', 'Brgy. Magsaysay', 'Brgy. Quezon',
|
| 284 |
-
'Brgy. Bagong Silang', 'Brgy. Bagumbayan', 'Brgy. Batasan Hills',
|
| 285 |
-
'Brgy. Commonwealth', 'Brgy. Culiat', 'Brgy. Fairview',
|
| 286 |
-
'Brgy. Holy Spirit', 'Brgy. Kamuning', 'Brgy. Laging Handa',
|
| 287 |
-
'Brgy. Malaya', 'Brgy. Masagana', 'Brgy. Pinyahan',
|
| 288 |
-
'Brgy. Roxas', 'Brgy. Sacred Heart', 'Brgy. San Roque',
|
| 289 |
-
'Brgy. Santa Cruz', 'Brgy. Santa Teresita', 'Brgy. Santo Domingo',
|
| 290 |
-
'Brgy. Silangan', 'Brgy. South Triangle', 'Brgy. Tagumpay',
|
| 291 |
-
'Brgy. Tandang Sora', 'Brgy. Vasra', 'Brgy. White Plains',
|
| 292 |
-
]
|
| 293 |
-
|
| 294 |
-
STREETS = [
|
| 295 |
-
'Mabini St.', 'Rizal Ave.', 'MacArthur Hwy.', 'Quezon Blvd.',
|
| 296 |
-
'Gen. Luna St.', 'Bonifacio St.', 'Aguinaldo St.', 'Burgos St.',
|
| 297 |
-
'Del Pilar St.', 'Gomez St.', 'Jacinto St.', 'Lapu-Lapu St.',
|
| 298 |
-
'Lopez Jaena St.', 'Luna St.', 'Osmena Blvd.', 'Padre Faura St.',
|
| 299 |
-
'Palma St.', 'Plaridel St.', 'Recto Ave.', 'Roxas Blvd.',
|
| 300 |
-
'San Andres St.', 'Shaw Blvd.', 'Taft Ave.', 'Tandang Sora Ave.',
|
| 301 |
-
'Timog Ave.', 'Tuazon Blvd.', 'Visayas Ave.', 'Aurora Blvd.',
|
| 302 |
-
'EDSA', 'Espana Blvd.', 'Katipunan Ave.', 'Marcos Hwy.',
|
| 303 |
-
'Ortigas Ave.', 'Quirino Ave.',
|
| 304 |
-
]
|
| 305 |
-
|
| 306 |
-
RELIGIONS = [
|
| 307 |
-
'Roman Catholic', 'Catholic', 'Islam', 'Muslim',
|
| 308 |
-
'Iglesia ni Cristo', 'INC', 'Baptist', 'Methodist',
|
| 309 |
-
'Seventh Day Adventist', 'Born Again Christian', 'Aglipayan',
|
| 310 |
-
]
|
| 311 |
-
|
| 312 |
-
OCCUPATIONS = [
|
| 313 |
-
'Farmer', 'Teacher', 'Engineer', 'Nurse', 'Doctor',
|
| 314 |
-
'Laborer', 'Housewife', 'Driver', 'Carpenter', 'Vendor',
|
| 315 |
-
'Student', 'OFW', 'Fisherman', 'Mechanic', 'Electrician',
|
| 316 |
-
'Police Officer', 'Military', 'Government Employee',
|
| 317 |
-
'Business Owner', 'Retired',
|
| 318 |
-
]
|
| 319 |
-
|
| 320 |
-
CIVIL_STATUSES = ['Single', 'Married', 'Widowed', 'Legally Separated']
|
| 321 |
-
|
| 322 |
-
CITIZENSHIPS = ['Filipino', 'Filipino', 'Filipino', 'American',
|
| 323 |
-
'Chinese', 'Japanese', 'Korean']
|
| 324 |
-
|
| 325 |
-
DEATH_CAUSES = [
|
| 326 |
-
'Cardio-Respiratory Arrest', 'Hypertensive Cardiovascular Disease',
|
| 327 |
-
'Acute Myocardial Infarction', 'Cerebrovascular Accident',
|
| 328 |
-
'Pneumonia', 'Septicemia', 'Renal Failure', 'Diabetes Mellitus',
|
| 329 |
-
'Pulmonary Tuberculosis', 'Cancer of the Lung',
|
| 330 |
-
'Chronic Obstructive Pulmonary Disease', 'Liver Cirrhosis',
|
| 331 |
-
'Dengue Hemorrhagic Fever', 'Acute Gastroenteritis',
|
| 332 |
-
'Congestive Heart Failure',
|
| 333 |
-
]
|
| 334 |
-
|
| 335 |
-
ATTENDANT_TYPES = [
|
| 336 |
-
'Private Physician', 'Public Health Officer',
|
| 337 |
-
'Hospital Authority', 'Hilot', 'None',
|
| 338 |
-
]
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 342 |
-
# NAME LOADER
|
| 343 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 344 |
-
|
| 345 |
-
def load_ph_names():
|
| 346 |
-
"""
|
| 347 |
-
Load Filipino names from ph_names.json.
|
| 348 |
-
Returns (first_names, last_names, middle_names).
|
| 349 |
-
Falls back to built-in lists if JSON not found.
|
| 350 |
-
"""
|
| 351 |
-
if os.path.exists(PH_NAMES_FILE):
|
| 352 |
-
with open(PH_NAMES_FILE, 'r', encoding='utf-8') as f:
|
| 353 |
-
data = json.load(f)
|
| 354 |
-
first_names = data['first_names']['all']
|
| 355 |
-
last_names = data['last_names']
|
| 356 |
-
# Load middle_names from JSON (added by updated generate_ph_names.py)
|
| 357 |
-
# Falls back to last_names if key missing (older ph_names.json)
|
| 358 |
-
middle_names = data.get('middle_names', last_names)
|
| 359 |
-
print(f" Loaded ph_names.json: "
|
| 360 |
-
f"{len(first_names)} first, "
|
| 361 |
-
f"{len(last_names)} last, "
|
| 362 |
-
f"{len(middle_names)} middle names")
|
| 363 |
-
else:
|
| 364 |
-
print(f" WARNING: {PH_NAMES_FILE} not found.")
|
| 365 |
-
print(f" Using built-in fallback names.")
|
| 366 |
-
print(f" For better results run: python generate_ph_names.py first.")
|
| 367 |
-
first_names = [
|
| 368 |
-
'Juan', 'Maria', 'Jose', 'Ana', 'Pedro', 'Rosa', 'Carlos',
|
| 369 |
-
'Elena', 'Ramon', 'Lucia', 'Eduardo', 'Carmen', 'Antonio',
|
| 370 |
-
'Isabel', 'Francisco', 'Gloria', 'Roberto', 'Corazon',
|
| 371 |
-
'Ricardo', 'Remedios', 'Manuel', 'Teresita', 'Andres',
|
| 372 |
-
'Lourdes', 'Fernando', 'Maricel', 'Rolando', 'Rowena',
|
| 373 |
-
'Danilo', 'Cristina', 'Ernesto', 'Marilou', 'Renato',
|
| 374 |
-
'Felicidad', 'Alfredo', 'Natividad', 'Domingo', 'Milagros',
|
| 375 |
-
]
|
| 376 |
-
last_names = [
|
| 377 |
-
'Santos', 'Reyes', 'Cruz', 'Bautista', 'Ocampo', 'Garcia',
|
| 378 |
-
'Mendoza', 'Torres', 'Flores', 'Aquino', 'Dela Cruz',
|
| 379 |
-
'Del Rosario', 'San Jose', 'De Guzman', 'Villanueva',
|
| 380 |
-
'Gonzales', 'Ramos', 'Diaz', 'Castro', 'Morales',
|
| 381 |
-
'Lim', 'Tan', 'Go', 'Chua', 'Sy', 'Ong',
|
| 382 |
-
'Macaraeg', 'Pascual', 'Buenaventura', 'Concepcion',
|
| 383 |
-
'Manalo', 'Soriano', 'Evangelista', 'Salazar', 'Tolentino',
|
| 384 |
-
]
|
| 385 |
-
middle_names = last_names
|
| 386 |
-
return first_names, last_names, middle_names
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 390 |
-
# TEXT GENERATORS
|
| 391 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 392 |
-
|
| 393 |
-
def gen_full_name(first_names, last_names, with_suffix=True):
|
| 394 |
-
first = random.choice(first_names)
|
| 395 |
-
middle = random.choice(MIDDLE_NAMES) if MIDDLE_NAMES else random.choice(last_names)
|
| 396 |
-
last = random.choice(last_names)
|
| 397 |
-
suffix = random.choice(SUFFIXES) if with_suffix else ''
|
| 398 |
-
name = f"{first} {middle} {last}"
|
| 399 |
-
if suffix:
|
| 400 |
-
name += f" {suffix}"
|
| 401 |
-
return name
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
def gen_first_name(first_names):
|
| 405 |
-
return random.choice(first_names)
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
def gen_last_name(last_names):
|
| 409 |
-
return random.choice(last_names)
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
def gen_middle_name(last_names):
|
| 413 |
-
# Always draw from MIDDLE_NAMES (700+ entries from ph_names.json)
|
| 414 |
-
pool = MIDDLE_NAMES if MIDDLE_NAMES else last_names
|
| 415 |
-
return random.choice(pool)
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
def gen_date_slash():
|
| 419 |
-
month = random.randint(1, 12)
|
| 420 |
-
day = random.randint(1, 28)
|
| 421 |
-
year = random.randint(1930, 2024)
|
| 422 |
-
return f"{month:02d}/{day:02d}/{year}"
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
def gen_date_long():
|
| 426 |
-
month = random.choice(MONTHS)
|
| 427 |
-
day = random.randint(1, 28)
|
| 428 |
-
year = random.randint(1930, 2024)
|
| 429 |
-
return f"{month} {day}, {year}"
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
def gen_date_day():
|
| 433 |
-
return str(random.randint(1, 28))
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
def gen_date_month():
|
| 437 |
-
return random.choice(MONTHS)
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
def gen_date_year():
|
| 441 |
-
return str(random.randint(1930, 2024))
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
def gen_age():
|
| 445 |
-
return str(random.randint(1, 95))
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
def gen_place_full():
|
| 449 |
-
return (f"{random.choice(BARANGAYS)}, "
|
| 450 |
-
f"{random.choice(CITIES)}, "
|
| 451 |
-
f"{random.choice(PROVINCES)}")
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
def gen_place_city():
|
| 455 |
-
return random.choice(CITIES)
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
def gen_place_province():
|
| 459 |
-
return random.choice(PROVINCES)
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
def gen_address():
|
| 463 |
-
num = random.randint(1, 999)
|
| 464 |
-
st = random.choice(STREETS)
|
| 465 |
-
return f"{num} {st}, {random.choice(CITIES)}"
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
def gen_registry_no():
|
| 469 |
-
year = random.randint(2000, 2024)
|
| 470 |
-
seq = random.randint(1, 9999)
|
| 471 |
-
return f"{year}-{seq:04d}"
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
def gen_sex():
|
| 475 |
-
return random.choice(['Male', 'Female'])
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
def gen_religion():
|
| 479 |
-
return random.choice(RELIGIONS)
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
def gen_occupation():
|
| 483 |
-
return random.choice(OCCUPATIONS)
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
def gen_civil_status():
|
| 487 |
-
return random.choice(CIVIL_STATUSES)
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
def gen_citizenship():
|
| 491 |
-
return random.choice(CITIZENSHIPS)
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
def gen_weight():
|
| 495 |
-
return f"{random.randint(1500, 4500)} grams"
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
def gen_death_cause():
|
| 499 |
-
return random.choice(DEATH_CAUSES)
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
def gen_attendant():
|
| 503 |
-
return random.choice(ATTENDANT_TYPES)
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 507 |
-
# FORM FIELD DEFINITIONS
|
| 508 |
-
# βββββββββββββοΏ½οΏ½οΏ½βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 509 |
-
|
| 510 |
-
def get_form_fields(form_type, first_names, last_names):
|
| 511 |
-
fn = first_names
|
| 512 |
-
ln = last_names
|
| 513 |
-
|
| 514 |
-
if form_type == 'form1a': # Birth Certificate
|
| 515 |
-
return [
|
| 516 |
-
('province', lambda: gen_place_province()),
|
| 517 |
-
('registry_no', lambda: gen_registry_no()),
|
| 518 |
-
('city_municipality', lambda: gen_place_city()),
|
| 519 |
-
('child_first_name', lambda: gen_first_name(fn)),
|
| 520 |
-
('child_middle_name', lambda: gen_middle_name(ln)),
|
| 521 |
-
('child_last_name', lambda: gen_last_name(ln)),
|
| 522 |
-
('sex', lambda: gen_sex()),
|
| 523 |
-
('dob_day', lambda: gen_date_day()),
|
| 524 |
-
('dob_month', lambda: gen_date_month()),
|
| 525 |
-
('dob_year', lambda: gen_date_year()),
|
| 526 |
-
('place_birth_hospital', lambda: f"Ospital ng {gen_place_city()}"),
|
| 527 |
-
('place_birth_city', lambda: gen_place_city()),
|
| 528 |
-
('place_birth_province', lambda: gen_place_province()),
|
| 529 |
-
('weight_at_birth', lambda: gen_weight()),
|
| 530 |
-
('type_of_birth', lambda: random.choice(['Single', 'Twin', 'Triplet'])),
|
| 531 |
-
('mother_first_name', lambda: gen_first_name(fn)),
|
| 532 |
-
('mother_middle_name', lambda: gen_middle_name(ln)),
|
| 533 |
-
('mother_last_name', lambda: gen_last_name(ln)),
|
| 534 |
-
('mother_citizenship', lambda: gen_citizenship()),
|
| 535 |
-
('mother_religion', lambda: gen_religion()),
|
| 536 |
-
('mother_occupation', lambda: gen_occupation()),
|
| 537 |
-
('mother_age_at_birth', lambda: str(random.randint(16, 45))),
|
| 538 |
-
('mother_residence_house', lambda: gen_address()),
|
| 539 |
-
('mother_residence_city', lambda: gen_place_city()),
|
| 540 |
-
('mother_residence_province', lambda: gen_place_province()),
|
| 541 |
-
('father_first_name', lambda: gen_first_name(fn)),
|
| 542 |
-
('father_middle_name', lambda: gen_middle_name(ln)),
|
| 543 |
-
('father_last_name', lambda: gen_last_name(ln)),
|
| 544 |
-
('father_citizenship', lambda: gen_citizenship()),
|
| 545 |
-
('father_religion', lambda: gen_religion()),
|
| 546 |
-
('father_occupation', lambda: gen_occupation()),
|
| 547 |
-
('father_age_at_birth', lambda: str(random.randint(18, 55))),
|
| 548 |
-
('parents_marriage_month', lambda: gen_date_month()),
|
| 549 |
-
('parents_marriage_day', lambda: gen_date_day()),
|
| 550 |
-
('parents_marriage_year', lambda: gen_date_year()),
|
| 551 |
-
('parents_marriage_city', lambda: gen_place_city()),
|
| 552 |
-
('informant_name', lambda: gen_full_name(fn, ln, False)),
|
| 553 |
-
('informant_address', lambda: gen_address()),
|
| 554 |
-
('informant_date', lambda: gen_date_slash()),
|
| 555 |
-
]
|
| 556 |
-
|
| 557 |
-
elif form_type == 'form2a': # Death Certificate
|
| 558 |
-
return [
|
| 559 |
-
('province', lambda: gen_place_province()),
|
| 560 |
-
('registry_no', lambda: gen_registry_no()),
|
| 561 |
-
('city_municipality', lambda: gen_place_city()),
|
| 562 |
-
('deceased_first_name', lambda: gen_first_name(fn)),
|
| 563 |
-
('deceased_middle_name', lambda: gen_middle_name(ln)),
|
| 564 |
-
('deceased_last_name', lambda: gen_last_name(ln)),
|
| 565 |
-
('sex', lambda: gen_sex()),
|
| 566 |
-
('religion', lambda: gen_religion()),
|
| 567 |
-
('age_years', lambda: gen_age()),
|
| 568 |
-
('place_death_full', lambda: f"{gen_place_city()}, {gen_place_province()}"),
|
| 569 |
-
('dod_day', lambda: gen_date_day()),
|
| 570 |
-
('dod_month', lambda: gen_date_month()),
|
| 571 |
-
('dod_year', lambda: gen_date_year()),
|
| 572 |
-
('citizenship', lambda: gen_citizenship()),
|
| 573 |
-
('residence_full', lambda: gen_address()),
|
| 574 |
-
('civil_status', lambda: gen_civil_status()),
|
| 575 |
-
('occupation', lambda: gen_occupation()),
|
| 576 |
-
('cause_immediate', lambda: gen_death_cause()),
|
| 577 |
-
('cause_antecedent', lambda: gen_death_cause()),
|
| 578 |
-
('cause_underlying', lambda: gen_death_cause()),
|
| 579 |
-
('cause_other', lambda: gen_death_cause()),
|
| 580 |
-
('informant_name', lambda: gen_full_name(fn, ln, False)),
|
| 581 |
-
('informant_address', lambda: gen_address()),
|
| 582 |
-
('informant_date', lambda: gen_date_slash()),
|
| 583 |
-
]
|
| 584 |
-
|
| 585 |
-
elif form_type == 'form3a': # Marriage Certificate
|
| 586 |
-
return [
|
| 587 |
-
('province', lambda: gen_place_province()),
|
| 588 |
-
('city_municipality', lambda: gen_place_city()),
|
| 589 |
-
('registry_no', lambda: gen_registry_no()),
|
| 590 |
-
('husband_first_name', lambda: gen_first_name(fn)),
|
| 591 |
-
('husband_middle_name', lambda: gen_middle_name(ln)),
|
| 592 |
-
('husband_last_name', lambda: gen_last_name(ln)),
|
| 593 |
-
('wife_first_name', lambda: gen_first_name(fn)),
|
| 594 |
-
('wife_middle_name', lambda: gen_middle_name(ln)),
|
| 595 |
-
('wife_last_name', lambda: gen_last_name(ln)),
|
| 596 |
-
('husband_dob_day', lambda: gen_date_day()),
|
| 597 |
-
('husband_dob_month', lambda: gen_date_month()),
|
| 598 |
-
('husband_dob_year', lambda: gen_date_year()),
|
| 599 |
-
('husband_age', lambda: gen_age()),
|
| 600 |
-
('wife_dob_day', lambda: gen_date_day()),
|
| 601 |
-
('wife_dob_month', lambda: gen_date_month()),
|
| 602 |
-
('wife_dob_year', lambda: gen_date_year()),
|
| 603 |
-
('wife_age', lambda: gen_age()),
|
| 604 |
-
('husband_place_birth_city', lambda: gen_place_city()),
|
| 605 |
-
('husband_place_birth_province', lambda: gen_place_province()),
|
| 606 |
-
('wife_place_birth_city', lambda: gen_place_city()),
|
| 607 |
-
('wife_place_birth_province', lambda: gen_place_province()),
|
| 608 |
-
('husband_citizenship', lambda: gen_citizenship()),
|
| 609 |
-
('wife_citizenship', lambda: gen_citizenship()),
|
| 610 |
-
('husband_religion', lambda: gen_religion()),
|
| 611 |
-
('wife_religion', lambda: gen_religion()),
|
| 612 |
-
('husband_civil_status', lambda: gen_civil_status()),
|
| 613 |
-
('wife_civil_status', lambda: gen_civil_status()),
|
| 614 |
-
('husband_father_first', lambda: gen_first_name(fn)),
|
| 615 |
-
('husband_father_last', lambda: gen_last_name(ln)),
|
| 616 |
-
('wife_father_first', lambda: gen_first_name(fn)),
|
| 617 |
-
('wife_father_last', lambda: gen_last_name(ln)),
|
| 618 |
-
('husband_mother_first', lambda: gen_first_name(fn)),
|
| 619 |
-
('husband_mother_last', lambda: gen_last_name(ln)),
|
| 620 |
-
('wife_mother_first', lambda: gen_first_name(fn)),
|
| 621 |
-
('wife_mother_last', lambda: gen_last_name(ln)),
|
| 622 |
-
('place_marriage_city', lambda: gen_place_city()),
|
| 623 |
-
('place_marriage_province', lambda: gen_place_province()),
|
| 624 |
-
('date_marriage_day', lambda: gen_date_day()),
|
| 625 |
-
('date_marriage_month', lambda: gen_date_month()),
|
| 626 |
-
('date_marriage_year', lambda: gen_date_year()),
|
| 627 |
-
]
|
| 628 |
-
|
| 629 |
-
elif form_type == 'form90': # Marriage License Application
|
| 630 |
-
return [
|
| 631 |
-
('province', lambda: gen_place_province()),
|
| 632 |
-
('city_municipality', lambda: gen_place_city()),
|
| 633 |
-
('registry_no', lambda: gen_registry_no()),
|
| 634 |
-
('husband_first_name', lambda: gen_first_name(fn)),
|
| 635 |
-
('husband_middle_name', lambda: gen_middle_name(ln)),
|
| 636 |
-
('husband_last_name', lambda: gen_last_name(ln)),
|
| 637 |
-
('wife_first_name', lambda: gen_first_name(fn)),
|
| 638 |
-
('wife_middle_name', lambda: gen_middle_name(ln)),
|
| 639 |
-
('wife_last_name', lambda: gen_last_name(ln)),
|
| 640 |
-
('husband_age', lambda: gen_age()),
|
| 641 |
-
('wife_age', lambda: gen_age()),
|
| 642 |
-
('husband_citizenship', lambda: gen_citizenship()),
|
| 643 |
-
('wife_citizenship', lambda: gen_citizenship()),
|
| 644 |
-
('husband_residence', lambda: gen_address()),
|
| 645 |
-
('wife_residence', lambda: gen_address()),
|
| 646 |
-
('application_date', lambda: gen_date_slash()),
|
| 647 |
-
]
|
| 648 |
-
|
| 649 |
-
return []
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 653 |
-
# MAIN GENERATOR
|
| 654 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 655 |
-
|
| 656 |
-
def generate_dataset():
|
| 657 |
-
print("=" * 65)
|
| 658 |
-
print(" fix_data.py β Synthetic Training Data Generator")
|
| 659 |
-
print("=" * 65)
|
| 660 |
-
|
| 661 |
-
# Load Filipino names
|
| 662 |
-
print("\n[1/4] Loading Filipino names...")
|
| 663 |
-
first_names, last_names, middle_names = load_ph_names()
|
| 664 |
-
|
| 665 |
-
# Populate global MIDDLE_NAMES so all generators use the full 700+ pool
|
| 666 |
-
global MIDDLE_NAMES
|
| 667 |
-
MIDDLE_NAMES.clear()
|
| 668 |
-
MIDDLE_NAMES.extend(middle_names)
|
| 669 |
-
print(f" Middle names pool active: {len(MIDDLE_NAMES)} entries")
|
| 670 |
-
|
| 671 |
-
# Create output directories
|
| 672 |
-
print("\n[2/4] Creating output directories...")
|
| 673 |
-
for split in ['train', 'val']:
|
| 674 |
-
for form in ['form1a', 'form2a', 'form3a', 'form90']:
|
| 675 |
-
Path(f'data/{split}/{form}').mkdir(parents=True, exist_ok=True)
|
| 676 |
-
print(" β Directories ready")
|
| 677 |
-
|
| 678 |
-
# Load font pool β multiple typefaces so model generalises across fonts
|
| 679 |
-
print("\n[3/4] Loading fonts...")
|
| 680 |
-
font_pool = load_font_pool(FONT_SIZE)
|
| 681 |
-
print(f" β {len(font_pool)} font(s) loaded")
|
| 682 |
-
|
| 683 |
-
# Generate images
|
| 684 |
-
print("\n[4/4] Generating images...")
|
| 685 |
-
print(f" {'Form':<10} {'Total':>7} {'Train':>7} {'Val':>7}")
|
| 686 |
-
print(f" {'-'*35}")
|
| 687 |
-
|
| 688 |
-
train_annotations = []
|
| 689 |
-
val_annotations = []
|
| 690 |
-
total_generated = 0
|
| 691 |
-
|
| 692 |
-
for form_type, n_samples in SAMPLES_PER_FORM.items():
|
| 693 |
-
fields = get_form_fields(form_type, first_names, last_names)
|
| 694 |
-
samples_per_field = max(1, n_samples // len(fields))
|
| 695 |
-
form_train = 0
|
| 696 |
-
form_val = 0
|
| 697 |
-
|
| 698 |
-
# Pre-build shuffled val assignment for unbiased 10% split
|
| 699 |
-
total_this_form = samples_per_field * len(fields)
|
| 700 |
-
val_flags = [False] * total_this_form
|
| 701 |
-
val_indices = random.sample(
|
| 702 |
-
range(total_this_form),
|
| 703 |
-
max(1, int(total_this_form * VAL_SPLIT))
|
| 704 |
-
)
|
| 705 |
-
for vi in val_indices:
|
| 706 |
-
val_flags[vi] = True
|
| 707 |
-
|
| 708 |
-
img_idx = 0
|
| 709 |
-
for field_name, generator in fields:
|
| 710 |
-
for _ in range(samples_per_field):
|
| 711 |
-
text = generator()
|
| 712 |
-
if not text or not text.strip():
|
| 713 |
-
img_idx += 1
|
| 714 |
-
continue
|
| 715 |
-
|
| 716 |
-
# 70% handwriting / 30% printed
|
| 717 |
-
use_handwriting = random.random() < 0.70
|
| 718 |
-
# Pick a random font from the pool each image β forces
|
| 719 |
-
# the model to generalise across typefaces, not memorise one font
|
| 720 |
-
font = random.choice(font_pool)
|
| 721 |
-
img = render_text_image(text, font, handwriting=use_handwriting)
|
| 722 |
-
fname = f"{field_name}_{img_idx:06d}.jpg"
|
| 723 |
-
|
| 724 |
-
is_val = val_flags[img_idx] if img_idx < len(val_flags) else False
|
| 725 |
-
|
| 726 |
-
if is_val:
|
| 727 |
-
out_path = f"data/val/{form_type}/{fname}"
|
| 728 |
-
val_annotations.append({
|
| 729 |
-
'image_path': f"{form_type}/{fname}",
|
| 730 |
-
'text': text,
|
| 731 |
-
})
|
| 732 |
-
form_val += 1
|
| 733 |
-
else:
|
| 734 |
-
out_path = f"data/train/{form_type}/{fname}"
|
| 735 |
-
train_annotations.append({
|
| 736 |
-
'image_path': f"{form_type}/{fname}",
|
| 737 |
-
'text': text,
|
| 738 |
-
})
|
| 739 |
-
form_train += 1
|
| 740 |
-
|
| 741 |
-
img.save(out_path, quality=95)
|
| 742 |
-
img_idx += 1
|
| 743 |
-
|
| 744 |
-
total_generated += form_train + form_val
|
| 745 |
-
print(f" {form_type:<10} {form_train + form_val:>7,} "
|
| 746 |
-
f"{form_train:>7,} {form_val:>7,}")
|
| 747 |
-
|
| 748 |
-
# Save annotation files
|
| 749 |
-
with open('data/train_annotations.json', 'w', encoding='utf-8') as f:
|
| 750 |
-
json.dump(train_annotations, f, indent=2, ensure_ascii=False)
|
| 751 |
-
|
| 752 |
-
with open('data/val_annotations.json', 'w', encoding='utf-8') as f:
|
| 753 |
-
json.dump(val_annotations, f, indent=2, ensure_ascii=False)
|
| 754 |
-
|
| 755 |
-
# Summary
|
| 756 |
-
print(f"\n{'=' * 65}")
|
| 757 |
-
print(f" DONE!")
|
| 758 |
-
print(f"{'=' * 65}")
|
| 759 |
-
print(f" Total images generated : {total_generated:,}")
|
| 760 |
-
print(f" Train images : {len(train_annotations):,}")
|
| 761 |
-
print(f" Val images : {len(val_annotations):,}")
|
| 762 |
-
print(f"\n Saved:")
|
| 763 |
-
print(f" data/train_annotations.json ({len(train_annotations)} entries)")
|
| 764 |
-
print(f" data/val_annotations.json ({len(val_annotations)} entries)")
|
| 765 |
-
print(f"\n Next step: python train.py")
|
| 766 |
-
print(f"{'=' * 65}")
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
if __name__ == '__main__':
|
| 770 |
-
generate_dataset()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CRNN+CTC/generate_form_samples.py
DELETED
|
@@ -1,389 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
generate_form_samples.py
|
| 3 |
-
========================
|
| 4 |
-
Generates thousands of synthetic filled civil registry form images
|
| 5 |
-
using the blank PDF forms + template_matcher.py coordinates.
|
| 6 |
-
|
| 7 |
-
Each form is filled with random Filipino names/dates in handwriting fonts.
|
| 8 |
-
Crops are saved with labels β ready for CRNN+CTC fine-tuning.
|
| 9 |
-
|
| 10 |
-
Usage:
|
| 11 |
-
python generate_form_samples.py
|
| 12 |
-
|
| 13 |
-
Output:
|
| 14 |
-
data/train/real_forms/ -- cropped field images
|
| 15 |
-
data/real_annotations.json -- labels for fine-tuning
|
| 16 |
-
"""
|
| 17 |
-
|
| 18 |
-
import os
|
| 19 |
-
import sys
|
| 20 |
-
import json
|
| 21 |
-
import random
|
| 22 |
-
import datetime
|
| 23 |
-
|
| 24 |
-
from PIL import Image, ImageDraw, ImageFont
|
| 25 |
-
|
| 26 |
-
# ββ Paths βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
-
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 28 |
-
ROOT_DIR = os.path.dirname(BASE_DIR)
|
| 29 |
-
PYTHON_DIR = ROOT_DIR # template_matcher.py is here
|
| 30 |
-
|
| 31 |
-
NAMES_FILE = os.path.join(BASE_DIR, 'data', 'ph_names.json')
|
| 32 |
-
OUT_IMG_DIR = os.path.join(BASE_DIR, 'data', 'train', 'real_forms')
|
| 33 |
-
OUT_ANN = os.path.join(BASE_DIR, 'data', 'real_annotations.json')
|
| 34 |
-
|
| 35 |
-
FONTS_DIR = os.path.join(ROOT_DIR, 'test_images', 'handwriting_fonts')
|
| 36 |
-
|
| 37 |
-
# Only verified-working Google Fonts URLs
|
| 38 |
-
GOOGLE_FONTS = {
|
| 39 |
-
'Kalam-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Regular.ttf',
|
| 40 |
-
'Kalam-Bold.ttf': 'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Bold.ttf',
|
| 41 |
-
'Kalam-Light.ttf': 'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Light.ttf',
|
| 42 |
-
'PatrickHand-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/patrickhand/PatrickHand-Regular.ttf',
|
| 43 |
-
'IndieFlower-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/indieflower/IndieFlower-Regular.ttf',
|
| 44 |
-
'Handlee-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/handlee/Handlee-Regular.ttf',
|
| 45 |
-
'GochiHand-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/gochihand/GochiHand-Regular.ttf',
|
| 46 |
-
'ArchitectsDaughter.ttf': 'https://github.com/google/fonts/raw/main/ofl/architectsdaughter/ArchitectsDaughter-Regular.ttf',
|
| 47 |
-
'ShadowsIntoLight.ttf': 'https://github.com/google/fonts/raw/main/ofl/shadowsintolight/ShadowsIntoLight.ttf',
|
| 48 |
-
'ShadowsIntoLightTwo.ttf': 'https://github.com/google/fonts/raw/main/ofl/shadowsintolighttwo/ShadowsIntoLightTwo-Regular.ttf',
|
| 49 |
-
'Kristi-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/kristi/Kristi-Regular.ttf',
|
| 50 |
-
'AmaticSC-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/amaticsc/AmaticSC-Regular.ttf',
|
| 51 |
-
'AmaticSC-Bold.ttf': 'https://github.com/google/fonts/raw/main/ofl/amaticsc/AmaticSC-Bold.ttf',
|
| 52 |
-
'BadScript-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/badscript/BadScript-Regular.ttf',
|
| 53 |
-
'Sacramento-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/sacramento/Sacramento-Regular.ttf',
|
| 54 |
-
'GreatVibes-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/greatvibes/GreatVibes-Regular.ttf',
|
| 55 |
-
'Allura-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/allura/Allura-Regular.ttf',
|
| 56 |
-
'AlexBrush-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/alexbrush/AlexBrush-Regular.ttf',
|
| 57 |
-
'Parisienne-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/parisienne/Parisienne-Regular.ttf',
|
| 58 |
-
'Tangerine-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/tangerine/Tangerine-Regular.ttf',
|
| 59 |
-
'Tangerine-Bold.ttf': 'https://github.com/google/fonts/raw/main/ofl/tangerine/Tangerine-Bold.ttf',
|
| 60 |
-
'Courgette-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/courgette/Courgette-Regular.ttf',
|
| 61 |
-
'Niconne-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/niconne/Niconne-Regular.ttf',
|
| 62 |
-
'MarckScript-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/marckscript/MarckScript-Regular.ttf',
|
| 63 |
-
'Norican-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/norican/Norican-Regular.ttf',
|
| 64 |
-
'Damion-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/damion/Damion-Regular.ttf',
|
| 65 |
-
'Satisfy-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/satisfy/Satisfy-Regular.ttf',
|
| 66 |
-
'Pacifico-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/pacifico/Pacifico-Regular.ttf',
|
| 67 |
-
'Italianno-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/italianno/Italianno-Regular.ttf',
|
| 68 |
-
'Pompiere-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/pompiere/Pompiere-Regular.ttf',
|
| 69 |
-
}
|
| 70 |
-
|
| 71 |
-
FONT_PATHS = [
|
| 72 |
-
# Downloaded handwriting fonts
|
| 73 |
-
*[os.path.join(FONTS_DIR, name) for name in GOOGLE_FONTS],
|
| 74 |
-
# Already available
|
| 75 |
-
os.path.join(ROOT_DIR, 'test_images', 'Caveat-Regular.ttf'),
|
| 76 |
-
# Windows fallbacks
|
| 77 |
-
r'C:\Windows\Fonts\segoepr.ttf',
|
| 78 |
-
r'C:\Windows\Fonts\segoeprb.ttf',
|
| 79 |
-
r'C:\Windows\Fonts\comic.ttf',
|
| 80 |
-
]
|
| 81 |
-
|
| 82 |
-
def download_fonts():
|
| 83 |
-
"""Download handwriting fonts from Google Fonts if not present."""
|
| 84 |
-
import urllib.request
|
| 85 |
-
os.makedirs(FONTS_DIR, exist_ok=True)
|
| 86 |
-
ok = 0
|
| 87 |
-
for fname, url in GOOGLE_FONTS.items():
|
| 88 |
-
dest = os.path.join(FONTS_DIR, fname)
|
| 89 |
-
if os.path.exists(dest) and os.path.getsize(dest) > 10000:
|
| 90 |
-
ok += 1
|
| 91 |
-
continue
|
| 92 |
-
try:
|
| 93 |
-
print(f" Downloading {fname}...")
|
| 94 |
-
with urllib.request.urlopen(url, timeout=10) as r, open(dest, 'wb') as f:
|
| 95 |
-
f.write(r.read())
|
| 96 |
-
# Validate: real TTF files are > 10KB
|
| 97 |
-
if os.path.getsize(dest) < 10000:
|
| 98 |
-
os.remove(dest)
|
| 99 |
-
print(f" Skipped {fname} (invalid file)")
|
| 100 |
-
else:
|
| 101 |
-
ok += 1
|
| 102 |
-
except Exception as e:
|
| 103 |
-
print(f" Failed {fname}: {e}")
|
| 104 |
-
if os.path.exists(dest):
|
| 105 |
-
os.remove(dest)
|
| 106 |
-
print(f" {ok} fonts ready")
|
| 107 |
-
|
| 108 |
-
PDF_FORMS = {
|
| 109 |
-
'97': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 97 (MARRIAGE CERTIFICATE).pdf'),
|
| 110 |
-
'102': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 102 (BIRTH CERTIFICATE).pdf'),
|
| 111 |
-
'103': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 103 (DEATH CERTIFICATE).pdf'),
|
| 112 |
-
'90': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 90-MARRIAGE-LICENCE-FORM.pdf'),
|
| 113 |
-
}
|
| 114 |
-
|
| 115 |
-
SAMPLES_PER_FORM = 1000 # forms to generate per type
|
| 116 |
-
IMG_W = 64
|
| 117 |
-
IMG_H = 512
|
| 118 |
-
|
| 119 |
-
# ββ Load TEMPLATES from template_matcher βββββββββββββββββββββ
|
| 120 |
-
sys.path.insert(0, PYTHON_DIR)
|
| 121 |
-
from template_matcher import TEMPLATES
|
| 122 |
-
|
| 123 |
-
# ββ Load Filipino names βββββββββββββββββββββββββββββββββββββββ
|
| 124 |
-
def load_names():
|
| 125 |
-
if not os.path.exists(NAMES_FILE):
|
| 126 |
-
print(f"ERROR: {NAMES_FILE} not found. Run generate_ph_names.py first.")
|
| 127 |
-
sys.exit(1)
|
| 128 |
-
with open(NAMES_FILE) as f:
|
| 129 |
-
data = json.load(f)
|
| 130 |
-
return data
|
| 131 |
-
|
| 132 |
-
# ββ Random data generators ββββββββββββββββββββββββββββββββββββ
|
| 133 |
-
MONTHS = ['January','February','March','April','May','June',
|
| 134 |
-
'July','August','September','October','November','December']
|
| 135 |
-
RELIGIONS = ['Roman Catholic','Islam','Baptist','Iglesia ni Cristo',
|
| 136 |
-
'Seventh Day Adventist','Born Again Christian']
|
| 137 |
-
CIVIL_STATUSES = ['Single','Married','Widowed','Legally Separated']
|
| 138 |
-
CITIZENSHIPS = ['Filipino','American','Chinese','Japanese']
|
| 139 |
-
PROVINCES = ['Cebu','Davao del Sur','Metro Manila','Iloilo','Pampanga',
|
| 140 |
-
'Batangas','Laguna','Cavite','Bulacan','Quezon City']
|
| 141 |
-
CITIES = ['Cebu City','Davao City','Manila','Iloilo City','San Fernando',
|
| 142 |
-
'Batangas City','Santa Rosa','Bacoor','Malolos','Quezon City']
|
| 143 |
-
|
| 144 |
-
def rand_name(names, key):
|
| 145 |
-
pool = names.get(key, ['Juan'])
|
| 146 |
-
return random.choice(pool).upper()
|
| 147 |
-
|
| 148 |
-
def rand_date():
|
| 149 |
-
y = random.randint(1950, 2005)
|
| 150 |
-
m = random.randint(1, 12)
|
| 151 |
-
d = random.randint(1, 28)
|
| 152 |
-
return f"{d:02d}", MONTHS[m-1], str(y)
|
| 153 |
-
|
| 154 |
-
def rand_age():
|
| 155 |
-
return str(random.randint(18, 80))
|
| 156 |
-
|
| 157 |
-
def rand_province():
|
| 158 |
-
return random.choice(PROVINCES).upper()
|
| 159 |
-
|
| 160 |
-
def rand_city():
|
| 161 |
-
return random.choice(CITIES).upper()
|
| 162 |
-
|
| 163 |
-
def rand_religion():
|
| 164 |
-
return random.choice(RELIGIONS).upper()
|
| 165 |
-
|
| 166 |
-
def rand_civil_status():
|
| 167 |
-
return random.choice(CIVIL_STATUSES).upper()
|
| 168 |
-
|
| 169 |
-
def rand_citizenship():
|
| 170 |
-
return random.choice(CITIZENSHIPS).upper()
|
| 171 |
-
|
| 172 |
-
def rand_registry_no():
|
| 173 |
-
return f"{random.randint(2000,2024)}-{random.randint(1000,9999)}"
|
| 174 |
-
|
| 175 |
-
def rand_time():
|
| 176 |
-
h = random.randint(6, 18)
|
| 177 |
-
m = random.choice(['00','15','30','45'])
|
| 178 |
-
return f"{h:02d}:{m} {'AM' if h < 12 else 'PM'}"
|
| 179 |
-
|
| 180 |
-
def generate_field_value(field_name, names):
|
| 181 |
-
"""Generate a plausible random value for a given field name."""
|
| 182 |
-
f = field_name.lower()
|
| 183 |
-
if 'province' in f: return rand_province()
|
| 184 |
-
if 'registry' in f: return rand_registry_no()
|
| 185 |
-
if 'city' in f or 'municipality' in f: return rand_city()
|
| 186 |
-
if 'first' in f and ('name' in f or 'father' in f or 'mother' in f):
|
| 187 |
-
return rand_name(names, 'first')
|
| 188 |
-
if 'middle' in f: return rand_name(names, 'middle')
|
| 189 |
-
if 'last' in f: return rand_name(names, 'last')
|
| 190 |
-
if '_name' in f and 'father' not in f and 'mother' not in f:
|
| 191 |
-
return rand_name(names, 'first')
|
| 192 |
-
if 'father_name' in f or 'mother_name' in f:
|
| 193 |
-
return f"{rand_name(names,'first')} {rand_name(names,'middle')} {rand_name(names,'last')}"
|
| 194 |
-
if 'dob_day' in f or 'day' in f: return rand_date()[0]
|
| 195 |
-
if 'dob_month' in f or 'month' in f: return rand_date()[1]
|
| 196 |
-
if 'dob_year' in f or 'year' in f: return rand_date()[2]
|
| 197 |
-
if 'dob' in f and 'day' not in f and 'month' not in f and 'year' not in f:
|
| 198 |
-
d,m,y = rand_date(); return f"{d} {m} {y}"
|
| 199 |
-
if 'age' in f: return rand_age()
|
| 200 |
-
if 'birth' in f and 'place' in f: return rand_city()
|
| 201 |
-
if 'place_of_birth' in f: return rand_city()
|
| 202 |
-
if 'sex' in f: return random.choice(['MALE','FEMALE'])
|
| 203 |
-
if 'citizenship' in f: return rand_citizenship()
|
| 204 |
-
if 'residence' in f: return f"{rand_city()}, {rand_province()}"
|
| 205 |
-
if 'religion' in f: return rand_religion()
|
| 206 |
-
if 'civil_status' in f: return rand_civil_status()
|
| 207 |
-
if 'place_of_marriage' in f: return rand_city()
|
| 208 |
-
if 'date_of_marriage' in f:
|
| 209 |
-
d,m,y = rand_date(); return f"{d} {m} {y}"
|
| 210 |
-
if 'time_of_marriage' in f: return rand_time()
|
| 211 |
-
if 'marriage_date' in f:
|
| 212 |
-
d,m,y = rand_date(); return f"{d} {m} {y}"
|
| 213 |
-
if 'marriage_place' in f: return rand_city()
|
| 214 |
-
if 'marriage_license' in f: return rand_registry_no()
|
| 215 |
-
if 'date_issued' in f:
|
| 216 |
-
d,m,y = rand_date(); return f"{d} {m} {y}"
|
| 217 |
-
if 'occupation' in f: return random.choice(['FARMER','TEACHER','NURSE','ENGINEER','DRIVER','HOUSEWIFE'])
|
| 218 |
-
if 'type_of_birth' in f: return random.choice(['SINGLE','TWIN','TRIPLET'])
|
| 219 |
-
if 'birth_order' in f: return random.choice(['1ST','2ND','3RD','4TH'])
|
| 220 |
-
if 'weight' in f: return f"{random.randint(2,5)}.{random.randint(0,9)} KG"
|
| 221 |
-
if 'cause' in f: return random.choice(['CARDIAC ARREST','PNEUMONIA','DIABETES','HYPERTENSION'])
|
| 222 |
-
if 'father_name' in f: return f"{rand_name(names,'first')} {rand_name(names,'last')}"
|
| 223 |
-
if 'mother_name' in f: return f"{rand_name(names,'first')} {rand_name(names,'last')}"
|
| 224 |
-
return rand_name(names, 'first')
|
| 225 |
-
|
| 226 |
-
# ββ Load fonts ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 227 |
-
def load_fonts():
|
| 228 |
-
fonts = []
|
| 229 |
-
for path in FONT_PATHS:
|
| 230 |
-
if os.path.exists(path):
|
| 231 |
-
for size in [14, 16, 18, 20]:
|
| 232 |
-
try:
|
| 233 |
-
fonts.append(ImageFont.truetype(path, size))
|
| 234 |
-
except:
|
| 235 |
-
pass
|
| 236 |
-
if not fonts:
|
| 237 |
-
fonts = [ImageFont.load_default()]
|
| 238 |
-
print(f" Loaded {len(fonts)} font variants")
|
| 239 |
-
return fonts
|
| 240 |
-
|
| 241 |
-
# ββ Load blank form image βββββββββββββββββββββββββββββββββββββ
|
| 242 |
-
def load_blank_form(form_type):
|
| 243 |
-
"""Convert PDF to image or use a reference scan as background."""
|
| 244 |
-
pdf_path = PDF_FORMS.get(form_type)
|
| 245 |
-
|
| 246 |
-
# Try pdf2image first
|
| 247 |
-
if pdf_path and os.path.exists(pdf_path):
|
| 248 |
-
try:
|
| 249 |
-
from pdf2image import convert_from_path
|
| 250 |
-
pages = convert_from_path(pdf_path, dpi=150)
|
| 251 |
-
if pages:
|
| 252 |
-
return pages[0].convert('RGB')
|
| 253 |
-
except Exception as e:
|
| 254 |
-
print(f" pdf2image failed: {e}")
|
| 255 |
-
|
| 256 |
-
# Fallback: use reference image (try png, jpg, jpeg)
|
| 257 |
-
for ext in ['png', 'jpg', 'jpeg']:
|
| 258 |
-
ref_path = os.path.join(ROOT_DIR, 'references', f'reference_{form_type}.{ext}')
|
| 259 |
-
if os.path.exists(ref_path):
|
| 260 |
-
return Image.open(ref_path).convert('RGB')
|
| 261 |
-
# Also try hyphen variant (e.g. reference-90.jpg)
|
| 262 |
-
for ext in ['png', 'jpg', 'jpeg']:
|
| 263 |
-
ref_path = os.path.join(ROOT_DIR, 'references', f'reference-{form_type}.{ext}')
|
| 264 |
-
if os.path.exists(ref_path):
|
| 265 |
-
return Image.open(ref_path).convert('RGB')
|
| 266 |
-
|
| 267 |
-
print(f" WARNING: No blank form found for {form_type} β skipping")
|
| 268 |
-
return None
|
| 269 |
-
|
| 270 |
-
# ββ Render text on form βββββββββββββββββββββββββββββββββββββββ
|
| 271 |
-
def render_field(draw, x1r, y1r, x2r, y2r, text, img_w, img_h, fonts):
|
| 272 |
-
"""Draw handwritten-style text in a field box."""
|
| 273 |
-
x1 = int(x1r * img_w)
|
| 274 |
-
y1 = int(y1r * img_h)
|
| 275 |
-
x2 = int(x2r * img_w)
|
| 276 |
-
y2 = int(y2r * img_h)
|
| 277 |
-
|
| 278 |
-
box_w = max(x2 - x1, 1)
|
| 279 |
-
box_h = max(y2 - y1, 1)
|
| 280 |
-
|
| 281 |
-
# Pick a font that fits
|
| 282 |
-
font = random.choice(fonts)
|
| 283 |
-
for f in fonts:
|
| 284 |
-
bbox = f.getbbox(text)
|
| 285 |
-
fw = bbox[2] - bbox[0]
|
| 286 |
-
fh = bbox[3] - bbox[1]
|
| 287 |
-
if fw <= box_w * 0.95 and fh <= box_h * 1.2:
|
| 288 |
-
font = f
|
| 289 |
-
break
|
| 290 |
-
|
| 291 |
-
# Random pen color (dark blue/black like ballpen)
|
| 292 |
-
r = random.randint(0, 40)
|
| 293 |
-
g = random.randint(0, 40)
|
| 294 |
-
b = random.randint(60, 120)
|
| 295 |
-
color = (r, g, b)
|
| 296 |
-
|
| 297 |
-
# Center text vertically in box
|
| 298 |
-
bbox = font.getbbox(text)
|
| 299 |
-
fh = bbox[3] - bbox[1]
|
| 300 |
-
ty = y1 + (box_h - fh) // 2
|
| 301 |
-
|
| 302 |
-
# Slight random x offset
|
| 303 |
-
tx = x1 + random.randint(2, max(3, box_w // 10))
|
| 304 |
-
|
| 305 |
-
draw.text((tx, ty), text, fill=color, font=font)
|
| 306 |
-
|
| 307 |
-
# ββ Crop a field ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 308 |
-
def crop_field(img, x1r, y1r, x2r, y2r):
|
| 309 |
-
w, h = img.size
|
| 310 |
-
x1 = max(0, int(x1r * w) - 4)
|
| 311 |
-
y1 = max(0, int(y1r * h) - 4)
|
| 312 |
-
x2 = min(w, int(x2r * w) + 4)
|
| 313 |
-
y2 = min(h, int(y2r * h) + 4)
|
| 314 |
-
return img.crop((x1, y1, x2, y2))
|
| 315 |
-
|
| 316 |
-
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½β
|
| 317 |
-
def main():
|
| 318 |
-
print("=" * 60)
|
| 319 |
-
print(" Form Sample Generator")
|
| 320 |
-
print("=" * 60)
|
| 321 |
-
|
| 322 |
-
os.makedirs(OUT_IMG_DIR, exist_ok=True)
|
| 323 |
-
print("\n Downloading handwriting fonts...")
|
| 324 |
-
download_fonts()
|
| 325 |
-
names = load_names()
|
| 326 |
-
fonts = load_fonts()
|
| 327 |
-
annotations = []
|
| 328 |
-
total = 0
|
| 329 |
-
|
| 330 |
-
for form_type, template in TEMPLATES.items():
|
| 331 |
-
print(f"\n Generating Form {form_type}...")
|
| 332 |
-
|
| 333 |
-
blank = load_blank_form(form_type)
|
| 334 |
-
if blank is None:
|
| 335 |
-
continue
|
| 336 |
-
|
| 337 |
-
for i in range(SAMPLES_PER_FORM):
|
| 338 |
-
# Fresh copy of blank form
|
| 339 |
-
form_img = blank.copy()
|
| 340 |
-
draw = ImageDraw.Draw(form_img)
|
| 341 |
-
img_w, img_h = form_img.size
|
| 342 |
-
|
| 343 |
-
field_values = {}
|
| 344 |
-
for field_name, coords in template.items():
|
| 345 |
-
x1r, y1r, x2r, y2r, _ = coords
|
| 346 |
-
text = generate_field_value(field_name, names)
|
| 347 |
-
field_values[field_name] = text
|
| 348 |
-
render_field(draw, x1r, y1r, x2r, y2r, text, img_w, img_h, fonts)
|
| 349 |
-
|
| 350 |
-
# Save full form preview (first sample only)
|
| 351 |
-
if i == 0:
|
| 352 |
-
preview_path = os.path.join(OUT_IMG_DIR, f'form{form_type}_preview.png')
|
| 353 |
-
form_img.save(preview_path)
|
| 354 |
-
print(f" Preview saved: {preview_path}")
|
| 355 |
-
|
| 356 |
-
# Crop each field and save
|
| 357 |
-
for field_name, coords in template.items():
|
| 358 |
-
x1r, y1r, x2r, y2r, _ = coords
|
| 359 |
-
crop = crop_field(form_img, x1r, y1r, x2r, y2r)
|
| 360 |
-
crop = crop.convert('L') # grayscale
|
| 361 |
-
|
| 362 |
-
fname = f"form{form_type}_{i:05d}_{field_name}.png"
|
| 363 |
-
fpath = os.path.join(OUT_IMG_DIR, fname)
|
| 364 |
-
crop.save(fpath)
|
| 365 |
-
|
| 366 |
-
annotations.append({
|
| 367 |
-
"image_path": f"real_forms/{fname}",
|
| 368 |
-
"text": field_values[field_name]
|
| 369 |
-
})
|
| 370 |
-
total += 1
|
| 371 |
-
|
| 372 |
-
if (i + 1) % 100 == 0:
|
| 373 |
-
print(f" {i+1}/{SAMPLES_PER_FORM} forms done ({total} crops so far)")
|
| 374 |
-
|
| 375 |
-
print(f" Form {form_type} done.")
|
| 376 |
-
|
| 377 |
-
# Save annotations
|
| 378 |
-
with open(OUT_ANN, 'w') as f:
|
| 379 |
-
json.dump(annotations, f, indent=2)
|
| 380 |
-
|
| 381 |
-
print(f"\n{'='*60}")
|
| 382 |
-
print(f" DONE!")
|
| 383 |
-
print(f" Total crops : {total}")
|
| 384 |
-
print(f" Annotations : {OUT_ANN}")
|
| 385 |
-
print(f" Next step : upload to Kaggle and run fine-tune")
|
| 386 |
-
print(f"{'='*60}")
|
| 387 |
-
|
| 388 |
-
if __name__ == '__main__':
|
| 389 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CRNN+CTC/inference.py
CHANGED
|
@@ -214,7 +214,7 @@ class CivilRegistryOCR:
|
|
| 214 |
def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
|
| 215 |
"""
|
| 216 |
Args:
|
| 217 |
-
checkpoint_path : path to
|
| 218 |
device : 'cuda' or 'cpu'
|
| 219 |
mode : 'auto' β auto-detect per image (recommended)
|
| 220 |
'simple' β always use simple pipeline
|
|
|
|
| 214 |
def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
|
| 215 |
"""
|
| 216 |
Args:
|
| 217 |
+
checkpoint_path : path to best_model_v4.pth
|
| 218 |
device : 'cuda' or 'cpu'
|
| 219 |
mode : 'auto' β auto-detect per image (recommended)
|
| 220 |
'simple' β always use simple pipeline
|
debug_and_retrain.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import matplotlib.pyplot as plt
|
| 3 |
+
|
| 4 |
+
# Load and show the image
|
| 5 |
+
img = cv2.imread('your_image.png')
|
| 6 |
+
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
|
| 7 |
+
plt.title('Original Image')
|
| 8 |
+
plt.show()
|
| 9 |
+
|
| 10 |
+
# Preprocess and show
|
| 11 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 12 |
+
thresh = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
|
| 13 |
+
plt.imshow(thresh, cmap='gray')
|
| 14 |
+
plt.title('Thresholded Image')
|
| 15 |
+
plt.show()
|
| 16 |
+
|
| 17 |
+
# Run OCR and print output
|
| 18 |
+
import pytesseract
|
| 19 |
+
text = pytesseract.image_to_string(thresh)
|
| 20 |
+
print("OCR Output:", text)
|
finetune.py
CHANGED
|
@@ -3,18 +3,20 @@ finetune.py
|
|
| 3 |
===========
|
| 4 |
Fine-tune CRNN+CTC on generated civil registry form crops.
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
|
| 9 |
Usage:
|
| 10 |
python finetune.py
|
| 11 |
|
| 12 |
Output:
|
| 13 |
-
checkpoints/
|
| 14 |
"""
|
| 15 |
|
| 16 |
import os
|
| 17 |
import sys
|
|
|
|
|
|
|
| 18 |
import torch
|
| 19 |
import torch.nn.functional as F
|
| 20 |
import torch.optim as optim
|
|
@@ -25,13 +27,14 @@ from crnn_model import get_crnn_model
|
|
| 25 |
from dataset import CivilRegistryDataset, collate_fn
|
| 26 |
|
| 27 |
# ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
-
CHECKPOINT_IN = "checkpoints/
|
| 29 |
-
CHECKPOINT_OUT = "checkpoints/
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
| 35 |
|
| 36 |
IMG_HEIGHT = 64
|
| 37 |
IMG_WIDTH = 512
|
|
@@ -42,10 +45,26 @@ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
| 42 |
# ββ Phase settings ββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
PHASES = [
|
| 44 |
# (name, epochs, lr, freeze_cnn, patience)
|
| 45 |
-
("Phase 1 β CNN frozen,
|
| 46 |
-
("Phase 2 β Full model,
|
|
|
|
|
|
|
| 47 |
]
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
def main():
|
| 51 |
print("=" * 60)
|
|
@@ -60,6 +79,11 @@ def main():
|
|
| 60 |
print(f"ERROR: {f} not found.")
|
| 61 |
sys.exit(1)
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
# ββ Datasets ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 64 |
datasets_to_merge = []
|
| 65 |
|
|
@@ -72,32 +96,23 @@ def main():
|
|
| 72 |
datasets_to_merge.append(actual_dataset)
|
| 73 |
print(f" Actual crops: {len(actual_dataset)} (real scanned forms)")
|
| 74 |
else:
|
| 75 |
-
print(f" [!] {ACTUAL_ANN} not found
|
| 76 |
|
| 77 |
-
# 2.
|
| 78 |
-
if os.path.exists(REAL_ANN):
|
| 79 |
-
real_dataset = CivilRegistryDataset(
|
| 80 |
-
data_dir="data/train", annotations_file=REAL_ANN,
|
| 81 |
-
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
|
| 82 |
-
)
|
| 83 |
-
datasets_to_merge.append(real_dataset)
|
| 84 |
-
print(f" Real crops : {len(real_dataset)} (synthetic on real backgrounds)")
|
| 85 |
-
|
| 86 |
-
# 3. Fully synthetic β keep so model doesn't forget basic characters
|
| 87 |
if os.path.exists(SYNTH_ANN):
|
| 88 |
synth_dataset = CivilRegistryDataset(
|
| 89 |
-
data_dir="
|
| 90 |
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
|
| 91 |
)
|
| 92 |
datasets_to_merge.append(synth_dataset)
|
| 93 |
print(f" Synth crops : {len(synth_dataset)} (fully synthetic)")
|
| 94 |
|
| 95 |
if not datasets_to_merge:
|
| 96 |
-
print("ERROR: No training data found.
|
| 97 |
sys.exit(1)
|
| 98 |
|
| 99 |
val_dataset = CivilRegistryDataset(
|
| 100 |
-
data_dir="
|
| 101 |
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
|
| 102 |
)
|
| 103 |
|
|
@@ -115,7 +130,7 @@ def main():
|
|
| 115 |
ckpt = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
|
| 116 |
config = ckpt.get('config', {})
|
| 117 |
|
| 118 |
-
ref_dataset = datasets_to_merge[0]
|
| 119 |
model = get_crnn_model(
|
| 120 |
model_type = config.get('model_type', 'standard'),
|
| 121 |
img_height = config.get('img_height', 64),
|
|
@@ -144,8 +159,8 @@ def main():
|
|
| 144 |
batch_size = images.size(0)
|
| 145 |
if training:
|
| 146 |
optimizer.zero_grad()
|
| 147 |
-
outputs
|
| 148 |
-
seq_len
|
| 149 |
input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
|
| 150 |
loss = criterion(outputs, targets, input_lengths, target_lengths)
|
| 151 |
if not torch.isnan(loss) and not torch.isinf(loss):
|
|
@@ -186,6 +201,7 @@ def main():
|
|
| 186 |
if vl < best_overall:
|
| 187 |
best_overall = vl
|
| 188 |
torch.save({
|
|
|
|
| 189 |
'model_state_dict': model.state_dict(),
|
| 190 |
'config': config,
|
| 191 |
'char_to_idx': ref_dataset.char_to_idx,
|
|
@@ -201,6 +217,11 @@ def main():
|
|
| 201 |
print(f" Early stopping.")
|
| 202 |
break
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
print(f"\n{'='*60}")
|
| 205 |
print(f" Fine-tuning complete!")
|
| 206 |
print(f" Best val loss : {best_overall:.4f}")
|
|
@@ -209,4 +230,4 @@ def main():
|
|
| 209 |
|
| 210 |
|
| 211 |
if __name__ == '__main__':
|
| 212 |
-
main()
|
|
|
|
| 3 |
===========
|
| 4 |
Fine-tune CRNN+CTC on generated civil registry form crops.
|
| 5 |
|
| 6 |
+
Continues from best_model_v2.pth, trains on actual_annotations.json
|
| 7 |
+
+ train_annotations.json, saves to best_model_v4.pth.
|
| 8 |
|
| 9 |
Usage:
|
| 10 |
python finetune.py
|
| 11 |
|
| 12 |
Output:
|
| 13 |
+
checkpoints/best_model_v4.pth
|
| 14 |
"""
|
| 15 |
|
| 16 |
import os
|
| 17 |
import sys
|
| 18 |
+
import json
|
| 19 |
+
import shutil
|
| 20 |
import torch
|
| 21 |
import torch.nn.functional as F
|
| 22 |
import torch.optim as optim
|
|
|
|
| 27 |
from dataset import CivilRegistryDataset, collate_fn
|
| 28 |
|
| 29 |
# ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
CHECKPOINT_IN = "checkpoints/best_model_v3.pth"
|
| 31 |
+
CHECKPOINT_OUT = "checkpoints/best_model_v4.pth"
|
| 32 |
|
| 33 |
+
ACTUAL_ANN = "data/actual_annotations.json" # real scanned forms
|
| 34 |
+
SYNTH_ANN = "data/train_annotations.json" # synthetic / train split
|
| 35 |
+
VAL_ANN = "data/val_annotations.json" # validation set
|
| 36 |
+
|
| 37 |
+
DRIVE_BACKUP = "/content/drive/MyDrive/crnn_finetune/CRNN+CTC/checkpoints/best_model_v4.pth"
|
| 38 |
|
| 39 |
IMG_HEIGHT = 64
|
| 40 |
IMG_WIDTH = 512
|
|
|
|
| 45 |
# ββ Phase settings ββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
PHASES = [
|
| 47 |
# (name, epochs, lr, freeze_cnn, patience)
|
| 48 |
+
("Phase 1 β CNN frozen, warm up on actual crops", 20, 1e-4, True, 5),
|
| 49 |
+
("Phase 2 β Full model, main training", 30, 1e-5, False, 6),
|
| 50 |
+
("Phase 3 β Full model, slow burn", 30, 5e-6, False, 6),
|
| 51 |
+
("Phase 4 β Full model, final polish", 20, 1e-6, False, 5),
|
| 52 |
]
|
| 53 |
|
| 54 |
+
# ββ Fix Windows backslash paths βββββββββββββββββββββββββββββββ
|
| 55 |
+
def fix_paths(json_path):
|
| 56 |
+
with open(json_path) as f:
|
| 57 |
+
ann = json.load(f)
|
| 58 |
+
changed = False
|
| 59 |
+
for a in ann:
|
| 60 |
+
if 'image_path' in a and '\\' in a['image_path']:
|
| 61 |
+
a['image_path'] = a['image_path'].replace('\\', '/')
|
| 62 |
+
changed = True
|
| 63 |
+
if changed:
|
| 64 |
+
with open(json_path, 'w') as f:
|
| 65 |
+
json.dump(ann, f)
|
| 66 |
+
print(f" Fixed backslash paths in {json_path}")
|
| 67 |
+
|
| 68 |
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 69 |
def main():
|
| 70 |
print("=" * 60)
|
|
|
|
| 79 |
print(f"ERROR: {f} not found.")
|
| 80 |
sys.exit(1)
|
| 81 |
|
| 82 |
+
# ββ Fix backslash paths βββββββββββββββββββββββββββββββββββ
|
| 83 |
+
for ann_file in [ACTUAL_ANN, SYNTH_ANN, VAL_ANN]:
|
| 84 |
+
if os.path.exists(ann_file):
|
| 85 |
+
fix_paths(ann_file)
|
| 86 |
+
|
| 87 |
# ββ Datasets ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 88 |
datasets_to_merge = []
|
| 89 |
|
|
|
|
| 96 |
datasets_to_merge.append(actual_dataset)
|
| 97 |
print(f" Actual crops: {len(actual_dataset)} (real scanned forms)")
|
| 98 |
else:
|
| 99 |
+
print(f" [!] {ACTUAL_ANN} not found")
|
| 100 |
|
| 101 |
+
# 2. Fully synthetic β keep so model doesn't forget basic characters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
if os.path.exists(SYNTH_ANN):
|
| 103 |
synth_dataset = CivilRegistryDataset(
|
| 104 |
+
data_dir=".", annotations_file=SYNTH_ANN,
|
| 105 |
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
|
| 106 |
)
|
| 107 |
datasets_to_merge.append(synth_dataset)
|
| 108 |
print(f" Synth crops : {len(synth_dataset)} (fully synthetic)")
|
| 109 |
|
| 110 |
if not datasets_to_merge:
|
| 111 |
+
print("ERROR: No training data found.")
|
| 112 |
sys.exit(1)
|
| 113 |
|
| 114 |
val_dataset = CivilRegistryDataset(
|
| 115 |
+
data_dir=".", annotations_file=VAL_ANN,
|
| 116 |
img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
|
| 117 |
)
|
| 118 |
|
|
|
|
| 130 |
ckpt = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
|
| 131 |
config = ckpt.get('config', {})
|
| 132 |
|
| 133 |
+
ref_dataset = datasets_to_merge[0]
|
| 134 |
model = get_crnn_model(
|
| 135 |
model_type = config.get('model_type', 'standard'),
|
| 136 |
img_height = config.get('img_height', 64),
|
|
|
|
| 159 |
batch_size = images.size(0)
|
| 160 |
if training:
|
| 161 |
optimizer.zero_grad()
|
| 162 |
+
outputs = F.log_softmax(model(images), dim=2)
|
| 163 |
+
seq_len = outputs.size(0)
|
| 164 |
input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
|
| 165 |
loss = criterion(outputs, targets, input_lengths, target_lengths)
|
| 166 |
if not torch.isnan(loss) and not torch.isinf(loss):
|
|
|
|
| 201 |
if vl < best_overall:
|
| 202 |
best_overall = vl
|
| 203 |
torch.save({
|
| 204 |
+
**ckpt,
|
| 205 |
'model_state_dict': model.state_dict(),
|
| 206 |
'config': config,
|
| 207 |
'char_to_idx': ref_dataset.char_to_idx,
|
|
|
|
| 217 |
print(f" Early stopping.")
|
| 218 |
break
|
| 219 |
|
| 220 |
+
# ββ Drive backup ββββββββββββββββββββββββββββββββββββββββββ
|
| 221 |
+
if os.path.exists(CHECKPOINT_OUT) and os.path.exists(os.path.dirname(DRIVE_BACKUP)):
|
| 222 |
+
shutil.copy(CHECKPOINT_OUT, DRIVE_BACKUP)
|
| 223 |
+
print(f"\n Backed up to Drive: {DRIVE_BACKUP}")
|
| 224 |
+
|
| 225 |
print(f"\n{'='*60}")
|
| 226 |
print(f" Fine-tuning complete!")
|
| 227 |
print(f" Best val loss : {best_overall:.4f}")
|
|
|
|
| 230 |
|
| 231 |
|
| 232 |
if __name__ == '__main__':
|
| 233 |
+
main()
|
inference.py
CHANGED
|
@@ -214,7 +214,7 @@ class CivilRegistryOCR:
|
|
| 214 |
def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
|
| 215 |
"""
|
| 216 |
Args:
|
| 217 |
-
checkpoint_path : path to
|
| 218 |
device : 'cuda' or 'cpu'
|
| 219 |
mode : 'auto' β auto-detect per image (recommended)
|
| 220 |
'simple' β always use simple pipeline
|
|
@@ -340,9 +340,9 @@ def demo_inference():
|
|
| 340 |
print("=" * 70)
|
| 341 |
|
| 342 |
ocr = CivilRegistryOCR(
|
| 343 |
-
checkpoint_path='checkpoints/
|
| 344 |
device='cuda',
|
| 345 |
-
mode='
|
| 346 |
verbose=True # shows which mode each image triggers
|
| 347 |
)
|
| 348 |
|
|
|
|
| 214 |
def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
|
| 215 |
"""
|
| 216 |
Args:
|
| 217 |
+
checkpoint_path : path to best_model_v4.pth
|
| 218 |
device : 'cuda' or 'cpu'
|
| 219 |
mode : 'auto' β auto-detect per image (recommended)
|
| 220 |
'simple' β always use simple pipeline
|
|
|
|
| 340 |
print("=" * 70)
|
| 341 |
|
| 342 |
ocr = CivilRegistryOCR(
|
| 343 |
+
checkpoint_path='checkpoints/best_model_v4.pth',
|
| 344 |
device='cuda',
|
| 345 |
+
mode='adaptive', # force adaptive for demo images (many are zoomed/physical)
|
| 346 |
verbose=True # shows which mode each image triggers
|
| 347 |
)
|
| 348 |
|
spacyNER/debug_and_retrain.py
DELETED
|
@@ -1,316 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
# debug_and_retrain.py
|
| 3 |
-
# ============================================================
|
| 4 |
-
# USE THIS WHEN: training crashes with E024 or any span error
|
| 5 |
-
#
|
| 6 |
-
# WHAT IT DOES (in order):
|
| 7 |
-
# 1. Checks all .spacy files for bad spans (whitespace, empty)
|
| 8 |
-
# 2. Runs spaCy's official debug data command
|
| 9 |
-
# 3. Deletes corrupted .spacy files so they get rebuilt clean
|
| 10 |
-
# 4. Rebuilds: prepare_data β funsd_integration β train
|
| 11 |
-
#
|
| 12 |
-
# USAGE:
|
| 13 |
-
# python debug_and_retrain.py β full check + retrain
|
| 14 |
-
# python debug_and_retrain.py --check β check only, no retrain
|
| 15 |
-
# python debug_and_retrain.py --retrain β skip check, just retrain
|
| 16 |
-
# ============================================================
|
| 17 |
-
|
| 18 |
-
import subprocess
|
| 19 |
-
import sys
|
| 20 |
-
import argparse
|
| 21 |
-
from pathlib import Path
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
# ββ All .spacy files to check βββββββββββββββββββββββββββββ
|
| 25 |
-
SPACY_FILES = {
|
| 26 |
-
"train.spacy": "data/training/train.spacy",
|
| 27 |
-
"dev.spacy": "data/training/dev.spacy",
|
| 28 |
-
"funsd_train.spacy": "data/training/funsd_train.spacy",
|
| 29 |
-
"funsd_dev.spacy": "data/training/funsd_dev.spacy",
|
| 30 |
-
"merged_train.spacy": "data/training/merged_train.spacy",
|
| 31 |
-
"merged_dev.spacy": "data/training/merged_dev.spacy",
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
-
# Files that get REBUILT (delete these before retraining)
|
| 35 |
-
REBUILT_FILES = list(SPACY_FILES.values())
|
| 36 |
-
|
| 37 |
-
CFG = "training/config.cfg"
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 41 |
-
# STEP 1 β INSPECT .spacy FILES FOR BAD SPANS
|
| 42 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
-
|
| 44 |
-
def inspect_spacy_file(path: str):
|
| 45 |
-
"""
|
| 46 |
-
Load a .spacy file and scan every entity span for problems.
|
| 47 |
-
Returns (total_docs, total_ents, bad_spans_list).
|
| 48 |
-
|
| 49 |
-
Bad span types that cause E024:
|
| 50 |
-
- Leading whitespace: span.text starts with ' ' or '\\n'
|
| 51 |
-
- Trailing whitespace: span.text ends with ' ' or '\\n'
|
| 52 |
-
- Empty span: span.text == ''
|
| 53 |
-
- Punctuation-only: e.g. '.' or ','
|
| 54 |
-
"""
|
| 55 |
-
import spacy
|
| 56 |
-
from spacy.tokens import DocBin
|
| 57 |
-
|
| 58 |
-
nlp = spacy.blank("en")
|
| 59 |
-
db = DocBin().from_disk(path)
|
| 60 |
-
docs = list(db.get_docs(nlp.vocab))
|
| 61 |
-
|
| 62 |
-
total_ents = 0
|
| 63 |
-
bad_spans = []
|
| 64 |
-
|
| 65 |
-
for i, doc in enumerate(docs):
|
| 66 |
-
for ent in doc.ents:
|
| 67 |
-
total_ents += 1
|
| 68 |
-
t = ent.text
|
| 69 |
-
|
| 70 |
-
if not t.strip():
|
| 71 |
-
bad_spans.append({
|
| 72 |
-
"doc": i, "label": ent.label_, "text": repr(t),
|
| 73 |
-
"reason": "EMPTY or whitespace-only"
|
| 74 |
-
})
|
| 75 |
-
elif t != t.strip():
|
| 76 |
-
bad_spans.append({
|
| 77 |
-
"doc": i, "label": ent.label_, "text": repr(t),
|
| 78 |
-
"reason": f"WHITESPACE β leading={repr(t[0])} trailing={repr(t[-1])}"
|
| 79 |
-
})
|
| 80 |
-
elif len(t) == 1 and not t.isalnum():
|
| 81 |
-
bad_spans.append({
|
| 82 |
-
"doc": i, "label": ent.label_, "text": repr(t),
|
| 83 |
-
"reason": "SINGLE PUNCTUATION CHAR"
|
| 84 |
-
})
|
| 85 |
-
|
| 86 |
-
return len(docs), total_ents, bad_spans
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
def check_all_spacy_files():
|
| 90 |
-
"""Check every .spacy file and report problems."""
|
| 91 |
-
try:
|
| 92 |
-
import spacy
|
| 93 |
-
except ImportError:
|
| 94 |
-
print(" β spaCy not installed. Run: pip install spacy")
|
| 95 |
-
return False
|
| 96 |
-
|
| 97 |
-
print("\n" + "=" * 62)
|
| 98 |
-
print(" STEP 1 β SCANNING .spacy FILES FOR BAD SPANS")
|
| 99 |
-
print("=" * 62)
|
| 100 |
-
|
| 101 |
-
any_problems = False
|
| 102 |
-
|
| 103 |
-
for name, path in SPACY_FILES.items():
|
| 104 |
-
if not Path(path).exists():
|
| 105 |
-
print(f"\n βͺ {name:30s} not found β will be created")
|
| 106 |
-
continue
|
| 107 |
-
|
| 108 |
-
print(f"\n π {name}")
|
| 109 |
-
try:
|
| 110 |
-
n_docs, n_ents, bad = inspect_spacy_file(path)
|
| 111 |
-
print(f" docs: {n_docs} entities: {n_ents} bad spans: {len(bad)}")
|
| 112 |
-
|
| 113 |
-
if bad:
|
| 114 |
-
any_problems = True
|
| 115 |
-
print(f" β {len(bad)} PROBLEM SPAN(S):")
|
| 116 |
-
for b in bad[:10]: # show first 10
|
| 117 |
-
print(f" doc {b['doc']:>3} [{b['label']}] {b['text']:30s} β {b['reason']}")
|
| 118 |
-
if len(bad) > 10:
|
| 119 |
-
print(f" ... and {len(bad) - 10} more")
|
| 120 |
-
else:
|
| 121 |
-
print(f" β
All spans clean")
|
| 122 |
-
|
| 123 |
-
except Exception as e:
|
| 124 |
-
print(f" β Could not read file: {e}")
|
| 125 |
-
any_problems = True
|
| 126 |
-
|
| 127 |
-
return any_problems
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 131 |
-
# STEP 2 β spaCy OFFICIAL DEBUG DATA
|
| 132 |
-
# βββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββ
|
| 133 |
-
|
| 134 |
-
def run_spacy_debug():
|
| 135 |
-
"""
|
| 136 |
-
Run spaCy's built-in debug data command.
|
| 137 |
-
This catches problems our scanner might miss.
|
| 138 |
-
"""
|
| 139 |
-
print("\n" + "=" * 62)
|
| 140 |
-
print(" STEP 2 β spaCy OFFICIAL DEBUG DATA")
|
| 141 |
-
print("=" * 62)
|
| 142 |
-
|
| 143 |
-
train = "data/training/merged_train.spacy"
|
| 144 |
-
dev = "data/training/merged_dev.spacy"
|
| 145 |
-
|
| 146 |
-
# Fall back to civil-only if merged doesn't exist
|
| 147 |
-
if not Path(train).exists():
|
| 148 |
-
train = "data/training/train.spacy"
|
| 149 |
-
dev = "data/training/dev.spacy"
|
| 150 |
-
|
| 151 |
-
if not Path(train).exists():
|
| 152 |
-
print("\n βͺ No training data found yet β skipping debug.")
|
| 153 |
-
print(" β Run: python training/prepare_data.py first")
|
| 154 |
-
return
|
| 155 |
-
|
| 156 |
-
if not Path(CFG).exists():
|
| 157 |
-
print(f"\n βͺ Config not found: {CFG} β skipping debug.")
|
| 158 |
-
return
|
| 159 |
-
|
| 160 |
-
print(f"\n Checking: {train}")
|
| 161 |
-
print(f" Dev: {dev}\n")
|
| 162 |
-
|
| 163 |
-
result = subprocess.run([
|
| 164 |
-
sys.executable, "-m", "spacy", "debug", "data", CFG,
|
| 165 |
-
"--paths.train", train,
|
| 166 |
-
"--paths.dev", dev,
|
| 167 |
-
])
|
| 168 |
-
|
| 169 |
-
if result.returncode != 0:
|
| 170 |
-
print("\n β οΈ debug data found issues β see above.")
|
| 171 |
-
else:
|
| 172 |
-
print("\n β
debug data passed β no issues found.")
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 176 |
-
# STEP 3 β DELETE OLD .spacy FILES
|
| 177 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 178 |
-
|
| 179 |
-
def delete_spacy_files():
|
| 180 |
-
"""Delete all generated .spacy files so they get rebuilt clean."""
|
| 181 |
-
print("\n" + "=" * 62)
|
| 182 |
-
print(" STEP 3 β DELETING OLD .spacy FILES")
|
| 183 |
-
print("=" * 62)
|
| 184 |
-
|
| 185 |
-
deleted = 0
|
| 186 |
-
for path in REBUILT_FILES:
|
| 187 |
-
p = Path(path)
|
| 188 |
-
if p.exists():
|
| 189 |
-
p.unlink()
|
| 190 |
-
print(f" ποΈ Deleted: {path}")
|
| 191 |
-
deleted += 1
|
| 192 |
-
|
| 193 |
-
if deleted == 0:
|
| 194 |
-
print(" βͺ Nothing to delete.")
|
| 195 |
-
else:
|
| 196 |
-
print(f"\n β
Deleted {deleted} file(s) β will be rebuilt clean.")
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 200 |
-
# STEP 4 β REBUILD + RETRAIN
|
| 201 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 202 |
-
|
| 203 |
-
def run_script(script: str, label: str) -> bool:
|
| 204 |
-
"""Run a training script. Returns True on success."""
|
| 205 |
-
print(f"\n{'β' * 62}")
|
| 206 |
-
print(f" βΆ {label}")
|
| 207 |
-
print(f" Script: {script}")
|
| 208 |
-
print(f"{'β' * 62}\n")
|
| 209 |
-
|
| 210 |
-
if not Path(script).exists():
|
| 211 |
-
print(f" β Script not found: {script}")
|
| 212 |
-
return False
|
| 213 |
-
|
| 214 |
-
result = subprocess.run([sys.executable, script])
|
| 215 |
-
if result.returncode != 0:
|
| 216 |
-
print(f"\n β {label} failed.")
|
| 217 |
-
return False
|
| 218 |
-
|
| 219 |
-
print(f"\n β
{label} complete.")
|
| 220 |
-
return True
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
def retrain():
|
| 224 |
-
"""Run the full rebuild pipeline: prepare β funsd β train."""
|
| 225 |
-
print("\n" + "=" * 62)
|
| 226 |
-
print(" STEP 4 β REBUILD + RETRAIN")
|
| 227 |
-
print("=" * 62)
|
| 228 |
-
|
| 229 |
-
steps = [
|
| 230 |
-
("training/prepare_data.py", "Step 1/3: Build civil registry data"),
|
| 231 |
-
("training/funsd_integration.py", "Step 2/3: Merge FUNSD + civil registry"),
|
| 232 |
-
("training/train.py", "Step 3/3: Train NER model"),
|
| 233 |
-
]
|
| 234 |
-
|
| 235 |
-
for script, label in steps:
|
| 236 |
-
ok = run_script(script, label)
|
| 237 |
-
if not ok:
|
| 238 |
-
print(f"\n β Pipeline stopped at: {script}")
|
| 239 |
-
print(f" Fix the error above, then re-run:")
|
| 240 |
-
print(f" python debug_and_retrain.py --retrain")
|
| 241 |
-
sys.exit(1)
|
| 242 |
-
|
| 243 |
-
print("\n" + "=" * 62)
|
| 244 |
-
print(" β
RETRAIN COMPLETE")
|
| 245 |
-
print("=" * 62)
|
| 246 |
-
print("\n Best model β models/civil_registry_model/model-best/")
|
| 247 |
-
print("\n NEXT: python training/evaluate.py")
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 251 |
-
# MAIN
|
| 252 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 253 |
-
|
| 254 |
-
def main():
|
| 255 |
-
parser = argparse.ArgumentParser(
|
| 256 |
-
description="Debug FUNSD/civil data and retrain NER model"
|
| 257 |
-
)
|
| 258 |
-
parser.add_argument("--check", action="store_true",
|
| 259 |
-
help="Check for bad spans only β don't retrain")
|
| 260 |
-
parser.add_argument("--retrain", action="store_true",
|
| 261 |
-
help="Skip check β delete old files and retrain immediately")
|
| 262 |
-
args = parser.parse_args()
|
| 263 |
-
|
| 264 |
-
print("\n" + "=" * 62)
|
| 265 |
-
print(" CIVIL REGISTRY NER β DEBUG & RETRAIN")
|
| 266 |
-
print("=" * 62)
|
| 267 |
-
print("\n This script fixes the E024 'bad span' training error.")
|
| 268 |
-
print(" Root causes: whitespace in spans, wrong alignment_mode,")
|
| 269 |
-
print(" offset shift from text.strip() after build.")
|
| 270 |
-
|
| 271 |
-
if args.retrain:
|
| 272 |
-
# Skip checking β just delete and rebuild
|
| 273 |
-
delete_spacy_files()
|
| 274 |
-
retrain()
|
| 275 |
-
return
|
| 276 |
-
|
| 277 |
-
# ββ Always run checks βββββββββββββββββββββββββββββββββ
|
| 278 |
-
has_problems = check_all_spacy_files()
|
| 279 |
-
run_spacy_debug()
|
| 280 |
-
|
| 281 |
-
if args.check:
|
| 282 |
-
# Check-only mode β stop here
|
| 283 |
-
print("\n" + "=" * 62)
|
| 284 |
-
if has_problems:
|
| 285 |
-
print(" β οΈ Problems found β run without --check to fix:")
|
| 286 |
-
print(" python debug_and_retrain.py")
|
| 287 |
-
else:
|
| 288 |
-
print(" β
No problems found β safe to train:")
|
| 289 |
-
print(" python training/train.py")
|
| 290 |
-
print("=" * 62)
|
| 291 |
-
return
|
| 292 |
-
|
| 293 |
-
# ββ Ask before deleting βββββββββββββββββββββββββββββββ
|
| 294 |
-
print("\n" + "=" * 62)
|
| 295 |
-
if has_problems:
|
| 296 |
-
print(" β οΈ Bad spans detected in .spacy files.")
|
| 297 |
-
print(" The fixed funsd_integration.py will rebuild them cleanly.")
|
| 298 |
-
else:
|
| 299 |
-
print(" β
No bad spans detected in existing files.")
|
| 300 |
-
|
| 301 |
-
print("\n Proceeding to delete old .spacy files and retrain...")
|
| 302 |
-
print(" (Ctrl+C now to cancel)")
|
| 303 |
-
print("=" * 62)
|
| 304 |
-
|
| 305 |
-
try:
|
| 306 |
-
input("\n Press ENTER to continue, Ctrl+C to cancel...\n")
|
| 307 |
-
except KeyboardInterrupt:
|
| 308 |
-
print("\n Cancelled.")
|
| 309 |
-
return
|
| 310 |
-
|
| 311 |
-
delete_spacy_files()
|
| 312 |
-
retrain()
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
if __name__ == "__main__":
|
| 316 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
spacyNER/models/phase1_funsd/model-last/vocab/strings.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
spacyNER/models/phase1_funsd/model-last/vocab/vectors.cfg
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff4359091952c8cd16f1f0482f5770fb82d1707368d5cca3c46aa501f552e3c5
|
| 3 |
+
size 22
|
template_matcher.py
CHANGED
|
@@ -41,7 +41,7 @@ _CRNN_DIR = os.path.join(os.path.dirname(__file__), 'CRNN+CTC')
|
|
| 41 |
if _CRNN_DIR not in _sys.path:
|
| 42 |
_sys.path.insert(0, _CRNN_DIR)
|
| 43 |
|
| 44 |
-
_CRNN_CHECKPOINT = os.path.join(_CRNN_DIR, 'checkpoints', '
|
| 45 |
_crnn_ocr = None
|
| 46 |
_crnn_decode = None # reference to decode_ctc_predictions
|
| 47 |
|
|
|
|
| 41 |
if _CRNN_DIR not in _sys.path:
|
| 42 |
_sys.path.insert(0, _CRNN_DIR)
|
| 43 |
|
| 44 |
+
_CRNN_CHECKPOINT = os.path.join(_CRNN_DIR, 'checkpoints', 'best_model_v4.pth')
|
| 45 |
_crnn_ocr = None
|
| 46 |
_crnn_decode = None # reference to decode_ctc_predictions
|
| 47 |
|