Hanz Pillerva commited on
Commit
25a1178
Β·
1 Parent(s): 94ec9c8

Updated files

Browse files
CRNN+CTC/finetune.py CHANGED
@@ -3,14 +3,14 @@ finetune.py
3
  ===========
4
  Fine-tune CRNN+CTC on generated civil registry form crops.
5
 
6
- Loads best_model_iam.pth (already knows real handwriting from IAM),
7
- then trains on real_annotations.json (Filipino names on real form backgrounds).
8
 
9
  Usage:
10
  python finetune.py
11
 
12
  Output:
13
- checkpoints/best_model_final.pth
14
  """
15
 
16
  import os
@@ -25,12 +25,12 @@ from crnn_model import get_crnn_model
25
  from dataset import CivilRegistryDataset, collate_fn
26
 
27
  # ── Config ────────────────────────────────────────────────────
28
- CHECKPOINT_IN = "checkpoints/best_model_iam.pth"
29
- CHECKPOINT_OUT = "checkpoints/best_model_final.pth"
30
 
31
- REAL_ANN = "data/real_annotations.json" # generated by generate_form_samples.py
32
- SYNTH_ANN = "data/train_annotations.json" # original synthetic data
33
- VAL_ANN = "data/val_annotations.json" # validation set
34
 
35
  IMG_HEIGHT = 64
36
  IMG_WIDTH = 512
@@ -53,35 +53,45 @@ def main():
53
  print(f" Device : {DEVICE}")
54
  print(f" Checkpoint : {CHECKPOINT_IN}")
55
 
56
- # ── Check files ───────────────────────────────────────────
57
- for f in [CHECKPOINT_IN, REAL_ANN, VAL_ANN]:
58
  if not os.path.exists(f):
59
  print(f"ERROR: {f} not found.")
60
  sys.exit(1)
61
 
62
  # ── Datasets ──────────────────────────────────────────────
63
- real_dataset = CivilRegistryDataset(
64
- data_dir="data/train", annotations_file=REAL_ANN,
65
- img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
66
- )
67
- val_dataset = CivilRegistryDataset(
68
- data_dir="data/val", annotations_file=VAL_ANN,
69
- img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
70
- )
 
 
 
 
71
 
72
- # Mix in original synthetic data so model doesn't forget
73
- train_dataset = real_dataset
74
  if os.path.exists(SYNTH_ANN):
75
  synth_dataset = CivilRegistryDataset(
76
  data_dir="data/train", annotations_file=SYNTH_ANN,
77
  img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
78
  )
79
- train_dataset = ConcatDataset([real_dataset, synth_dataset])
80
- print(f" Real crops : {len(real_dataset)}")
81
- print(f" Synth crops : {len(synth_dataset)}")
82
- else:
83
- print(f" Real crops : {len(real_dataset)}")
 
 
 
 
 
 
84
 
 
85
  print(f" Total train : {len(train_dataset)}")
86
  print(f" Val : {len(val_dataset)}")
87
 
@@ -95,10 +105,11 @@ def main():
95
  ckpt = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
96
  config = ckpt.get('config', {})
97
 
 
98
  model = get_crnn_model(
99
  model_type = config.get('model_type', 'standard'),
100
  img_height = config.get('img_height', 64),
101
- num_chars = real_dataset.num_chars,
102
  hidden_size = config.get('hidden_size', 128),
103
  num_lstm_layers = config.get('num_lstm_layers', 1),
104
  ).to(DEVICE)
@@ -123,8 +134,8 @@ def main():
123
  batch_size = images.size(0)
124
  if training:
125
  optimizer.zero_grad()
126
- outputs = F.log_softmax(model(images), dim=2)
127
- seq_len = outputs.size(0)
128
  input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
129
  loss = criterion(outputs, targets, input_lengths, target_lengths)
130
  if not torch.isnan(loss) and not torch.isinf(loss):
@@ -167,8 +178,8 @@ def main():
167
  torch.save({
168
  'model_state_dict': model.state_dict(),
169
  'config': config,
170
- 'char_to_idx': real_dataset.char_to_idx,
171
- 'idx_to_char': real_dataset.idx_to_char,
172
  'epoch': epoch,
173
  'val_loss': vl,
174
  }, CHECKPOINT_OUT)
@@ -188,4 +199,4 @@ def main():
188
 
189
 
190
  if __name__ == '__main__':
191
- main()
 
3
  ===========
4
  Fine-tune CRNN+CTC on generated civil registry form crops.
5
 
6
+ Loads best_model_final.pth (pretrained), continues training on
7
+ actual_annotations.json + train_annotations.json.
8
 
9
  Usage:
10
  python finetune.py
11
 
12
  Output:
13
+ checkpoints/best_model_v2.pth
14
  """
15
 
16
  import os
 
25
  from dataset import CivilRegistryDataset, collate_fn
26
 
27
  # ── Config ────────────────────────────────────────────────────
28
+ CHECKPOINT_IN = "checkpoints/best_model_final.pth"
29
+ CHECKPOINT_OUT = "checkpoints/best_model_v2.pth"
30
 
31
+ ACTUAL_ANN = "data/actual_annotations.json" # real scanned forms
32
+ SYNTH_ANN = "data/train_annotations.json" # synthetic / train split
33
+ VAL_ANN = "data/val_annotations.json" # validation set
34
 
35
  IMG_HEIGHT = 64
36
  IMG_WIDTH = 512
 
53
  print(f" Device : {DEVICE}")
54
  print(f" Checkpoint : {CHECKPOINT_IN}")
55
 
56
+ # ── Check required files ──────────────────────────────────
57
+ for f in [CHECKPOINT_IN, VAL_ANN]:
58
  if not os.path.exists(f):
59
  print(f"ERROR: {f} not found.")
60
  sys.exit(1)
61
 
62
  # ── Datasets ──────────────────────────────────────────────
63
+ datasets_to_merge = []
64
+
65
+ # 1. Actual scanned forms (highest priority β€” real data)
66
+ if os.path.exists(ACTUAL_ANN):
67
+ actual_dataset = CivilRegistryDataset(
68
+ data_dir=".", annotations_file=ACTUAL_ANN,
69
+ img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
70
+ )
71
+ datasets_to_merge.append(actual_dataset)
72
+ print(f" Actual crops: {len(actual_dataset)} (real scanned forms)")
73
+ else:
74
+ print(f" [!] {ACTUAL_ANN} not found β€” run extract_actual_data.py first")
75
 
76
+ # 2. Fully synthetic β€” keep so model doesn't forget basic characters
 
77
  if os.path.exists(SYNTH_ANN):
78
  synth_dataset = CivilRegistryDataset(
79
  data_dir="data/train", annotations_file=SYNTH_ANN,
80
  img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
81
  )
82
+ datasets_to_merge.append(synth_dataset)
83
+ print(f" Synth crops : {len(synth_dataset)} (fully synthetic)")
84
+
85
+ if not datasets_to_merge:
86
+ print("ERROR: No training data found. Run extract_actual_data.py first.")
87
+ sys.exit(1)
88
+
89
+ val_dataset = CivilRegistryDataset(
90
+ data_dir="data/val", annotations_file=VAL_ANN,
91
+ img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
92
+ )
93
 
94
+ train_dataset = ConcatDataset(datasets_to_merge) if len(datasets_to_merge) > 1 else datasets_to_merge[0]
95
  print(f" Total train : {len(train_dataset)}")
96
  print(f" Val : {len(val_dataset)}")
97
 
 
105
  ckpt = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
106
  config = ckpt.get('config', {})
107
 
108
+ ref_dataset = datasets_to_merge[0]
109
  model = get_crnn_model(
110
  model_type = config.get('model_type', 'standard'),
111
  img_height = config.get('img_height', 64),
112
+ num_chars = ref_dataset.num_chars,
113
  hidden_size = config.get('hidden_size', 128),
114
  num_lstm_layers = config.get('num_lstm_layers', 1),
115
  ).to(DEVICE)
 
134
  batch_size = images.size(0)
135
  if training:
136
  optimizer.zero_grad()
137
+ outputs = F.log_softmax(model(images), dim=2)
138
+ seq_len = outputs.size(0)
139
  input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
140
  loss = criterion(outputs, targets, input_lengths, target_lengths)
141
  if not torch.isnan(loss) and not torch.isinf(loss):
 
178
  torch.save({
179
  'model_state_dict': model.state_dict(),
180
  'config': config,
181
+ 'char_to_idx': ref_dataset.char_to_idx,
182
+ 'idx_to_char': ref_dataset.idx_to_char,
183
  'epoch': epoch,
184
  'val_loss': vl,
185
  }, CHECKPOINT_OUT)
 
199
 
200
 
201
  if __name__ == '__main__':
202
+ main()
CRNN+CTC/fix_annotations.py DELETED
@@ -1,40 +0,0 @@
1
- import json, os
2
-
3
- # Maps any image path to its correct form subfolder.
4
- # FIXED: was only handling form1a/form2a β€” missed form3a and form90.
5
- def detect_folder(image_path):
6
- for form in ['form1a', 'form2a', 'form3a', 'form90']:
7
- if form in image_path:
8
- return form
9
- return 'form1a' # safe fallback
10
-
11
- for split in ['train', 'val']:
12
- ann_file = f'data/{split}_annotations.json'
13
- if not os.path.exists(ann_file):
14
- print(f'SKIP: {ann_file} not found')
15
- continue
16
-
17
- with open(ann_file) as f:
18
- data = json.load(f)
19
-
20
- fixed = []
21
- skipped = 0
22
- for d in data:
23
- # Support both old key names ('image'/'label') and new ('image_path'/'text')
24
- image_val = d.get('image') or d.get('image_path', '')
25
- text_val = d.get('label') or d.get('text', '')
26
-
27
- if not image_val or not text_val:
28
- skipped += 1
29
- continue
30
-
31
- filename = os.path.basename(image_val)
32
- folder = detect_folder(image_val)
33
- fixed.append({'image_path': f'{folder}/{filename}', 'text': text_val})
34
-
35
- with open(ann_file, 'w') as f:
36
- json.dump(fixed, f, indent=2)
37
-
38
- print(f'{split}: {len(fixed)} fixed, {skipped} skipped')
39
-
40
- print('Done!')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CRNN+CTC/fix_data.py DELETED
@@ -1,770 +0,0 @@
1
- """
2
- fix_data.py
3
- ===========
4
- Generates synthetic training images for the Civil Registry OCR system.
5
-
6
- Run this ONCE before training to create your dataset.
7
-
8
- STEP ORDER:
9
- 1. python generate_ph_names.py <- generates data/ph_names.json
10
- 2. python fix_data.py <- generates all training images (THIS FILE)
11
- 3. python train.py <- trains the CRNN model
12
-
13
- WHAT IT GENERATES:
14
- - Printed text images of names, dates, places, and other form fields
15
- - Covers all 4 form types: birth, death, marriage, marriage license
16
- - Splits into train (90%) and val (10%)
17
- - Writes data/train_annotations.json and data/val_annotations.json
18
-
19
- OUTPUT STRUCTURE:
20
- data/
21
- train/
22
- form1a/ <- birth certificate fields
23
- form2a/ <- death certificate fields
24
- form3a/ <- marriage certificate fields
25
- form90/ <- marriage license fields
26
- val/
27
- form1a/
28
- form2a/
29
- form3a/
30
- form90/
31
- train_annotations.json
32
- val_annotations.json
33
- """
34
-
35
- import os
36
- import json
37
- import random
38
- import numpy as np
39
- from pathlib import Path
40
- from PIL import Image, ImageDraw, ImageFont, ImageFilter
41
-
42
-
43
- # ─────────────────────────────────────────────────────────────────────────────
44
- # CONFIG
45
- # ─────────────────────────────────────────────────────────────────────────────
46
-
47
- IMG_WIDTH = 512
48
- IMG_HEIGHT = 64
49
- FONT_SIZE = 22
50
- VAL_SPLIT = 0.10
51
- RANDOM_SEED = 42
52
-
53
- SAMPLES_PER_FORM = {
54
- 'form1a': 6000,
55
- 'form2a': 4000,
56
- 'form3a': 4000,
57
- 'form90': 2000,
58
- }
59
-
60
- PH_NAMES_FILE = 'data/ph_names.json'
61
-
62
- random.seed(RANDOM_SEED)
63
-
64
-
65
- # ─────────────────────────────────────────────────────────────────────────────
66
- # FONT LOADER
67
- # ─────────────────────────────────────────────────────────────────────────────
68
-
69
- def load_font(size=FONT_SIZE):
70
- """Load a single font β€” used as fallback. Prefer load_font_pool()."""
71
- for fp in [
72
- 'arial.ttf', 'Arial.ttf',
73
- '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
74
- '/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf',
75
- '/System/Library/Fonts/Helvetica.ttc',
76
- 'C:/Windows/Fonts/arial.ttf',
77
- 'C:/Windows/Fonts/calibri.ttf',
78
- ]:
79
- try:
80
- return ImageFont.truetype(fp, size)
81
- except Exception:
82
- continue
83
- print("WARNING: Could not load a TrueType font. Using default bitmap font.")
84
- print(" Prediction accuracy may be lower.")
85
- return ImageFont.load_default()
86
-
87
-
88
- def load_font_pool(size=FONT_SIZE):
89
- """
90
- Load a pool of diverse fonts so the model trains on varied typefaces.
91
- Using only one font causes the model to overfit to that font's style and
92
- fail on real civil registry documents which use mixed fonts.
93
- Returns a list of at least 1 font; caller picks randomly per image.
94
- """
95
- candidates = [
96
- # Sans-serif (most common in PH civil registry printed forms)
97
- 'arial.ttf', 'Arial.ttf',
98
- '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
99
- '/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf',
100
- '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
101
- 'C:/Windows/Fonts/arial.ttf',
102
- 'C:/Windows/Fonts/arialbd.ttf',
103
- 'C:/Windows/Fonts/calibri.ttf',
104
- 'C:/Windows/Fonts/calibrib.ttf',
105
- # Serif (used in older typewriter-style registry entries)
106
- '/usr/share/fonts/truetype/dejavu/DejaVuSerif.ttf',
107
- '/usr/share/fonts/truetype/liberation/LiberationSerif-Regular.ttf',
108
- 'C:/Windows/Fonts/times.ttf',
109
- 'C:/Windows/Fonts/Georgia.ttf',
110
- '/System/Library/Fonts/Times.ttc',
111
- # Mono (typewriter β€” common in pre-2000 civil registry forms)
112
- '/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf',
113
- '/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf',
114
- 'C:/Windows/Fonts/cour.ttf',
115
- # Condensed / narrow (space-saving fonts used in registry tables)
116
- 'C:/Windows/Fonts/arialn.ttf',
117
- '/usr/share/fonts/truetype/ubuntu/UbuntuCondensed-Regular.ttf',
118
- ]
119
- pool = []
120
- for fp in candidates:
121
- try:
122
- pool.append(ImageFont.truetype(fp, size))
123
- except Exception:
124
- continue
125
- if not pool:
126
- print("WARNING: No TrueType fonts found. Using default bitmap font.")
127
- pool.append(ImageFont.load_default())
128
- else:
129
- print(f" βœ“ Font pool loaded: {len(pool)} font(s) available")
130
- return pool
131
-
132
-
133
- # ─────────────────────────────────────────────────────────────────────────────
134
- # IMAGE RENDERER
135
- # ─────────────────────────────────────────────────────────────────────────────
136
-
137
- def render_text_image(text: str, font, width=IMG_WIDTH, height=IMG_HEIGHT,
138
- handwriting=False) -> Image.Image:
139
- """
140
- Render text on a white background, centered.
141
- handwriting=True applies handwriting-style augmentations.
142
- """
143
- img = Image.new('RGB', (width, height), color=(255, 255, 255))
144
- draw = ImageDraw.Draw(img)
145
-
146
- bbox = draw.textbbox((0, 0), text, font=font)
147
- tw = bbox[2] - bbox[0]
148
- th = bbox[3] - bbox[1]
149
- x = max(4, (width - tw) // 2)
150
- y = max(4, (height - th) // 2)
151
-
152
- if not handwriting:
153
- # ── PRINTED mode ──────────────────────────────────────
154
- shade = random.randint(0, 40)
155
- draw.text((x, y), text, fill=(shade, shade, shade), font=font)
156
-
157
- else:
158
- # ── HANDWRITING simulation mode ───────────────────────
159
- # 1. Pen color β€” blue-black ballpen
160
- r = random.randint(0, 60)
161
- g = random.randint(0, 60)
162
- b = random.randint(0, 120)
163
- ink_color = (r, g, b)
164
-
165
- # 2. Per-character y-wobble (unsteady hand)
166
- if random.choice([True, False]) and len(text) > 1:
167
- char_x = x
168
- for ch in text:
169
- y_offset = random.randint(-2, 2)
170
- draw.text((char_x, y + y_offset), ch, fill=ink_color, font=font)
171
- ch_bbox = draw.textbbox((0, 0), ch, font=font)
172
- char_x += (ch_bbox[2] - ch_bbox[0]) + random.randint(-1, 1)
173
- else:
174
- draw.text((x, y), text, fill=ink_color, font=font)
175
-
176
- # 3. Pixel-level augmentation
177
- arr = np.array(img).astype(np.float32)
178
-
179
- # 4. Ink bleed
180
- if random.random() < 0.5:
181
- img_pil = Image.fromarray(arr.astype(np.uint8))
182
- img_pil = img_pil.filter(
183
- ImageFilter.GaussianBlur(radius=random.uniform(0.3, 0.7)))
184
- arr = np.array(img_pil).astype(np.float32)
185
-
186
- # 5. Paper texture noise
187
- noise_map = np.random.normal(0, random.uniform(3, 10), arr.shape)
188
- arr = np.clip(arr + noise_map, 0, 255)
189
-
190
- # 6. Scan shadow patch
191
- if random.random() < 0.3:
192
- patch_x = random.randint(0, width - 20)
193
- patch_w = random.randint(10, 60)
194
- arr[:, patch_x:patch_x + patch_w] *= random.uniform(0.88, 0.97)
195
- arr = np.clip(arr, 0, 255)
196
-
197
- img = Image.fromarray(arr.astype(np.uint8))
198
-
199
- # 7. Pen tilt rotation (+-3 degrees)
200
- if random.random() < 0.6:
201
- angle = random.uniform(-3, 3)
202
- img = img.rotate(angle, fillcolor=(255, 255, 255), expand=False)
203
-
204
- return img
205
-
206
-
207
- # ─────────────────────────────────────────────────────────────────────────────
208
- # NAME / DATA POOLS
209
- # ─────────────────────────────────────────────────────────────────────────────
210
-
211
- # Populated at runtime from ph_names.json via load_ph_names()
212
- MIDDLE_NAMES = []
213
-
214
- SUFFIXES = ['Jr.', 'Sr.', 'II', 'III', '']
215
-
216
- MONTHS = [
217
- 'January', 'February', 'March', 'April', 'May', 'June',
218
- 'July', 'August', 'September', 'October', 'November', 'December',
219
- ]
220
-
221
- CITIES = [
222
- # NCR
223
- 'Manila', 'Quezon City', 'Caloocan', 'Pasig', 'Makati',
224
- 'Taguig', 'Paranaque', 'Pasay', 'Las Pinas', 'Muntinlupa',
225
- 'Marikina', 'Valenzuela', 'Malabon', 'Navotas', 'Mandaluyong',
226
- 'San Juan', 'Pateros',
227
- # Luzon
228
- 'Tarlac City', 'Angeles City', 'San Fernando', 'Olongapo',
229
- 'Cabanatuan', 'San Jose del Monte', 'Bacoor', 'Imus', 'Dasmarinas',
230
- 'Antipolo', 'Binangonan', 'Taytay', 'Santa Rosa', 'Calamba',
231
- 'San Pablo', 'Lucena', 'Batangas City', 'Lipa', 'Naga City',
232
- 'Legazpi', 'Sorsogon City', 'Tuguegarao', 'Ilagan', 'Santiago City',
233
- 'Cauayan', 'San Fernando (La Union)', 'Vigan', 'Laoag',
234
- 'Dagupan', 'San Carlos', 'Urdaneta', 'Baguio City',
235
- # Visayas
236
- 'Cebu City', 'Mandaue', 'Lapu-Lapu', 'Talisay', 'Danao',
237
- 'Toledo', 'Carcar', 'Bacolod', 'Bago', 'Sagay', 'Victorias',
238
- 'Iloilo City', 'Passi', 'Roxas City', 'Kalibo',
239
- 'Tacloban', 'Ormoc', 'Palo', 'Catbalogan', 'Calbayog',
240
- 'Tagbilaran', 'Dumaguete', 'Tanjay', 'Bayawan', 'Kabankalan',
241
- # Mindanao
242
- 'Davao City', 'Tagum', 'Panabo', 'Digos', 'Mati',
243
- 'General Santos', 'Koronadal', 'Kidapawan', 'Cotabato City',
244
- 'Cagayan de Oro', 'Iligan', 'Ozamiz', 'Oroquieta', 'Tangub',
245
- 'Butuan', 'Cabadbaran', 'Surigao City', 'Bislig', 'Bayugan',
246
- 'Zamboanga City', 'Pagadian', 'Dipolog', 'Dapitan',
247
- 'Marawi', 'Malaybalay', 'Valencia',
248
- ]
249
-
250
- PROVINCES = [
251
- # Luzon
252
- 'Tarlac', 'Pampanga', 'Bulacan', 'Nueva Ecija', 'Bataan',
253
- 'Zambales', 'Aurora', 'Rizal', 'Cavite', 'Laguna',
254
- 'Batangas', 'Quezon', 'Marinduque', 'Occidental Mindoro',
255
- 'Oriental Mindoro', 'Palawan', 'Romblon',
256
- 'Camarines Norte', 'Camarines Sur', 'Albay', 'Sorsogon',
257
- 'Catanduanes', 'Masbate',
258
- 'Pangasinan', 'La Union', 'Benguet', 'Ifugao', 'Mountain Province',
259
- 'Kalinga', 'Apayao', 'Abra', 'Ilocos Norte', 'Ilocos Sur',
260
- 'Cagayan', 'Isabela', 'Nueva Vizcaya', 'Quirino',
261
- 'Metro Manila',
262
- # Visayas
263
- 'Cebu', 'Bohol', 'Negros Oriental', 'Siquijor',
264
- 'Negros Occidental', 'Iloilo', 'Capiz', 'Aklan', 'Antique',
265
- 'Guimaras', 'Leyte', 'Southern Leyte', 'Samar', 'Eastern Samar',
266
- 'Northern Samar', 'Biliran',
267
- # Mindanao
268
- 'Davao del Sur', 'Davao del Norte', 'Davao Oriental',
269
- 'Davao Occidental', 'Davao de Oro',
270
- 'South Cotabato', 'Sarangani', 'Sultan Kudarat', 'North Cotabato',
271
- 'Misamis Oriental', 'Misamis Occidental', 'Camiguin',
272
- 'Bukidnon', 'Lanao del Norte', 'Lanao del Sur',
273
- 'Maguindanao', 'Basilan', 'Sulu', 'Tawi-Tawi',
274
- 'Zamboanga del Sur', 'Zamboanga del Norte', 'Zamboanga Sibugay',
275
- 'Agusan del Norte', 'Agusan del Sur', 'Surigao del Norte',
276
- 'Surigao del Sur', 'Dinagat Islands',
277
- ]
278
-
279
- BARANGAYS = [
280
- 'Brgy. San Jose', 'Brgy. Sta. Maria', 'Brgy. San Antonio',
281
- 'Brgy. Santo Nino', 'Brgy. Poblacion', 'Brgy. San Isidro',
282
- 'Brgy. San Pedro', 'Brgy. San Miguel', 'Brgy. Mabini',
283
- 'Brgy. Rizal', 'Brgy. Magsaysay', 'Brgy. Quezon',
284
- 'Brgy. Bagong Silang', 'Brgy. Bagumbayan', 'Brgy. Batasan Hills',
285
- 'Brgy. Commonwealth', 'Brgy. Culiat', 'Brgy. Fairview',
286
- 'Brgy. Holy Spirit', 'Brgy. Kamuning', 'Brgy. Laging Handa',
287
- 'Brgy. Malaya', 'Brgy. Masagana', 'Brgy. Pinyahan',
288
- 'Brgy. Roxas', 'Brgy. Sacred Heart', 'Brgy. San Roque',
289
- 'Brgy. Santa Cruz', 'Brgy. Santa Teresita', 'Brgy. Santo Domingo',
290
- 'Brgy. Silangan', 'Brgy. South Triangle', 'Brgy. Tagumpay',
291
- 'Brgy. Tandang Sora', 'Brgy. Vasra', 'Brgy. White Plains',
292
- ]
293
-
294
- STREETS = [
295
- 'Mabini St.', 'Rizal Ave.', 'MacArthur Hwy.', 'Quezon Blvd.',
296
- 'Gen. Luna St.', 'Bonifacio St.', 'Aguinaldo St.', 'Burgos St.',
297
- 'Del Pilar St.', 'Gomez St.', 'Jacinto St.', 'Lapu-Lapu St.',
298
- 'Lopez Jaena St.', 'Luna St.', 'Osmena Blvd.', 'Padre Faura St.',
299
- 'Palma St.', 'Plaridel St.', 'Recto Ave.', 'Roxas Blvd.',
300
- 'San Andres St.', 'Shaw Blvd.', 'Taft Ave.', 'Tandang Sora Ave.',
301
- 'Timog Ave.', 'Tuazon Blvd.', 'Visayas Ave.', 'Aurora Blvd.',
302
- 'EDSA', 'Espana Blvd.', 'Katipunan Ave.', 'Marcos Hwy.',
303
- 'Ortigas Ave.', 'Quirino Ave.',
304
- ]
305
-
306
- RELIGIONS = [
307
- 'Roman Catholic', 'Catholic', 'Islam', 'Muslim',
308
- 'Iglesia ni Cristo', 'INC', 'Baptist', 'Methodist',
309
- 'Seventh Day Adventist', 'Born Again Christian', 'Aglipayan',
310
- ]
311
-
312
- OCCUPATIONS = [
313
- 'Farmer', 'Teacher', 'Engineer', 'Nurse', 'Doctor',
314
- 'Laborer', 'Housewife', 'Driver', 'Carpenter', 'Vendor',
315
- 'Student', 'OFW', 'Fisherman', 'Mechanic', 'Electrician',
316
- 'Police Officer', 'Military', 'Government Employee',
317
- 'Business Owner', 'Retired',
318
- ]
319
-
320
- CIVIL_STATUSES = ['Single', 'Married', 'Widowed', 'Legally Separated']
321
-
322
- CITIZENSHIPS = ['Filipino', 'Filipino', 'Filipino', 'American',
323
- 'Chinese', 'Japanese', 'Korean']
324
-
325
- DEATH_CAUSES = [
326
- 'Cardio-Respiratory Arrest', 'Hypertensive Cardiovascular Disease',
327
- 'Acute Myocardial Infarction', 'Cerebrovascular Accident',
328
- 'Pneumonia', 'Septicemia', 'Renal Failure', 'Diabetes Mellitus',
329
- 'Pulmonary Tuberculosis', 'Cancer of the Lung',
330
- 'Chronic Obstructive Pulmonary Disease', 'Liver Cirrhosis',
331
- 'Dengue Hemorrhagic Fever', 'Acute Gastroenteritis',
332
- 'Congestive Heart Failure',
333
- ]
334
-
335
- ATTENDANT_TYPES = [
336
- 'Private Physician', 'Public Health Officer',
337
- 'Hospital Authority', 'Hilot', 'None',
338
- ]
339
-
340
-
341
- # ─────────────────────────────────────────────────────────────────────────────
342
- # NAME LOADER
343
- # ─────────────────────────────────────────────────────────────────────────────
344
-
345
- def load_ph_names():
346
- """
347
- Load Filipino names from ph_names.json.
348
- Returns (first_names, last_names, middle_names).
349
- Falls back to built-in lists if JSON not found.
350
- """
351
- if os.path.exists(PH_NAMES_FILE):
352
- with open(PH_NAMES_FILE, 'r', encoding='utf-8') as f:
353
- data = json.load(f)
354
- first_names = data['first_names']['all']
355
- last_names = data['last_names']
356
- # Load middle_names from JSON (added by updated generate_ph_names.py)
357
- # Falls back to last_names if key missing (older ph_names.json)
358
- middle_names = data.get('middle_names', last_names)
359
- print(f" Loaded ph_names.json: "
360
- f"{len(first_names)} first, "
361
- f"{len(last_names)} last, "
362
- f"{len(middle_names)} middle names")
363
- else:
364
- print(f" WARNING: {PH_NAMES_FILE} not found.")
365
- print(f" Using built-in fallback names.")
366
- print(f" For better results run: python generate_ph_names.py first.")
367
- first_names = [
368
- 'Juan', 'Maria', 'Jose', 'Ana', 'Pedro', 'Rosa', 'Carlos',
369
- 'Elena', 'Ramon', 'Lucia', 'Eduardo', 'Carmen', 'Antonio',
370
- 'Isabel', 'Francisco', 'Gloria', 'Roberto', 'Corazon',
371
- 'Ricardo', 'Remedios', 'Manuel', 'Teresita', 'Andres',
372
- 'Lourdes', 'Fernando', 'Maricel', 'Rolando', 'Rowena',
373
- 'Danilo', 'Cristina', 'Ernesto', 'Marilou', 'Renato',
374
- 'Felicidad', 'Alfredo', 'Natividad', 'Domingo', 'Milagros',
375
- ]
376
- last_names = [
377
- 'Santos', 'Reyes', 'Cruz', 'Bautista', 'Ocampo', 'Garcia',
378
- 'Mendoza', 'Torres', 'Flores', 'Aquino', 'Dela Cruz',
379
- 'Del Rosario', 'San Jose', 'De Guzman', 'Villanueva',
380
- 'Gonzales', 'Ramos', 'Diaz', 'Castro', 'Morales',
381
- 'Lim', 'Tan', 'Go', 'Chua', 'Sy', 'Ong',
382
- 'Macaraeg', 'Pascual', 'Buenaventura', 'Concepcion',
383
- 'Manalo', 'Soriano', 'Evangelista', 'Salazar', 'Tolentino',
384
- ]
385
- middle_names = last_names
386
- return first_names, last_names, middle_names
387
-
388
-
389
- # ─────────────────────────────────────────────────────────────────────────────
390
- # TEXT GENERATORS
391
- # ─────────────────────────────────────────────────────────────────────────────
392
-
393
- def gen_full_name(first_names, last_names, with_suffix=True):
394
- first = random.choice(first_names)
395
- middle = random.choice(MIDDLE_NAMES) if MIDDLE_NAMES else random.choice(last_names)
396
- last = random.choice(last_names)
397
- suffix = random.choice(SUFFIXES) if with_suffix else ''
398
- name = f"{first} {middle} {last}"
399
- if suffix:
400
- name += f" {suffix}"
401
- return name
402
-
403
-
404
- def gen_first_name(first_names):
405
- return random.choice(first_names)
406
-
407
-
408
- def gen_last_name(last_names):
409
- return random.choice(last_names)
410
-
411
-
412
- def gen_middle_name(last_names):
413
- # Always draw from MIDDLE_NAMES (700+ entries from ph_names.json)
414
- pool = MIDDLE_NAMES if MIDDLE_NAMES else last_names
415
- return random.choice(pool)
416
-
417
-
418
- def gen_date_slash():
419
- month = random.randint(1, 12)
420
- day = random.randint(1, 28)
421
- year = random.randint(1930, 2024)
422
- return f"{month:02d}/{day:02d}/{year}"
423
-
424
-
425
- def gen_date_long():
426
- month = random.choice(MONTHS)
427
- day = random.randint(1, 28)
428
- year = random.randint(1930, 2024)
429
- return f"{month} {day}, {year}"
430
-
431
-
432
- def gen_date_day():
433
- return str(random.randint(1, 28))
434
-
435
-
436
- def gen_date_month():
437
- return random.choice(MONTHS)
438
-
439
-
440
- def gen_date_year():
441
- return str(random.randint(1930, 2024))
442
-
443
-
444
- def gen_age():
445
- return str(random.randint(1, 95))
446
-
447
-
448
- def gen_place_full():
449
- return (f"{random.choice(BARANGAYS)}, "
450
- f"{random.choice(CITIES)}, "
451
- f"{random.choice(PROVINCES)}")
452
-
453
-
454
- def gen_place_city():
455
- return random.choice(CITIES)
456
-
457
-
458
- def gen_place_province():
459
- return random.choice(PROVINCES)
460
-
461
-
462
- def gen_address():
463
- num = random.randint(1, 999)
464
- st = random.choice(STREETS)
465
- return f"{num} {st}, {random.choice(CITIES)}"
466
-
467
-
468
- def gen_registry_no():
469
- year = random.randint(2000, 2024)
470
- seq = random.randint(1, 9999)
471
- return f"{year}-{seq:04d}"
472
-
473
-
474
- def gen_sex():
475
- return random.choice(['Male', 'Female'])
476
-
477
-
478
- def gen_religion():
479
- return random.choice(RELIGIONS)
480
-
481
-
482
- def gen_occupation():
483
- return random.choice(OCCUPATIONS)
484
-
485
-
486
- def gen_civil_status():
487
- return random.choice(CIVIL_STATUSES)
488
-
489
-
490
- def gen_citizenship():
491
- return random.choice(CITIZENSHIPS)
492
-
493
-
494
- def gen_weight():
495
- return f"{random.randint(1500, 4500)} grams"
496
-
497
-
498
- def gen_death_cause():
499
- return random.choice(DEATH_CAUSES)
500
-
501
-
502
- def gen_attendant():
503
- return random.choice(ATTENDANT_TYPES)
504
-
505
-
506
- # ─────────────────────────────────────────────────────────────────────────────
507
- # FORM FIELD DEFINITIONS
508
- # ─────────────���───────────────────────────────────────────────────────────────
509
-
510
- def get_form_fields(form_type, first_names, last_names):
511
- fn = first_names
512
- ln = last_names
513
-
514
- if form_type == 'form1a': # Birth Certificate
515
- return [
516
- ('province', lambda: gen_place_province()),
517
- ('registry_no', lambda: gen_registry_no()),
518
- ('city_municipality', lambda: gen_place_city()),
519
- ('child_first_name', lambda: gen_first_name(fn)),
520
- ('child_middle_name', lambda: gen_middle_name(ln)),
521
- ('child_last_name', lambda: gen_last_name(ln)),
522
- ('sex', lambda: gen_sex()),
523
- ('dob_day', lambda: gen_date_day()),
524
- ('dob_month', lambda: gen_date_month()),
525
- ('dob_year', lambda: gen_date_year()),
526
- ('place_birth_hospital', lambda: f"Ospital ng {gen_place_city()}"),
527
- ('place_birth_city', lambda: gen_place_city()),
528
- ('place_birth_province', lambda: gen_place_province()),
529
- ('weight_at_birth', lambda: gen_weight()),
530
- ('type_of_birth', lambda: random.choice(['Single', 'Twin', 'Triplet'])),
531
- ('mother_first_name', lambda: gen_first_name(fn)),
532
- ('mother_middle_name', lambda: gen_middle_name(ln)),
533
- ('mother_last_name', lambda: gen_last_name(ln)),
534
- ('mother_citizenship', lambda: gen_citizenship()),
535
- ('mother_religion', lambda: gen_religion()),
536
- ('mother_occupation', lambda: gen_occupation()),
537
- ('mother_age_at_birth', lambda: str(random.randint(16, 45))),
538
- ('mother_residence_house', lambda: gen_address()),
539
- ('mother_residence_city', lambda: gen_place_city()),
540
- ('mother_residence_province', lambda: gen_place_province()),
541
- ('father_first_name', lambda: gen_first_name(fn)),
542
- ('father_middle_name', lambda: gen_middle_name(ln)),
543
- ('father_last_name', lambda: gen_last_name(ln)),
544
- ('father_citizenship', lambda: gen_citizenship()),
545
- ('father_religion', lambda: gen_religion()),
546
- ('father_occupation', lambda: gen_occupation()),
547
- ('father_age_at_birth', lambda: str(random.randint(18, 55))),
548
- ('parents_marriage_month', lambda: gen_date_month()),
549
- ('parents_marriage_day', lambda: gen_date_day()),
550
- ('parents_marriage_year', lambda: gen_date_year()),
551
- ('parents_marriage_city', lambda: gen_place_city()),
552
- ('informant_name', lambda: gen_full_name(fn, ln, False)),
553
- ('informant_address', lambda: gen_address()),
554
- ('informant_date', lambda: gen_date_slash()),
555
- ]
556
-
557
- elif form_type == 'form2a': # Death Certificate
558
- return [
559
- ('province', lambda: gen_place_province()),
560
- ('registry_no', lambda: gen_registry_no()),
561
- ('city_municipality', lambda: gen_place_city()),
562
- ('deceased_first_name', lambda: gen_first_name(fn)),
563
- ('deceased_middle_name', lambda: gen_middle_name(ln)),
564
- ('deceased_last_name', lambda: gen_last_name(ln)),
565
- ('sex', lambda: gen_sex()),
566
- ('religion', lambda: gen_religion()),
567
- ('age_years', lambda: gen_age()),
568
- ('place_death_full', lambda: f"{gen_place_city()}, {gen_place_province()}"),
569
- ('dod_day', lambda: gen_date_day()),
570
- ('dod_month', lambda: gen_date_month()),
571
- ('dod_year', lambda: gen_date_year()),
572
- ('citizenship', lambda: gen_citizenship()),
573
- ('residence_full', lambda: gen_address()),
574
- ('civil_status', lambda: gen_civil_status()),
575
- ('occupation', lambda: gen_occupation()),
576
- ('cause_immediate', lambda: gen_death_cause()),
577
- ('cause_antecedent', lambda: gen_death_cause()),
578
- ('cause_underlying', lambda: gen_death_cause()),
579
- ('cause_other', lambda: gen_death_cause()),
580
- ('informant_name', lambda: gen_full_name(fn, ln, False)),
581
- ('informant_address', lambda: gen_address()),
582
- ('informant_date', lambda: gen_date_slash()),
583
- ]
584
-
585
- elif form_type == 'form3a': # Marriage Certificate
586
- return [
587
- ('province', lambda: gen_place_province()),
588
- ('city_municipality', lambda: gen_place_city()),
589
- ('registry_no', lambda: gen_registry_no()),
590
- ('husband_first_name', lambda: gen_first_name(fn)),
591
- ('husband_middle_name', lambda: gen_middle_name(ln)),
592
- ('husband_last_name', lambda: gen_last_name(ln)),
593
- ('wife_first_name', lambda: gen_first_name(fn)),
594
- ('wife_middle_name', lambda: gen_middle_name(ln)),
595
- ('wife_last_name', lambda: gen_last_name(ln)),
596
- ('husband_dob_day', lambda: gen_date_day()),
597
- ('husband_dob_month', lambda: gen_date_month()),
598
- ('husband_dob_year', lambda: gen_date_year()),
599
- ('husband_age', lambda: gen_age()),
600
- ('wife_dob_day', lambda: gen_date_day()),
601
- ('wife_dob_month', lambda: gen_date_month()),
602
- ('wife_dob_year', lambda: gen_date_year()),
603
- ('wife_age', lambda: gen_age()),
604
- ('husband_place_birth_city', lambda: gen_place_city()),
605
- ('husband_place_birth_province', lambda: gen_place_province()),
606
- ('wife_place_birth_city', lambda: gen_place_city()),
607
- ('wife_place_birth_province', lambda: gen_place_province()),
608
- ('husband_citizenship', lambda: gen_citizenship()),
609
- ('wife_citizenship', lambda: gen_citizenship()),
610
- ('husband_religion', lambda: gen_religion()),
611
- ('wife_religion', lambda: gen_religion()),
612
- ('husband_civil_status', lambda: gen_civil_status()),
613
- ('wife_civil_status', lambda: gen_civil_status()),
614
- ('husband_father_first', lambda: gen_first_name(fn)),
615
- ('husband_father_last', lambda: gen_last_name(ln)),
616
- ('wife_father_first', lambda: gen_first_name(fn)),
617
- ('wife_father_last', lambda: gen_last_name(ln)),
618
- ('husband_mother_first', lambda: gen_first_name(fn)),
619
- ('husband_mother_last', lambda: gen_last_name(ln)),
620
- ('wife_mother_first', lambda: gen_first_name(fn)),
621
- ('wife_mother_last', lambda: gen_last_name(ln)),
622
- ('place_marriage_city', lambda: gen_place_city()),
623
- ('place_marriage_province', lambda: gen_place_province()),
624
- ('date_marriage_day', lambda: gen_date_day()),
625
- ('date_marriage_month', lambda: gen_date_month()),
626
- ('date_marriage_year', lambda: gen_date_year()),
627
- ]
628
-
629
- elif form_type == 'form90': # Marriage License Application
630
- return [
631
- ('province', lambda: gen_place_province()),
632
- ('city_municipality', lambda: gen_place_city()),
633
- ('registry_no', lambda: gen_registry_no()),
634
- ('husband_first_name', lambda: gen_first_name(fn)),
635
- ('husband_middle_name', lambda: gen_middle_name(ln)),
636
- ('husband_last_name', lambda: gen_last_name(ln)),
637
- ('wife_first_name', lambda: gen_first_name(fn)),
638
- ('wife_middle_name', lambda: gen_middle_name(ln)),
639
- ('wife_last_name', lambda: gen_last_name(ln)),
640
- ('husband_age', lambda: gen_age()),
641
- ('wife_age', lambda: gen_age()),
642
- ('husband_citizenship', lambda: gen_citizenship()),
643
- ('wife_citizenship', lambda: gen_citizenship()),
644
- ('husband_residence', lambda: gen_address()),
645
- ('wife_residence', lambda: gen_address()),
646
- ('application_date', lambda: gen_date_slash()),
647
- ]
648
-
649
- return []
650
-
651
-
652
- # ─────────────────────────────────────────────────────────────────────────────
653
- # MAIN GENERATOR
654
- # ─────────────────────────────────────────────────────────────────────────────
655
-
656
- def generate_dataset():
657
- print("=" * 65)
658
- print(" fix_data.py β€” Synthetic Training Data Generator")
659
- print("=" * 65)
660
-
661
- # Load Filipino names
662
- print("\n[1/4] Loading Filipino names...")
663
- first_names, last_names, middle_names = load_ph_names()
664
-
665
- # Populate global MIDDLE_NAMES so all generators use the full 700+ pool
666
- global MIDDLE_NAMES
667
- MIDDLE_NAMES.clear()
668
- MIDDLE_NAMES.extend(middle_names)
669
- print(f" Middle names pool active: {len(MIDDLE_NAMES)} entries")
670
-
671
- # Create output directories
672
- print("\n[2/4] Creating output directories...")
673
- for split in ['train', 'val']:
674
- for form in ['form1a', 'form2a', 'form3a', 'form90']:
675
- Path(f'data/{split}/{form}').mkdir(parents=True, exist_ok=True)
676
- print(" βœ“ Directories ready")
677
-
678
- # Load font pool β€” multiple typefaces so model generalises across fonts
679
- print("\n[3/4] Loading fonts...")
680
- font_pool = load_font_pool(FONT_SIZE)
681
- print(f" βœ“ {len(font_pool)} font(s) loaded")
682
-
683
- # Generate images
684
- print("\n[4/4] Generating images...")
685
- print(f" {'Form':<10} {'Total':>7} {'Train':>7} {'Val':>7}")
686
- print(f" {'-'*35}")
687
-
688
- train_annotations = []
689
- val_annotations = []
690
- total_generated = 0
691
-
692
- for form_type, n_samples in SAMPLES_PER_FORM.items():
693
- fields = get_form_fields(form_type, first_names, last_names)
694
- samples_per_field = max(1, n_samples // len(fields))
695
- form_train = 0
696
- form_val = 0
697
-
698
- # Pre-build shuffled val assignment for unbiased 10% split
699
- total_this_form = samples_per_field * len(fields)
700
- val_flags = [False] * total_this_form
701
- val_indices = random.sample(
702
- range(total_this_form),
703
- max(1, int(total_this_form * VAL_SPLIT))
704
- )
705
- for vi in val_indices:
706
- val_flags[vi] = True
707
-
708
- img_idx = 0
709
- for field_name, generator in fields:
710
- for _ in range(samples_per_field):
711
- text = generator()
712
- if not text or not text.strip():
713
- img_idx += 1
714
- continue
715
-
716
- # 70% handwriting / 30% printed
717
- use_handwriting = random.random() < 0.70
718
- # Pick a random font from the pool each image β€” forces
719
- # the model to generalise across typefaces, not memorise one font
720
- font = random.choice(font_pool)
721
- img = render_text_image(text, font, handwriting=use_handwriting)
722
- fname = f"{field_name}_{img_idx:06d}.jpg"
723
-
724
- is_val = val_flags[img_idx] if img_idx < len(val_flags) else False
725
-
726
- if is_val:
727
- out_path = f"data/val/{form_type}/{fname}"
728
- val_annotations.append({
729
- 'image_path': f"{form_type}/{fname}",
730
- 'text': text,
731
- })
732
- form_val += 1
733
- else:
734
- out_path = f"data/train/{form_type}/{fname}"
735
- train_annotations.append({
736
- 'image_path': f"{form_type}/{fname}",
737
- 'text': text,
738
- })
739
- form_train += 1
740
-
741
- img.save(out_path, quality=95)
742
- img_idx += 1
743
-
744
- total_generated += form_train + form_val
745
- print(f" {form_type:<10} {form_train + form_val:>7,} "
746
- f"{form_train:>7,} {form_val:>7,}")
747
-
748
- # Save annotation files
749
- with open('data/train_annotations.json', 'w', encoding='utf-8') as f:
750
- json.dump(train_annotations, f, indent=2, ensure_ascii=False)
751
-
752
- with open('data/val_annotations.json', 'w', encoding='utf-8') as f:
753
- json.dump(val_annotations, f, indent=2, ensure_ascii=False)
754
-
755
- # Summary
756
- print(f"\n{'=' * 65}")
757
- print(f" DONE!")
758
- print(f"{'=' * 65}")
759
- print(f" Total images generated : {total_generated:,}")
760
- print(f" Train images : {len(train_annotations):,}")
761
- print(f" Val images : {len(val_annotations):,}")
762
- print(f"\n Saved:")
763
- print(f" data/train_annotations.json ({len(train_annotations)} entries)")
764
- print(f" data/val_annotations.json ({len(val_annotations)} entries)")
765
- print(f"\n Next step: python train.py")
766
- print(f"{'=' * 65}")
767
-
768
-
769
- if __name__ == '__main__':
770
- generate_dataset()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CRNN+CTC/generate_form_samples.py DELETED
@@ -1,389 +0,0 @@
1
- """
2
- generate_form_samples.py
3
- ========================
4
- Generates thousands of synthetic filled civil registry form images
5
- using the blank PDF forms + template_matcher.py coordinates.
6
-
7
- Each form is filled with random Filipino names/dates in handwriting fonts.
8
- Crops are saved with labels β†’ ready for CRNN+CTC fine-tuning.
9
-
10
- Usage:
11
- python generate_form_samples.py
12
-
13
- Output:
14
- data/train/real_forms/ -- cropped field images
15
- data/real_annotations.json -- labels for fine-tuning
16
- """
17
-
18
- import os
19
- import sys
20
- import json
21
- import random
22
- import datetime
23
-
24
- from PIL import Image, ImageDraw, ImageFont
25
-
26
- # ── Paths ─────────────────────────────────────────────────────
27
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
28
- ROOT_DIR = os.path.dirname(BASE_DIR)
29
- PYTHON_DIR = ROOT_DIR # template_matcher.py is here
30
-
31
- NAMES_FILE = os.path.join(BASE_DIR, 'data', 'ph_names.json')
32
- OUT_IMG_DIR = os.path.join(BASE_DIR, 'data', 'train', 'real_forms')
33
- OUT_ANN = os.path.join(BASE_DIR, 'data', 'real_annotations.json')
34
-
35
- FONTS_DIR = os.path.join(ROOT_DIR, 'test_images', 'handwriting_fonts')
36
-
37
- # Only verified-working Google Fonts URLs
38
- GOOGLE_FONTS = {
39
- 'Kalam-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Regular.ttf',
40
- 'Kalam-Bold.ttf': 'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Bold.ttf',
41
- 'Kalam-Light.ttf': 'https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Light.ttf',
42
- 'PatrickHand-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/patrickhand/PatrickHand-Regular.ttf',
43
- 'IndieFlower-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/indieflower/IndieFlower-Regular.ttf',
44
- 'Handlee-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/handlee/Handlee-Regular.ttf',
45
- 'GochiHand-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/gochihand/GochiHand-Regular.ttf',
46
- 'ArchitectsDaughter.ttf': 'https://github.com/google/fonts/raw/main/ofl/architectsdaughter/ArchitectsDaughter-Regular.ttf',
47
- 'ShadowsIntoLight.ttf': 'https://github.com/google/fonts/raw/main/ofl/shadowsintolight/ShadowsIntoLight.ttf',
48
- 'ShadowsIntoLightTwo.ttf': 'https://github.com/google/fonts/raw/main/ofl/shadowsintolighttwo/ShadowsIntoLightTwo-Regular.ttf',
49
- 'Kristi-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/kristi/Kristi-Regular.ttf',
50
- 'AmaticSC-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/amaticsc/AmaticSC-Regular.ttf',
51
- 'AmaticSC-Bold.ttf': 'https://github.com/google/fonts/raw/main/ofl/amaticsc/AmaticSC-Bold.ttf',
52
- 'BadScript-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/badscript/BadScript-Regular.ttf',
53
- 'Sacramento-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/sacramento/Sacramento-Regular.ttf',
54
- 'GreatVibes-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/greatvibes/GreatVibes-Regular.ttf',
55
- 'Allura-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/allura/Allura-Regular.ttf',
56
- 'AlexBrush-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/alexbrush/AlexBrush-Regular.ttf',
57
- 'Parisienne-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/parisienne/Parisienne-Regular.ttf',
58
- 'Tangerine-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/tangerine/Tangerine-Regular.ttf',
59
- 'Tangerine-Bold.ttf': 'https://github.com/google/fonts/raw/main/ofl/tangerine/Tangerine-Bold.ttf',
60
- 'Courgette-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/courgette/Courgette-Regular.ttf',
61
- 'Niconne-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/niconne/Niconne-Regular.ttf',
62
- 'MarckScript-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/marckscript/MarckScript-Regular.ttf',
63
- 'Norican-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/norican/Norican-Regular.ttf',
64
- 'Damion-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/damion/Damion-Regular.ttf',
65
- 'Satisfy-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/satisfy/Satisfy-Regular.ttf',
66
- 'Pacifico-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/pacifico/Pacifico-Regular.ttf',
67
- 'Italianno-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/italianno/Italianno-Regular.ttf',
68
- 'Pompiere-Regular.ttf': 'https://github.com/google/fonts/raw/main/ofl/pompiere/Pompiere-Regular.ttf',
69
- }
70
-
71
- FONT_PATHS = [
72
- # Downloaded handwriting fonts
73
- *[os.path.join(FONTS_DIR, name) for name in GOOGLE_FONTS],
74
- # Already available
75
- os.path.join(ROOT_DIR, 'test_images', 'Caveat-Regular.ttf'),
76
- # Windows fallbacks
77
- r'C:\Windows\Fonts\segoepr.ttf',
78
- r'C:\Windows\Fonts\segoeprb.ttf',
79
- r'C:\Windows\Fonts\comic.ttf',
80
- ]
81
-
82
- def download_fonts():
83
- """Download handwriting fonts from Google Fonts if not present."""
84
- import urllib.request
85
- os.makedirs(FONTS_DIR, exist_ok=True)
86
- ok = 0
87
- for fname, url in GOOGLE_FONTS.items():
88
- dest = os.path.join(FONTS_DIR, fname)
89
- if os.path.exists(dest) and os.path.getsize(dest) > 10000:
90
- ok += 1
91
- continue
92
- try:
93
- print(f" Downloading {fname}...")
94
- with urllib.request.urlopen(url, timeout=10) as r, open(dest, 'wb') as f:
95
- f.write(r.read())
96
- # Validate: real TTF files are > 10KB
97
- if os.path.getsize(dest) < 10000:
98
- os.remove(dest)
99
- print(f" Skipped {fname} (invalid file)")
100
- else:
101
- ok += 1
102
- except Exception as e:
103
- print(f" Failed {fname}: {e}")
104
- if os.path.exists(dest):
105
- os.remove(dest)
106
- print(f" {ok} fonts ready")
107
-
108
- PDF_FORMS = {
109
- '97': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 97 (MARRIAGE CERTIFICATE).pdf'),
110
- '102': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 102 (BIRTH CERTIFICATE).pdf'),
111
- '103': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 103 (DEATH CERTIFICATE).pdf'),
112
- '90': os.path.join(ROOT_DIR, 'python', 'CRNN+CTC', 'FORM 90-MARRIAGE-LICENCE-FORM.pdf'),
113
- }
114
-
115
- SAMPLES_PER_FORM = 1000 # forms to generate per type
116
- IMG_W = 64
117
- IMG_H = 512
118
-
119
- # ── Load TEMPLATES from template_matcher ─────────────────────
120
- sys.path.insert(0, PYTHON_DIR)
121
- from template_matcher import TEMPLATES
122
-
123
- # ── Load Filipino names ───────────────────────────────────────
124
- def load_names():
125
- if not os.path.exists(NAMES_FILE):
126
- print(f"ERROR: {NAMES_FILE} not found. Run generate_ph_names.py first.")
127
- sys.exit(1)
128
- with open(NAMES_FILE) as f:
129
- data = json.load(f)
130
- return data
131
-
132
- # ── Random data generators ────────────────────────────────────
133
- MONTHS = ['January','February','March','April','May','June',
134
- 'July','August','September','October','November','December']
135
- RELIGIONS = ['Roman Catholic','Islam','Baptist','Iglesia ni Cristo',
136
- 'Seventh Day Adventist','Born Again Christian']
137
- CIVIL_STATUSES = ['Single','Married','Widowed','Legally Separated']
138
- CITIZENSHIPS = ['Filipino','American','Chinese','Japanese']
139
- PROVINCES = ['Cebu','Davao del Sur','Metro Manila','Iloilo','Pampanga',
140
- 'Batangas','Laguna','Cavite','Bulacan','Quezon City']
141
- CITIES = ['Cebu City','Davao City','Manila','Iloilo City','San Fernando',
142
- 'Batangas City','Santa Rosa','Bacoor','Malolos','Quezon City']
143
-
144
- def rand_name(names, key):
145
- pool = names.get(key, ['Juan'])
146
- return random.choice(pool).upper()
147
-
148
- def rand_date():
149
- y = random.randint(1950, 2005)
150
- m = random.randint(1, 12)
151
- d = random.randint(1, 28)
152
- return f"{d:02d}", MONTHS[m-1], str(y)
153
-
154
- def rand_age():
155
- return str(random.randint(18, 80))
156
-
157
- def rand_province():
158
- return random.choice(PROVINCES).upper()
159
-
160
- def rand_city():
161
- return random.choice(CITIES).upper()
162
-
163
- def rand_religion():
164
- return random.choice(RELIGIONS).upper()
165
-
166
- def rand_civil_status():
167
- return random.choice(CIVIL_STATUSES).upper()
168
-
169
- def rand_citizenship():
170
- return random.choice(CITIZENSHIPS).upper()
171
-
172
- def rand_registry_no():
173
- return f"{random.randint(2000,2024)}-{random.randint(1000,9999)}"
174
-
175
- def rand_time():
176
- h = random.randint(6, 18)
177
- m = random.choice(['00','15','30','45'])
178
- return f"{h:02d}:{m} {'AM' if h < 12 else 'PM'}"
179
-
180
- def generate_field_value(field_name, names):
181
- """Generate a plausible random value for a given field name."""
182
- f = field_name.lower()
183
- if 'province' in f: return rand_province()
184
- if 'registry' in f: return rand_registry_no()
185
- if 'city' in f or 'municipality' in f: return rand_city()
186
- if 'first' in f and ('name' in f or 'father' in f or 'mother' in f):
187
- return rand_name(names, 'first')
188
- if 'middle' in f: return rand_name(names, 'middle')
189
- if 'last' in f: return rand_name(names, 'last')
190
- if '_name' in f and 'father' not in f and 'mother' not in f:
191
- return rand_name(names, 'first')
192
- if 'father_name' in f or 'mother_name' in f:
193
- return f"{rand_name(names,'first')} {rand_name(names,'middle')} {rand_name(names,'last')}"
194
- if 'dob_day' in f or 'day' in f: return rand_date()[0]
195
- if 'dob_month' in f or 'month' in f: return rand_date()[1]
196
- if 'dob_year' in f or 'year' in f: return rand_date()[2]
197
- if 'dob' in f and 'day' not in f and 'month' not in f and 'year' not in f:
198
- d,m,y = rand_date(); return f"{d} {m} {y}"
199
- if 'age' in f: return rand_age()
200
- if 'birth' in f and 'place' in f: return rand_city()
201
- if 'place_of_birth' in f: return rand_city()
202
- if 'sex' in f: return random.choice(['MALE','FEMALE'])
203
- if 'citizenship' in f: return rand_citizenship()
204
- if 'residence' in f: return f"{rand_city()}, {rand_province()}"
205
- if 'religion' in f: return rand_religion()
206
- if 'civil_status' in f: return rand_civil_status()
207
- if 'place_of_marriage' in f: return rand_city()
208
- if 'date_of_marriage' in f:
209
- d,m,y = rand_date(); return f"{d} {m} {y}"
210
- if 'time_of_marriage' in f: return rand_time()
211
- if 'marriage_date' in f:
212
- d,m,y = rand_date(); return f"{d} {m} {y}"
213
- if 'marriage_place' in f: return rand_city()
214
- if 'marriage_license' in f: return rand_registry_no()
215
- if 'date_issued' in f:
216
- d,m,y = rand_date(); return f"{d} {m} {y}"
217
- if 'occupation' in f: return random.choice(['FARMER','TEACHER','NURSE','ENGINEER','DRIVER','HOUSEWIFE'])
218
- if 'type_of_birth' in f: return random.choice(['SINGLE','TWIN','TRIPLET'])
219
- if 'birth_order' in f: return random.choice(['1ST','2ND','3RD','4TH'])
220
- if 'weight' in f: return f"{random.randint(2,5)}.{random.randint(0,9)} KG"
221
- if 'cause' in f: return random.choice(['CARDIAC ARREST','PNEUMONIA','DIABETES','HYPERTENSION'])
222
- if 'father_name' in f: return f"{rand_name(names,'first')} {rand_name(names,'last')}"
223
- if 'mother_name' in f: return f"{rand_name(names,'first')} {rand_name(names,'last')}"
224
- return rand_name(names, 'first')
225
-
226
- # ── Load fonts ────────────────────────────────────────────────
227
- def load_fonts():
228
- fonts = []
229
- for path in FONT_PATHS:
230
- if os.path.exists(path):
231
- for size in [14, 16, 18, 20]:
232
- try:
233
- fonts.append(ImageFont.truetype(path, size))
234
- except:
235
- pass
236
- if not fonts:
237
- fonts = [ImageFont.load_default()]
238
- print(f" Loaded {len(fonts)} font variants")
239
- return fonts
240
-
241
- # ── Load blank form image ─────────────────────────────────────
242
- def load_blank_form(form_type):
243
- """Convert PDF to image or use a reference scan as background."""
244
- pdf_path = PDF_FORMS.get(form_type)
245
-
246
- # Try pdf2image first
247
- if pdf_path and os.path.exists(pdf_path):
248
- try:
249
- from pdf2image import convert_from_path
250
- pages = convert_from_path(pdf_path, dpi=150)
251
- if pages:
252
- return pages[0].convert('RGB')
253
- except Exception as e:
254
- print(f" pdf2image failed: {e}")
255
-
256
- # Fallback: use reference image (try png, jpg, jpeg)
257
- for ext in ['png', 'jpg', 'jpeg']:
258
- ref_path = os.path.join(ROOT_DIR, 'references', f'reference_{form_type}.{ext}')
259
- if os.path.exists(ref_path):
260
- return Image.open(ref_path).convert('RGB')
261
- # Also try hyphen variant (e.g. reference-90.jpg)
262
- for ext in ['png', 'jpg', 'jpeg']:
263
- ref_path = os.path.join(ROOT_DIR, 'references', f'reference-{form_type}.{ext}')
264
- if os.path.exists(ref_path):
265
- return Image.open(ref_path).convert('RGB')
266
-
267
- print(f" WARNING: No blank form found for {form_type} β€” skipping")
268
- return None
269
-
270
- # ── Render text on form ───────────────────────────────────────
271
- def render_field(draw, x1r, y1r, x2r, y2r, text, img_w, img_h, fonts):
272
- """Draw handwritten-style text in a field box."""
273
- x1 = int(x1r * img_w)
274
- y1 = int(y1r * img_h)
275
- x2 = int(x2r * img_w)
276
- y2 = int(y2r * img_h)
277
-
278
- box_w = max(x2 - x1, 1)
279
- box_h = max(y2 - y1, 1)
280
-
281
- # Pick a font that fits
282
- font = random.choice(fonts)
283
- for f in fonts:
284
- bbox = f.getbbox(text)
285
- fw = bbox[2] - bbox[0]
286
- fh = bbox[3] - bbox[1]
287
- if fw <= box_w * 0.95 and fh <= box_h * 1.2:
288
- font = f
289
- break
290
-
291
- # Random pen color (dark blue/black like ballpen)
292
- r = random.randint(0, 40)
293
- g = random.randint(0, 40)
294
- b = random.randint(60, 120)
295
- color = (r, g, b)
296
-
297
- # Center text vertically in box
298
- bbox = font.getbbox(text)
299
- fh = bbox[3] - bbox[1]
300
- ty = y1 + (box_h - fh) // 2
301
-
302
- # Slight random x offset
303
- tx = x1 + random.randint(2, max(3, box_w // 10))
304
-
305
- draw.text((tx, ty), text, fill=color, font=font)
306
-
307
- # ── Crop a field ──────────────────────────────────────────────
308
- def crop_field(img, x1r, y1r, x2r, y2r):
309
- w, h = img.size
310
- x1 = max(0, int(x1r * w) - 4)
311
- y1 = max(0, int(y1r * h) - 4)
312
- x2 = min(w, int(x2r * w) + 4)
313
- y2 = min(h, int(y2r * h) + 4)
314
- return img.crop((x1, y1, x2, y2))
315
-
316
- # ── Main ────────────────────────────────────────────────────��─
317
- def main():
318
- print("=" * 60)
319
- print(" Form Sample Generator")
320
- print("=" * 60)
321
-
322
- os.makedirs(OUT_IMG_DIR, exist_ok=True)
323
- print("\n Downloading handwriting fonts...")
324
- download_fonts()
325
- names = load_names()
326
- fonts = load_fonts()
327
- annotations = []
328
- total = 0
329
-
330
- for form_type, template in TEMPLATES.items():
331
- print(f"\n Generating Form {form_type}...")
332
-
333
- blank = load_blank_form(form_type)
334
- if blank is None:
335
- continue
336
-
337
- for i in range(SAMPLES_PER_FORM):
338
- # Fresh copy of blank form
339
- form_img = blank.copy()
340
- draw = ImageDraw.Draw(form_img)
341
- img_w, img_h = form_img.size
342
-
343
- field_values = {}
344
- for field_name, coords in template.items():
345
- x1r, y1r, x2r, y2r, _ = coords
346
- text = generate_field_value(field_name, names)
347
- field_values[field_name] = text
348
- render_field(draw, x1r, y1r, x2r, y2r, text, img_w, img_h, fonts)
349
-
350
- # Save full form preview (first sample only)
351
- if i == 0:
352
- preview_path = os.path.join(OUT_IMG_DIR, f'form{form_type}_preview.png')
353
- form_img.save(preview_path)
354
- print(f" Preview saved: {preview_path}")
355
-
356
- # Crop each field and save
357
- for field_name, coords in template.items():
358
- x1r, y1r, x2r, y2r, _ = coords
359
- crop = crop_field(form_img, x1r, y1r, x2r, y2r)
360
- crop = crop.convert('L') # grayscale
361
-
362
- fname = f"form{form_type}_{i:05d}_{field_name}.png"
363
- fpath = os.path.join(OUT_IMG_DIR, fname)
364
- crop.save(fpath)
365
-
366
- annotations.append({
367
- "image_path": f"real_forms/{fname}",
368
- "text": field_values[field_name]
369
- })
370
- total += 1
371
-
372
- if (i + 1) % 100 == 0:
373
- print(f" {i+1}/{SAMPLES_PER_FORM} forms done ({total} crops so far)")
374
-
375
- print(f" Form {form_type} done.")
376
-
377
- # Save annotations
378
- with open(OUT_ANN, 'w') as f:
379
- json.dump(annotations, f, indent=2)
380
-
381
- print(f"\n{'='*60}")
382
- print(f" DONE!")
383
- print(f" Total crops : {total}")
384
- print(f" Annotations : {OUT_ANN}")
385
- print(f" Next step : upload to Kaggle and run fine-tune")
386
- print(f"{'='*60}")
387
-
388
- if __name__ == '__main__':
389
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CRNN+CTC/inference.py CHANGED
@@ -214,7 +214,7 @@ class CivilRegistryOCR:
214
  def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
215
  """
216
  Args:
217
- checkpoint_path : path to best_model.pth
218
  device : 'cuda' or 'cpu'
219
  mode : 'auto' β†’ auto-detect per image (recommended)
220
  'simple' β†’ always use simple pipeline
 
214
  def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
215
  """
216
  Args:
217
+ checkpoint_path : path to best_model_v4.pth
218
  device : 'cuda' or 'cpu'
219
  mode : 'auto' β†’ auto-detect per image (recommended)
220
  'simple' β†’ always use simple pipeline
debug_and_retrain.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import matplotlib.pyplot as plt
3
+
4
+ # Load and show the image
5
+ img = cv2.imread('your_image.png')
6
+ plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
7
+ plt.title('Original Image')
8
+ plt.show()
9
+
10
+ # Preprocess and show
11
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
12
+ thresh = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
13
+ plt.imshow(thresh, cmap='gray')
14
+ plt.title('Thresholded Image')
15
+ plt.show()
16
+
17
+ # Run OCR and print output
18
+ import pytesseract
19
+ text = pytesseract.image_to_string(thresh)
20
+ print("OCR Output:", text)
finetune.py CHANGED
@@ -3,18 +3,20 @@ finetune.py
3
  ===========
4
  Fine-tune CRNN+CTC on generated civil registry form crops.
5
 
6
- Loads best_model_iam.pth (already knows real handwriting from IAM),
7
- then trains on real_annotations.json (Filipino names on real form backgrounds).
8
 
9
  Usage:
10
  python finetune.py
11
 
12
  Output:
13
- checkpoints/best_model_final.pth
14
  """
15
 
16
  import os
17
  import sys
 
 
18
  import torch
19
  import torch.nn.functional as F
20
  import torch.optim as optim
@@ -25,13 +27,14 @@ from crnn_model import get_crnn_model
25
  from dataset import CivilRegistryDataset, collate_fn
26
 
27
  # ── Config ────────────────────────────────────────────────────
28
- CHECKPOINT_IN = "checkpoints/best_model_iam.pth"
29
- CHECKPOINT_OUT = "checkpoints/best_model_final.pth"
30
 
31
- REAL_ANN = "data/real_annotations.json" # generated by generate_form_samples.py
32
- ACTUAL_ANN = "data/actual_annotations.json" # real scanned forms (extract_actual_data.py)
33
- SYNTH_ANN = "data/train_annotations.json" # original synthetic data
34
- VAL_ANN = "data/val_annotations.json" # validation set
 
35
 
36
  IMG_HEIGHT = 64
37
  IMG_WIDTH = 512
@@ -42,10 +45,26 @@ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
42
  # ── Phase settings ────────────────────────────────────────────
43
  PHASES = [
44
  # (name, epochs, lr, freeze_cnn, patience)
45
- ("Phase 1 β€” CNN frozen, adapt to form crops", 20, 1e-4, True, 5),
46
- ("Phase 2 β€” Full model, low LR polish", 15, 1e-5, False, 4),
 
 
47
  ]
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # ── Main ──────────────────────────────────────────────────────
50
  def main():
51
  print("=" * 60)
@@ -60,6 +79,11 @@ def main():
60
  print(f"ERROR: {f} not found.")
61
  sys.exit(1)
62
 
 
 
 
 
 
63
  # ── Datasets ──────────────────────────────────────────────
64
  datasets_to_merge = []
65
 
@@ -72,32 +96,23 @@ def main():
72
  datasets_to_merge.append(actual_dataset)
73
  print(f" Actual crops: {len(actual_dataset)} (real scanned forms)")
74
  else:
75
- print(f" [!] {ACTUAL_ANN} not found β€” run extract_actual_data.py first")
76
 
77
- # 2. Synthetic on real form backgrounds
78
- if os.path.exists(REAL_ANN):
79
- real_dataset = CivilRegistryDataset(
80
- data_dir="data/train", annotations_file=REAL_ANN,
81
- img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
82
- )
83
- datasets_to_merge.append(real_dataset)
84
- print(f" Real crops : {len(real_dataset)} (synthetic on real backgrounds)")
85
-
86
- # 3. Fully synthetic β€” keep so model doesn't forget basic characters
87
  if os.path.exists(SYNTH_ANN):
88
  synth_dataset = CivilRegistryDataset(
89
- data_dir="data/train", annotations_file=SYNTH_ANN,
90
  img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
91
  )
92
  datasets_to_merge.append(synth_dataset)
93
  print(f" Synth crops : {len(synth_dataset)} (fully synthetic)")
94
 
95
  if not datasets_to_merge:
96
- print("ERROR: No training data found. Run extract_actual_data.py first.")
97
  sys.exit(1)
98
 
99
  val_dataset = CivilRegistryDataset(
100
- data_dir="data/val", annotations_file=VAL_ANN,
101
  img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
102
  )
103
 
@@ -115,7 +130,7 @@ def main():
115
  ckpt = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
116
  config = ckpt.get('config', {})
117
 
118
- ref_dataset = datasets_to_merge[0] # use whichever dataset was loaded first
119
  model = get_crnn_model(
120
  model_type = config.get('model_type', 'standard'),
121
  img_height = config.get('img_height', 64),
@@ -144,8 +159,8 @@ def main():
144
  batch_size = images.size(0)
145
  if training:
146
  optimizer.zero_grad()
147
- outputs = F.log_softmax(model(images), dim=2)
148
- seq_len = outputs.size(0)
149
  input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
150
  loss = criterion(outputs, targets, input_lengths, target_lengths)
151
  if not torch.isnan(loss) and not torch.isinf(loss):
@@ -186,6 +201,7 @@ def main():
186
  if vl < best_overall:
187
  best_overall = vl
188
  torch.save({
 
189
  'model_state_dict': model.state_dict(),
190
  'config': config,
191
  'char_to_idx': ref_dataset.char_to_idx,
@@ -201,6 +217,11 @@ def main():
201
  print(f" Early stopping.")
202
  break
203
 
 
 
 
 
 
204
  print(f"\n{'='*60}")
205
  print(f" Fine-tuning complete!")
206
  print(f" Best val loss : {best_overall:.4f}")
@@ -209,4 +230,4 @@ def main():
209
 
210
 
211
  if __name__ == '__main__':
212
- main()
 
3
  ===========
4
  Fine-tune CRNN+CTC on generated civil registry form crops.
5
 
6
+ Continues from best_model_v2.pth, trains on actual_annotations.json
7
+ + train_annotations.json, saves to best_model_v4.pth.
8
 
9
  Usage:
10
  python finetune.py
11
 
12
  Output:
13
+ checkpoints/best_model_v4.pth
14
  """
15
 
16
  import os
17
  import sys
18
+ import json
19
+ import shutil
20
  import torch
21
  import torch.nn.functional as F
22
  import torch.optim as optim
 
27
  from dataset import CivilRegistryDataset, collate_fn
28
 
29
  # ── Config ────────────────────────────────────────────────────
30
+ CHECKPOINT_IN = "checkpoints/best_model_v3.pth"
31
+ CHECKPOINT_OUT = "checkpoints/best_model_v4.pth"
32
 
33
+ ACTUAL_ANN = "data/actual_annotations.json" # real scanned forms
34
+ SYNTH_ANN = "data/train_annotations.json" # synthetic / train split
35
+ VAL_ANN = "data/val_annotations.json" # validation set
36
+
37
+ DRIVE_BACKUP = "/content/drive/MyDrive/crnn_finetune/CRNN+CTC/checkpoints/best_model_v4.pth"
38
 
39
  IMG_HEIGHT = 64
40
  IMG_WIDTH = 512
 
45
  # ── Phase settings ────────────────────────────────────────────
46
  PHASES = [
47
  # (name, epochs, lr, freeze_cnn, patience)
48
+ ("Phase 1 β€” CNN frozen, warm up on actual crops", 20, 1e-4, True, 5),
49
+ ("Phase 2 β€” Full model, main training", 30, 1e-5, False, 6),
50
+ ("Phase 3 β€” Full model, slow burn", 30, 5e-6, False, 6),
51
+ ("Phase 4 β€” Full model, final polish", 20, 1e-6, False, 5),
52
  ]
53
 
54
+ # ── Fix Windows backslash paths ───────────────────────────────
55
+ def fix_paths(json_path):
56
+ with open(json_path) as f:
57
+ ann = json.load(f)
58
+ changed = False
59
+ for a in ann:
60
+ if 'image_path' in a and '\\' in a['image_path']:
61
+ a['image_path'] = a['image_path'].replace('\\', '/')
62
+ changed = True
63
+ if changed:
64
+ with open(json_path, 'w') as f:
65
+ json.dump(ann, f)
66
+ print(f" Fixed backslash paths in {json_path}")
67
+
68
  # ── Main ──────────────────────────────────────────────────────
69
  def main():
70
  print("=" * 60)
 
79
  print(f"ERROR: {f} not found.")
80
  sys.exit(1)
81
 
82
+ # ── Fix backslash paths ───────────────────────────────────
83
+ for ann_file in [ACTUAL_ANN, SYNTH_ANN, VAL_ANN]:
84
+ if os.path.exists(ann_file):
85
+ fix_paths(ann_file)
86
+
87
  # ── Datasets ──────────────────────────────────────────────
88
  datasets_to_merge = []
89
 
 
96
  datasets_to_merge.append(actual_dataset)
97
  print(f" Actual crops: {len(actual_dataset)} (real scanned forms)")
98
  else:
99
+ print(f" [!] {ACTUAL_ANN} not found")
100
 
101
+ # 2. Fully synthetic β€” keep so model doesn't forget basic characters
 
 
 
 
 
 
 
 
 
102
  if os.path.exists(SYNTH_ANN):
103
  synth_dataset = CivilRegistryDataset(
104
+ data_dir=".", annotations_file=SYNTH_ANN,
105
  img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=True
106
  )
107
  datasets_to_merge.append(synth_dataset)
108
  print(f" Synth crops : {len(synth_dataset)} (fully synthetic)")
109
 
110
  if not datasets_to_merge:
111
+ print("ERROR: No training data found.")
112
  sys.exit(1)
113
 
114
  val_dataset = CivilRegistryDataset(
115
+ data_dir=".", annotations_file=VAL_ANN,
116
  img_height=IMG_HEIGHT, img_width=IMG_WIDTH, augment=False
117
  )
118
 
 
130
  ckpt = torch.load(CHECKPOINT_IN, map_location=DEVICE, weights_only=False)
131
  config = ckpt.get('config', {})
132
 
133
+ ref_dataset = datasets_to_merge[0]
134
  model = get_crnn_model(
135
  model_type = config.get('model_type', 'standard'),
136
  img_height = config.get('img_height', 64),
 
159
  batch_size = images.size(0)
160
  if training:
161
  optimizer.zero_grad()
162
+ outputs = F.log_softmax(model(images), dim=2)
163
+ seq_len = outputs.size(0)
164
  input_lengths = torch.full((batch_size,), seq_len, dtype=torch.long)
165
  loss = criterion(outputs, targets, input_lengths, target_lengths)
166
  if not torch.isnan(loss) and not torch.isinf(loss):
 
201
  if vl < best_overall:
202
  best_overall = vl
203
  torch.save({
204
+ **ckpt,
205
  'model_state_dict': model.state_dict(),
206
  'config': config,
207
  'char_to_idx': ref_dataset.char_to_idx,
 
217
  print(f" Early stopping.")
218
  break
219
 
220
+ # ── Drive backup ──────────────────────────────────────────
221
+ if os.path.exists(CHECKPOINT_OUT) and os.path.exists(os.path.dirname(DRIVE_BACKUP)):
222
+ shutil.copy(CHECKPOINT_OUT, DRIVE_BACKUP)
223
+ print(f"\n Backed up to Drive: {DRIVE_BACKUP}")
224
+
225
  print(f"\n{'='*60}")
226
  print(f" Fine-tuning complete!")
227
  print(f" Best val loss : {best_overall:.4f}")
 
230
 
231
 
232
  if __name__ == '__main__':
233
+ main()
inference.py CHANGED
@@ -214,7 +214,7 @@ class CivilRegistryOCR:
214
  def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
215
  """
216
  Args:
217
- checkpoint_path : path to best_model.pth
218
  device : 'cuda' or 'cpu'
219
  mode : 'auto' β†’ auto-detect per image (recommended)
220
  'simple' β†’ always use simple pipeline
@@ -340,9 +340,9 @@ def demo_inference():
340
  print("=" * 70)
341
 
342
  ocr = CivilRegistryOCR(
343
- checkpoint_path='checkpoints/best_model.pth',
344
  device='cuda',
345
- mode='auto',
346
  verbose=True # shows which mode each image triggers
347
  )
348
 
 
214
  def __init__(self, checkpoint_path, device='cuda', mode='auto', verbose=False):
215
  """
216
  Args:
217
+ checkpoint_path : path to best_model_v4.pth
218
  device : 'cuda' or 'cpu'
219
  mode : 'auto' β†’ auto-detect per image (recommended)
220
  'simple' β†’ always use simple pipeline
 
340
  print("=" * 70)
341
 
342
  ocr = CivilRegistryOCR(
343
+ checkpoint_path='checkpoints/best_model_v4.pth',
344
  device='cuda',
345
+ mode='adaptive', # force adaptive for demo images (many are zoomed/physical)
346
  verbose=True # shows which mode each image triggers
347
  )
348
 
spacyNER/debug_and_retrain.py DELETED
@@ -1,316 +0,0 @@
1
- #!/usr/bin/env python3
2
- # debug_and_retrain.py
3
- # ============================================================
4
- # USE THIS WHEN: training crashes with E024 or any span error
5
- #
6
- # WHAT IT DOES (in order):
7
- # 1. Checks all .spacy files for bad spans (whitespace, empty)
8
- # 2. Runs spaCy's official debug data command
9
- # 3. Deletes corrupted .spacy files so they get rebuilt clean
10
- # 4. Rebuilds: prepare_data β†’ funsd_integration β†’ train
11
- #
12
- # USAGE:
13
- # python debug_and_retrain.py ← full check + retrain
14
- # python debug_and_retrain.py --check ← check only, no retrain
15
- # python debug_and_retrain.py --retrain ← skip check, just retrain
16
- # ============================================================
17
-
18
- import subprocess
19
- import sys
20
- import argparse
21
- from pathlib import Path
22
-
23
-
24
- # ── All .spacy files to check ─────────────────────────────
25
- SPACY_FILES = {
26
- "train.spacy": "data/training/train.spacy",
27
- "dev.spacy": "data/training/dev.spacy",
28
- "funsd_train.spacy": "data/training/funsd_train.spacy",
29
- "funsd_dev.spacy": "data/training/funsd_dev.spacy",
30
- "merged_train.spacy": "data/training/merged_train.spacy",
31
- "merged_dev.spacy": "data/training/merged_dev.spacy",
32
- }
33
-
34
- # Files that get REBUILT (delete these before retraining)
35
- REBUILT_FILES = list(SPACY_FILES.values())
36
-
37
- CFG = "training/config.cfg"
38
-
39
-
40
- # ══════════════════════════════════════════════════════════
41
- # STEP 1 β€” INSPECT .spacy FILES FOR BAD SPANS
42
- # ══════════════════════════════════════════════════════════
43
-
44
- def inspect_spacy_file(path: str):
45
- """
46
- Load a .spacy file and scan every entity span for problems.
47
- Returns (total_docs, total_ents, bad_spans_list).
48
-
49
- Bad span types that cause E024:
50
- - Leading whitespace: span.text starts with ' ' or '\\n'
51
- - Trailing whitespace: span.text ends with ' ' or '\\n'
52
- - Empty span: span.text == ''
53
- - Punctuation-only: e.g. '.' or ','
54
- """
55
- import spacy
56
- from spacy.tokens import DocBin
57
-
58
- nlp = spacy.blank("en")
59
- db = DocBin().from_disk(path)
60
- docs = list(db.get_docs(nlp.vocab))
61
-
62
- total_ents = 0
63
- bad_spans = []
64
-
65
- for i, doc in enumerate(docs):
66
- for ent in doc.ents:
67
- total_ents += 1
68
- t = ent.text
69
-
70
- if not t.strip():
71
- bad_spans.append({
72
- "doc": i, "label": ent.label_, "text": repr(t),
73
- "reason": "EMPTY or whitespace-only"
74
- })
75
- elif t != t.strip():
76
- bad_spans.append({
77
- "doc": i, "label": ent.label_, "text": repr(t),
78
- "reason": f"WHITESPACE β€” leading={repr(t[0])} trailing={repr(t[-1])}"
79
- })
80
- elif len(t) == 1 and not t.isalnum():
81
- bad_spans.append({
82
- "doc": i, "label": ent.label_, "text": repr(t),
83
- "reason": "SINGLE PUNCTUATION CHAR"
84
- })
85
-
86
- return len(docs), total_ents, bad_spans
87
-
88
-
89
- def check_all_spacy_files():
90
- """Check every .spacy file and report problems."""
91
- try:
92
- import spacy
93
- except ImportError:
94
- print(" ❌ spaCy not installed. Run: pip install spacy")
95
- return False
96
-
97
- print("\n" + "=" * 62)
98
- print(" STEP 1 β€” SCANNING .spacy FILES FOR BAD SPANS")
99
- print("=" * 62)
100
-
101
- any_problems = False
102
-
103
- for name, path in SPACY_FILES.items():
104
- if not Path(path).exists():
105
- print(f"\n βšͺ {name:30s} not found β€” will be created")
106
- continue
107
-
108
- print(f"\n πŸ“„ {name}")
109
- try:
110
- n_docs, n_ents, bad = inspect_spacy_file(path)
111
- print(f" docs: {n_docs} entities: {n_ents} bad spans: {len(bad)}")
112
-
113
- if bad:
114
- any_problems = True
115
- print(f" ❌ {len(bad)} PROBLEM SPAN(S):")
116
- for b in bad[:10]: # show first 10
117
- print(f" doc {b['doc']:>3} [{b['label']}] {b['text']:30s} ← {b['reason']}")
118
- if len(bad) > 10:
119
- print(f" ... and {len(bad) - 10} more")
120
- else:
121
- print(f" βœ… All spans clean")
122
-
123
- except Exception as e:
124
- print(f" ❌ Could not read file: {e}")
125
- any_problems = True
126
-
127
- return any_problems
128
-
129
-
130
- # ══════════════════════════════════════════════════════════
131
- # STEP 2 β€” spaCy OFFICIAL DEBUG DATA
132
- # ═══════════════════════════════════════���══════════════════
133
-
134
- def run_spacy_debug():
135
- """
136
- Run spaCy's built-in debug data command.
137
- This catches problems our scanner might miss.
138
- """
139
- print("\n" + "=" * 62)
140
- print(" STEP 2 β€” spaCy OFFICIAL DEBUG DATA")
141
- print("=" * 62)
142
-
143
- train = "data/training/merged_train.spacy"
144
- dev = "data/training/merged_dev.spacy"
145
-
146
- # Fall back to civil-only if merged doesn't exist
147
- if not Path(train).exists():
148
- train = "data/training/train.spacy"
149
- dev = "data/training/dev.spacy"
150
-
151
- if not Path(train).exists():
152
- print("\n βšͺ No training data found yet β€” skipping debug.")
153
- print(" β†’ Run: python training/prepare_data.py first")
154
- return
155
-
156
- if not Path(CFG).exists():
157
- print(f"\n βšͺ Config not found: {CFG} β€” skipping debug.")
158
- return
159
-
160
- print(f"\n Checking: {train}")
161
- print(f" Dev: {dev}\n")
162
-
163
- result = subprocess.run([
164
- sys.executable, "-m", "spacy", "debug", "data", CFG,
165
- "--paths.train", train,
166
- "--paths.dev", dev,
167
- ])
168
-
169
- if result.returncode != 0:
170
- print("\n ⚠️ debug data found issues β€” see above.")
171
- else:
172
- print("\n βœ… debug data passed β€” no issues found.")
173
-
174
-
175
- # ══════════════════════════════════════════════════════════
176
- # STEP 3 β€” DELETE OLD .spacy FILES
177
- # ══════════════════════════════════════════════════════════
178
-
179
- def delete_spacy_files():
180
- """Delete all generated .spacy files so they get rebuilt clean."""
181
- print("\n" + "=" * 62)
182
- print(" STEP 3 β€” DELETING OLD .spacy FILES")
183
- print("=" * 62)
184
-
185
- deleted = 0
186
- for path in REBUILT_FILES:
187
- p = Path(path)
188
- if p.exists():
189
- p.unlink()
190
- print(f" πŸ—‘οΈ Deleted: {path}")
191
- deleted += 1
192
-
193
- if deleted == 0:
194
- print(" βšͺ Nothing to delete.")
195
- else:
196
- print(f"\n βœ… Deleted {deleted} file(s) β€” will be rebuilt clean.")
197
-
198
-
199
- # ══════════════════════════════════════════════════════════
200
- # STEP 4 β€” REBUILD + RETRAIN
201
- # ══════════════════════════════════════════════════════════
202
-
203
- def run_script(script: str, label: str) -> bool:
204
- """Run a training script. Returns True on success."""
205
- print(f"\n{'─' * 62}")
206
- print(f" β–Ά {label}")
207
- print(f" Script: {script}")
208
- print(f"{'─' * 62}\n")
209
-
210
- if not Path(script).exists():
211
- print(f" ❌ Script not found: {script}")
212
- return False
213
-
214
- result = subprocess.run([sys.executable, script])
215
- if result.returncode != 0:
216
- print(f"\n ❌ {label} failed.")
217
- return False
218
-
219
- print(f"\n βœ… {label} complete.")
220
- return True
221
-
222
-
223
- def retrain():
224
- """Run the full rebuild pipeline: prepare β†’ funsd β†’ train."""
225
- print("\n" + "=" * 62)
226
- print(" STEP 4 β€” REBUILD + RETRAIN")
227
- print("=" * 62)
228
-
229
- steps = [
230
- ("training/prepare_data.py", "Step 1/3: Build civil registry data"),
231
- ("training/funsd_integration.py", "Step 2/3: Merge FUNSD + civil registry"),
232
- ("training/train.py", "Step 3/3: Train NER model"),
233
- ]
234
-
235
- for script, label in steps:
236
- ok = run_script(script, label)
237
- if not ok:
238
- print(f"\n ❌ Pipeline stopped at: {script}")
239
- print(f" Fix the error above, then re-run:")
240
- print(f" python debug_and_retrain.py --retrain")
241
- sys.exit(1)
242
-
243
- print("\n" + "=" * 62)
244
- print(" βœ… RETRAIN COMPLETE")
245
- print("=" * 62)
246
- print("\n Best model β†’ models/civil_registry_model/model-best/")
247
- print("\n NEXT: python training/evaluate.py")
248
-
249
-
250
- # ══════════════════════════════════════════════════════════
251
- # MAIN
252
- # ══════════════════════════════════════════════════════════
253
-
254
- def main():
255
- parser = argparse.ArgumentParser(
256
- description="Debug FUNSD/civil data and retrain NER model"
257
- )
258
- parser.add_argument("--check", action="store_true",
259
- help="Check for bad spans only β€” don't retrain")
260
- parser.add_argument("--retrain", action="store_true",
261
- help="Skip check β€” delete old files and retrain immediately")
262
- args = parser.parse_args()
263
-
264
- print("\n" + "=" * 62)
265
- print(" CIVIL REGISTRY NER β€” DEBUG & RETRAIN")
266
- print("=" * 62)
267
- print("\n This script fixes the E024 'bad span' training error.")
268
- print(" Root causes: whitespace in spans, wrong alignment_mode,")
269
- print(" offset shift from text.strip() after build.")
270
-
271
- if args.retrain:
272
- # Skip checking β€” just delete and rebuild
273
- delete_spacy_files()
274
- retrain()
275
- return
276
-
277
- # ── Always run checks ─────────────────────────────────
278
- has_problems = check_all_spacy_files()
279
- run_spacy_debug()
280
-
281
- if args.check:
282
- # Check-only mode β€” stop here
283
- print("\n" + "=" * 62)
284
- if has_problems:
285
- print(" ⚠️ Problems found β€” run without --check to fix:")
286
- print(" python debug_and_retrain.py")
287
- else:
288
- print(" βœ… No problems found β€” safe to train:")
289
- print(" python training/train.py")
290
- print("=" * 62)
291
- return
292
-
293
- # ── Ask before deleting ───────────────────────────────
294
- print("\n" + "=" * 62)
295
- if has_problems:
296
- print(" ⚠️ Bad spans detected in .spacy files.")
297
- print(" The fixed funsd_integration.py will rebuild them cleanly.")
298
- else:
299
- print(" βœ… No bad spans detected in existing files.")
300
-
301
- print("\n Proceeding to delete old .spacy files and retrain...")
302
- print(" (Ctrl+C now to cancel)")
303
- print("=" * 62)
304
-
305
- try:
306
- input("\n Press ENTER to continue, Ctrl+C to cancel...\n")
307
- except KeyboardInterrupt:
308
- print("\n Cancelled.")
309
- return
310
-
311
- delete_spacy_files()
312
- retrain()
313
-
314
-
315
- if __name__ == "__main__":
316
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
spacyNER/models/phase1_funsd/model-last/vocab/strings.json CHANGED
The diff for this file is too large to render. See raw diff
 
spacyNER/models/phase1_funsd/model-last/vocab/vectors.cfg CHANGED
@@ -1,3 +1,3 @@
1
- {
2
- "mode":"default"
3
- }
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff4359091952c8cd16f1f0482f5770fb82d1707368d5cca3c46aa501f552e3c5
3
+ size 22
template_matcher.py CHANGED
@@ -41,7 +41,7 @@ _CRNN_DIR = os.path.join(os.path.dirname(__file__), 'CRNN+CTC')
41
  if _CRNN_DIR not in _sys.path:
42
  _sys.path.insert(0, _CRNN_DIR)
43
 
44
- _CRNN_CHECKPOINT = os.path.join(_CRNN_DIR, 'checkpoints', 'latest_checkpoint.pth')
45
  _crnn_ocr = None
46
  _crnn_decode = None # reference to decode_ctc_predictions
47
 
 
41
  if _CRNN_DIR not in _sys.path:
42
  _sys.path.insert(0, _CRNN_DIR)
43
 
44
+ _CRNN_CHECKPOINT = os.path.join(_CRNN_DIR, 'checkpoints', 'best_model_v4.pth')
45
  _crnn_ocr = None
46
  _crnn_decode = None # reference to decode_ctc_predictions
47