File size: 20,535 Bytes
7111e1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
"""
form_classifier.py
=======================
Multinomial Naive Bayes (MNB) Document Classifier
for Local Civil Registry Document Digitization System

Classifies extracted OCR text into:
  - Form 102  (Certificate of Live Birth)       ← Certifications page
  - Form 103  (Certificate of Death)             ← Certifications page
  - Form 97   (Certificate of Marriage)          ← Certifications page

NOTE: Form 90 (Application for Marriage License) is NOT classified here.
      Form 90 has its OWN upload page where the user uploads:
        - Groom's Birth Certificate (PSA/NSO sealed)
        - Bride's Birth Certificate (PSA/NSO sealed)
      The SEX field on each birth cert determines GROOM (Male) or BRIDE (Female).
      See classify_sex() in classifier.py for that routing.

Usage:
    python form_classifier.py            # trains and saves model
    python form_classifier.py --test     # runs test predictions
"""

import os
import json
import random
import argparse
import pickle
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix
)

# ─────────────────────────────────────────────────────────────
# 1.  LABEL MAP  (Certifications page only β€” NO Form 90 here)
# ─────────────────────────────────────────────────────────────
LABEL_MAP = {
    0: 'Form 102 - Certificate of Live Birth',
    1: 'Form 103 - Certificate of Death',
    2: 'Form 97 - Certificate of Marriage',
}
LABEL_NAMES = list(LABEL_MAP.values())

# ─────────────────────────────────────────────────────────────
# 2.  VOCABULARY POOLS  (Filipino civil registry)
# ─────────────────────────────────────────────────────────────
FIRST_NAMES = [
    'Juan', 'Maria', 'Jose', 'Ana', 'Pedro', 'Rosa', 'Carlos', 'Lani',
    'Roberto', 'Nena', 'Ramon', 'Cynthia', 'Eduardo', 'Marites', 'Danilo',
    'Rowena', 'Renato', 'Melinda', 'Ernesto', 'Josephine', 'Michael',
    'Jennifer', 'Angelo', 'Christine', 'Mark', 'Patricia', 'John', 'Mary'
]
LAST_NAMES = [
    'Dela Cruz', 'Santos', 'Reyes', 'Garcia', 'Torres', 'Flores',
    'Bautista', 'Villanueva', 'Mendoza', 'Castro', 'Ramos', 'Lim',
    'Aquino', 'Diaz', 'Fernandez', 'Lopez', 'Gonzales', 'Ramirez',
    'Abad', 'Aguilar', 'Manalo', 'Navarro', 'Ocampo', 'Pascual'
]
MUNICIPALITIES = [
    'Tarlac City', 'Capas', 'Paniqui', 'Gerona', 'Camiling',
    'Victoria', 'San Manuel', 'Concepcion', 'La Paz', 'Sta. Ignacia',
    'Bamban', 'Moncada', 'Pura', 'Ramos', 'Anao'
]
PROVINCES = ['Tarlac', 'Pampanga', 'Nueva Ecija', 'Bulacan', 'Zambales']
BARANGAYS = [
    'Brgy. San Jose', 'Brgy. Poblacion', 'Brgy. Sto. Cristo',
    'Brgy. Tibag', 'Brgy. Maliwalo', 'Brgy. San Nicolas',
    'Brgy. San Roque', 'Brgy. San Vicente', 'Brgy. Salapungan'
]
DATES = [
    '01/15/1990', '03/22/1985', '07/04/2000', '11/30/1995',
    '05/18/1988', '09/12/1975', '02/28/1993', '06/06/1980',
    '12/25/1998', '04/17/2001', '08/08/1965', '10/31/1970',
]

def _name():
    return f"{random.choice(FIRST_NAMES)} {random.choice(LAST_NAMES)}"

def _date():
    return random.choice(DATES)

def _place():
    return f"{random.choice(BARANGAYS)}, {random.choice(MUNICIPALITIES)}, {random.choice(PROVINCES)}"


# ─────────────────────────────────────────────────────────────
# 3.  SAMPLE GENERATORS
#     Each generator uses the EXACT Philippine form header
#     so MNB learns the real keywords from actual documents.
# ─────────────────────────────────────────────────────────────

def generate_form102():
    """
    Form 102 β€” Certificate of Live Birth
    Header keywords: 'Municipal Form No. 102', 'Certificate of Live Birth'
    """
    templates = [
        # Template A: Exact header present
        f"Municipal Form No. 102 Certificate of Live Birth "
        f"Name of child {_name()} Date of birth {_date()} Place of birth {_place()} "
        f"Name of mother {_name()} Name of father {_name()} "
        f"Sex {random.choice(['Male', 'Female'])} "
        f"Legitimacy {random.choice(['Legitimate', 'Illegitimate'])} "
        f"Attendant {random.choice(['Physician', 'Midwife', 'Nurse'])} "
        f"birth certificate registry birth registration infant newborn child",

        # Template B: No. without space
        f"Municipal Form No.102 Certificate of Live Birth "
        f"Child {_name()} born {_date()} at {_place()} "
        f"mother {_name()} father {_name()} "
        f"birth weight {random.randint(2, 4)}.{random.randint(1, 9)} kg "
        f"birth order {random.choice(['First', 'Second', 'Third'])} "
        f"birth certificate Form 102",

        # Template C: Registry number format
        f"Municipal Form No. 102 Certificate of Live Birth "
        f"Registry number {random.randint(100, 999)}-{random.randint(1, 99):02d} "
        f"name of child {_name()} date of birth {_date()} "
        f"place of birth {_place()} birth certificate municipal civil registrar",

        # Template D: PSA/NSO sealed copy (used when filing Form 90)
        f"Municipal Form No. 102 Certificate of Live Birth "
        f"PSA {_name()} born on {_date()} "
        f"place of birth {_place()} "
        f"mother maiden name {_name()} father {_name()} "
        f"type of birth {random.choice(['Single', 'Twin'])} infant newborn",

        # Template E: NSO variation
        f"Municipal Form No.102 Certificate of Live Birth "
        f"NSO birth registration {_name()} "
        f"birth date {_date()} birthplace {_place()} "
        f"parents mother {_name()} father {_name()} "
        f"attendant at birth {random.choice(['hospital', 'midwife', 'physician'])} "
        f"sex {random.choice(['male', 'female'])}",
    ]
    return random.choice(templates)


def generate_form103():
    """
    Form 103 β€” Certificate of Death
    Header keywords: 'Municipal Form No. 103', 'Certificate of Death'
    """
    causes = [
        'Cardiac Arrest', 'Pneumonia', 'Hypertension', 'Diabetes Mellitus',
        'Stroke', 'Respiratory Failure', 'Natural Causes', 'Cancer',
        'Septicemia', 'Renal Failure'
    ]
    templates = [
        # Template A: Exact header
        f"Municipal Form No. 103 Certificate of Death "
        f"Name of deceased {_name()} Date of death {_date()} Place of death {_place()} "
        f"Cause of death {random.choice(causes)} Age at death {random.randint(1, 95)} "
        f"Sex {random.choice(['Male', 'Female'])} "
        f"Civil status {random.choice(['Single', 'Married', 'Widowed'])} "
        f"death certificate deceased burial interment",

        # Template B: No space
        f"Municipal Form No.103 Certificate of Death "
        f"Deceased {_name()} died on {_date()} at {_place()} "
        f"cause {random.choice(causes)} corpse informant {_name()} "
        f"death certificate Form 103 municipal civil registrar",

        # Template C: Registry format
        f"Municipal Form No. 103 Certificate of Death "
        f"Registry number death {random.randint(100, 999)}-{random.randint(1, 99):02d} "
        f"name of deceased {_name()} date of death {_date()} "
        f"place of death {_place()} cause of death {random.choice(causes)} "
        f"death certificate burial permit",

        # Template D: Clinical format
        f"Municipal Form No.103 Certificate of Death "
        f"{_name()} died {_date()} "
        f"place {_place()} cause of death {random.choice(causes)} "
        f"informant {_name()} relationship {random.choice(['spouse', 'child', 'sibling', 'parent'])} "
        f"death deceased cadaver",

        # Template E: Full form
        f"Municipal Form No. 103 Certificate of Death "
        f"Form 103 death registration {_name()} "
        f"date of death {_date()} place of death {_place()} "
        f"immediate cause {random.choice(causes)} "
        f"attending physician {_name()} certificate of death",
    ]
    return random.choice(templates)


def generate_form97():
    """
    Form 97 β€” Certificate of Marriage
    Header keywords: 'Municipal Form No. 97', 'Certificate of Marriage'
    """
    officers = ['Rev.', 'Judge', 'Mayor', 'Pastor', 'Fr.']
    licenses = [f"{random.randint(10000, 99999)}", f"ML-{random.randint(1000, 9999)}"]
    templates = [
        # Template A: Exact header
        f"Municipal Form No. 97 Certificate of Marriage "
        f"Name of husband {_name()} Name of wife {_name()} "
        f"Date of marriage {_date()} Place of marriage {_place()} "
        f"Solemnizing officer {random.choice(officers)} {_name()} "
        f"Marriage license number {random.choice(licenses)} witnesses {_name()} {_name()} "
        f"marriage certificate contracting parties wedding",

        # Template B: No space
        f"Municipal Form No.97 Certificate of Marriage "
        f"Husband {_name()} wife {_name()} "
        f"married on {_date()} at {_place()} "
        f"officiated by {random.choice(officers)} {_name()} "
        f"marriage certificate Form 97 solemnizing officer",

        # Template C: Registry format
        f"Municipal Form No. 97 Certificate of Marriage "
        f"Registry number marriage {random.randint(100, 999)}-{random.randint(1, 99):02d} "
        f"husband {_name()} wife {_name()} "
        f"date of marriage {_date()} place {_place()} "
        f"marriage license {random.choice(licenses)} issued at {_place()} "
        f"marriage certificate civil registrar",

        # Template D: Ceremony format
        f"Municipal Form No.97 Certificate of Marriage "
        f"{_name()} and {_name()} "
        f"solemnized {_date()} at {_place()} "
        f"solemnizing officer {random.choice(officers)} {_name()} "
        f"witnesses {_name()} {_name()} "
        f"marriage contracting parties husband wife ceremony",

        # Template E: Full form
        f"Municipal Form No. 97 Certificate of Marriage "
        f"Form 97 marriage registration husband {_name()} "
        f"wife {_name()} date of marriage {_date()} "
        f"place of marriage {_place()} "
        f"license number {random.choice(licenses)} marriage nuptial wed",
    ]
    return random.choice(templates)


# ─────────────────────────────────────────────────────────────
# 4.  DATASET GENERATOR  (3 classes only β€” no Form 90)
# ─────────────────────────────────────────────────────────────
def generate_dataset(samples_per_class=150):
    generators = [generate_form102, generate_form103, generate_form97]
    labels_map = [0, 1, 2]  # 0=Form102, 1=Form103, 2=Form97

    texts, labels = [], []
    for gen, label in zip(generators, labels_map):
        for _ in range(samples_per_class):
            texts.append(gen())
            labels.append(label)

    combined = list(zip(texts, labels))
    random.shuffle(combined)
    texts, labels = zip(*combined)
    return list(texts), list(labels)


# ─────────────────────────────────────────────────────────────
# 5.  TRAIN & SAVE
# ─────────────────────────────────────────────────────────────
def train(samples_per_class=150, save_dir='models'):
    os.makedirs(save_dir, exist_ok=True)

    print("=" * 60)
    print("  MNB Document Classifier  |  Filipino Civil Registry")
    print("  Certifications Page: Form 102 / 103 / 97 ONLY")
    print("  (Form 90 routing is handled separately via SEX field)")
    print("=" * 60)

    print(f"\n  Generating dataset ({samples_per_class} samples Γ— 3 forms = {samples_per_class * 3} total)...")
    texts, labels = generate_dataset(samples_per_class)

    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels, test_size=0.2, random_state=42, stratify=labels
    )
    print(f"  Train: {len(X_train)}  |  Test: {len(X_test)}")

    # TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=5000,
        sublinear_tf=True,
        min_df=1,
    )
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec  = vectorizer.transform(X_test)

    # Train MNB
    clf = MultinomialNB(alpha=0.1)
    clf.fit(X_train_vec, y_train)

    # Evaluate
    y_pred = clf.predict(X_test_vec)
    acc    = accuracy_score(y_test, y_pred)

    print(f"\n  Accuracy : {acc * 100:.2f}%")
    print("\n  Classification Report:")
    print(classification_report(y_test, y_pred, target_names=LABEL_NAMES))

    print("  Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    headers = ['Form102', 'Form103', 'Form97']
    print(f"  {'':30s} " + "  ".join(headers))
    for i, row in enumerate(cm):
        print(f"  Actual {headers[i]}: {str(row)}")

    # Save
    model_path = os.path.join(save_dir, 'mnb_classifier.pkl')
    vec_path   = os.path.join(save_dir, 'tfidf_vectorizer.pkl')
    with open(model_path, 'wb') as f:
        pickle.dump(clf, f)
    with open(vec_path, 'wb') as f:
        pickle.dump(vectorizer, f)

    meta = {
        'accuracy': round(acc * 100, 2),
        'samples_per_class': samples_per_class,
        'total_samples': samples_per_class * 3,
        'labels': LABEL_MAP,
        'note': 'Form 90 routing is handled by classify_sex() β€” not this model',
        'model_path': model_path,
        'vectorizer_path': vec_path,
    }
    with open(os.path.join(save_dir, 'mnb_metadata.json'), 'w') as f:
        json.dump(meta, f, indent=2)

    print(f"\n  Model saved     : {model_path}")
    print(f"  Vectorizer saved: {vec_path}")
    print(f"\n  Target accuracy : >90%")
    print(f"  Achieved        : {acc * 100:.2f}% {'βœ“' if acc >= 0.90 else 'βœ— (try increasing samples_per_class)'}")
    print("=" * 60)

    return clf, vectorizer, acc


# ─────────────────────────────────────────────────────────────
# 6.  DOCUMENT CLASSIFIER CLASS
# ─────────────────────────────────────────────────────────────
class DocumentClassifier:
    """Load trained MNB model and classify OCR text from Certifications page."""

    def __init__(self, model_dir='models'):
        model_path = os.path.join(model_dir, 'mnb_classifier.pkl')
        vec_path   = os.path.join(model_dir, 'tfidf_vectorizer.pkl')

        if not os.path.exists(model_path):
            raise FileNotFoundError(
                f"Model not found at {model_path}. Run: python form_classifier.py"
            )

        with open(model_path, 'rb') as f:
            self.clf = pickle.load(f)
        with open(vec_path, 'rb') as f:
            self.vectorizer = pickle.load(f)

    def predict(self, text: str) -> dict:
        """
        Classify OCR text from Certifications page.

        Returns:
            {
                'label':        'Form 102 - Certificate of Live Birth',
                'form_code':    'form102',
                'confidence':   0.95,
                'probabilities': { ... }
            }
        """
        vec   = self.vectorizer.transform([text])
        probs = self.clf.predict_proba(vec)[0]
        idx   = int(np.argmax(probs))

        form_codes = ['form102', 'form103', 'form97']
        return {
            'label':      LABEL_MAP[idx],
            'form_code':  form_codes[idx],
            'confidence': round(float(probs[idx]), 4),
            'probabilities': {
                LABEL_MAP[i]: round(float(p), 4)
                for i, p in enumerate(probs)
            }
        }


# ─────────────────────────────────────────────────────────────
# 7.  TEST DEMO
# ─────────────────────────────────────────────────────────────
def run_test():
    print("\n" + "=" * 60)
    print("  Testing DocumentClassifier β€” Certifications Page")
    print("=" * 60)

    classifier = DocumentClassifier()

    test_cases = [
        (
            "Municipal Form No. 102 Certificate of Live Birth "
            "Name of child Maria Santos Date of birth 01/15/1990 "
            "Place of birth Brgy. San Jose, Tarlac City, Tarlac "
            "Name of mother Lani Santos Name of father Jose Santos "
            "Sex Female birth certificate infant",
            "Form 102 - Certificate of Live Birth"
        ),
        (
            "Municipal Form No.102 Certificate of Live Birth "
            "PSA Child Juan Dela Cruz born 03/22/1985 "
            "Place of birth Capas Tarlac mother Rosa Dela Cruz "
            "father Pedro Dela Cruz Sex Male",
            "Form 102 - Certificate of Live Birth"
        ),
        (
            "Municipal Form No. 103 Certificate of Death "
            "Name of deceased Pedro Reyes Date of death 03/22/2020 "
            "Place of death Capas, Tarlac Cause of death Cardiac Arrest "
            "Age at death 75 death certificate deceased burial",
            "Form 103 - Certificate of Death"
        ),
        (
            "Municipal Form No.103 Certificate of Death "
            "Deceased Ana Torres died 07/04/2000 "
            "cause Pneumonia burial permit interment",
            "Form 103 - Certificate of Death"
        ),
        (
            "Municipal Form No. 97 Certificate of Marriage "
            "Name of husband Carlos Bautista Name of wife Ana Torres "
            "Date of marriage 07/04/2005 Place of marriage Paniqui, Tarlac "
            "Solemnizing officer Rev. Santos witnesses marriage certificate",
            "Form 97 - Certificate of Marriage"
        ),
        (
            "Municipal Form No.97 Certificate of Marriage "
            "Husband Jose Santos wife Maria Reyes "
            "married 11/30/1995 contracting parties solemnizing officer",
            "Form 97 - Certificate of Marriage"
        ),
    ]

    correct = 0
    for text, expected in test_cases:
        result = classifier.predict(text)
        status = 'βœ“' if expected in result['label'] else 'βœ—'
        if expected in result['label']:
            correct += 1
        print(f"\n  {status} Expected : {expected}")
        print(f"    Predicted: {result['label']} ({result['confidence'] * 100:.1f}% confidence)")

    print(f"\n  Test Accuracy: {correct}/{len(test_cases)} ({correct / len(test_cases) * 100:.0f}%)")
    print("=" * 60)


# ─────────────────────────────────────────────────────────────
# 8.  MAIN
# ─────────────────────────────────────────────────────────────
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--test', action='store_true', help='Run test predictions only')
    parser.add_argument('--samples', type=int, default=150, help='Samples per class (default: 150)')
    args = parser.parse_args()

    if args.test:
        run_test()
    else:
        train(samples_per_class=args.samples)
        print("\nTo test predictions, run:")
        print("  python form_classifier.py --test")