| """
|
| form_classifier.py
|
| =======================
|
| Multinomial Naive Bayes (MNB) Document Classifier
|
| for Local Civil Registry Document Digitization System
|
|
|
| Classifies extracted OCR text into:
|
| - Form 102 (Certificate of Live Birth) β Certifications page
|
| - Form 103 (Certificate of Death) β Certifications page
|
| - Form 97 (Certificate of Marriage) β Certifications page
|
|
|
| NOTE: Form 90 (Application for Marriage License) is NOT classified here.
|
| Form 90 has its OWN upload page where the user uploads:
|
| - Groom's Birth Certificate (PSA/NSO sealed)
|
| - Bride's Birth Certificate (PSA/NSO sealed)
|
| The SEX field on each birth cert determines GROOM (Male) or BRIDE (Female).
|
| See classify_sex() in classifier.py for that routing.
|
|
|
| Usage:
|
| python form_classifier.py # trains and saves model
|
| python form_classifier.py --test # runs test predictions
|
| """
|
|
|
| import os
|
| import json
|
| import random
|
| import argparse
|
| import pickle
|
| import numpy as np
|
| from sklearn.naive_bayes import MultinomialNB
|
| from sklearn.feature_extraction.text import TfidfVectorizer
|
| from sklearn.model_selection import train_test_split
|
| from sklearn.metrics import (
|
| accuracy_score, classification_report, confusion_matrix
|
| )
|
|
|
|
|
|
|
|
|
| LABEL_MAP = {
|
| 0: 'Form 102 - Certificate of Live Birth',
|
| 1: 'Form 103 - Certificate of Death',
|
| 2: 'Form 97 - Certificate of Marriage',
|
| }
|
| LABEL_NAMES = list(LABEL_MAP.values())
|
|
|
|
|
|
|
|
|
| FIRST_NAMES = [
|
| 'Juan', 'Maria', 'Jose', 'Ana', 'Pedro', 'Rosa', 'Carlos', 'Lani',
|
| 'Roberto', 'Nena', 'Ramon', 'Cynthia', 'Eduardo', 'Marites', 'Danilo',
|
| 'Rowena', 'Renato', 'Melinda', 'Ernesto', 'Josephine', 'Michael',
|
| 'Jennifer', 'Angelo', 'Christine', 'Mark', 'Patricia', 'John', 'Mary'
|
| ]
|
| LAST_NAMES = [
|
| 'Dela Cruz', 'Santos', 'Reyes', 'Garcia', 'Torres', 'Flores',
|
| 'Bautista', 'Villanueva', 'Mendoza', 'Castro', 'Ramos', 'Lim',
|
| 'Aquino', 'Diaz', 'Fernandez', 'Lopez', 'Gonzales', 'Ramirez',
|
| 'Abad', 'Aguilar', 'Manalo', 'Navarro', 'Ocampo', 'Pascual'
|
| ]
|
| MUNICIPALITIES = [
|
| 'Tarlac City', 'Capas', 'Paniqui', 'Gerona', 'Camiling',
|
| 'Victoria', 'San Manuel', 'Concepcion', 'La Paz', 'Sta. Ignacia',
|
| 'Bamban', 'Moncada', 'Pura', 'Ramos', 'Anao'
|
| ]
|
| PROVINCES = ['Tarlac', 'Pampanga', 'Nueva Ecija', 'Bulacan', 'Zambales']
|
| BARANGAYS = [
|
| 'Brgy. San Jose', 'Brgy. Poblacion', 'Brgy. Sto. Cristo',
|
| 'Brgy. Tibag', 'Brgy. Maliwalo', 'Brgy. San Nicolas',
|
| 'Brgy. San Roque', 'Brgy. San Vicente', 'Brgy. Salapungan'
|
| ]
|
| DATES = [
|
| '01/15/1990', '03/22/1985', '07/04/2000', '11/30/1995',
|
| '05/18/1988', '09/12/1975', '02/28/1993', '06/06/1980',
|
| '12/25/1998', '04/17/2001', '08/08/1965', '10/31/1970',
|
| ]
|
|
|
| def _name():
|
| return f"{random.choice(FIRST_NAMES)} {random.choice(LAST_NAMES)}"
|
|
|
| def _date():
|
| return random.choice(DATES)
|
|
|
| def _place():
|
| return f"{random.choice(BARANGAYS)}, {random.choice(MUNICIPALITIES)}, {random.choice(PROVINCES)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| def generate_form102():
|
| """
|
| Form 102 β Certificate of Live Birth
|
| Header keywords: 'Municipal Form No. 102', 'Certificate of Live Birth'
|
| """
|
| templates = [
|
|
|
| f"Municipal Form No. 102 Certificate of Live Birth "
|
| f"Name of child {_name()} Date of birth {_date()} Place of birth {_place()} "
|
| f"Name of mother {_name()} Name of father {_name()} "
|
| f"Sex {random.choice(['Male', 'Female'])} "
|
| f"Legitimacy {random.choice(['Legitimate', 'Illegitimate'])} "
|
| f"Attendant {random.choice(['Physician', 'Midwife', 'Nurse'])} "
|
| f"birth certificate registry birth registration infant newborn child",
|
|
|
|
|
| f"Municipal Form No.102 Certificate of Live Birth "
|
| f"Child {_name()} born {_date()} at {_place()} "
|
| f"mother {_name()} father {_name()} "
|
| f"birth weight {random.randint(2, 4)}.{random.randint(1, 9)} kg "
|
| f"birth order {random.choice(['First', 'Second', 'Third'])} "
|
| f"birth certificate Form 102",
|
|
|
|
|
| f"Municipal Form No. 102 Certificate of Live Birth "
|
| f"Registry number {random.randint(100, 999)}-{random.randint(1, 99):02d} "
|
| f"name of child {_name()} date of birth {_date()} "
|
| f"place of birth {_place()} birth certificate municipal civil registrar",
|
|
|
|
|
| f"Municipal Form No. 102 Certificate of Live Birth "
|
| f"PSA {_name()} born on {_date()} "
|
| f"place of birth {_place()} "
|
| f"mother maiden name {_name()} father {_name()} "
|
| f"type of birth {random.choice(['Single', 'Twin'])} infant newborn",
|
|
|
|
|
| f"Municipal Form No.102 Certificate of Live Birth "
|
| f"NSO birth registration {_name()} "
|
| f"birth date {_date()} birthplace {_place()} "
|
| f"parents mother {_name()} father {_name()} "
|
| f"attendant at birth {random.choice(['hospital', 'midwife', 'physician'])} "
|
| f"sex {random.choice(['male', 'female'])}",
|
| ]
|
| return random.choice(templates)
|
|
|
|
|
| def generate_form103():
|
| """
|
| Form 103 β Certificate of Death
|
| Header keywords: 'Municipal Form No. 103', 'Certificate of Death'
|
| """
|
| causes = [
|
| 'Cardiac Arrest', 'Pneumonia', 'Hypertension', 'Diabetes Mellitus',
|
| 'Stroke', 'Respiratory Failure', 'Natural Causes', 'Cancer',
|
| 'Septicemia', 'Renal Failure'
|
| ]
|
| templates = [
|
|
|
| f"Municipal Form No. 103 Certificate of Death "
|
| f"Name of deceased {_name()} Date of death {_date()} Place of death {_place()} "
|
| f"Cause of death {random.choice(causes)} Age at death {random.randint(1, 95)} "
|
| f"Sex {random.choice(['Male', 'Female'])} "
|
| f"Civil status {random.choice(['Single', 'Married', 'Widowed'])} "
|
| f"death certificate deceased burial interment",
|
|
|
|
|
| f"Municipal Form No.103 Certificate of Death "
|
| f"Deceased {_name()} died on {_date()} at {_place()} "
|
| f"cause {random.choice(causes)} corpse informant {_name()} "
|
| f"death certificate Form 103 municipal civil registrar",
|
|
|
|
|
| f"Municipal Form No. 103 Certificate of Death "
|
| f"Registry number death {random.randint(100, 999)}-{random.randint(1, 99):02d} "
|
| f"name of deceased {_name()} date of death {_date()} "
|
| f"place of death {_place()} cause of death {random.choice(causes)} "
|
| f"death certificate burial permit",
|
|
|
|
|
| f"Municipal Form No.103 Certificate of Death "
|
| f"{_name()} died {_date()} "
|
| f"place {_place()} cause of death {random.choice(causes)} "
|
| f"informant {_name()} relationship {random.choice(['spouse', 'child', 'sibling', 'parent'])} "
|
| f"death deceased cadaver",
|
|
|
|
|
| f"Municipal Form No. 103 Certificate of Death "
|
| f"Form 103 death registration {_name()} "
|
| f"date of death {_date()} place of death {_place()} "
|
| f"immediate cause {random.choice(causes)} "
|
| f"attending physician {_name()} certificate of death",
|
| ]
|
| return random.choice(templates)
|
|
|
|
|
| def generate_form97():
|
| """
|
| Form 97 β Certificate of Marriage
|
| Header keywords: 'Municipal Form No. 97', 'Certificate of Marriage'
|
| """
|
| officers = ['Rev.', 'Judge', 'Mayor', 'Pastor', 'Fr.']
|
| licenses = [f"{random.randint(10000, 99999)}", f"ML-{random.randint(1000, 9999)}"]
|
| templates = [
|
|
|
| f"Municipal Form No. 97 Certificate of Marriage "
|
| f"Name of husband {_name()} Name of wife {_name()} "
|
| f"Date of marriage {_date()} Place of marriage {_place()} "
|
| f"Solemnizing officer {random.choice(officers)} {_name()} "
|
| f"Marriage license number {random.choice(licenses)} witnesses {_name()} {_name()} "
|
| f"marriage certificate contracting parties wedding",
|
|
|
|
|
| f"Municipal Form No.97 Certificate of Marriage "
|
| f"Husband {_name()} wife {_name()} "
|
| f"married on {_date()} at {_place()} "
|
| f"officiated by {random.choice(officers)} {_name()} "
|
| f"marriage certificate Form 97 solemnizing officer",
|
|
|
|
|
| f"Municipal Form No. 97 Certificate of Marriage "
|
| f"Registry number marriage {random.randint(100, 999)}-{random.randint(1, 99):02d} "
|
| f"husband {_name()} wife {_name()} "
|
| f"date of marriage {_date()} place {_place()} "
|
| f"marriage license {random.choice(licenses)} issued at {_place()} "
|
| f"marriage certificate civil registrar",
|
|
|
|
|
| f"Municipal Form No.97 Certificate of Marriage "
|
| f"{_name()} and {_name()} "
|
| f"solemnized {_date()} at {_place()} "
|
| f"solemnizing officer {random.choice(officers)} {_name()} "
|
| f"witnesses {_name()} {_name()} "
|
| f"marriage contracting parties husband wife ceremony",
|
|
|
|
|
| f"Municipal Form No. 97 Certificate of Marriage "
|
| f"Form 97 marriage registration husband {_name()} "
|
| f"wife {_name()} date of marriage {_date()} "
|
| f"place of marriage {_place()} "
|
| f"license number {random.choice(licenses)} marriage nuptial wed",
|
| ]
|
| return random.choice(templates)
|
|
|
|
|
|
|
|
|
|
|
| def generate_dataset(samples_per_class=150):
|
| generators = [generate_form102, generate_form103, generate_form97]
|
| labels_map = [0, 1, 2]
|
|
|
| texts, labels = [], []
|
| for gen, label in zip(generators, labels_map):
|
| for _ in range(samples_per_class):
|
| texts.append(gen())
|
| labels.append(label)
|
|
|
| combined = list(zip(texts, labels))
|
| random.shuffle(combined)
|
| texts, labels = zip(*combined)
|
| return list(texts), list(labels)
|
|
|
|
|
|
|
|
|
|
|
| def train(samples_per_class=150, save_dir='models'):
|
| os.makedirs(save_dir, exist_ok=True)
|
|
|
| print("=" * 60)
|
| print(" MNB Document Classifier | Filipino Civil Registry")
|
| print(" Certifications Page: Form 102 / 103 / 97 ONLY")
|
| print(" (Form 90 routing is handled separately via SEX field)")
|
| print("=" * 60)
|
|
|
| print(f"\n Generating dataset ({samples_per_class} samples Γ 3 forms = {samples_per_class * 3} total)...")
|
| texts, labels = generate_dataset(samples_per_class)
|
|
|
| X_train, X_test, y_train, y_test = train_test_split(
|
| texts, labels, test_size=0.2, random_state=42, stratify=labels
|
| )
|
| print(f" Train: {len(X_train)} | Test: {len(X_test)}")
|
|
|
|
|
| vectorizer = TfidfVectorizer(
|
| ngram_range=(1, 2),
|
| max_features=5000,
|
| sublinear_tf=True,
|
| min_df=1,
|
| )
|
| X_train_vec = vectorizer.fit_transform(X_train)
|
| X_test_vec = vectorizer.transform(X_test)
|
|
|
|
|
| clf = MultinomialNB(alpha=0.1)
|
| clf.fit(X_train_vec, y_train)
|
|
|
|
|
| y_pred = clf.predict(X_test_vec)
|
| acc = accuracy_score(y_test, y_pred)
|
|
|
| print(f"\n Accuracy : {acc * 100:.2f}%")
|
| print("\n Classification Report:")
|
| print(classification_report(y_test, y_pred, target_names=LABEL_NAMES))
|
|
|
| print(" Confusion Matrix:")
|
| cm = confusion_matrix(y_test, y_pred)
|
| headers = ['Form102', 'Form103', 'Form97']
|
| print(f" {'':30s} " + " ".join(headers))
|
| for i, row in enumerate(cm):
|
| print(f" Actual {headers[i]}: {str(row)}")
|
|
|
|
|
| model_path = os.path.join(save_dir, 'mnb_classifier.pkl')
|
| vec_path = os.path.join(save_dir, 'tfidf_vectorizer.pkl')
|
| with open(model_path, 'wb') as f:
|
| pickle.dump(clf, f)
|
| with open(vec_path, 'wb') as f:
|
| pickle.dump(vectorizer, f)
|
|
|
| meta = {
|
| 'accuracy': round(acc * 100, 2),
|
| 'samples_per_class': samples_per_class,
|
| 'total_samples': samples_per_class * 3,
|
| 'labels': LABEL_MAP,
|
| 'note': 'Form 90 routing is handled by classify_sex() β not this model',
|
| 'model_path': model_path,
|
| 'vectorizer_path': vec_path,
|
| }
|
| with open(os.path.join(save_dir, 'mnb_metadata.json'), 'w') as f:
|
| json.dump(meta, f, indent=2)
|
|
|
| print(f"\n Model saved : {model_path}")
|
| print(f" Vectorizer saved: {vec_path}")
|
| print(f"\n Target accuracy : >90%")
|
| print(f" Achieved : {acc * 100:.2f}% {'β' if acc >= 0.90 else 'β (try increasing samples_per_class)'}")
|
| print("=" * 60)
|
|
|
| return clf, vectorizer, acc
|
|
|
|
|
|
|
|
|
|
|
| class DocumentClassifier:
|
| """Load trained MNB model and classify OCR text from Certifications page."""
|
|
|
| def __init__(self, model_dir='models'):
|
| model_path = os.path.join(model_dir, 'mnb_classifier.pkl')
|
| vec_path = os.path.join(model_dir, 'tfidf_vectorizer.pkl')
|
|
|
| if not os.path.exists(model_path):
|
| raise FileNotFoundError(
|
| f"Model not found at {model_path}. Run: python form_classifier.py"
|
| )
|
|
|
| with open(model_path, 'rb') as f:
|
| self.clf = pickle.load(f)
|
| with open(vec_path, 'rb') as f:
|
| self.vectorizer = pickle.load(f)
|
|
|
| def predict(self, text: str) -> dict:
|
| """
|
| Classify OCR text from Certifications page.
|
|
|
| Returns:
|
| {
|
| 'label': 'Form 102 - Certificate of Live Birth',
|
| 'form_code': 'form102',
|
| 'confidence': 0.95,
|
| 'probabilities': { ... }
|
| }
|
| """
|
| vec = self.vectorizer.transform([text])
|
| probs = self.clf.predict_proba(vec)[0]
|
| idx = int(np.argmax(probs))
|
|
|
| form_codes = ['form102', 'form103', 'form97']
|
| return {
|
| 'label': LABEL_MAP[idx],
|
| 'form_code': form_codes[idx],
|
| 'confidence': round(float(probs[idx]), 4),
|
| 'probabilities': {
|
| LABEL_MAP[i]: round(float(p), 4)
|
| for i, p in enumerate(probs)
|
| }
|
| }
|
|
|
|
|
|
|
|
|
|
|
| def run_test():
|
| print("\n" + "=" * 60)
|
| print(" Testing DocumentClassifier β Certifications Page")
|
| print("=" * 60)
|
|
|
| classifier = DocumentClassifier()
|
|
|
| test_cases = [
|
| (
|
| "Municipal Form No. 102 Certificate of Live Birth "
|
| "Name of child Maria Santos Date of birth 01/15/1990 "
|
| "Place of birth Brgy. San Jose, Tarlac City, Tarlac "
|
| "Name of mother Lani Santos Name of father Jose Santos "
|
| "Sex Female birth certificate infant",
|
| "Form 102 - Certificate of Live Birth"
|
| ),
|
| (
|
| "Municipal Form No.102 Certificate of Live Birth "
|
| "PSA Child Juan Dela Cruz born 03/22/1985 "
|
| "Place of birth Capas Tarlac mother Rosa Dela Cruz "
|
| "father Pedro Dela Cruz Sex Male",
|
| "Form 102 - Certificate of Live Birth"
|
| ),
|
| (
|
| "Municipal Form No. 103 Certificate of Death "
|
| "Name of deceased Pedro Reyes Date of death 03/22/2020 "
|
| "Place of death Capas, Tarlac Cause of death Cardiac Arrest "
|
| "Age at death 75 death certificate deceased burial",
|
| "Form 103 - Certificate of Death"
|
| ),
|
| (
|
| "Municipal Form No.103 Certificate of Death "
|
| "Deceased Ana Torres died 07/04/2000 "
|
| "cause Pneumonia burial permit interment",
|
| "Form 103 - Certificate of Death"
|
| ),
|
| (
|
| "Municipal Form No. 97 Certificate of Marriage "
|
| "Name of husband Carlos Bautista Name of wife Ana Torres "
|
| "Date of marriage 07/04/2005 Place of marriage Paniqui, Tarlac "
|
| "Solemnizing officer Rev. Santos witnesses marriage certificate",
|
| "Form 97 - Certificate of Marriage"
|
| ),
|
| (
|
| "Municipal Form No.97 Certificate of Marriage "
|
| "Husband Jose Santos wife Maria Reyes "
|
| "married 11/30/1995 contracting parties solemnizing officer",
|
| "Form 97 - Certificate of Marriage"
|
| ),
|
| ]
|
|
|
| correct = 0
|
| for text, expected in test_cases:
|
| result = classifier.predict(text)
|
| status = 'β' if expected in result['label'] else 'β'
|
| if expected in result['label']:
|
| correct += 1
|
| print(f"\n {status} Expected : {expected}")
|
| print(f" Predicted: {result['label']} ({result['confidence'] * 100:.1f}% confidence)")
|
|
|
| print(f"\n Test Accuracy: {correct}/{len(test_cases)} ({correct / len(test_cases) * 100:.0f}%)")
|
| print("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
| if __name__ == '__main__':
|
| parser = argparse.ArgumentParser()
|
| parser.add_argument('--test', action='store_true', help='Run test predictions only')
|
| parser.add_argument('--samples', type=int, default=150, help='Samples per class (default: 150)')
|
| args = parser.parse_args()
|
|
|
| if args.test:
|
| run_test()
|
| else:
|
| train(samples_per_class=args.samples)
|
| print("\nTo test predictions, run:")
|
| print(" python form_classifier.py --test")
|
|
|