| """ |
| form_classifier.py |
| ======================= |
| Multinomial Naive Bayes (MNB) Document Classifier |
| for Local Civil Registry Document Digitization System |
| |
| Classifies extracted OCR text into: |
| - Form 102 (Certificate of Live Birth) β Certifications page |
| - Form 103 (Certificate of Death) β Certifications page |
| - Form 97 (Certificate of Marriage) β Certifications page |
| |
| NOTE: Form 90 (Application for Marriage License) is NOT classified here. |
| Form 90 has its OWN upload page where the user uploads: |
| - Groom's Birth Certificate (PSA/NSO sealed) |
| - Bride's Birth Certificate (PSA/NSO sealed) |
| The SEX field on each birth cert determines GROOM (Male) or BRIDE (Female). |
| See classify_sex() in classifier.py for that routing. |
| |
| Usage: |
| python form_classifier.py # trains and saves model |
| python form_classifier.py --test # runs test predictions |
| """ |
|
|
| import os |
| import json |
| import random |
| import argparse |
| import pickle |
| import numpy as np |
| from sklearn.naive_bayes import MultinomialNB |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import ( |
| accuracy_score, classification_report, confusion_matrix |
| ) |
|
|
| |
| |
| |
| LABEL_MAP = { |
| 0: 'Form 102 - Certificate of Live Birth', |
| 1: 'Form 103 - Certificate of Death', |
| 2: 'Form 97 - Certificate of Marriage', |
| } |
| LABEL_NAMES = list(LABEL_MAP.values()) |
|
|
| |
| |
| |
| FIRST_NAMES = [ |
| 'Juan', 'Maria', 'Jose', 'Ana', 'Pedro', 'Rosa', 'Carlos', 'Lani', |
| 'Roberto', 'Nena', 'Ramon', 'Cynthia', 'Eduardo', 'Marites', 'Danilo', |
| 'Rowena', 'Renato', 'Melinda', 'Ernesto', 'Josephine', 'Michael', |
| 'Jennifer', 'Angelo', 'Christine', 'Mark', 'Patricia', 'John', 'Mary' |
| ] |
| LAST_NAMES = [ |
| 'Dela Cruz', 'Santos', 'Reyes', 'Garcia', 'Torres', 'Flores', |
| 'Bautista', 'Villanueva', 'Mendoza', 'Castro', 'Ramos', 'Lim', |
| 'Aquino', 'Diaz', 'Fernandez', 'Lopez', 'Gonzales', 'Ramirez', |
| 'Abad', 'Aguilar', 'Manalo', 'Navarro', 'Ocampo', 'Pascual' |
| ] |
| MUNICIPALITIES = [ |
| 'Tarlac City', 'Capas', 'Paniqui', 'Gerona', 'Camiling', |
| 'Victoria', 'San Manuel', 'Concepcion', 'La Paz', 'Sta. Ignacia', |
| 'Bamban', 'Moncada', 'Pura', 'Ramos', 'Anao' |
| ] |
| PROVINCES = ['Tarlac', 'Pampanga', 'Nueva Ecija', 'Bulacan', 'Zambales'] |
| BARANGAYS = [ |
| 'Brgy. San Jose', 'Brgy. Poblacion', 'Brgy. Sto. Cristo', |
| 'Brgy. Tibag', 'Brgy. Maliwalo', 'Brgy. San Nicolas', |
| 'Brgy. San Roque', 'Brgy. San Vicente', 'Brgy. Salapungan' |
| ] |
| DATES = [ |
| '01/15/1990', '03/22/1985', '07/04/2000', '11/30/1995', |
| '05/18/1988', '09/12/1975', '02/28/1993', '06/06/1980', |
| '12/25/1998', '04/17/2001', '08/08/1965', '10/31/1970', |
| ] |
|
|
| def _name(): |
| return f"{random.choice(FIRST_NAMES)} {random.choice(LAST_NAMES)}" |
|
|
| def _date(): |
| return random.choice(DATES) |
|
|
| def _place(): |
| return f"{random.choice(BARANGAYS)}, {random.choice(MUNICIPALITIES)}, {random.choice(PROVINCES)}" |
|
|
|
|
| |
| |
| |
| |
| |
|
|
| def generate_form102(): |
| """ |
| Form 102 β Certificate of Live Birth |
| Header keywords: 'Municipal Form No. 102', 'Certificate of Live Birth' |
| """ |
| templates = [ |
| |
| f"Municipal Form No. 102 Certificate of Live Birth " |
| f"Name of child {_name()} Date of birth {_date()} Place of birth {_place()} " |
| f"Name of mother {_name()} Name of father {_name()} " |
| f"Sex {random.choice(['Male', 'Female'])} " |
| f"Legitimacy {random.choice(['Legitimate', 'Illegitimate'])} " |
| f"Attendant {random.choice(['Physician', 'Midwife', 'Nurse'])} " |
| f"birth certificate registry birth registration infant newborn child", |
|
|
| |
| f"Municipal Form No.102 Certificate of Live Birth " |
| f"Child {_name()} born {_date()} at {_place()} " |
| f"mother {_name()} father {_name()} " |
| f"birth weight {random.randint(2, 4)}.{random.randint(1, 9)} kg " |
| f"birth order {random.choice(['First', 'Second', 'Third'])} " |
| f"birth certificate Form 102", |
|
|
| |
| f"Municipal Form No. 102 Certificate of Live Birth " |
| f"Registry number {random.randint(100, 999)}-{random.randint(1, 99):02d} " |
| f"name of child {_name()} date of birth {_date()} " |
| f"place of birth {_place()} birth certificate municipal civil registrar", |
|
|
| |
| f"Municipal Form No. 102 Certificate of Live Birth " |
| f"PSA {_name()} born on {_date()} " |
| f"place of birth {_place()} " |
| f"mother maiden name {_name()} father {_name()} " |
| f"type of birth {random.choice(['Single', 'Twin'])} infant newborn", |
|
|
| |
| f"Municipal Form No.102 Certificate of Live Birth " |
| f"NSO birth registration {_name()} " |
| f"birth date {_date()} birthplace {_place()} " |
| f"parents mother {_name()} father {_name()} " |
| f"attendant at birth {random.choice(['hospital', 'midwife', 'physician'])} " |
| f"sex {random.choice(['male', 'female'])}", |
| ] |
| return random.choice(templates) |
|
|
|
|
| def generate_form103(): |
| """ |
| Form 103 β Certificate of Death |
| Header keywords: 'Municipal Form No. 103', 'Certificate of Death' |
| """ |
| causes = [ |
| 'Cardiac Arrest', 'Pneumonia', 'Hypertension', 'Diabetes Mellitus', |
| 'Stroke', 'Respiratory Failure', 'Natural Causes', 'Cancer', |
| 'Septicemia', 'Renal Failure' |
| ] |
| templates = [ |
| |
| f"Municipal Form No. 103 Certificate of Death " |
| f"Name of deceased {_name()} Date of death {_date()} Place of death {_place()} " |
| f"Cause of death {random.choice(causes)} Age at death {random.randint(1, 95)} " |
| f"Sex {random.choice(['Male', 'Female'])} " |
| f"Civil status {random.choice(['Single', 'Married', 'Widowed'])} " |
| f"death certificate deceased burial interment", |
|
|
| |
| f"Municipal Form No.103 Certificate of Death " |
| f"Deceased {_name()} died on {_date()} at {_place()} " |
| f"cause {random.choice(causes)} corpse informant {_name()} " |
| f"death certificate Form 103 municipal civil registrar", |
|
|
| |
| f"Municipal Form No. 103 Certificate of Death " |
| f"Registry number death {random.randint(100, 999)}-{random.randint(1, 99):02d} " |
| f"name of deceased {_name()} date of death {_date()} " |
| f"place of death {_place()} cause of death {random.choice(causes)} " |
| f"death certificate burial permit", |
|
|
| |
| f"Municipal Form No.103 Certificate of Death " |
| f"{_name()} died {_date()} " |
| f"place {_place()} cause of death {random.choice(causes)} " |
| f"informant {_name()} relationship {random.choice(['spouse', 'child', 'sibling', 'parent'])} " |
| f"death deceased cadaver", |
|
|
| |
| f"Municipal Form No. 103 Certificate of Death " |
| f"Form 103 death registration {_name()} " |
| f"date of death {_date()} place of death {_place()} " |
| f"immediate cause {random.choice(causes)} " |
| f"attending physician {_name()} certificate of death", |
| ] |
| return random.choice(templates) |
|
|
|
|
| def generate_form97(): |
| """ |
| Form 97 β Certificate of Marriage |
| Header keywords: 'Municipal Form No. 97', 'Certificate of Marriage' |
| """ |
| officers = ['Rev.', 'Judge', 'Mayor', 'Pastor', 'Fr.'] |
| licenses = [f"{random.randint(10000, 99999)}", f"ML-{random.randint(1000, 9999)}"] |
| templates = [ |
| |
| f"Municipal Form No. 97 Certificate of Marriage " |
| f"Name of husband {_name()} Name of wife {_name()} " |
| f"Date of marriage {_date()} Place of marriage {_place()} " |
| f"Solemnizing officer {random.choice(officers)} {_name()} " |
| f"Marriage license number {random.choice(licenses)} witnesses {_name()} {_name()} " |
| f"marriage certificate contracting parties wedding", |
|
|
| |
| f"Municipal Form No.97 Certificate of Marriage " |
| f"Husband {_name()} wife {_name()} " |
| f"married on {_date()} at {_place()} " |
| f"officiated by {random.choice(officers)} {_name()} " |
| f"marriage certificate Form 97 solemnizing officer", |
|
|
| |
| f"Municipal Form No. 97 Certificate of Marriage " |
| f"Registry number marriage {random.randint(100, 999)}-{random.randint(1, 99):02d} " |
| f"husband {_name()} wife {_name()} " |
| f"date of marriage {_date()} place {_place()} " |
| f"marriage license {random.choice(licenses)} issued at {_place()} " |
| f"marriage certificate civil registrar", |
|
|
| |
| f"Municipal Form No.97 Certificate of Marriage " |
| f"{_name()} and {_name()} " |
| f"solemnized {_date()} at {_place()} " |
| f"solemnizing officer {random.choice(officers)} {_name()} " |
| f"witnesses {_name()} {_name()} " |
| f"marriage contracting parties husband wife ceremony", |
|
|
| |
| f"Municipal Form No. 97 Certificate of Marriage " |
| f"Form 97 marriage registration husband {_name()} " |
| f"wife {_name()} date of marriage {_date()} " |
| f"place of marriage {_place()} " |
| f"license number {random.choice(licenses)} marriage nuptial wed", |
| ] |
| return random.choice(templates) |
|
|
|
|
| |
| |
| |
| def generate_dataset(samples_per_class=150): |
| generators = [generate_form102, generate_form103, generate_form97] |
| labels_map = [0, 1, 2] |
|
|
| texts, labels = [], [] |
| for gen, label in zip(generators, labels_map): |
| for _ in range(samples_per_class): |
| texts.append(gen()) |
| labels.append(label) |
|
|
| combined = list(zip(texts, labels)) |
| random.shuffle(combined) |
| texts, labels = zip(*combined) |
| return list(texts), list(labels) |
|
|
|
|
| |
| |
| |
| def train(samples_per_class=150, save_dir='models'): |
| os.makedirs(save_dir, exist_ok=True) |
|
|
| print("=" * 60) |
| print(" MNB Document Classifier | Filipino Civil Registry") |
| print(" Certifications Page: Form 102 / 103 / 97 ONLY") |
| print(" (Form 90 routing is handled separately via SEX field)") |
| print("=" * 60) |
|
|
| print(f"\n Generating dataset ({samples_per_class} samples Γ 3 forms = {samples_per_class * 3} total)...") |
| texts, labels = generate_dataset(samples_per_class) |
|
|
| X_train, X_test, y_train, y_test = train_test_split( |
| texts, labels, test_size=0.2, random_state=42, stratify=labels |
| ) |
| print(f" Train: {len(X_train)} | Test: {len(X_test)}") |
|
|
| |
| vectorizer = TfidfVectorizer( |
| ngram_range=(1, 2), |
| max_features=5000, |
| sublinear_tf=True, |
| min_df=1, |
| ) |
| X_train_vec = vectorizer.fit_transform(X_train) |
| X_test_vec = vectorizer.transform(X_test) |
|
|
| |
| clf = MultinomialNB(alpha=0.1) |
| clf.fit(X_train_vec, y_train) |
|
|
| |
| y_pred = clf.predict(X_test_vec) |
| acc = accuracy_score(y_test, y_pred) |
|
|
| print(f"\n Accuracy : {acc * 100:.2f}%") |
| print("\n Classification Report:") |
| print(classification_report(y_test, y_pred, target_names=LABEL_NAMES)) |
|
|
| print(" Confusion Matrix:") |
| cm = confusion_matrix(y_test, y_pred) |
| headers = ['Form102', 'Form103', 'Form97'] |
| print(f" {'':30s} " + " ".join(headers)) |
| for i, row in enumerate(cm): |
| print(f" Actual {headers[i]}: {str(row)}") |
|
|
| |
| model_path = os.path.join(save_dir, 'mnb_classifier.pkl') |
| vec_path = os.path.join(save_dir, 'tfidf_vectorizer.pkl') |
| with open(model_path, 'wb') as f: |
| pickle.dump(clf, f) |
| with open(vec_path, 'wb') as f: |
| pickle.dump(vectorizer, f) |
|
|
| meta = { |
| 'accuracy': round(acc * 100, 2), |
| 'samples_per_class': samples_per_class, |
| 'total_samples': samples_per_class * 3, |
| 'labels': LABEL_MAP, |
| 'note': 'Form 90 routing is handled by classify_sex() β not this model', |
| 'model_path': model_path, |
| 'vectorizer_path': vec_path, |
| } |
| with open(os.path.join(save_dir, 'mnb_metadata.json'), 'w') as f: |
| json.dump(meta, f, indent=2) |
|
|
| print(f"\n Model saved : {model_path}") |
| print(f" Vectorizer saved: {vec_path}") |
| print(f"\n Target accuracy : >90%") |
| print(f" Achieved : {acc * 100:.2f}% {'β' if acc >= 0.90 else 'β (try increasing samples_per_class)'}") |
| print("=" * 60) |
|
|
| return clf, vectorizer, acc |
|
|
|
|
| |
| |
| |
| class DocumentClassifier: |
| """Load trained MNB model and classify OCR text from Certifications page.""" |
|
|
| def __init__(self, model_dir='models'): |
| model_path = os.path.join(model_dir, 'mnb_classifier.pkl') |
| vec_path = os.path.join(model_dir, 'tfidf_vectorizer.pkl') |
|
|
| if not os.path.exists(model_path): |
| raise FileNotFoundError( |
| f"Model not found at {model_path}. Run: python form_classifier.py" |
| ) |
|
|
| with open(model_path, 'rb') as f: |
| self.clf = pickle.load(f) |
| with open(vec_path, 'rb') as f: |
| self.vectorizer = pickle.load(f) |
|
|
| def predict(self, text: str) -> dict: |
| """ |
| Classify OCR text from Certifications page. |
| |
| Returns: |
| { |
| 'label': 'Form 102 - Certificate of Live Birth', |
| 'form_code': 'form102', |
| 'confidence': 0.95, |
| 'probabilities': { ... } |
| } |
| """ |
| vec = self.vectorizer.transform([text]) |
| probs = self.clf.predict_proba(vec)[0] |
| idx = int(np.argmax(probs)) |
|
|
| form_codes = ['form102', 'form103', 'form97'] |
| return { |
| 'label': LABEL_MAP[idx], |
| 'form_code': form_codes[idx], |
| 'confidence': round(float(probs[idx]), 4), |
| 'probabilities': { |
| LABEL_MAP[i]: round(float(p), 4) |
| for i, p in enumerate(probs) |
| } |
| } |
|
|
|
|
| |
| |
| |
| def run_test(): |
| print("\n" + "=" * 60) |
| print(" Testing DocumentClassifier β Certifications Page") |
| print("=" * 60) |
|
|
| classifier = DocumentClassifier() |
|
|
| test_cases = [ |
| ( |
| "Municipal Form No. 102 Certificate of Live Birth " |
| "Name of child Maria Santos Date of birth 01/15/1990 " |
| "Place of birth Brgy. San Jose, Tarlac City, Tarlac " |
| "Name of mother Lani Santos Name of father Jose Santos " |
| "Sex Female birth certificate infant", |
| "Form 102 - Certificate of Live Birth" |
| ), |
| ( |
| "Municipal Form No.102 Certificate of Live Birth " |
| "PSA Child Juan Dela Cruz born 03/22/1985 " |
| "Place of birth Capas Tarlac mother Rosa Dela Cruz " |
| "father Pedro Dela Cruz Sex Male", |
| "Form 102 - Certificate of Live Birth" |
| ), |
| ( |
| "Municipal Form No. 103 Certificate of Death " |
| "Name of deceased Pedro Reyes Date of death 03/22/2020 " |
| "Place of death Capas, Tarlac Cause of death Cardiac Arrest " |
| "Age at death 75 death certificate deceased burial", |
| "Form 103 - Certificate of Death" |
| ), |
| ( |
| "Municipal Form No.103 Certificate of Death " |
| "Deceased Ana Torres died 07/04/2000 " |
| "cause Pneumonia burial permit interment", |
| "Form 103 - Certificate of Death" |
| ), |
| ( |
| "Municipal Form No. 97 Certificate of Marriage " |
| "Name of husband Carlos Bautista Name of wife Ana Torres " |
| "Date of marriage 07/04/2005 Place of marriage Paniqui, Tarlac " |
| "Solemnizing officer Rev. Santos witnesses marriage certificate", |
| "Form 97 - Certificate of Marriage" |
| ), |
| ( |
| "Municipal Form No.97 Certificate of Marriage " |
| "Husband Jose Santos wife Maria Reyes " |
| "married 11/30/1995 contracting parties solemnizing officer", |
| "Form 97 - Certificate of Marriage" |
| ), |
| ] |
|
|
| correct = 0 |
| for text, expected in test_cases: |
| result = classifier.predict(text) |
| status = 'β' if expected in result['label'] else 'β' |
| if expected in result['label']: |
| correct += 1 |
| print(f"\n {status} Expected : {expected}") |
| print(f" Predicted: {result['label']} ({result['confidence'] * 100:.1f}% confidence)") |
|
|
| print(f"\n Test Accuracy: {correct}/{len(test_cases)} ({correct / len(test_cases) * 100:.0f}%)") |
| print("=" * 60) |
|
|
|
|
| |
| |
| |
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--test', action='store_true', help='Run test predictions only') |
| parser.add_argument('--samples', type=int, default=150, help='Samples per class (default: 150)') |
| args = parser.parse_args() |
|
|
| if args.test: |
| run_test() |
| else: |
| train(samples_per_class=args.samples) |
| print("\nTo test predictions, run:") |
| print(" python form_classifier.py --test") |
|
|