"""
form_classifier.py
=======================
Multinomial Naive Bayes (MNB) Document Classifier
for Local Civil Registry Document Digitization System

Classifies extracted OCR text into:
  - Form 102  (Certificate of Live Birth)       ← Certifications page
  - Form 103  (Certificate of Death)             ← Certifications page
  - Form 97   (Certificate of Marriage)          ← Certifications page

NOTE: Form 90 (Application for Marriage License) is NOT classified here.
      Form 90 has its OWN upload page where the user uploads:
        - Groom's Birth Certificate (PSA/NSO sealed)
        - Bride's Birth Certificate (PSA/NSO sealed)
      The SEX field on each birth cert determines GROOM (Male) or BRIDE (Female).
      See classify_sex() in classifier.py for that routing.

Usage:
    python form_classifier.py            # trains and saves model
    python form_classifier.py --test     # runs test predictions
"""

import os
import json
import random
import argparse
import pickle
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix
)

# ─────────────────────────────────────────────────────────────
# 1.  LABEL MAP  (Certifications page only — NO Form 90 here)
# ─────────────────────────────────────────────────────────────
LABEL_MAP = {
    0: 'Form 102 - Certificate of Live Birth',
    1: 'Form 103 - Certificate of Death',
    2: 'Form 97 - Certificate of Marriage',
}
LABEL_NAMES = list(LABEL_MAP.values())

# ─────────────────────────────────────────────────────────────
# 2.  VOCABULARY POOLS  (Filipino civil registry)
# ─────────────────────────────────────────────────────────────
FIRST_NAMES = [
    'Juan', 'Maria', 'Jose', 'Ana', 'Pedro', 'Rosa', 'Carlos', 'Lani',
    'Roberto', 'Nena', 'Ramon', 'Cynthia', 'Eduardo', 'Marites', 'Danilo',
    'Rowena', 'Renato', 'Melinda', 'Ernesto', 'Josephine', 'Michael',
    'Jennifer', 'Angelo', 'Christine', 'Mark', 'Patricia', 'John', 'Mary'
]
LAST_NAMES = [
    'Dela Cruz', 'Santos', 'Reyes', 'Garcia', 'Torres', 'Flores',
    'Bautista', 'Villanueva', 'Mendoza', 'Castro', 'Ramos', 'Lim',
    'Aquino', 'Diaz', 'Fernandez', 'Lopez', 'Gonzales', 'Ramirez',
    'Abad', 'Aguilar', 'Manalo', 'Navarro', 'Ocampo', 'Pascual'
]
MUNICIPALITIES = [
    'Tarlac City', 'Capas', 'Paniqui', 'Gerona', 'Camiling',
    'Victoria', 'San Manuel', 'Concepcion', 'La Paz', 'Sta. Ignacia',
    'Bamban', 'Moncada', 'Pura', 'Ramos', 'Anao'
]
PROVINCES = ['Tarlac', 'Pampanga', 'Nueva Ecija', 'Bulacan', 'Zambales']
BARANGAYS = [
    'Brgy. San Jose', 'Brgy. Poblacion', 'Brgy. Sto. Cristo',
    'Brgy. Tibag', 'Brgy. Maliwalo', 'Brgy. San Nicolas',
    'Brgy. San Roque', 'Brgy. San Vicente', 'Brgy. Salapungan'
]
DATES = [
    '01/15/1990', '03/22/1985', '07/04/2000', '11/30/1995',
    '05/18/1988', '09/12/1975', '02/28/1993', '06/06/1980',
    '12/25/1998', '04/17/2001', '08/08/1965', '10/31/1970',
]

def _name():
    return f"{random.choice(FIRST_NAMES)} {random.choice(LAST_NAMES)}"

def _date():
    return random.choice(DATES)

def _place():
    return f"{random.choice(BARANGAYS)}, {random.choice(MUNICIPALITIES)}, {random.choice(PROVINCES)}"


# ─────────────────────────────────────────────────────────────
# 3.  SAMPLE GENERATORS
#     Each generator uses the EXACT Philippine form header
#     so MNB learns the real keywords from actual documents.
# ─────────────────────────────────────────────────────────────

def generate_form102():
    """
    Form 102 — Certificate of Live Birth
    Header keywords: 'Municipal Form No. 102', 'Certificate of Live Birth'
    """
    templates = [
        # Template A: Exact header present
        f"Municipal Form No. 102 Certificate of Live Birth "
        f"Name of child {_name()} Date of birth {_date()} Place of birth {_place()} "
        f"Name of mother {_name()} Name of father {_name()} "
        f"Sex {random.choice(['Male', 'Female'])} "
        f"Legitimacy {random.choice(['Legitimate', 'Illegitimate'])} "
        f"Attendant {random.choice(['Physician', 'Midwife', 'Nurse'])} "
        f"birth certificate registry birth registration infant newborn child",

        # Template B: No. without space
        f"Municipal Form No.102 Certificate of Live Birth "
        f"Child {_name()} born {_date()} at {_place()} "
        f"mother {_name()} father {_name()} "
        f"birth weight {random.randint(2, 4)}.{random.randint(1, 9)} kg "
        f"birth order {random.choice(['First', 'Second', 'Third'])} "
        f"birth certificate Form 102",

        # Template C: Registry number format
        f"Municipal Form No. 102 Certificate of Live Birth "
        f"Registry number {random.randint(100, 999)}-{random.randint(1, 99):02d} "
        f"name of child {_name()} date of birth {_date()} "
        f"place of birth {_place()} birth certificate municipal civil registrar",

        # Template D: PSA/NSO sealed copy (used when filing Form 90)
        f"Municipal Form No. 102 Certificate of Live Birth "
        f"PSA {_name()} born on {_date()} "
        f"place of birth {_place()} "
        f"mother maiden name {_name()} father {_name()} "
        f"type of birth {random.choice(['Single', 'Twin'])} infant newborn",

        # Template E: NSO variation
        f"Municipal Form No.102 Certificate of Live Birth "
        f"NSO birth registration {_name()} "
        f"birth date {_date()} birthplace {_place()} "
        f"parents mother {_name()} father {_name()} "
        f"attendant at birth {random.choice(['hospital', 'midwife', 'physician'])} "
        f"sex {random.choice(['male', 'female'])}",
    ]
    return random.choice(templates)


def generate_form103():
    """
    Form 103 — Certificate of Death
    Header keywords: 'Municipal Form No. 103', 'Certificate of Death'
    """
    causes = [
        'Cardiac Arrest', 'Pneumonia', 'Hypertension', 'Diabetes Mellitus',
        'Stroke', 'Respiratory Failure', 'Natural Causes', 'Cancer',
        'Septicemia', 'Renal Failure'
    ]
    templates = [
        # Template A: Exact header
        f"Municipal Form No. 103 Certificate of Death "
        f"Name of deceased {_name()} Date of death {_date()} Place of death {_place()} "
        f"Cause of death {random.choice(causes)} Age at death {random.randint(1, 95)} "
        f"Sex {random.choice(['Male', 'Female'])} "
        f"Civil status {random.choice(['Single', 'Married', 'Widowed'])} "
        f"death certificate deceased burial interment",

        # Template B: No space
        f"Municipal Form No.103 Certificate of Death "
        f"Deceased {_name()} died on {_date()} at {_place()} "
        f"cause {random.choice(causes)} corpse informant {_name()} "
        f"death certificate Form 103 municipal civil registrar",

        # Template C: Registry format
        f"Municipal Form No. 103 Certificate of Death "
        f"Registry number death {random.randint(100, 999)}-{random.randint(1, 99):02d} "
        f"name of deceased {_name()} date of death {_date()} "
        f"place of death {_place()} cause of death {random.choice(causes)} "
        f"death certificate burial permit",

        # Template D: Clinical format
        f"Municipal Form No.103 Certificate of Death "
        f"{_name()} died {_date()} "
        f"place {_place()} cause of death {random.choice(causes)} "
        f"informant {_name()} relationship {random.choice(['spouse', 'child', 'sibling', 'parent'])} "
        f"death deceased cadaver",

        # Template E: Full form
        f"Municipal Form No. 103 Certificate of Death "
        f"Form 103 death registration {_name()} "
        f"date of death {_date()} place of death {_place()} "
        f"immediate cause {random.choice(causes)} "
        f"attending physician {_name()} certificate of death",
    ]
    return random.choice(templates)


def generate_form97():
    """
    Form 97 — Certificate of Marriage
    Header keywords: 'Municipal Form No. 97', 'Certificate of Marriage'
    """
    officers = ['Rev.', 'Judge', 'Mayor', 'Pastor', 'Fr.']
    licenses = [f"{random.randint(10000, 99999)}", f"ML-{random.randint(1000, 9999)}"]
    templates = [
        # Template A: Exact header
        f"Municipal Form No. 97 Certificate of Marriage "
        f"Name of husband {_name()} Name of wife {_name()} "
        f"Date of marriage {_date()} Place of marriage {_place()} "
        f"Solemnizing officer {random.choice(officers)} {_name()} "
        f"Marriage license number {random.choice(licenses)} witnesses {_name()} {_name()} "
        f"marriage certificate contracting parties wedding",

        # Template B: No space
        f"Municipal Form No.97 Certificate of Marriage "
        f"Husband {_name()} wife {_name()} "
        f"married on {_date()} at {_place()} "
        f"officiated by {random.choice(officers)} {_name()} "
        f"marriage certificate Form 97 solemnizing officer",

        # Template C: Registry format
        f"Municipal Form No. 97 Certificate of Marriage "
        f"Registry number marriage {random.randint(100, 999)}-{random.randint(1, 99):02d} "
        f"husband {_name()} wife {_name()} "
        f"date of marriage {_date()} place {_place()} "
        f"marriage license {random.choice(licenses)} issued at {_place()} "
        f"marriage certificate civil registrar",

        # Template D: Ceremony format
        f"Municipal Form No.97 Certificate of Marriage "
        f"{_name()} and {_name()} "
        f"solemnized {_date()} at {_place()} "
        f"solemnizing officer {random.choice(officers)} {_name()} "
        f"witnesses {_name()} {_name()} "
        f"marriage contracting parties husband wife ceremony",

        # Template E: Full form
        f"Municipal Form No. 97 Certificate of Marriage "
        f"Form 97 marriage registration husband {_name()} "
        f"wife {_name()} date of marriage {_date()} "
        f"place of marriage {_place()} "
        f"license number {random.choice(licenses)} marriage nuptial wed",
    ]
    return random.choice(templates)


# ─────────────────────────────────────────────────────────────
# 4.  DATASET GENERATOR  (3 classes only — no Form 90)
# ─────────────────────────────────────────────────────────────
def generate_dataset(samples_per_class=150):
    generators = [generate_form102, generate_form103, generate_form97]
    labels_map = [0, 1, 2]  # 0=Form102, 1=Form103, 2=Form97

    texts, labels = [], []
    for gen, label in zip(generators, labels_map):
        for _ in range(samples_per_class):
            texts.append(gen())
            labels.append(label)

    combined = list(zip(texts, labels))
    random.shuffle(combined)
    texts, labels = zip(*combined)
    return list(texts), list(labels)


# ─────────────────────────────────────────────────────────────
# 5.  TRAIN & SAVE
# ─────────────────────────────────────────────────────────────
def train(samples_per_class=150, save_dir='models'):
    os.makedirs(save_dir, exist_ok=True)

    print("=" * 60)
    print("  MNB Document Classifier  |  Filipino Civil Registry")
    print("  Certifications Page: Form 102 / 103 / 97 ONLY")
    print("  (Form 90 routing is handled separately via SEX field)")
    print("=" * 60)

    print(f"\n  Generating dataset ({samples_per_class} samples × 3 forms = {samples_per_class * 3} total)...")
    texts, labels = generate_dataset(samples_per_class)

    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels, test_size=0.2, random_state=42, stratify=labels
    )
    print(f"  Train: {len(X_train)}  |  Test: {len(X_test)}")

    # TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=5000,
        sublinear_tf=True,
        min_df=1,
    )
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec  = vectorizer.transform(X_test)

    # Train MNB
    clf = MultinomialNB(alpha=0.1)
    clf.fit(X_train_vec, y_train)

    # Evaluate
    y_pred = clf.predict(X_test_vec)
    acc    = accuracy_score(y_test, y_pred)

    print(f"\n  Accuracy : {acc * 100:.2f}%")
    print("\n  Classification Report:")
    print(classification_report(y_test, y_pred, target_names=LABEL_NAMES))

    print("  Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    headers = ['Form102', 'Form103', 'Form97']
    print(f"  {'':30s} " + "  ".join(headers))
    for i, row in enumerate(cm):
        print(f"  Actual {headers[i]}: {str(row)}")

    # Save
    model_path = os.path.join(save_dir, 'mnb_classifier.pkl')
    vec_path   = os.path.join(save_dir, 'tfidf_vectorizer.pkl')
    with open(model_path, 'wb') as f:
        pickle.dump(clf, f)
    with open(vec_path, 'wb') as f:
        pickle.dump(vectorizer, f)

    meta = {
        'accuracy': round(acc * 100, 2),
        'samples_per_class': samples_per_class,
        'total_samples': samples_per_class * 3,
        'labels': LABEL_MAP,
        'note': 'Form 90 routing is handled by classify_sex() — not this model',
        'model_path': model_path,
        'vectorizer_path': vec_path,
    }
    with open(os.path.join(save_dir, 'mnb_metadata.json'), 'w') as f:
        json.dump(meta, f, indent=2)

    print(f"\n  Model saved     : {model_path}")
    print(f"  Vectorizer saved: {vec_path}")
    print(f"\n  Target accuracy : >90%")
    print(f"  Achieved        : {acc * 100:.2f}% {'✓' if acc >= 0.90 else '✗ (try increasing samples_per_class)'}")
    print("=" * 60)

    return clf, vectorizer, acc


# ─────────────────────────────────────────────────────────────
# 6.  DOCUMENT CLASSIFIER CLASS
# ─────────────────────────────────────────────────────────────
class DocumentClassifier:
    """Load trained MNB model and classify OCR text from Certifications page."""

    def __init__(self, model_dir='models'):
        model_path = os.path.join(model_dir, 'mnb_classifier.pkl')
        vec_path   = os.path.join(model_dir, 'tfidf_vectorizer.pkl')

        if not os.path.exists(model_path):
            raise FileNotFoundError(
                f"Model not found at {model_path}. Run: python form_classifier.py"
            )

        with open(model_path, 'rb') as f:
            self.clf = pickle.load(f)
        with open(vec_path, 'rb') as f:
            self.vectorizer = pickle.load(f)

    def predict(self, text: str) -> dict:
        """
        Classify OCR text from Certifications page.

        Returns:
            {
                'label':        'Form 102 - Certificate of Live Birth',
                'form_code':    'form102',
                'confidence':   0.95,
                'probabilities': { ... }
            }
        """
        vec   = self.vectorizer.transform([text])
        probs = self.clf.predict_proba(vec)[0]
        idx   = int(np.argmax(probs))

        form_codes = ['form102', 'form103', 'form97']
        return {
            'label':      LABEL_MAP[idx],
            'form_code':  form_codes[idx],
            'confidence': round(float(probs[idx]), 4),
            'probabilities': {
                LABEL_MAP[i]: round(float(p), 4)
                for i, p in enumerate(probs)
            }
        }


# ─────────────────────────────────────────────────────────────
# 7.  TEST DEMO
# ─────────────────────────────────────────────────────────────
def run_test():
    print("\n" + "=" * 60)
    print("  Testing DocumentClassifier — Certifications Page")
    print("=" * 60)

    classifier = DocumentClassifier()

    test_cases = [
        (
            "Municipal Form No. 102 Certificate of Live Birth "
            "Name of child Maria Santos Date of birth 01/15/1990 "
            "Place of birth Brgy. San Jose, Tarlac City, Tarlac "
            "Name of mother Lani Santos Name of father Jose Santos "
            "Sex Female birth certificate infant",
            "Form 102 - Certificate of Live Birth"
        ),
        (
            "Municipal Form No.102 Certificate of Live Birth "
            "PSA Child Juan Dela Cruz born 03/22/1985 "
            "Place of birth Capas Tarlac mother Rosa Dela Cruz "
            "father Pedro Dela Cruz Sex Male",
            "Form 102 - Certificate of Live Birth"
        ),
        (
            "Municipal Form No. 103 Certificate of Death "
            "Name of deceased Pedro Reyes Date of death 03/22/2020 "
            "Place of death Capas, Tarlac Cause of death Cardiac Arrest "
            "Age at death 75 death certificate deceased burial",
            "Form 103 - Certificate of Death"
        ),
        (
            "Municipal Form No.103 Certificate of Death "
            "Deceased Ana Torres died 07/04/2000 "
            "cause Pneumonia burial permit interment",
            "Form 103 - Certificate of Death"
        ),
        (
            "Municipal Form No. 97 Certificate of Marriage "
            "Name of husband Carlos Bautista Name of wife Ana Torres "
            "Date of marriage 07/04/2005 Place of marriage Paniqui, Tarlac "
            "Solemnizing officer Rev. Santos witnesses marriage certificate",
            "Form 97 - Certificate of Marriage"
        ),
        (
            "Municipal Form No.97 Certificate of Marriage "
            "Husband Jose Santos wife Maria Reyes "
            "married 11/30/1995 contracting parties solemnizing officer",
            "Form 97 - Certificate of Marriage"
        ),
    ]

    correct = 0
    for text, expected in test_cases:
        result = classifier.predict(text)
        status = '✓' if expected in result['label'] else '✗'
        if expected in result['label']:
            correct += 1
        print(f"\n  {status} Expected : {expected}")
        print(f"    Predicted: {result['label']} ({result['confidence'] * 100:.1f}% confidence)")

    print(f"\n  Test Accuracy: {correct}/{len(test_cases)} ({correct / len(test_cases) * 100:.0f}%)")
    print("=" * 60)


# ─────────────────────────────────────────────────────────────
# 8.  MAIN
# ─────────────────────────────────────────────────────────────
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--test', action='store_true', help='Run test predictions only')
    parser.add_argument('--samples', type=int, default=150, help='Samples per class (default: 150)')
    args = parser.parse_args()

    if args.test:
        run_test()
    else:
        train(samples_per_class=args.samples)
        print("\nTo test predictions, run:")
        print("  python form_classifier.py --test")