"""
Model Overfitting Evaluation Script
=====================================
Evaluates the Random Forest fake news classifier for overfitting by
comparing Training vs. Testing performance.

Split: 80% Train / 20% Test
Metrics: classification_report, accuracy_score, confusion matrix plot
Flag: Overfitting detected if Train Acc > 95% and Test Acc < 70%

Usage:
    python backend/evaluate_model.py
"""

import sys
import os
import re
import time
import numpy as np
from textblob import TextBlob
import textstat

PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, PROJECT_ROOT)

import pandas as pd
import matplotlib

matplotlib.use("Agg")  # Non-interactive backend for saving plots
import matplotlib.pyplot as plt
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
from sentence_transformers import SentenceTransformer


# ── Paths ──
DATA_MODELS_DIR = os.path.join(PROJECT_ROOT, "data_models")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "evaluation_results")


# ── MiniLM Model (lazy-loaded singleton) ──
MINILM_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
_minilm_model = None


def get_minilm_model():
    """Load the multilingual MiniLM model (cached after first call)."""
    global _minilm_model
    if _minilm_model is None:
        print("  Loading MiniLM model...")
        _minilm_model = SentenceTransformer(MINILM_MODEL_NAME)
    return _minilm_model


# ───────────────────────────────────────────────────────────
# Text Cleaning (same as train.py)
# ───────────────────────────────────────────────────────────


def clean_text(text):
    """Basic text cleaning for Filipino news articles."""
    if not text or not isinstance(text, str):
        return ""
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"https?://\S+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# ───────────────────────────────────────────────────────────
# Stylometric Features (same as train.py)
# ───────────────────────────────────────────────────────────


# ── Word lists for linguistic features ──
FIRST_PERSON_PRONOUNS = {
    "i",
    "me",
    "my",
    "mine",
    "myself",
    "we",
    "us",
    "our",
    "ours",
    "ourselves",
    "ako",
    "ko",
    "akin",
    "aking",
    "natin",
    "atin",
    "namin",
    "amin",
    "tayo",
    "kami",
    "ta",
}

AUXILIARY_VERBS = {
    "have",
    "has",
    "had",
    "do",
    "does",
    "did",
    "will",
    "would",
    "shall",
    "should",
    "may",
    "might",
    "can",
    "could",
    "must",
    "am",
    "is",
    "are",
    "was",
    "were",
    "be",
    "been",
    "being",
    "ay",
    "dapat",
    "mayroon",
    "meron",
    "maaari",
    "pwede",
    "kailangan",
}

ANALYTICAL_WORDS = {
    "the",
    "a",
    "an",
    "of",
    "in",
    "on",
    "at",
    "to",
    "for",
    "with",
    "by",
    "from",
    "about",
    "between",
    "through",
    "during",
    "before",
    "after",
    "ang",
    "ng",
    "sa",
    "mga",
    "nang",
    "para",
    "tungkol",
    "mula",
}

CERTAINTY_WORDS = {
    "always",
    "never",
    "absolutely",
    "definitely",
    "certainly",
    "undoubtedly",
    "clearly",
    "obviously",
    "without doubt",
    "guaranteed",
    "proven",
    "fact",
    "undeniable",
    "indisputable",
    "every",
    "all",
    "palagi",
    "sigurado",
    "tiyak",
    "talaga",
    "totoo",
    "lagi",
    "walang duda",
}

TENTATIVE_WORDS = {
    "perhaps",
    "maybe",
    "possibly",
    "might",
    "could",
    "likely",
    "unlikely",
    "suggests",
    "appears",
    "seems",
    "allegedly",
    "reportedly",
    "according",
    "probable",
    "approximately",
    "estimated",
    "siguro",
    "marahil",
    "maaaring",
    "mukhang",
    "parang",
    "umano",
    "diumano",
}

CLOUT_WORDS = {
    "must",
    "demand",
    "require",
    "order",
    "command",
    "insist",
    "decree",
    "mandate",
    "authority",
    "power",
    "control",
    "dominant",
    "superior",
    "we must",
    "you must",
    "kailangan",
    "dapat",
    "utos",
    "kapangyarihan",
    "kontrol",
    "mando",
}

PAST_FOCUS_WORDS = {
    "talked",
    "did",
    "ago",
    "said",
    "was",
    "were",
    "had",
    "went",
    "told",
    "noon",
    "nakaraan",
    "dati",
    "kahapon",
}

PRESENT_FOCUS_WORDS = {
    "now",
    "is",
    "today",
    "are",
    "being",
    "currently",
    "ongoing",
    "ngayon",
    "kasalukuyan",
}

FUTURE_FOCUS_WORDS = {
    "soon",
    "will",
    "may",
    "shall",
    "going",
    "plan",
    "expect",
    "tomorrow",
    "bukas",
    "darating",
    "magiging",
    "gagawin",
}


def extract_stylometric_features(text):
    """Extract 25 stylometric features from text (matches train.py)."""
    if not text or not isinstance(text, str):
        return [0.0] * 25

    words = text.split()
    token_count = len(words)
    if token_count == 0:
        return [0.0] * 25

    words_lower = [w.lower() for w in words]
    text_len = len(text)

    exclamation_density = text.count("!") / token_count
    question_count = text.count("?")

    caps_words = sum(1 for w in words if len(w) >= 2 and w.isupper())
    caps_ratio = caps_words / token_count

    sentences = re.split(r"[.!?]+", text)
    sentences = [s.strip() for s in sentences if s.strip()]
    avg_sentence_length = (
        sum(len(s.split()) for s in sentences) / len(sentences)
        if sentences
        else token_count
    )

    punct_chars = sum(1 for c in text if c in ".,;:!?-\"'()[]{}...")
    punctuation_density = (punct_chars / text_len) * 100 if text_len > 0 else 0

    unique_words = len(set(words_lower))
    unique_word_ratio = unique_words / token_count

    avg_word_length = sum(len(w) for w in words) / token_count

    try:
        subjectivity = TextBlob(text).sentiment.subjectivity
    except Exception:
        subjectivity = 0.0

    try:
        flesch_reading_ease = textstat.flesch_reading_ease(text)
        flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
        coleman_liau_index = textstat.coleman_liau_index(text)
        ari = textstat.automated_readability_index(text)
    except Exception:
        flesch_reading_ease = 0.0
        flesch_kincaid_grade = 0.0
        coleman_liau_index = 0.0
        ari = 0.0

    first_person_count = sum(1 for w in words_lower if w in FIRST_PERSON_PRONOUNS)
    first_person_ratio = first_person_count / token_count

    aux_count = sum(1 for w in words_lower if w in AUXILIARY_VERBS)
    auxiliary_verb_ratio = aux_count / token_count

    try:
        gunning_fog_index = textstat.gunning_fog(text)
    except Exception:
        gunning_fog_index = 0.0

    analytical_count = sum(1 for w in words_lower if w in ANALYTICAL_WORDS)
    analytical_thinking = analytical_count / token_count

    certainty_count = sum(1 for w in words_lower if w in CERTAINTY_WORDS)
    certainty_score = certainty_count / token_count

    tentative_count = sum(1 for w in words_lower if w in TENTATIVE_WORDS)
    tentative_score = tentative_count / token_count

    clout_count = sum(1 for w in words_lower if w in CLOUT_WORDS)
    clout_score = clout_count / token_count

    comma_period_count = text.count(",") + text.count(".")
    comma_period_density = (comma_period_count / text_len) * 100 if text_len > 0 else 0

    informal_count = (
        text.count("(")
        + text.count(")")
        + text.count("—")
        + text.count("–")
        + text.count("-")
        + text.count("...")
        + text.count("…")
    )
    informal_punct_density = (informal_count / text_len) * 100 if text_len > 0 else 0

    past_count = sum(1 for w in words_lower if w in PAST_FOCUS_WORDS)
    past_focus_ratio = past_count / token_count

    present_count = sum(1 for w in words_lower if w in PRESENT_FOCUS_WORDS)
    present_focus_ratio = present_count / token_count

    future_count = sum(1 for w in words_lower if w in FUTURE_FOCUS_WORDS)
    future_focus_ratio = future_count / token_count

    return [
        float(exclamation_density),
        float(question_count),
        float(caps_ratio),
        float(avg_sentence_length),
        float(punctuation_density),
        float(token_count),
        float(unique_word_ratio),
        float(avg_word_length),
        float(subjectivity),
        float(flesch_reading_ease),
        float(flesch_kincaid_grade),
        float(coleman_liau_index),
        float(ari),
        float(first_person_ratio),
        float(auxiliary_verb_ratio),
        float(gunning_fog_index),
        float(analytical_thinking),
        float(certainty_score),
        float(tentative_score),
        float(clout_score),
        float(comma_period_density),
        float(informal_punct_density),
        float(past_focus_ratio),
        float(present_focus_ratio),
        float(future_focus_ratio),
    ]


STYLOMETRIC_FEATURE_NAMES = [
    "exclamation_density",
    "question_count",
    "caps_ratio",
    "avg_sentence_length",
    "punctuation_density",
    "token_count",
    "unique_word_ratio",
    "avg_word_length",
    "subjectivity",
    "flesch_reading_ease",
    "flesch_kincaid_grade",
    "coleman_liau_index",
    "ari",
    "first_person_ratio",
    "auxiliary_verb_ratio",
    "gunning_fog_index",
    "analytical_thinking",
    "certainty_score",
    "tentative_score",
    "clout_score",
    "comma_period_density",
    "informal_punct_density",
    "past_focus_ratio",
    "present_focus_ratio",
    "future_focus_ratio",
]


# ───────────────────────────────────────────────────────────
# Main Evaluation
# ───────────────────────────────────────────────────────────


def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    label_names = ["Real", "Fake"]

    # ── 1. Load Dataset ──
    print("=" * 60)
    print("  MODEL OVERFITTING EVALUATION")
    print("=" * 60)

    csv_path = os.path.join(
        PROJECT_ROOT, "data", "raw", "fakenews", "fakenews", "full.csv"
    )
    if not os.path.exists(csv_path):
        print(f"ERROR: Dataset not found at {csv_path}")
        return

    df = pd.read_csv(csv_path)
    print(f"\nDataset: jcblaise/fake_news_filipino")
    print(f"Total articles: {len(df)}")
    print(f"Distribution:")
    print(f"  Real (0): {(df['label'] == 0).sum()}")
    print(f"  Fake (1): {(df['label'] == 1).sum()}")

    # ── 2. Preprocess ──
    print("\nPreprocessing...")
    df = df.dropna(subset=["article"]).copy()
    df = df[df["article"].str.len() > 0].copy()
    df.loc[:, "article_clean"] = df["article"].apply(clean_text)

    X_texts = df["article_clean"].tolist()
    y_labels = df["label"].tolist()
    print(f"  Valid articles: {len(X_texts)}")

    # ── 3. Split: 80% Train / 20% Test ──
    print("\nSplitting data: 80% Train / 20% Test...")
    X_train, X_test, y_train, y_test = train_test_split(
        X_texts,
        y_labels,
        test_size=0.20,
        random_state=42,
        stratify=y_labels,
    )
    print(f"  Training set: {len(X_train)} articles")
    print(f"  Testing set:  {len(X_test)} articles")

    # ── 4. Build Hybrid Features ──
    print("\nBuilding hybrid features (TF-IDF + MiniLM + stylometric)...")

    # TF-IDF
    tfidf = TfidfVectorizer(
        max_features=15000,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True,
    )
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    # MiniLM embeddings
    print("  Encoding texts with MiniLM...")
    minilm = get_minilm_model()
    train_embeddings = minilm.encode(X_train, show_progress_bar=True, batch_size=64)
    test_embeddings = minilm.encode(X_test, show_progress_bar=True, batch_size=64)

    # Stylometric
    print("  Extracting stylometric features...")
    train_stylo = np.array([extract_stylometric_features(t) for t in X_train])
    test_stylo = np.array([extract_stylometric_features(t) for t in X_test])

    scaler = StandardScaler()
    train_stylo_scaled = scaler.fit_transform(train_stylo)
    test_stylo_scaled = scaler.transform(test_stylo)

    # Combine
    X_train_feat = hstack(
        [X_train_tfidf, csr_matrix(train_embeddings), csr_matrix(train_stylo_scaled)]
    )
    X_test_feat = hstack(
        [X_test_tfidf, csr_matrix(test_embeddings), csr_matrix(test_stylo_scaled)]
    )

    n_tfidf = X_train_tfidf.shape[1]
    n_minilm = 384
    n_stylo = len(STYLOMETRIC_FEATURE_NAMES)
    print(
        f"  Feature dimensions: {X_train_feat.shape[1]} "
        f"(TF-IDF: {n_tfidf} + MiniLM: {n_minilm} + Stylometric: {n_stylo})"
    )

    # ── 5. Full 5-Fold Cross-Validation ──
    print("\n" + "=" * 60)
    print("  5-FOLD CROSS-VALIDATION (Full Dataset)")
    print("=" * 60)

    # Build features on entire dataset
    print("\nBuilding features on full dataset...")
    tfidf_full = TfidfVectorizer(
        max_features=15000,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True,
    )
    X_tfidf_full = tfidf_full.fit_transform(X_texts)

    print("  Encoding full dataset with MiniLM...")
    full_embeddings = minilm.encode(X_texts, show_progress_bar=True, batch_size=64)

    stylo_full = np.array([extract_stylometric_features(t) for t in X_texts])
    scaler_full = StandardScaler()
    stylo_full_scaled = scaler_full.fit_transform(stylo_full)
    X_full = hstack(
        [X_tfidf_full, csr_matrix(full_embeddings), csr_matrix(stylo_full_scaled)]
    )
    y_full = np.array(y_labels)

    print(f"  Total samples: {X_full.shape[0]}")
    print(
        f"  Feature dimensions: {X_full.shape[1]} "
        f"(TF-IDF: {X_tfidf_full.shape[1]} + MiniLM: {n_minilm} + Stylometric: {n_stylo})"
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    fold_accuracies = []
    fold_precisions = []
    fold_recalls = []
    fold_f1s = []
    fold_train_accs = []
    all_y_true = []
    all_y_pred = []

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_full, y_full), 1):
        X_fold_train = X_full[train_idx]
        X_fold_test = X_full[test_idx]
        y_fold_train = y_full[train_idx]
        y_fold_test = y_full[test_idx]

        print(f"\n{'─' * 60}")
        print(f"  FOLD {fold_idx}/5  (Train: {len(train_idx)}, Test: {len(test_idx)})")
        print(f"{'─' * 60}")

        rf_fold = RandomForestClassifier(
            n_estimators=300,
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=5,
            class_weight="balanced",
            n_jobs=-1,
            random_state=42,
        )
        rf_fold.fit(X_fold_train, y_fold_train)

        # Predictions
        y_fold_train_pred = rf_fold.predict(X_fold_train)
        y_fold_test_pred = rf_fold.predict(X_fold_test)

        train_acc = accuracy_score(y_fold_train, y_fold_train_pred)
        test_acc = accuracy_score(y_fold_test, y_fold_test_pred)

        fold_train_accs.append(train_acc)
        fold_accuracies.append(test_acc)

        # Per-fold classification report
        report = classification_report(
            y_fold_test,
            y_fold_test_pred,
            target_names=label_names,
            output_dict=True,
        )
        fold_precisions.append(report["weighted avg"]["precision"])
        fold_recalls.append(report["weighted avg"]["recall"])
        fold_f1s.append(report["weighted avg"]["f1-score"])

        # Collect for final confusion matrix
        all_y_true.extend(y_fold_test)
        all_y_pred.extend(y_fold_test_pred)

        print(f"  Train Accuracy: {train_acc:.4f} ({train_acc:.1%})")
        print(f"  Test Accuracy:  {test_acc:.4f} ({test_acc:.1%})")
        print(f"  Gap:            {train_acc - test_acc:.4f}")
        print()
        print(
            classification_report(
                y_fold_test, y_fold_test_pred, target_names=label_names
            )
        )

    # ── 6. Cross-Fold Summary ──
    fold_accuracies = np.array(fold_accuracies)
    fold_train_accs = np.array(fold_train_accs)
    fold_precisions = np.array(fold_precisions)
    fold_recalls = np.array(fold_recalls)
    fold_f1s = np.array(fold_f1s)
    gaps = fold_train_accs - fold_accuracies

    print("\n" + "=" * 60)
    print("  CROSS-VALIDATION SUMMARY (5 Folds)")
    print("=" * 60)

    print(f"\n  Per-Fold Test Accuracies:")
    for i, (ta, te) in enumerate(zip(fold_train_accs, fold_accuracies), 1):
        print(f"    Fold {i}: Train {ta:.1%} | Test {te:.1%} | Gap {ta - te:.1%}")

    print(
        f"\n  Average Training Accuracy:  {fold_train_accs.mean():.4f} "
        f"(+/- {fold_train_accs.std():.4f})"
    )
    print(
        f"  Average Testing Accuracy:   {fold_accuracies.mean():.4f} "
        f"(+/- {fold_accuracies.std():.4f})"
    )
    print(
        f"  Average Precision:          {fold_precisions.mean():.4f} "
        f"(+/- {fold_precisions.std():.4f})"
    )
    print(
        f"  Average Recall:             {fold_recalls.mean():.4f} "
        f"(+/- {fold_recalls.std():.4f})"
    )
    print(
        f"  Average F1 Score:           {fold_f1s.mean():.4f} "
        f"(+/- {fold_f1s.std():.4f})"
    )
    print(f"  Average Gap:                {gaps.mean():.4f} " f"(+/- {gaps.std():.4f})")

    # ── 7. Consistency Check ──
    print("\n" + "=" * 60)
    print("  VERDICT CONSISTENCY & OVERFITTING ANALYSIS")
    print("=" * 60)

    avg_train = fold_train_accs.mean()
    avg_test = fold_accuracies.mean()
    avg_gap = gaps.mean()
    acc_std = fold_accuracies.std()

    if avg_train > 0.95 and avg_test < 0.70:
        overfit_status = "OVERFITTING DETECTED"
        print(f"\n  *** OVERFITTING DETECTED ***")
        print(f"  Average training accuracy ({avg_train:.1%}) is much higher than")
        print(f"  average testing accuracy ({avg_test:.1%}).")
        print(f"  The model memorizes training data and fails to generalize.")
    elif avg_gap > 0.10:
        overfit_status = "MILD OVERFITTING"
        print(f"\n  ** MILD OVERFITTING **")
        print(f"  Average gap ({avg_gap:.1%}) exceeds 10%.")
    else:
        overfit_status = "NO OVERFITTING"
        print(f"\n  NO OVERFITTING DETECTED")
        print(f"  Average gap ({avg_gap:.1%}) is within acceptable range.")

    if acc_std < 0.01:
        consistency = "HIGHLY CONSISTENT"
        print(f"  Verdict Consistency: HIGHLY CONSISTENT (std={acc_std:.4f})")
        print(f"  Predictions are very stable across all 5 folds.")
    elif acc_std < 0.03:
        consistency = "CONSISTENT"
        print(f"  Verdict Consistency: CONSISTENT (std={acc_std:.4f})")
        print(f"  Minor variance across folds — acceptable for production.")
    else:
        consistency = "INCONSISTENT"
        print(f"  Verdict Consistency: INCONSISTENT (std={acc_std:.4f})")
        print(f"  High variance suggests model stability issues.")

    # ── 8. Confusion Matrix (aggregated across all folds) ──
    print("\n\nGenerating plots...")
    cm = confusion_matrix(all_y_true, all_y_pred)
    overall_acc = accuracy_score(all_y_true, all_y_pred)

    fig, ax = plt.subplots(figsize=(8, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
    disp.plot(ax=ax, cmap="Blues", values_format="d")

    ax.set_title(
        f"Confusion Matrix — Aggregated 5-Fold CV\n"
        f"Overall Accuracy: {overall_acc:.1%} | {overfit_status}",
        fontsize=14,
        fontweight="bold",
    )
    ax.set_xlabel("Predicted Label", fontsize=12)
    ax.set_ylabel("True Label", fontsize=12)

    plt.tight_layout()
    cm_path = os.path.join(OUTPUT_DIR, "confusion_matrix.png")
    fig.savefig(cm_path, dpi=150, bbox_inches="tight")
    print(f"  Saved: {cm_path}")

    # ── 9. Per-Fold Accuracy Bar Chart ──
    fig2, ax2 = plt.subplots(figsize=(10, 5))

    x = np.arange(5)
    width = 0.35
    bars_train = ax2.bar(
        x - width / 2,
        fold_train_accs * 100,
        width,
        label="Training",
        color="#2196F3",
        edgecolor="black",
        linewidth=0.5,
    )
    bars_test = ax2.bar(
        x + width / 2,
        fold_accuracies * 100,
        width,
        label="Testing",
        color="#FF9800",
        edgecolor="black",
        linewidth=0.5,
    )

    for bar, val in zip(bars_train, fold_train_accs):
        ax2.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 0.3,
            f"{val:.1%}",
            ha="center",
            va="bottom",
            fontsize=9,
            fontweight="bold",
        )
    for bar, val in zip(bars_test, fold_accuracies):
        ax2.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 0.3,
            f"{val:.1%}",
            ha="center",
            va="bottom",
            fontsize=9,
            fontweight="bold",
        )

    ax2.set_xticks(x)
    ax2.set_xticklabels([f"Fold {i}" for i in range(1, 6)])
    ax2.set_ylim(0, 105)
    ax2.set_ylabel("Accuracy (%)", fontsize=12)
    ax2.set_title(
        f"Per-Fold Accuracy Comparison\n"
        f"Avg Test: {avg_test:.1%} (+/- {acc_std:.4f}) | {consistency}",
        fontsize=14,
        fontweight="bold",
    )
    ax2.legend(loc="lower right")
    ax2.axhline(y=70, color="red", linestyle="--", alpha=0.5, label="70% threshold")

    plt.tight_layout()
    bar_path = os.path.join(OUTPUT_DIR, "accuracy_comparison.png")
    fig2.savefig(bar_path, dpi=150, bbox_inches="tight")
    print(f"  Saved: {bar_path}")

    # ── Final Summary ──
    print("\n" + "=" * 60)
    print("  EVALUATION COMPLETE")
    print("=" * 60)
    print(f"  Dataset:               fake_news_filipino ({len(df)} articles)")
    print(f"  Feature set:           {X_full.shape[1]} (TF-IDF + 9 stylometric)")
    print(f"  Cross-Validation:      5-Fold Stratified")
    print(f"  Avg Training Accuracy: {avg_train:.4f} (+/- {fold_train_accs.std():.4f})")
    print(f"  Avg Testing Accuracy:  {avg_test:.4f} (+/- {acc_std:.4f})")
    print(f"  Avg F1 Score:          {fold_f1s.mean():.4f} (+/- {fold_f1s.std():.4f})")
    print(f"  Avg Gap:               {avg_gap:.4f}")
    print(f"  Overfitting Status:    {overfit_status}")
    print(f"  Verdict Consistency:   {consistency}")
    print(f"  Plots saved to:        {OUTPUT_DIR}/")
    print("=" * 60)


if __name__ == "__main__":
    main()