""" Model Overfitting Evaluation Script ===================================== Evaluates the Random Forest fake news classifier for overfitting by comparing Training vs. Testing performance. Split: 80% Train / 20% Test Metrics: classification_report, accuracy_score, confusion matrix plot Flag: Overfitting detected if Train Acc > 95% and Test Acc < 70% Usage: python backend/evaluate_model.py """ import sys import os import re import time import numpy as np from textblob import TextBlob import textstat PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, PROJECT_ROOT) import pandas as pd import matplotlib matplotlib.use("Agg") # Non-interactive backend for saving plots import matplotlib.pyplot as plt from scipy.sparse import hstack, csr_matrix from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler from sklearn.metrics import ( classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, ) from sentence_transformers import SentenceTransformer # ── Paths ── DATA_MODELS_DIR = os.path.join(PROJECT_ROOT, "data_models") OUTPUT_DIR = os.path.join(PROJECT_ROOT, "evaluation_results") # ── MiniLM Model (lazy-loaded singleton) ── MINILM_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2" _minilm_model = None def get_minilm_model(): """Load the multilingual MiniLM model (cached after first call).""" global _minilm_model if _minilm_model is None: print(" Loading MiniLM model...") _minilm_model = SentenceTransformer(MINILM_MODEL_NAME) return _minilm_model # ─────────────────────────────────────────────────────────── # Text Cleaning (same as train.py) # ─────────────────────────────────────────────────────────── def clean_text(text): """Basic text cleaning for Filipino news articles.""" if not text or not isinstance(text, str): return "" text = re.sub(r"<[^>]+>", " ", text) text = re.sub(r"https?://\S+", " ", text) text = re.sub(r"\s+", " ", text) return text.strip() # ─────────────────────────────────────────────────────────── # Stylometric Features (same as train.py) # ─────────────────────────────────────────────────────────── # ── Word lists for linguistic features ── FIRST_PERSON_PRONOUNS = { "i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves", "ako", "ko", "akin", "aking", "natin", "atin", "namin", "amin", "tayo", "kami", "ta", } AUXILIARY_VERBS = { "have", "has", "had", "do", "does", "did", "will", "would", "shall", "should", "may", "might", "can", "could", "must", "am", "is", "are", "was", "were", "be", "been", "being", "ay", "dapat", "mayroon", "meron", "maaari", "pwede", "kailangan", } ANALYTICAL_WORDS = { "the", "a", "an", "of", "in", "on", "at", "to", "for", "with", "by", "from", "about", "between", "through", "during", "before", "after", "ang", "ng", "sa", "mga", "nang", "para", "tungkol", "mula", } CERTAINTY_WORDS = { "always", "never", "absolutely", "definitely", "certainly", "undoubtedly", "clearly", "obviously", "without doubt", "guaranteed", "proven", "fact", "undeniable", "indisputable", "every", "all", "palagi", "sigurado", "tiyak", "talaga", "totoo", "lagi", "walang duda", } TENTATIVE_WORDS = { "perhaps", "maybe", "possibly", "might", "could", "likely", "unlikely", "suggests", "appears", "seems", "allegedly", "reportedly", "according", "probable", "approximately", "estimated", "siguro", "marahil", "maaaring", "mukhang", "parang", "umano", "diumano", } CLOUT_WORDS = { "must", "demand", "require", "order", "command", "insist", "decree", "mandate", "authority", "power", "control", "dominant", "superior", "we must", "you must", "kailangan", "dapat", "utos", "kapangyarihan", "kontrol", "mando", } PAST_FOCUS_WORDS = { "talked", "did", "ago", "said", "was", "were", "had", "went", "told", "noon", "nakaraan", "dati", "kahapon", } PRESENT_FOCUS_WORDS = { "now", "is", "today", "are", "being", "currently", "ongoing", "ngayon", "kasalukuyan", } FUTURE_FOCUS_WORDS = { "soon", "will", "may", "shall", "going", "plan", "expect", "tomorrow", "bukas", "darating", "magiging", "gagawin", } def extract_stylometric_features(text): """Extract 25 stylometric features from text (matches train.py).""" if not text or not isinstance(text, str): return [0.0] * 25 words = text.split() token_count = len(words) if token_count == 0: return [0.0] * 25 words_lower = [w.lower() for w in words] text_len = len(text) exclamation_density = text.count("!") / token_count question_count = text.count("?") caps_words = sum(1 for w in words if len(w) >= 2 and w.isupper()) caps_ratio = caps_words / token_count sentences = re.split(r"[.!?]+", text) sentences = [s.strip() for s in sentences if s.strip()] avg_sentence_length = ( sum(len(s.split()) for s in sentences) / len(sentences) if sentences else token_count ) punct_chars = sum(1 for c in text if c in ".,;:!?-\"'()[]{}...") punctuation_density = (punct_chars / text_len) * 100 if text_len > 0 else 0 unique_words = len(set(words_lower)) unique_word_ratio = unique_words / token_count avg_word_length = sum(len(w) for w in words) / token_count try: subjectivity = TextBlob(text).sentiment.subjectivity except Exception: subjectivity = 0.0 try: flesch_reading_ease = textstat.flesch_reading_ease(text) flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) coleman_liau_index = textstat.coleman_liau_index(text) ari = textstat.automated_readability_index(text) except Exception: flesch_reading_ease = 0.0 flesch_kincaid_grade = 0.0 coleman_liau_index = 0.0 ari = 0.0 first_person_count = sum(1 for w in words_lower if w in FIRST_PERSON_PRONOUNS) first_person_ratio = first_person_count / token_count aux_count = sum(1 for w in words_lower if w in AUXILIARY_VERBS) auxiliary_verb_ratio = aux_count / token_count try: gunning_fog_index = textstat.gunning_fog(text) except Exception: gunning_fog_index = 0.0 analytical_count = sum(1 for w in words_lower if w in ANALYTICAL_WORDS) analytical_thinking = analytical_count / token_count certainty_count = sum(1 for w in words_lower if w in CERTAINTY_WORDS) certainty_score = certainty_count / token_count tentative_count = sum(1 for w in words_lower if w in TENTATIVE_WORDS) tentative_score = tentative_count / token_count clout_count = sum(1 for w in words_lower if w in CLOUT_WORDS) clout_score = clout_count / token_count comma_period_count = text.count(",") + text.count(".") comma_period_density = (comma_period_count / text_len) * 100 if text_len > 0 else 0 informal_count = ( text.count("(") + text.count(")") + text.count("—") + text.count("–") + text.count("-") + text.count("...") + text.count("…") ) informal_punct_density = (informal_count / text_len) * 100 if text_len > 0 else 0 past_count = sum(1 for w in words_lower if w in PAST_FOCUS_WORDS) past_focus_ratio = past_count / token_count present_count = sum(1 for w in words_lower if w in PRESENT_FOCUS_WORDS) present_focus_ratio = present_count / token_count future_count = sum(1 for w in words_lower if w in FUTURE_FOCUS_WORDS) future_focus_ratio = future_count / token_count return [ float(exclamation_density), float(question_count), float(caps_ratio), float(avg_sentence_length), float(punctuation_density), float(token_count), float(unique_word_ratio), float(avg_word_length), float(subjectivity), float(flesch_reading_ease), float(flesch_kincaid_grade), float(coleman_liau_index), float(ari), float(first_person_ratio), float(auxiliary_verb_ratio), float(gunning_fog_index), float(analytical_thinking), float(certainty_score), float(tentative_score), float(clout_score), float(comma_period_density), float(informal_punct_density), float(past_focus_ratio), float(present_focus_ratio), float(future_focus_ratio), ] STYLOMETRIC_FEATURE_NAMES = [ "exclamation_density", "question_count", "caps_ratio", "avg_sentence_length", "punctuation_density", "token_count", "unique_word_ratio", "avg_word_length", "subjectivity", "flesch_reading_ease", "flesch_kincaid_grade", "coleman_liau_index", "ari", "first_person_ratio", "auxiliary_verb_ratio", "gunning_fog_index", "analytical_thinking", "certainty_score", "tentative_score", "clout_score", "comma_period_density", "informal_punct_density", "past_focus_ratio", "present_focus_ratio", "future_focus_ratio", ] # ─────────────────────────────────────────────────────────── # Main Evaluation # ─────────────────────────────────────────────────────────── def main(): os.makedirs(OUTPUT_DIR, exist_ok=True) label_names = ["Real", "Fake"] # ── 1. Load Dataset ── print("=" * 60) print(" MODEL OVERFITTING EVALUATION") print("=" * 60) csv_path = os.path.join( PROJECT_ROOT, "data", "raw", "fakenews", "fakenews", "full.csv" ) if not os.path.exists(csv_path): print(f"ERROR: Dataset not found at {csv_path}") return df = pd.read_csv(csv_path) print(f"\nDataset: jcblaise/fake_news_filipino") print(f"Total articles: {len(df)}") print(f"Distribution:") print(f" Real (0): {(df['label'] == 0).sum()}") print(f" Fake (1): {(df['label'] == 1).sum()}") # ── 2. Preprocess ── print("\nPreprocessing...") df = df.dropna(subset=["article"]).copy() df = df[df["article"].str.len() > 0].copy() df.loc[:, "article_clean"] = df["article"].apply(clean_text) X_texts = df["article_clean"].tolist() y_labels = df["label"].tolist() print(f" Valid articles: {len(X_texts)}") # ── 3. Split: 80% Train / 20% Test ── print("\nSplitting data: 80% Train / 20% Test...") X_train, X_test, y_train, y_test = train_test_split( X_texts, y_labels, test_size=0.20, random_state=42, stratify=y_labels, ) print(f" Training set: {len(X_train)} articles") print(f" Testing set: {len(X_test)} articles") # ── 4. Build Hybrid Features ── print("\nBuilding hybrid features (TF-IDF + MiniLM + stylometric)...") # TF-IDF tfidf = TfidfVectorizer( max_features=15000, ngram_range=(1, 2), min_df=2, max_df=0.95, sublinear_tf=True, ) X_train_tfidf = tfidf.fit_transform(X_train) X_test_tfidf = tfidf.transform(X_test) # MiniLM embeddings print(" Encoding texts with MiniLM...") minilm = get_minilm_model() train_embeddings = minilm.encode(X_train, show_progress_bar=True, batch_size=64) test_embeddings = minilm.encode(X_test, show_progress_bar=True, batch_size=64) # Stylometric print(" Extracting stylometric features...") train_stylo = np.array([extract_stylometric_features(t) for t in X_train]) test_stylo = np.array([extract_stylometric_features(t) for t in X_test]) scaler = StandardScaler() train_stylo_scaled = scaler.fit_transform(train_stylo) test_stylo_scaled = scaler.transform(test_stylo) # Combine X_train_feat = hstack( [X_train_tfidf, csr_matrix(train_embeddings), csr_matrix(train_stylo_scaled)] ) X_test_feat = hstack( [X_test_tfidf, csr_matrix(test_embeddings), csr_matrix(test_stylo_scaled)] ) n_tfidf = X_train_tfidf.shape[1] n_minilm = 384 n_stylo = len(STYLOMETRIC_FEATURE_NAMES) print( f" Feature dimensions: {X_train_feat.shape[1]} " f"(TF-IDF: {n_tfidf} + MiniLM: {n_minilm} + Stylometric: {n_stylo})" ) # ── 5. Full 5-Fold Cross-Validation ── print("\n" + "=" * 60) print(" 5-FOLD CROSS-VALIDATION (Full Dataset)") print("=" * 60) # Build features on entire dataset print("\nBuilding features on full dataset...") tfidf_full = TfidfVectorizer( max_features=15000, ngram_range=(1, 2), min_df=2, max_df=0.95, sublinear_tf=True, ) X_tfidf_full = tfidf_full.fit_transform(X_texts) print(" Encoding full dataset with MiniLM...") full_embeddings = minilm.encode(X_texts, show_progress_bar=True, batch_size=64) stylo_full = np.array([extract_stylometric_features(t) for t in X_texts]) scaler_full = StandardScaler() stylo_full_scaled = scaler_full.fit_transform(stylo_full) X_full = hstack( [X_tfidf_full, csr_matrix(full_embeddings), csr_matrix(stylo_full_scaled)] ) y_full = np.array(y_labels) print(f" Total samples: {X_full.shape[0]}") print( f" Feature dimensions: {X_full.shape[1]} " f"(TF-IDF: {X_tfidf_full.shape[1]} + MiniLM: {n_minilm} + Stylometric: {n_stylo})" ) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) fold_accuracies = [] fold_precisions = [] fold_recalls = [] fold_f1s = [] fold_train_accs = [] all_y_true = [] all_y_pred = [] for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_full, y_full), 1): X_fold_train = X_full[train_idx] X_fold_test = X_full[test_idx] y_fold_train = y_full[train_idx] y_fold_test = y_full[test_idx] print(f"\n{'─' * 60}") print(f" FOLD {fold_idx}/5 (Train: {len(train_idx)}, Test: {len(test_idx)})") print(f"{'─' * 60}") rf_fold = RandomForestClassifier( n_estimators=300, max_depth=15, min_samples_split=5, min_samples_leaf=5, class_weight="balanced", n_jobs=-1, random_state=42, ) rf_fold.fit(X_fold_train, y_fold_train) # Predictions y_fold_train_pred = rf_fold.predict(X_fold_train) y_fold_test_pred = rf_fold.predict(X_fold_test) train_acc = accuracy_score(y_fold_train, y_fold_train_pred) test_acc = accuracy_score(y_fold_test, y_fold_test_pred) fold_train_accs.append(train_acc) fold_accuracies.append(test_acc) # Per-fold classification report report = classification_report( y_fold_test, y_fold_test_pred, target_names=label_names, output_dict=True, ) fold_precisions.append(report["weighted avg"]["precision"]) fold_recalls.append(report["weighted avg"]["recall"]) fold_f1s.append(report["weighted avg"]["f1-score"]) # Collect for final confusion matrix all_y_true.extend(y_fold_test) all_y_pred.extend(y_fold_test_pred) print(f" Train Accuracy: {train_acc:.4f} ({train_acc:.1%})") print(f" Test Accuracy: {test_acc:.4f} ({test_acc:.1%})") print(f" Gap: {train_acc - test_acc:.4f}") print() print( classification_report( y_fold_test, y_fold_test_pred, target_names=label_names ) ) # ── 6. Cross-Fold Summary ── fold_accuracies = np.array(fold_accuracies) fold_train_accs = np.array(fold_train_accs) fold_precisions = np.array(fold_precisions) fold_recalls = np.array(fold_recalls) fold_f1s = np.array(fold_f1s) gaps = fold_train_accs - fold_accuracies print("\n" + "=" * 60) print(" CROSS-VALIDATION SUMMARY (5 Folds)") print("=" * 60) print(f"\n Per-Fold Test Accuracies:") for i, (ta, te) in enumerate(zip(fold_train_accs, fold_accuracies), 1): print(f" Fold {i}: Train {ta:.1%} | Test {te:.1%} | Gap {ta - te:.1%}") print( f"\n Average Training Accuracy: {fold_train_accs.mean():.4f} " f"(+/- {fold_train_accs.std():.4f})" ) print( f" Average Testing Accuracy: {fold_accuracies.mean():.4f} " f"(+/- {fold_accuracies.std():.4f})" ) print( f" Average Precision: {fold_precisions.mean():.4f} " f"(+/- {fold_precisions.std():.4f})" ) print( f" Average Recall: {fold_recalls.mean():.4f} " f"(+/- {fold_recalls.std():.4f})" ) print( f" Average F1 Score: {fold_f1s.mean():.4f} " f"(+/- {fold_f1s.std():.4f})" ) print(f" Average Gap: {gaps.mean():.4f} " f"(+/- {gaps.std():.4f})") # ── 7. Consistency Check ── print("\n" + "=" * 60) print(" VERDICT CONSISTENCY & OVERFITTING ANALYSIS") print("=" * 60) avg_train = fold_train_accs.mean() avg_test = fold_accuracies.mean() avg_gap = gaps.mean() acc_std = fold_accuracies.std() if avg_train > 0.95 and avg_test < 0.70: overfit_status = "OVERFITTING DETECTED" print(f"\n *** OVERFITTING DETECTED ***") print(f" Average training accuracy ({avg_train:.1%}) is much higher than") print(f" average testing accuracy ({avg_test:.1%}).") print(f" The model memorizes training data and fails to generalize.") elif avg_gap > 0.10: overfit_status = "MILD OVERFITTING" print(f"\n ** MILD OVERFITTING **") print(f" Average gap ({avg_gap:.1%}) exceeds 10%.") else: overfit_status = "NO OVERFITTING" print(f"\n NO OVERFITTING DETECTED") print(f" Average gap ({avg_gap:.1%}) is within acceptable range.") if acc_std < 0.01: consistency = "HIGHLY CONSISTENT" print(f" Verdict Consistency: HIGHLY CONSISTENT (std={acc_std:.4f})") print(f" Predictions are very stable across all 5 folds.") elif acc_std < 0.03: consistency = "CONSISTENT" print(f" Verdict Consistency: CONSISTENT (std={acc_std:.4f})") print(f" Minor variance across folds — acceptable for production.") else: consistency = "INCONSISTENT" print(f" Verdict Consistency: INCONSISTENT (std={acc_std:.4f})") print(f" High variance suggests model stability issues.") # ── 8. Confusion Matrix (aggregated across all folds) ── print("\n\nGenerating plots...") cm = confusion_matrix(all_y_true, all_y_pred) overall_acc = accuracy_score(all_y_true, all_y_pred) fig, ax = plt.subplots(figsize=(8, 6)) disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names) disp.plot(ax=ax, cmap="Blues", values_format="d") ax.set_title( f"Confusion Matrix — Aggregated 5-Fold CV\n" f"Overall Accuracy: {overall_acc:.1%} | {overfit_status}", fontsize=14, fontweight="bold", ) ax.set_xlabel("Predicted Label", fontsize=12) ax.set_ylabel("True Label", fontsize=12) plt.tight_layout() cm_path = os.path.join(OUTPUT_DIR, "confusion_matrix.png") fig.savefig(cm_path, dpi=150, bbox_inches="tight") print(f" Saved: {cm_path}") # ── 9. Per-Fold Accuracy Bar Chart ── fig2, ax2 = plt.subplots(figsize=(10, 5)) x = np.arange(5) width = 0.35 bars_train = ax2.bar( x - width / 2, fold_train_accs * 100, width, label="Training", color="#2196F3", edgecolor="black", linewidth=0.5, ) bars_test = ax2.bar( x + width / 2, fold_accuracies * 100, width, label="Testing", color="#FF9800", edgecolor="black", linewidth=0.5, ) for bar, val in zip(bars_train, fold_train_accs): ax2.text( bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.3, f"{val:.1%}", ha="center", va="bottom", fontsize=9, fontweight="bold", ) for bar, val in zip(bars_test, fold_accuracies): ax2.text( bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.3, f"{val:.1%}", ha="center", va="bottom", fontsize=9, fontweight="bold", ) ax2.set_xticks(x) ax2.set_xticklabels([f"Fold {i}" for i in range(1, 6)]) ax2.set_ylim(0, 105) ax2.set_ylabel("Accuracy (%)", fontsize=12) ax2.set_title( f"Per-Fold Accuracy Comparison\n" f"Avg Test: {avg_test:.1%} (+/- {acc_std:.4f}) | {consistency}", fontsize=14, fontweight="bold", ) ax2.legend(loc="lower right") ax2.axhline(y=70, color="red", linestyle="--", alpha=0.5, label="70% threshold") plt.tight_layout() bar_path = os.path.join(OUTPUT_DIR, "accuracy_comparison.png") fig2.savefig(bar_path, dpi=150, bbox_inches="tight") print(f" Saved: {bar_path}") # ── Final Summary ── print("\n" + "=" * 60) print(" EVALUATION COMPLETE") print("=" * 60) print(f" Dataset: fake_news_filipino ({len(df)} articles)") print(f" Feature set: {X_full.shape[1]} (TF-IDF + 9 stylometric)") print(f" Cross-Validation: 5-Fold Stratified") print(f" Avg Training Accuracy: {avg_train:.4f} (+/- {fold_train_accs.std():.4f})") print(f" Avg Testing Accuracy: {avg_test:.4f} (+/- {acc_std:.4f})") print(f" Avg F1 Score: {fold_f1s.mean():.4f} (+/- {fold_f1s.std():.4f})") print(f" Avg Gap: {avg_gap:.4f}") print(f" Overfitting Status: {overfit_status}") print(f" Verdict Consistency: {consistency}") print(f" Plots saved to: {OUTPUT_DIR}/") print("=" * 60) if __name__ == "__main__": main()