ThesisProject / backend /evaluate_model.py
JeyBii's picture
Upload folder using huggingface_hub
2b9b5b5 verified
Raw
History Blame Contribute Delete
24.3 kB
"""
Model Overfitting Evaluation Script
=====================================
Evaluates the Random Forest fake news classifier for overfitting by
comparing Training vs. Testing performance.
Split: 80% Train / 20% Test
Metrics: classification_report, accuracy_score, confusion matrix plot
Flag: Overfitting detected if Train Acc > 95% and Test Acc < 70%
Usage:
python backend/evaluate_model.py
"""
import sys
import os
import re
import time
import numpy as np
from textblob import TextBlob
import textstat
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, PROJECT_ROOT)
import pandas as pd
import matplotlib
matplotlib.use("Agg") # Non-interactive backend for saving plots
import matplotlib.pyplot as plt
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
classification_report,
accuracy_score,
confusion_matrix,
ConfusionMatrixDisplay,
)
from sentence_transformers import SentenceTransformer
# ── Paths ──
DATA_MODELS_DIR = os.path.join(PROJECT_ROOT, "data_models")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "evaluation_results")
# ── MiniLM Model (lazy-loaded singleton) ──
MINILM_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
_minilm_model = None
def get_minilm_model():
"""Load the multilingual MiniLM model (cached after first call)."""
global _minilm_model
if _minilm_model is None:
print(" Loading MiniLM model...")
_minilm_model = SentenceTransformer(MINILM_MODEL_NAME)
return _minilm_model
# ───────────────────────────────────────────────────────────
# Text Cleaning (same as train.py)
# ───────────────────────────────────────────────────────────
def clean_text(text):
"""Basic text cleaning for Filipino news articles."""
if not text or not isinstance(text, str):
return ""
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"https?://\S+", " ", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
# ───────────────────────────────────────────────────────────
# Stylometric Features (same as train.py)
# ───────────────────────────────────────────────────────────
# ── Word lists for linguistic features ──
FIRST_PERSON_PRONOUNS = {
"i",
"me",
"my",
"mine",
"myself",
"we",
"us",
"our",
"ours",
"ourselves",
"ako",
"ko",
"akin",
"aking",
"natin",
"atin",
"namin",
"amin",
"tayo",
"kami",
"ta",
}
AUXILIARY_VERBS = {
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"shall",
"should",
"may",
"might",
"can",
"could",
"must",
"am",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"ay",
"dapat",
"mayroon",
"meron",
"maaari",
"pwede",
"kailangan",
}
ANALYTICAL_WORDS = {
"the",
"a",
"an",
"of",
"in",
"on",
"at",
"to",
"for",
"with",
"by",
"from",
"about",
"between",
"through",
"during",
"before",
"after",
"ang",
"ng",
"sa",
"mga",
"nang",
"para",
"tungkol",
"mula",
}
CERTAINTY_WORDS = {
"always",
"never",
"absolutely",
"definitely",
"certainly",
"undoubtedly",
"clearly",
"obviously",
"without doubt",
"guaranteed",
"proven",
"fact",
"undeniable",
"indisputable",
"every",
"all",
"palagi",
"sigurado",
"tiyak",
"talaga",
"totoo",
"lagi",
"walang duda",
}
TENTATIVE_WORDS = {
"perhaps",
"maybe",
"possibly",
"might",
"could",
"likely",
"unlikely",
"suggests",
"appears",
"seems",
"allegedly",
"reportedly",
"according",
"probable",
"approximately",
"estimated",
"siguro",
"marahil",
"maaaring",
"mukhang",
"parang",
"umano",
"diumano",
}
CLOUT_WORDS = {
"must",
"demand",
"require",
"order",
"command",
"insist",
"decree",
"mandate",
"authority",
"power",
"control",
"dominant",
"superior",
"we must",
"you must",
"kailangan",
"dapat",
"utos",
"kapangyarihan",
"kontrol",
"mando",
}
PAST_FOCUS_WORDS = {
"talked",
"did",
"ago",
"said",
"was",
"were",
"had",
"went",
"told",
"noon",
"nakaraan",
"dati",
"kahapon",
}
PRESENT_FOCUS_WORDS = {
"now",
"is",
"today",
"are",
"being",
"currently",
"ongoing",
"ngayon",
"kasalukuyan",
}
FUTURE_FOCUS_WORDS = {
"soon",
"will",
"may",
"shall",
"going",
"plan",
"expect",
"tomorrow",
"bukas",
"darating",
"magiging",
"gagawin",
}
def extract_stylometric_features(text):
"""Extract 25 stylometric features from text (matches train.py)."""
if not text or not isinstance(text, str):
return [0.0] * 25
words = text.split()
token_count = len(words)
if token_count == 0:
return [0.0] * 25
words_lower = [w.lower() for w in words]
text_len = len(text)
exclamation_density = text.count("!") / token_count
question_count = text.count("?")
caps_words = sum(1 for w in words if len(w) >= 2 and w.isupper())
caps_ratio = caps_words / token_count
sentences = re.split(r"[.!?]+", text)
sentences = [s.strip() for s in sentences if s.strip()]
avg_sentence_length = (
sum(len(s.split()) for s in sentences) / len(sentences)
if sentences
else token_count
)
punct_chars = sum(1 for c in text if c in ".,;:!?-\"'()[]{}...")
punctuation_density = (punct_chars / text_len) * 100 if text_len > 0 else 0
unique_words = len(set(words_lower))
unique_word_ratio = unique_words / token_count
avg_word_length = sum(len(w) for w in words) / token_count
try:
subjectivity = TextBlob(text).sentiment.subjectivity
except Exception:
subjectivity = 0.0
try:
flesch_reading_ease = textstat.flesch_reading_ease(text)
flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
coleman_liau_index = textstat.coleman_liau_index(text)
ari = textstat.automated_readability_index(text)
except Exception:
flesch_reading_ease = 0.0
flesch_kincaid_grade = 0.0
coleman_liau_index = 0.0
ari = 0.0
first_person_count = sum(1 for w in words_lower if w in FIRST_PERSON_PRONOUNS)
first_person_ratio = first_person_count / token_count
aux_count = sum(1 for w in words_lower if w in AUXILIARY_VERBS)
auxiliary_verb_ratio = aux_count / token_count
try:
gunning_fog_index = textstat.gunning_fog(text)
except Exception:
gunning_fog_index = 0.0
analytical_count = sum(1 for w in words_lower if w in ANALYTICAL_WORDS)
analytical_thinking = analytical_count / token_count
certainty_count = sum(1 for w in words_lower if w in CERTAINTY_WORDS)
certainty_score = certainty_count / token_count
tentative_count = sum(1 for w in words_lower if w in TENTATIVE_WORDS)
tentative_score = tentative_count / token_count
clout_count = sum(1 for w in words_lower if w in CLOUT_WORDS)
clout_score = clout_count / token_count
comma_period_count = text.count(",") + text.count(".")
comma_period_density = (comma_period_count / text_len) * 100 if text_len > 0 else 0
informal_count = (
text.count("(")
+ text.count(")")
+ text.count("β€”")
+ text.count("–")
+ text.count("-")
+ text.count("...")
+ text.count("…")
)
informal_punct_density = (informal_count / text_len) * 100 if text_len > 0 else 0
past_count = sum(1 for w in words_lower if w in PAST_FOCUS_WORDS)
past_focus_ratio = past_count / token_count
present_count = sum(1 for w in words_lower if w in PRESENT_FOCUS_WORDS)
present_focus_ratio = present_count / token_count
future_count = sum(1 for w in words_lower if w in FUTURE_FOCUS_WORDS)
future_focus_ratio = future_count / token_count
return [
float(exclamation_density),
float(question_count),
float(caps_ratio),
float(avg_sentence_length),
float(punctuation_density),
float(token_count),
float(unique_word_ratio),
float(avg_word_length),
float(subjectivity),
float(flesch_reading_ease),
float(flesch_kincaid_grade),
float(coleman_liau_index),
float(ari),
float(first_person_ratio),
float(auxiliary_verb_ratio),
float(gunning_fog_index),
float(analytical_thinking),
float(certainty_score),
float(tentative_score),
float(clout_score),
float(comma_period_density),
float(informal_punct_density),
float(past_focus_ratio),
float(present_focus_ratio),
float(future_focus_ratio),
]
STYLOMETRIC_FEATURE_NAMES = [
"exclamation_density",
"question_count",
"caps_ratio",
"avg_sentence_length",
"punctuation_density",
"token_count",
"unique_word_ratio",
"avg_word_length",
"subjectivity",
"flesch_reading_ease",
"flesch_kincaid_grade",
"coleman_liau_index",
"ari",
"first_person_ratio",
"auxiliary_verb_ratio",
"gunning_fog_index",
"analytical_thinking",
"certainty_score",
"tentative_score",
"clout_score",
"comma_period_density",
"informal_punct_density",
"past_focus_ratio",
"present_focus_ratio",
"future_focus_ratio",
]
# ───────────────────────────────────────────────────────────
# Main Evaluation
# ───────────────────────────────────────────────────────────
def main():
os.makedirs(OUTPUT_DIR, exist_ok=True)
label_names = ["Real", "Fake"]
# ── 1. Load Dataset ──
print("=" * 60)
print(" MODEL OVERFITTING EVALUATION")
print("=" * 60)
csv_path = os.path.join(
PROJECT_ROOT, "data", "raw", "fakenews", "fakenews", "full.csv"
)
if not os.path.exists(csv_path):
print(f"ERROR: Dataset not found at {csv_path}")
return
df = pd.read_csv(csv_path)
print(f"\nDataset: jcblaise/fake_news_filipino")
print(f"Total articles: {len(df)}")
print(f"Distribution:")
print(f" Real (0): {(df['label'] == 0).sum()}")
print(f" Fake (1): {(df['label'] == 1).sum()}")
# ── 2. Preprocess ──
print("\nPreprocessing...")
df = df.dropna(subset=["article"]).copy()
df = df[df["article"].str.len() > 0].copy()
df.loc[:, "article_clean"] = df["article"].apply(clean_text)
X_texts = df["article_clean"].tolist()
y_labels = df["label"].tolist()
print(f" Valid articles: {len(X_texts)}")
# ── 3. Split: 80% Train / 20% Test ──
print("\nSplitting data: 80% Train / 20% Test...")
X_train, X_test, y_train, y_test = train_test_split(
X_texts,
y_labels,
test_size=0.20,
random_state=42,
stratify=y_labels,
)
print(f" Training set: {len(X_train)} articles")
print(f" Testing set: {len(X_test)} articles")
# ── 4. Build Hybrid Features ──
print("\nBuilding hybrid features (TF-IDF + MiniLM + stylometric)...")
# TF-IDF
tfidf = TfidfVectorizer(
max_features=15000,
ngram_range=(1, 2),
min_df=2,
max_df=0.95,
sublinear_tf=True,
)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
# MiniLM embeddings
print(" Encoding texts with MiniLM...")
minilm = get_minilm_model()
train_embeddings = minilm.encode(X_train, show_progress_bar=True, batch_size=64)
test_embeddings = minilm.encode(X_test, show_progress_bar=True, batch_size=64)
# Stylometric
print(" Extracting stylometric features...")
train_stylo = np.array([extract_stylometric_features(t) for t in X_train])
test_stylo = np.array([extract_stylometric_features(t) for t in X_test])
scaler = StandardScaler()
train_stylo_scaled = scaler.fit_transform(train_stylo)
test_stylo_scaled = scaler.transform(test_stylo)
# Combine
X_train_feat = hstack(
[X_train_tfidf, csr_matrix(train_embeddings), csr_matrix(train_stylo_scaled)]
)
X_test_feat = hstack(
[X_test_tfidf, csr_matrix(test_embeddings), csr_matrix(test_stylo_scaled)]
)
n_tfidf = X_train_tfidf.shape[1]
n_minilm = 384
n_stylo = len(STYLOMETRIC_FEATURE_NAMES)
print(
f" Feature dimensions: {X_train_feat.shape[1]} "
f"(TF-IDF: {n_tfidf} + MiniLM: {n_minilm} + Stylometric: {n_stylo})"
)
# ── 5. Full 5-Fold Cross-Validation ──
print("\n" + "=" * 60)
print(" 5-FOLD CROSS-VALIDATION (Full Dataset)")
print("=" * 60)
# Build features on entire dataset
print("\nBuilding features on full dataset...")
tfidf_full = TfidfVectorizer(
max_features=15000,
ngram_range=(1, 2),
min_df=2,
max_df=0.95,
sublinear_tf=True,
)
X_tfidf_full = tfidf_full.fit_transform(X_texts)
print(" Encoding full dataset with MiniLM...")
full_embeddings = minilm.encode(X_texts, show_progress_bar=True, batch_size=64)
stylo_full = np.array([extract_stylometric_features(t) for t in X_texts])
scaler_full = StandardScaler()
stylo_full_scaled = scaler_full.fit_transform(stylo_full)
X_full = hstack(
[X_tfidf_full, csr_matrix(full_embeddings), csr_matrix(stylo_full_scaled)]
)
y_full = np.array(y_labels)
print(f" Total samples: {X_full.shape[0]}")
print(
f" Feature dimensions: {X_full.shape[1]} "
f"(TF-IDF: {X_tfidf_full.shape[1]} + MiniLM: {n_minilm} + Stylometric: {n_stylo})"
)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies = []
fold_precisions = []
fold_recalls = []
fold_f1s = []
fold_train_accs = []
all_y_true = []
all_y_pred = []
for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_full, y_full), 1):
X_fold_train = X_full[train_idx]
X_fold_test = X_full[test_idx]
y_fold_train = y_full[train_idx]
y_fold_test = y_full[test_idx]
print(f"\n{'─' * 60}")
print(f" FOLD {fold_idx}/5 (Train: {len(train_idx)}, Test: {len(test_idx)})")
print(f"{'─' * 60}")
rf_fold = RandomForestClassifier(
n_estimators=300,
max_depth=15,
min_samples_split=5,
min_samples_leaf=5,
class_weight="balanced",
n_jobs=-1,
random_state=42,
)
rf_fold.fit(X_fold_train, y_fold_train)
# Predictions
y_fold_train_pred = rf_fold.predict(X_fold_train)
y_fold_test_pred = rf_fold.predict(X_fold_test)
train_acc = accuracy_score(y_fold_train, y_fold_train_pred)
test_acc = accuracy_score(y_fold_test, y_fold_test_pred)
fold_train_accs.append(train_acc)
fold_accuracies.append(test_acc)
# Per-fold classification report
report = classification_report(
y_fold_test,
y_fold_test_pred,
target_names=label_names,
output_dict=True,
)
fold_precisions.append(report["weighted avg"]["precision"])
fold_recalls.append(report["weighted avg"]["recall"])
fold_f1s.append(report["weighted avg"]["f1-score"])
# Collect for final confusion matrix
all_y_true.extend(y_fold_test)
all_y_pred.extend(y_fold_test_pred)
print(f" Train Accuracy: {train_acc:.4f} ({train_acc:.1%})")
print(f" Test Accuracy: {test_acc:.4f} ({test_acc:.1%})")
print(f" Gap: {train_acc - test_acc:.4f}")
print()
print(
classification_report(
y_fold_test, y_fold_test_pred, target_names=label_names
)
)
# ── 6. Cross-Fold Summary ──
fold_accuracies = np.array(fold_accuracies)
fold_train_accs = np.array(fold_train_accs)
fold_precisions = np.array(fold_precisions)
fold_recalls = np.array(fold_recalls)
fold_f1s = np.array(fold_f1s)
gaps = fold_train_accs - fold_accuracies
print("\n" + "=" * 60)
print(" CROSS-VALIDATION SUMMARY (5 Folds)")
print("=" * 60)
print(f"\n Per-Fold Test Accuracies:")
for i, (ta, te) in enumerate(zip(fold_train_accs, fold_accuracies), 1):
print(f" Fold {i}: Train {ta:.1%} | Test {te:.1%} | Gap {ta - te:.1%}")
print(
f"\n Average Training Accuracy: {fold_train_accs.mean():.4f} "
f"(+/- {fold_train_accs.std():.4f})"
)
print(
f" Average Testing Accuracy: {fold_accuracies.mean():.4f} "
f"(+/- {fold_accuracies.std():.4f})"
)
print(
f" Average Precision: {fold_precisions.mean():.4f} "
f"(+/- {fold_precisions.std():.4f})"
)
print(
f" Average Recall: {fold_recalls.mean():.4f} "
f"(+/- {fold_recalls.std():.4f})"
)
print(
f" Average F1 Score: {fold_f1s.mean():.4f} "
f"(+/- {fold_f1s.std():.4f})"
)
print(f" Average Gap: {gaps.mean():.4f} " f"(+/- {gaps.std():.4f})")
# ── 7. Consistency Check ──
print("\n" + "=" * 60)
print(" VERDICT CONSISTENCY & OVERFITTING ANALYSIS")
print("=" * 60)
avg_train = fold_train_accs.mean()
avg_test = fold_accuracies.mean()
avg_gap = gaps.mean()
acc_std = fold_accuracies.std()
if avg_train > 0.95 and avg_test < 0.70:
overfit_status = "OVERFITTING DETECTED"
print(f"\n *** OVERFITTING DETECTED ***")
print(f" Average training accuracy ({avg_train:.1%}) is much higher than")
print(f" average testing accuracy ({avg_test:.1%}).")
print(f" The model memorizes training data and fails to generalize.")
elif avg_gap > 0.10:
overfit_status = "MILD OVERFITTING"
print(f"\n ** MILD OVERFITTING **")
print(f" Average gap ({avg_gap:.1%}) exceeds 10%.")
else:
overfit_status = "NO OVERFITTING"
print(f"\n NO OVERFITTING DETECTED")
print(f" Average gap ({avg_gap:.1%}) is within acceptable range.")
if acc_std < 0.01:
consistency = "HIGHLY CONSISTENT"
print(f" Verdict Consistency: HIGHLY CONSISTENT (std={acc_std:.4f})")
print(f" Predictions are very stable across all 5 folds.")
elif acc_std < 0.03:
consistency = "CONSISTENT"
print(f" Verdict Consistency: CONSISTENT (std={acc_std:.4f})")
print(f" Minor variance across folds β€” acceptable for production.")
else:
consistency = "INCONSISTENT"
print(f" Verdict Consistency: INCONSISTENT (std={acc_std:.4f})")
print(f" High variance suggests model stability issues.")
# ── 8. Confusion Matrix (aggregated across all folds) ──
print("\n\nGenerating plots...")
cm = confusion_matrix(all_y_true, all_y_pred)
overall_acc = accuracy_score(all_y_true, all_y_pred)
fig, ax = plt.subplots(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
disp.plot(ax=ax, cmap="Blues", values_format="d")
ax.set_title(
f"Confusion Matrix β€” Aggregated 5-Fold CV\n"
f"Overall Accuracy: {overall_acc:.1%} | {overfit_status}",
fontsize=14,
fontweight="bold",
)
ax.set_xlabel("Predicted Label", fontsize=12)
ax.set_ylabel("True Label", fontsize=12)
plt.tight_layout()
cm_path = os.path.join(OUTPUT_DIR, "confusion_matrix.png")
fig.savefig(cm_path, dpi=150, bbox_inches="tight")
print(f" Saved: {cm_path}")
# ── 9. Per-Fold Accuracy Bar Chart ──
fig2, ax2 = plt.subplots(figsize=(10, 5))
x = np.arange(5)
width = 0.35
bars_train = ax2.bar(
x - width / 2,
fold_train_accs * 100,
width,
label="Training",
color="#2196F3",
edgecolor="black",
linewidth=0.5,
)
bars_test = ax2.bar(
x + width / 2,
fold_accuracies * 100,
width,
label="Testing",
color="#FF9800",
edgecolor="black",
linewidth=0.5,
)
for bar, val in zip(bars_train, fold_train_accs):
ax2.text(
bar.get_x() + bar.get_width() / 2,
bar.get_height() + 0.3,
f"{val:.1%}",
ha="center",
va="bottom",
fontsize=9,
fontweight="bold",
)
for bar, val in zip(bars_test, fold_accuracies):
ax2.text(
bar.get_x() + bar.get_width() / 2,
bar.get_height() + 0.3,
f"{val:.1%}",
ha="center",
va="bottom",
fontsize=9,
fontweight="bold",
)
ax2.set_xticks(x)
ax2.set_xticklabels([f"Fold {i}" for i in range(1, 6)])
ax2.set_ylim(0, 105)
ax2.set_ylabel("Accuracy (%)", fontsize=12)
ax2.set_title(
f"Per-Fold Accuracy Comparison\n"
f"Avg Test: {avg_test:.1%} (+/- {acc_std:.4f}) | {consistency}",
fontsize=14,
fontweight="bold",
)
ax2.legend(loc="lower right")
ax2.axhline(y=70, color="red", linestyle="--", alpha=0.5, label="70% threshold")
plt.tight_layout()
bar_path = os.path.join(OUTPUT_DIR, "accuracy_comparison.png")
fig2.savefig(bar_path, dpi=150, bbox_inches="tight")
print(f" Saved: {bar_path}")
# ── Final Summary ──
print("\n" + "=" * 60)
print(" EVALUATION COMPLETE")
print("=" * 60)
print(f" Dataset: fake_news_filipino ({len(df)} articles)")
print(f" Feature set: {X_full.shape[1]} (TF-IDF + 9 stylometric)")
print(f" Cross-Validation: 5-Fold Stratified")
print(f" Avg Training Accuracy: {avg_train:.4f} (+/- {fold_train_accs.std():.4f})")
print(f" Avg Testing Accuracy: {avg_test:.4f} (+/- {acc_std:.4f})")
print(f" Avg F1 Score: {fold_f1s.mean():.4f} (+/- {fold_f1s.std():.4f})")
print(f" Avg Gap: {avg_gap:.4f}")
print(f" Overfitting Status: {overfit_status}")
print(f" Verdict Consistency: {consistency}")
print(f" Plots saved to: {OUTPUT_DIR}/")
print("=" * 60)
if __name__ == "__main__":
main()