Spaces:
Sleeping
Sleeping
| """ | |
| Model Overfitting Evaluation Script | |
| ===================================== | |
| Evaluates the Random Forest fake news classifier for overfitting by | |
| comparing Training vs. Testing performance. | |
| Split: 80% Train / 20% Test | |
| Metrics: classification_report, accuracy_score, confusion matrix plot | |
| Flag: Overfitting detected if Train Acc > 95% and Test Acc < 70% | |
| Usage: | |
| python backend/evaluate_model.py | |
| """ | |
| import sys | |
| import os | |
| import re | |
| import time | |
| import numpy as np | |
| from textblob import TextBlob | |
| import textstat | |
| PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| sys.path.insert(0, PROJECT_ROOT) | |
| import pandas as pd | |
| import matplotlib | |
| matplotlib.use("Agg") # Non-interactive backend for saving plots | |
| import matplotlib.pyplot as plt | |
| from scipy.sparse import hstack, csr_matrix | |
| from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.metrics import ( | |
| classification_report, | |
| accuracy_score, | |
| confusion_matrix, | |
| ConfusionMatrixDisplay, | |
| ) | |
| from sentence_transformers import SentenceTransformer | |
| # ββ Paths ββ | |
| DATA_MODELS_DIR = os.path.join(PROJECT_ROOT, "data_models") | |
| OUTPUT_DIR = os.path.join(PROJECT_ROOT, "evaluation_results") | |
| # ββ MiniLM Model (lazy-loaded singleton) ββ | |
| MINILM_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2" | |
| _minilm_model = None | |
| def get_minilm_model(): | |
| """Load the multilingual MiniLM model (cached after first call).""" | |
| global _minilm_model | |
| if _minilm_model is None: | |
| print(" Loading MiniLM model...") | |
| _minilm_model = SentenceTransformer(MINILM_MODEL_NAME) | |
| return _minilm_model | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Text Cleaning (same as train.py) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def clean_text(text): | |
| """Basic text cleaning for Filipino news articles.""" | |
| if not text or not isinstance(text, str): | |
| return "" | |
| text = re.sub(r"<[^>]+>", " ", text) | |
| text = re.sub(r"https?://\S+", " ", text) | |
| text = re.sub(r"\s+", " ", text) | |
| return text.strip() | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Stylometric Features (same as train.py) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ββ Word lists for linguistic features ββ | |
| FIRST_PERSON_PRONOUNS = { | |
| "i", | |
| "me", | |
| "my", | |
| "mine", | |
| "myself", | |
| "we", | |
| "us", | |
| "our", | |
| "ours", | |
| "ourselves", | |
| "ako", | |
| "ko", | |
| "akin", | |
| "aking", | |
| "natin", | |
| "atin", | |
| "namin", | |
| "amin", | |
| "tayo", | |
| "kami", | |
| "ta", | |
| } | |
| AUXILIARY_VERBS = { | |
| "have", | |
| "has", | |
| "had", | |
| "do", | |
| "does", | |
| "did", | |
| "will", | |
| "would", | |
| "shall", | |
| "should", | |
| "may", | |
| "might", | |
| "can", | |
| "could", | |
| "must", | |
| "am", | |
| "is", | |
| "are", | |
| "was", | |
| "were", | |
| "be", | |
| "been", | |
| "being", | |
| "ay", | |
| "dapat", | |
| "mayroon", | |
| "meron", | |
| "maaari", | |
| "pwede", | |
| "kailangan", | |
| } | |
| ANALYTICAL_WORDS = { | |
| "the", | |
| "a", | |
| "an", | |
| "of", | |
| "in", | |
| "on", | |
| "at", | |
| "to", | |
| "for", | |
| "with", | |
| "by", | |
| "from", | |
| "about", | |
| "between", | |
| "through", | |
| "during", | |
| "before", | |
| "after", | |
| "ang", | |
| "ng", | |
| "sa", | |
| "mga", | |
| "nang", | |
| "para", | |
| "tungkol", | |
| "mula", | |
| } | |
| CERTAINTY_WORDS = { | |
| "always", | |
| "never", | |
| "absolutely", | |
| "definitely", | |
| "certainly", | |
| "undoubtedly", | |
| "clearly", | |
| "obviously", | |
| "without doubt", | |
| "guaranteed", | |
| "proven", | |
| "fact", | |
| "undeniable", | |
| "indisputable", | |
| "every", | |
| "all", | |
| "palagi", | |
| "sigurado", | |
| "tiyak", | |
| "talaga", | |
| "totoo", | |
| "lagi", | |
| "walang duda", | |
| } | |
| TENTATIVE_WORDS = { | |
| "perhaps", | |
| "maybe", | |
| "possibly", | |
| "might", | |
| "could", | |
| "likely", | |
| "unlikely", | |
| "suggests", | |
| "appears", | |
| "seems", | |
| "allegedly", | |
| "reportedly", | |
| "according", | |
| "probable", | |
| "approximately", | |
| "estimated", | |
| "siguro", | |
| "marahil", | |
| "maaaring", | |
| "mukhang", | |
| "parang", | |
| "umano", | |
| "diumano", | |
| } | |
| CLOUT_WORDS = { | |
| "must", | |
| "demand", | |
| "require", | |
| "order", | |
| "command", | |
| "insist", | |
| "decree", | |
| "mandate", | |
| "authority", | |
| "power", | |
| "control", | |
| "dominant", | |
| "superior", | |
| "we must", | |
| "you must", | |
| "kailangan", | |
| "dapat", | |
| "utos", | |
| "kapangyarihan", | |
| "kontrol", | |
| "mando", | |
| } | |
| PAST_FOCUS_WORDS = { | |
| "talked", | |
| "did", | |
| "ago", | |
| "said", | |
| "was", | |
| "were", | |
| "had", | |
| "went", | |
| "told", | |
| "noon", | |
| "nakaraan", | |
| "dati", | |
| "kahapon", | |
| } | |
| PRESENT_FOCUS_WORDS = { | |
| "now", | |
| "is", | |
| "today", | |
| "are", | |
| "being", | |
| "currently", | |
| "ongoing", | |
| "ngayon", | |
| "kasalukuyan", | |
| } | |
| FUTURE_FOCUS_WORDS = { | |
| "soon", | |
| "will", | |
| "may", | |
| "shall", | |
| "going", | |
| "plan", | |
| "expect", | |
| "tomorrow", | |
| "bukas", | |
| "darating", | |
| "magiging", | |
| "gagawin", | |
| } | |
| def extract_stylometric_features(text): | |
| """Extract 25 stylometric features from text (matches train.py).""" | |
| if not text or not isinstance(text, str): | |
| return [0.0] * 25 | |
| words = text.split() | |
| token_count = len(words) | |
| if token_count == 0: | |
| return [0.0] * 25 | |
| words_lower = [w.lower() for w in words] | |
| text_len = len(text) | |
| exclamation_density = text.count("!") / token_count | |
| question_count = text.count("?") | |
| caps_words = sum(1 for w in words if len(w) >= 2 and w.isupper()) | |
| caps_ratio = caps_words / token_count | |
| sentences = re.split(r"[.!?]+", text) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| avg_sentence_length = ( | |
| sum(len(s.split()) for s in sentences) / len(sentences) | |
| if sentences | |
| else token_count | |
| ) | |
| punct_chars = sum(1 for c in text if c in ".,;:!?-\"'()[]{}...") | |
| punctuation_density = (punct_chars / text_len) * 100 if text_len > 0 else 0 | |
| unique_words = len(set(words_lower)) | |
| unique_word_ratio = unique_words / token_count | |
| avg_word_length = sum(len(w) for w in words) / token_count | |
| try: | |
| subjectivity = TextBlob(text).sentiment.subjectivity | |
| except Exception: | |
| subjectivity = 0.0 | |
| try: | |
| flesch_reading_ease = textstat.flesch_reading_ease(text) | |
| flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) | |
| coleman_liau_index = textstat.coleman_liau_index(text) | |
| ari = textstat.automated_readability_index(text) | |
| except Exception: | |
| flesch_reading_ease = 0.0 | |
| flesch_kincaid_grade = 0.0 | |
| coleman_liau_index = 0.0 | |
| ari = 0.0 | |
| first_person_count = sum(1 for w in words_lower if w in FIRST_PERSON_PRONOUNS) | |
| first_person_ratio = first_person_count / token_count | |
| aux_count = sum(1 for w in words_lower if w in AUXILIARY_VERBS) | |
| auxiliary_verb_ratio = aux_count / token_count | |
| try: | |
| gunning_fog_index = textstat.gunning_fog(text) | |
| except Exception: | |
| gunning_fog_index = 0.0 | |
| analytical_count = sum(1 for w in words_lower if w in ANALYTICAL_WORDS) | |
| analytical_thinking = analytical_count / token_count | |
| certainty_count = sum(1 for w in words_lower if w in CERTAINTY_WORDS) | |
| certainty_score = certainty_count / token_count | |
| tentative_count = sum(1 for w in words_lower if w in TENTATIVE_WORDS) | |
| tentative_score = tentative_count / token_count | |
| clout_count = sum(1 for w in words_lower if w in CLOUT_WORDS) | |
| clout_score = clout_count / token_count | |
| comma_period_count = text.count(",") + text.count(".") | |
| comma_period_density = (comma_period_count / text_len) * 100 if text_len > 0 else 0 | |
| informal_count = ( | |
| text.count("(") | |
| + text.count(")") | |
| + text.count("β") | |
| + text.count("β") | |
| + text.count("-") | |
| + text.count("...") | |
| + text.count("β¦") | |
| ) | |
| informal_punct_density = (informal_count / text_len) * 100 if text_len > 0 else 0 | |
| past_count = sum(1 for w in words_lower if w in PAST_FOCUS_WORDS) | |
| past_focus_ratio = past_count / token_count | |
| present_count = sum(1 for w in words_lower if w in PRESENT_FOCUS_WORDS) | |
| present_focus_ratio = present_count / token_count | |
| future_count = sum(1 for w in words_lower if w in FUTURE_FOCUS_WORDS) | |
| future_focus_ratio = future_count / token_count | |
| return [ | |
| float(exclamation_density), | |
| float(question_count), | |
| float(caps_ratio), | |
| float(avg_sentence_length), | |
| float(punctuation_density), | |
| float(token_count), | |
| float(unique_word_ratio), | |
| float(avg_word_length), | |
| float(subjectivity), | |
| float(flesch_reading_ease), | |
| float(flesch_kincaid_grade), | |
| float(coleman_liau_index), | |
| float(ari), | |
| float(first_person_ratio), | |
| float(auxiliary_verb_ratio), | |
| float(gunning_fog_index), | |
| float(analytical_thinking), | |
| float(certainty_score), | |
| float(tentative_score), | |
| float(clout_score), | |
| float(comma_period_density), | |
| float(informal_punct_density), | |
| float(past_focus_ratio), | |
| float(present_focus_ratio), | |
| float(future_focus_ratio), | |
| ] | |
| STYLOMETRIC_FEATURE_NAMES = [ | |
| "exclamation_density", | |
| "question_count", | |
| "caps_ratio", | |
| "avg_sentence_length", | |
| "punctuation_density", | |
| "token_count", | |
| "unique_word_ratio", | |
| "avg_word_length", | |
| "subjectivity", | |
| "flesch_reading_ease", | |
| "flesch_kincaid_grade", | |
| "coleman_liau_index", | |
| "ari", | |
| "first_person_ratio", | |
| "auxiliary_verb_ratio", | |
| "gunning_fog_index", | |
| "analytical_thinking", | |
| "certainty_score", | |
| "tentative_score", | |
| "clout_score", | |
| "comma_period_density", | |
| "informal_punct_density", | |
| "past_focus_ratio", | |
| "present_focus_ratio", | |
| "future_focus_ratio", | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Main Evaluation | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| label_names = ["Real", "Fake"] | |
| # ββ 1. Load Dataset ββ | |
| print("=" * 60) | |
| print(" MODEL OVERFITTING EVALUATION") | |
| print("=" * 60) | |
| csv_path = os.path.join( | |
| PROJECT_ROOT, "data", "raw", "fakenews", "fakenews", "full.csv" | |
| ) | |
| if not os.path.exists(csv_path): | |
| print(f"ERROR: Dataset not found at {csv_path}") | |
| return | |
| df = pd.read_csv(csv_path) | |
| print(f"\nDataset: jcblaise/fake_news_filipino") | |
| print(f"Total articles: {len(df)}") | |
| print(f"Distribution:") | |
| print(f" Real (0): {(df['label'] == 0).sum()}") | |
| print(f" Fake (1): {(df['label'] == 1).sum()}") | |
| # ββ 2. Preprocess ββ | |
| print("\nPreprocessing...") | |
| df = df.dropna(subset=["article"]).copy() | |
| df = df[df["article"].str.len() > 0].copy() | |
| df.loc[:, "article_clean"] = df["article"].apply(clean_text) | |
| X_texts = df["article_clean"].tolist() | |
| y_labels = df["label"].tolist() | |
| print(f" Valid articles: {len(X_texts)}") | |
| # ββ 3. Split: 80% Train / 20% Test ββ | |
| print("\nSplitting data: 80% Train / 20% Test...") | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X_texts, | |
| y_labels, | |
| test_size=0.20, | |
| random_state=42, | |
| stratify=y_labels, | |
| ) | |
| print(f" Training set: {len(X_train)} articles") | |
| print(f" Testing set: {len(X_test)} articles") | |
| # ββ 4. Build Hybrid Features ββ | |
| print("\nBuilding hybrid features (TF-IDF + MiniLM + stylometric)...") | |
| # TF-IDF | |
| tfidf = TfidfVectorizer( | |
| max_features=15000, | |
| ngram_range=(1, 2), | |
| min_df=2, | |
| max_df=0.95, | |
| sublinear_tf=True, | |
| ) | |
| X_train_tfidf = tfidf.fit_transform(X_train) | |
| X_test_tfidf = tfidf.transform(X_test) | |
| # MiniLM embeddings | |
| print(" Encoding texts with MiniLM...") | |
| minilm = get_minilm_model() | |
| train_embeddings = minilm.encode(X_train, show_progress_bar=True, batch_size=64) | |
| test_embeddings = minilm.encode(X_test, show_progress_bar=True, batch_size=64) | |
| # Stylometric | |
| print(" Extracting stylometric features...") | |
| train_stylo = np.array([extract_stylometric_features(t) for t in X_train]) | |
| test_stylo = np.array([extract_stylometric_features(t) for t in X_test]) | |
| scaler = StandardScaler() | |
| train_stylo_scaled = scaler.fit_transform(train_stylo) | |
| test_stylo_scaled = scaler.transform(test_stylo) | |
| # Combine | |
| X_train_feat = hstack( | |
| [X_train_tfidf, csr_matrix(train_embeddings), csr_matrix(train_stylo_scaled)] | |
| ) | |
| X_test_feat = hstack( | |
| [X_test_tfidf, csr_matrix(test_embeddings), csr_matrix(test_stylo_scaled)] | |
| ) | |
| n_tfidf = X_train_tfidf.shape[1] | |
| n_minilm = 384 | |
| n_stylo = len(STYLOMETRIC_FEATURE_NAMES) | |
| print( | |
| f" Feature dimensions: {X_train_feat.shape[1]} " | |
| f"(TF-IDF: {n_tfidf} + MiniLM: {n_minilm} + Stylometric: {n_stylo})" | |
| ) | |
| # ββ 5. Full 5-Fold Cross-Validation ββ | |
| print("\n" + "=" * 60) | |
| print(" 5-FOLD CROSS-VALIDATION (Full Dataset)") | |
| print("=" * 60) | |
| # Build features on entire dataset | |
| print("\nBuilding features on full dataset...") | |
| tfidf_full = TfidfVectorizer( | |
| max_features=15000, | |
| ngram_range=(1, 2), | |
| min_df=2, | |
| max_df=0.95, | |
| sublinear_tf=True, | |
| ) | |
| X_tfidf_full = tfidf_full.fit_transform(X_texts) | |
| print(" Encoding full dataset with MiniLM...") | |
| full_embeddings = minilm.encode(X_texts, show_progress_bar=True, batch_size=64) | |
| stylo_full = np.array([extract_stylometric_features(t) for t in X_texts]) | |
| scaler_full = StandardScaler() | |
| stylo_full_scaled = scaler_full.fit_transform(stylo_full) | |
| X_full = hstack( | |
| [X_tfidf_full, csr_matrix(full_embeddings), csr_matrix(stylo_full_scaled)] | |
| ) | |
| y_full = np.array(y_labels) | |
| print(f" Total samples: {X_full.shape[0]}") | |
| print( | |
| f" Feature dimensions: {X_full.shape[1]} " | |
| f"(TF-IDF: {X_tfidf_full.shape[1]} + MiniLM: {n_minilm} + Stylometric: {n_stylo})" | |
| ) | |
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | |
| fold_accuracies = [] | |
| fold_precisions = [] | |
| fold_recalls = [] | |
| fold_f1s = [] | |
| fold_train_accs = [] | |
| all_y_true = [] | |
| all_y_pred = [] | |
| for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_full, y_full), 1): | |
| X_fold_train = X_full[train_idx] | |
| X_fold_test = X_full[test_idx] | |
| y_fold_train = y_full[train_idx] | |
| y_fold_test = y_full[test_idx] | |
| print(f"\n{'β' * 60}") | |
| print(f" FOLD {fold_idx}/5 (Train: {len(train_idx)}, Test: {len(test_idx)})") | |
| print(f"{'β' * 60}") | |
| rf_fold = RandomForestClassifier( | |
| n_estimators=300, | |
| max_depth=15, | |
| min_samples_split=5, | |
| min_samples_leaf=5, | |
| class_weight="balanced", | |
| n_jobs=-1, | |
| random_state=42, | |
| ) | |
| rf_fold.fit(X_fold_train, y_fold_train) | |
| # Predictions | |
| y_fold_train_pred = rf_fold.predict(X_fold_train) | |
| y_fold_test_pred = rf_fold.predict(X_fold_test) | |
| train_acc = accuracy_score(y_fold_train, y_fold_train_pred) | |
| test_acc = accuracy_score(y_fold_test, y_fold_test_pred) | |
| fold_train_accs.append(train_acc) | |
| fold_accuracies.append(test_acc) | |
| # Per-fold classification report | |
| report = classification_report( | |
| y_fold_test, | |
| y_fold_test_pred, | |
| target_names=label_names, | |
| output_dict=True, | |
| ) | |
| fold_precisions.append(report["weighted avg"]["precision"]) | |
| fold_recalls.append(report["weighted avg"]["recall"]) | |
| fold_f1s.append(report["weighted avg"]["f1-score"]) | |
| # Collect for final confusion matrix | |
| all_y_true.extend(y_fold_test) | |
| all_y_pred.extend(y_fold_test_pred) | |
| print(f" Train Accuracy: {train_acc:.4f} ({train_acc:.1%})") | |
| print(f" Test Accuracy: {test_acc:.4f} ({test_acc:.1%})") | |
| print(f" Gap: {train_acc - test_acc:.4f}") | |
| print() | |
| print( | |
| classification_report( | |
| y_fold_test, y_fold_test_pred, target_names=label_names | |
| ) | |
| ) | |
| # ββ 6. Cross-Fold Summary ββ | |
| fold_accuracies = np.array(fold_accuracies) | |
| fold_train_accs = np.array(fold_train_accs) | |
| fold_precisions = np.array(fold_precisions) | |
| fold_recalls = np.array(fold_recalls) | |
| fold_f1s = np.array(fold_f1s) | |
| gaps = fold_train_accs - fold_accuracies | |
| print("\n" + "=" * 60) | |
| print(" CROSS-VALIDATION SUMMARY (5 Folds)") | |
| print("=" * 60) | |
| print(f"\n Per-Fold Test Accuracies:") | |
| for i, (ta, te) in enumerate(zip(fold_train_accs, fold_accuracies), 1): | |
| print(f" Fold {i}: Train {ta:.1%} | Test {te:.1%} | Gap {ta - te:.1%}") | |
| print( | |
| f"\n Average Training Accuracy: {fold_train_accs.mean():.4f} " | |
| f"(+/- {fold_train_accs.std():.4f})" | |
| ) | |
| print( | |
| f" Average Testing Accuracy: {fold_accuracies.mean():.4f} " | |
| f"(+/- {fold_accuracies.std():.4f})" | |
| ) | |
| print( | |
| f" Average Precision: {fold_precisions.mean():.4f} " | |
| f"(+/- {fold_precisions.std():.4f})" | |
| ) | |
| print( | |
| f" Average Recall: {fold_recalls.mean():.4f} " | |
| f"(+/- {fold_recalls.std():.4f})" | |
| ) | |
| print( | |
| f" Average F1 Score: {fold_f1s.mean():.4f} " | |
| f"(+/- {fold_f1s.std():.4f})" | |
| ) | |
| print(f" Average Gap: {gaps.mean():.4f} " f"(+/- {gaps.std():.4f})") | |
| # ββ 7. Consistency Check ββ | |
| print("\n" + "=" * 60) | |
| print(" VERDICT CONSISTENCY & OVERFITTING ANALYSIS") | |
| print("=" * 60) | |
| avg_train = fold_train_accs.mean() | |
| avg_test = fold_accuracies.mean() | |
| avg_gap = gaps.mean() | |
| acc_std = fold_accuracies.std() | |
| if avg_train > 0.95 and avg_test < 0.70: | |
| overfit_status = "OVERFITTING DETECTED" | |
| print(f"\n *** OVERFITTING DETECTED ***") | |
| print(f" Average training accuracy ({avg_train:.1%}) is much higher than") | |
| print(f" average testing accuracy ({avg_test:.1%}).") | |
| print(f" The model memorizes training data and fails to generalize.") | |
| elif avg_gap > 0.10: | |
| overfit_status = "MILD OVERFITTING" | |
| print(f"\n ** MILD OVERFITTING **") | |
| print(f" Average gap ({avg_gap:.1%}) exceeds 10%.") | |
| else: | |
| overfit_status = "NO OVERFITTING" | |
| print(f"\n NO OVERFITTING DETECTED") | |
| print(f" Average gap ({avg_gap:.1%}) is within acceptable range.") | |
| if acc_std < 0.01: | |
| consistency = "HIGHLY CONSISTENT" | |
| print(f" Verdict Consistency: HIGHLY CONSISTENT (std={acc_std:.4f})") | |
| print(f" Predictions are very stable across all 5 folds.") | |
| elif acc_std < 0.03: | |
| consistency = "CONSISTENT" | |
| print(f" Verdict Consistency: CONSISTENT (std={acc_std:.4f})") | |
| print(f" Minor variance across folds β acceptable for production.") | |
| else: | |
| consistency = "INCONSISTENT" | |
| print(f" Verdict Consistency: INCONSISTENT (std={acc_std:.4f})") | |
| print(f" High variance suggests model stability issues.") | |
| # ββ 8. Confusion Matrix (aggregated across all folds) ββ | |
| print("\n\nGenerating plots...") | |
| cm = confusion_matrix(all_y_true, all_y_pred) | |
| overall_acc = accuracy_score(all_y_true, all_y_pred) | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names) | |
| disp.plot(ax=ax, cmap="Blues", values_format="d") | |
| ax.set_title( | |
| f"Confusion Matrix β Aggregated 5-Fold CV\n" | |
| f"Overall Accuracy: {overall_acc:.1%} | {overfit_status}", | |
| fontsize=14, | |
| fontweight="bold", | |
| ) | |
| ax.set_xlabel("Predicted Label", fontsize=12) | |
| ax.set_ylabel("True Label", fontsize=12) | |
| plt.tight_layout() | |
| cm_path = os.path.join(OUTPUT_DIR, "confusion_matrix.png") | |
| fig.savefig(cm_path, dpi=150, bbox_inches="tight") | |
| print(f" Saved: {cm_path}") | |
| # ββ 9. Per-Fold Accuracy Bar Chart ββ | |
| fig2, ax2 = plt.subplots(figsize=(10, 5)) | |
| x = np.arange(5) | |
| width = 0.35 | |
| bars_train = ax2.bar( | |
| x - width / 2, | |
| fold_train_accs * 100, | |
| width, | |
| label="Training", | |
| color="#2196F3", | |
| edgecolor="black", | |
| linewidth=0.5, | |
| ) | |
| bars_test = ax2.bar( | |
| x + width / 2, | |
| fold_accuracies * 100, | |
| width, | |
| label="Testing", | |
| color="#FF9800", | |
| edgecolor="black", | |
| linewidth=0.5, | |
| ) | |
| for bar, val in zip(bars_train, fold_train_accs): | |
| ax2.text( | |
| bar.get_x() + bar.get_width() / 2, | |
| bar.get_height() + 0.3, | |
| f"{val:.1%}", | |
| ha="center", | |
| va="bottom", | |
| fontsize=9, | |
| fontweight="bold", | |
| ) | |
| for bar, val in zip(bars_test, fold_accuracies): | |
| ax2.text( | |
| bar.get_x() + bar.get_width() / 2, | |
| bar.get_height() + 0.3, | |
| f"{val:.1%}", | |
| ha="center", | |
| va="bottom", | |
| fontsize=9, | |
| fontweight="bold", | |
| ) | |
| ax2.set_xticks(x) | |
| ax2.set_xticklabels([f"Fold {i}" for i in range(1, 6)]) | |
| ax2.set_ylim(0, 105) | |
| ax2.set_ylabel("Accuracy (%)", fontsize=12) | |
| ax2.set_title( | |
| f"Per-Fold Accuracy Comparison\n" | |
| f"Avg Test: {avg_test:.1%} (+/- {acc_std:.4f}) | {consistency}", | |
| fontsize=14, | |
| fontweight="bold", | |
| ) | |
| ax2.legend(loc="lower right") | |
| ax2.axhline(y=70, color="red", linestyle="--", alpha=0.5, label="70% threshold") | |
| plt.tight_layout() | |
| bar_path = os.path.join(OUTPUT_DIR, "accuracy_comparison.png") | |
| fig2.savefig(bar_path, dpi=150, bbox_inches="tight") | |
| print(f" Saved: {bar_path}") | |
| # ββ Final Summary ββ | |
| print("\n" + "=" * 60) | |
| print(" EVALUATION COMPLETE") | |
| print("=" * 60) | |
| print(f" Dataset: fake_news_filipino ({len(df)} articles)") | |
| print(f" Feature set: {X_full.shape[1]} (TF-IDF + 9 stylometric)") | |
| print(f" Cross-Validation: 5-Fold Stratified") | |
| print(f" Avg Training Accuracy: {avg_train:.4f} (+/- {fold_train_accs.std():.4f})") | |
| print(f" Avg Testing Accuracy: {avg_test:.4f} (+/- {acc_std:.4f})") | |
| print(f" Avg F1 Score: {fold_f1s.mean():.4f} (+/- {fold_f1s.std():.4f})") | |
| print(f" Avg Gap: {avg_gap:.4f}") | |
| print(f" Overfitting Status: {overfit_status}") | |
| print(f" Verdict Consistency: {consistency}") | |
| print(f" Plots saved to: {OUTPUT_DIR}/") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() | |