Spaces:

ABPThesisGroup
/

ThesisProject

Sleeping

App Files Files Community

ThesisProject / backend /evaluate_model.py

JeyBii

Upload folder using huggingface_hub

2b9b5b5 verified 3 months ago

Raw

History Blame Contribute Delete

24.3 kB

	"""
	Model Overfitting Evaluation Script
	=====================================
	Evaluates the Random Forest fake news classifier for overfitting by
	comparing Training vs. Testing performance.

	Split: 80% Train / 20% Test
	Metrics: classification_report, accuracy_score, confusion matrix plot
	Flag: Overfitting detected if Train Acc > 95% and Test Acc < 70%

	Usage:
	python backend/evaluate_model.py
	"""

	import sys
	import os
	import re
	import time
	import numpy as np
	from textblob import TextBlob
	import textstat

	PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	sys.path.insert(0, PROJECT_ROOT)

	import pandas as pd
	import matplotlib

	matplotlib.use("Agg") # Non-interactive backend for saving plots
	import matplotlib.pyplot as plt
	from scipy.sparse import hstack, csr_matrix
	from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.preprocessing import StandardScaler
	from sklearn.metrics import (
	classification_report,
	accuracy_score,
	confusion_matrix,
	ConfusionMatrixDisplay,
	)
	from sentence_transformers import SentenceTransformer


	# ── Paths ──
	DATA_MODELS_DIR = os.path.join(PROJECT_ROOT, "data_models")
	OUTPUT_DIR = os.path.join(PROJECT_ROOT, "evaluation_results")


	# ── MiniLM Model (lazy-loaded singleton) ──
	MINILM_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
	_minilm_model = None


	def get_minilm_model():
	"""Load the multilingual MiniLM model (cached after first call)."""
	global _minilm_model
	if _minilm_model is None:
	print(" Loading MiniLM model...")
	_minilm_model = SentenceTransformer(MINILM_MODEL_NAME)
	return _minilm_model


	# ───────────────────────────────────────────────────────────
	# Text Cleaning (same as train.py)
	# ───────────────────────────────────────────────────────────


	def clean_text(text):
	"""Basic text cleaning for Filipino news articles."""
	if not text or not isinstance(text, str):
	return ""
	text = re.sub(r"<[^>]+>", " ", text)
	text = re.sub(r"https?://\S+", " ", text)
	text = re.sub(r"\s+", " ", text)
	return text.strip()


	# ───────────────────────────────────────────────────────────
	# Stylometric Features (same as train.py)
	# ───────────────────────────────────────────────────────────


	# ── Word lists for linguistic features ──
	FIRST_PERSON_PRONOUNS = {
	"i",
	"me",
	"my",
	"mine",
	"myself",
	"we",
	"us",
	"our",
	"ours",
	"ourselves",
	"ako",
	"ko",
	"akin",
	"aking",
	"natin",
	"atin",
	"namin",
	"amin",
	"tayo",
	"kami",
	"ta",
	}

	AUXILIARY_VERBS = {
	"have",
	"has",
	"had",
	"do",
	"does",
	"did",
	"will",
	"would",
	"shall",
	"should",
	"may",
	"might",
	"can",
	"could",
	"must",
	"am",
	"is",
	"are",
	"was",
	"were",
	"be",
	"been",
	"being",
	"ay",
	"dapat",
	"mayroon",
	"meron",
	"maaari",
	"pwede",
	"kailangan",
	}

	ANALYTICAL_WORDS = {
	"the",
	"a",
	"an",
	"of",
	"in",
	"on",
	"at",
	"to",
	"for",
	"with",
	"by",
	"from",
	"about",
	"between",
	"through",
	"during",
	"before",
	"after",
	"ang",
	"ng",
	"sa",
	"mga",
	"nang",
	"para",
	"tungkol",
	"mula",
	}

	CERTAINTY_WORDS = {
	"always",
	"never",
	"absolutely",
	"definitely",
	"certainly",
	"undoubtedly",
	"clearly",
	"obviously",
	"without doubt",
	"guaranteed",
	"proven",
	"fact",
	"undeniable",
	"indisputable",
	"every",
	"all",
	"palagi",
	"sigurado",
	"tiyak",
	"talaga",
	"totoo",
	"lagi",
	"walang duda",
	}

	TENTATIVE_WORDS = {
	"perhaps",
	"maybe",
	"possibly",
	"might",
	"could",
	"likely",
	"unlikely",
	"suggests",
	"appears",
	"seems",
	"allegedly",
	"reportedly",
	"according",
	"probable",
	"approximately",
	"estimated",
	"siguro",
	"marahil",
	"maaaring",
	"mukhang",
	"parang",
	"umano",
	"diumano",
	}

	CLOUT_WORDS = {
	"must",
	"demand",
	"require",
	"order",
	"command",
	"insist",
	"decree",
	"mandate",
	"authority",
	"power",
	"control",
	"dominant",
	"superior",
	"we must",
	"you must",
	"kailangan",
	"dapat",
	"utos",
	"kapangyarihan",
	"kontrol",
	"mando",
	}

	PAST_FOCUS_WORDS = {
	"talked",
	"did",
	"ago",
	"said",
	"was",
	"were",
	"had",
	"went",
	"told",
	"noon",
	"nakaraan",
	"dati",
	"kahapon",
	}

	PRESENT_FOCUS_WORDS = {
	"now",
	"is",
	"today",
	"are",
	"being",
	"currently",
	"ongoing",
	"ngayon",
	"kasalukuyan",
	}

	FUTURE_FOCUS_WORDS = {
	"soon",
	"will",
	"may",
	"shall",
	"going",
	"plan",
	"expect",
	"tomorrow",
	"bukas",
	"darating",
	"magiging",
	"gagawin",
	}


	def extract_stylometric_features(text):
	"""Extract 25 stylometric features from text (matches train.py)."""
	if not text or not isinstance(text, str):
	return [0.0] * 25

	words = text.split()
	token_count = len(words)
	if token_count == 0:
	return [0.0] * 25

	words_lower = [w.lower() for w in words]
	text_len = len(text)

	exclamation_density = text.count("!") / token_count
	question_count = text.count("?")

	caps_words = sum(1 for w in words if len(w) >= 2 and w.isupper())
	caps_ratio = caps_words / token_count

	sentences = re.split(r"[.!?]+", text)
	sentences = [s.strip() for s in sentences if s.strip()]
	avg_sentence_length = (
	sum(len(s.split()) for s in sentences) / len(sentences)
	if sentences
	else token_count
	)

	punct_chars = sum(1 for c in text if c in ".,;:!?-\"'()[]{}...")
	punctuation_density = (punct_chars / text_len) * 100 if text_len > 0 else 0

	unique_words = len(set(words_lower))
	unique_word_ratio = unique_words / token_count

	avg_word_length = sum(len(w) for w in words) / token_count

	try:
	subjectivity = TextBlob(text).sentiment.subjectivity
	except Exception:
	subjectivity = 0.0

	try:
	flesch_reading_ease = textstat.flesch_reading_ease(text)
	flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
	coleman_liau_index = textstat.coleman_liau_index(text)
	ari = textstat.automated_readability_index(text)
	except Exception:
	flesch_reading_ease = 0.0
	flesch_kincaid_grade = 0.0
	coleman_liau_index = 0.0
	ari = 0.0

	first_person_count = sum(1 for w in words_lower if w in FIRST_PERSON_PRONOUNS)
	first_person_ratio = first_person_count / token_count

	aux_count = sum(1 for w in words_lower if w in AUXILIARY_VERBS)
	auxiliary_verb_ratio = aux_count / token_count

	try:
	gunning_fog_index = textstat.gunning_fog(text)
	except Exception:
	gunning_fog_index = 0.0

	analytical_count = sum(1 for w in words_lower if w in ANALYTICAL_WORDS)
	analytical_thinking = analytical_count / token_count

	certainty_count = sum(1 for w in words_lower if w in CERTAINTY_WORDS)
	certainty_score = certainty_count / token_count

	tentative_count = sum(1 for w in words_lower if w in TENTATIVE_WORDS)
	tentative_score = tentative_count / token_count

	clout_count = sum(1 for w in words_lower if w in CLOUT_WORDS)
	clout_score = clout_count / token_count

	comma_period_count = text.count(",") + text.count(".")
	comma_period_density = (comma_period_count / text_len) * 100 if text_len > 0 else 0

	informal_count = (
	text.count("(")
	+ text.count(")")
	+ text.count("—")
	+ text.count("–")
	+ text.count("-")
	+ text.count("...")
	+ text.count("…")
	)
	informal_punct_density = (informal_count / text_len) * 100 if text_len > 0 else 0

	past_count = sum(1 for w in words_lower if w in PAST_FOCUS_WORDS)
	past_focus_ratio = past_count / token_count

	present_count = sum(1 for w in words_lower if w in PRESENT_FOCUS_WORDS)
	present_focus_ratio = present_count / token_count

	future_count = sum(1 for w in words_lower if w in FUTURE_FOCUS_WORDS)
	future_focus_ratio = future_count / token_count

	return [
	float(exclamation_density),
	float(question_count),
	float(caps_ratio),
	float(avg_sentence_length),
	float(punctuation_density),
	float(token_count),
	float(unique_word_ratio),
	float(avg_word_length),
	float(subjectivity),
	float(flesch_reading_ease),
	float(flesch_kincaid_grade),
	float(coleman_liau_index),
	float(ari),
	float(first_person_ratio),
	float(auxiliary_verb_ratio),
	float(gunning_fog_index),
	float(analytical_thinking),
	float(certainty_score),
	float(tentative_score),
	float(clout_score),
	float(comma_period_density),
	float(informal_punct_density),
	float(past_focus_ratio),
	float(present_focus_ratio),
	float(future_focus_ratio),
	]


	STYLOMETRIC_FEATURE_NAMES = [
	"exclamation_density",
	"question_count",
	"caps_ratio",
	"avg_sentence_length",
	"punctuation_density",
	"token_count",
	"unique_word_ratio",
	"avg_word_length",
	"subjectivity",
	"flesch_reading_ease",
	"flesch_kincaid_grade",
	"coleman_liau_index",
	"ari",
	"first_person_ratio",
	"auxiliary_verb_ratio",
	"gunning_fog_index",
	"analytical_thinking",
	"certainty_score",
	"tentative_score",
	"clout_score",
	"comma_period_density",
	"informal_punct_density",
	"past_focus_ratio",
	"present_focus_ratio",
	"future_focus_ratio",
	]


	# ───────────────────────────────────────────────────────────
	# Main Evaluation
	# ───────────────────────────────────────────────────────────


	def main():
	os.makedirs(OUTPUT_DIR, exist_ok=True)
	label_names = ["Real", "Fake"]

	# ── 1. Load Dataset ──
	print("=" * 60)
	print(" MODEL OVERFITTING EVALUATION")
	print("=" * 60)

	csv_path = os.path.join(
	PROJECT_ROOT, "data", "raw", "fakenews", "fakenews", "full.csv"
	)
	if not os.path.exists(csv_path):
	print(f"ERROR: Dataset not found at {csv_path}")
	return

	df = pd.read_csv(csv_path)
	print(f"\nDataset: jcblaise/fake_news_filipino")
	print(f"Total articles: {len(df)}")
	print(f"Distribution:")
	print(f" Real (0): {(df['label'] == 0).sum()}")
	print(f" Fake (1): {(df['label'] == 1).sum()}")

	# ── 2. Preprocess ──
	print("\nPreprocessing...")
	df = df.dropna(subset=["article"]).copy()
	df = df[df["article"].str.len() > 0].copy()
	df.loc[:, "article_clean"] = df["article"].apply(clean_text)

	X_texts = df["article_clean"].tolist()
	y_labels = df["label"].tolist()
	print(f" Valid articles: {len(X_texts)}")

	# ── 3. Split: 80% Train / 20% Test ──
	print("\nSplitting data: 80% Train / 20% Test...")
	X_train, X_test, y_train, y_test = train_test_split(
	X_texts,
	y_labels,
	test_size=0.20,
	random_state=42,
	stratify=y_labels,
	)
	print(f" Training set: {len(X_train)} articles")
	print(f" Testing set: {len(X_test)} articles")

	# ── 4. Build Hybrid Features ──
	print("\nBuilding hybrid features (TF-IDF + MiniLM + stylometric)...")

	# TF-IDF
	tfidf = TfidfVectorizer(
	max_features=15000,
	ngram_range=(1, 2),
	min_df=2,
	max_df=0.95,
	sublinear_tf=True,
	)
	X_train_tfidf = tfidf.fit_transform(X_train)
	X_test_tfidf = tfidf.transform(X_test)

	# MiniLM embeddings
	print(" Encoding texts with MiniLM...")
	minilm = get_minilm_model()
	train_embeddings = minilm.encode(X_train, show_progress_bar=True, batch_size=64)
	test_embeddings = minilm.encode(X_test, show_progress_bar=True, batch_size=64)

	# Stylometric
	print(" Extracting stylometric features...")
	train_stylo = np.array([extract_stylometric_features(t) for t in X_train])
	test_stylo = np.array([extract_stylometric_features(t) for t in X_test])

	scaler = StandardScaler()
	train_stylo_scaled = scaler.fit_transform(train_stylo)
	test_stylo_scaled = scaler.transform(test_stylo)

	# Combine
	X_train_feat = hstack(
	[X_train_tfidf, csr_matrix(train_embeddings), csr_matrix(train_stylo_scaled)]
	)
	X_test_feat = hstack(
	[X_test_tfidf, csr_matrix(test_embeddings), csr_matrix(test_stylo_scaled)]
	)

	n_tfidf = X_train_tfidf.shape[1]
	n_minilm = 384
	n_stylo = len(STYLOMETRIC_FEATURE_NAMES)
	print(
	f" Feature dimensions: {X_train_feat.shape[1]} "
	f"(TF-IDF: {n_tfidf} + MiniLM: {n_minilm} + Stylometric: {n_stylo})"
	)

	# ── 5. Full 5-Fold Cross-Validation ──
	print("\n" + "=" * 60)
	print(" 5-FOLD CROSS-VALIDATION (Full Dataset)")
	print("=" * 60)

	# Build features on entire dataset
	print("\nBuilding features on full dataset...")
	tfidf_full = TfidfVectorizer(
	max_features=15000,
	ngram_range=(1, 2),
	min_df=2,
	max_df=0.95,
	sublinear_tf=True,
	)
	X_tfidf_full = tfidf_full.fit_transform(X_texts)

	print(" Encoding full dataset with MiniLM...")
	full_embeddings = minilm.encode(X_texts, show_progress_bar=True, batch_size=64)

	stylo_full = np.array([extract_stylometric_features(t) for t in X_texts])
	scaler_full = StandardScaler()
	stylo_full_scaled = scaler_full.fit_transform(stylo_full)
	X_full = hstack(
	[X_tfidf_full, csr_matrix(full_embeddings), csr_matrix(stylo_full_scaled)]
	)
	y_full = np.array(y_labels)

	print(f" Total samples: {X_full.shape[0]}")
	print(
	f" Feature dimensions: {X_full.shape[1]} "
	f"(TF-IDF: {X_tfidf_full.shape[1]} + MiniLM: {n_minilm} + Stylometric: {n_stylo})"
	)

	cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

	fold_accuracies = []
	fold_precisions = []
	fold_recalls = []
	fold_f1s = []
	fold_train_accs = []
	all_y_true = []
	all_y_pred = []

	for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_full, y_full), 1):
	X_fold_train = X_full[train_idx]
	X_fold_test = X_full[test_idx]
	y_fold_train = y_full[train_idx]
	y_fold_test = y_full[test_idx]

	print(f"\n{'─' * 60}")
	print(f" FOLD {fold_idx}/5 (Train: {len(train_idx)}, Test: {len(test_idx)})")
	print(f"{'─' * 60}")

	rf_fold = RandomForestClassifier(
	n_estimators=300,
	max_depth=15,
	min_samples_split=5,
	min_samples_leaf=5,
	class_weight="balanced",
	n_jobs=-1,
	random_state=42,
	)
	rf_fold.fit(X_fold_train, y_fold_train)

	# Predictions
	y_fold_train_pred = rf_fold.predict(X_fold_train)
	y_fold_test_pred = rf_fold.predict(X_fold_test)

	train_acc = accuracy_score(y_fold_train, y_fold_train_pred)
	test_acc = accuracy_score(y_fold_test, y_fold_test_pred)

	fold_train_accs.append(train_acc)
	fold_accuracies.append(test_acc)

	# Per-fold classification report
	report = classification_report(
	y_fold_test,
	y_fold_test_pred,
	target_names=label_names,
	output_dict=True,
	)
	fold_precisions.append(report["weighted avg"]["precision"])
	fold_recalls.append(report["weighted avg"]["recall"])
	fold_f1s.append(report["weighted avg"]["f1-score"])

	# Collect for final confusion matrix
	all_y_true.extend(y_fold_test)
	all_y_pred.extend(y_fold_test_pred)

	print(f" Train Accuracy: {train_acc:.4f} ({train_acc:.1%})")
	print(f" Test Accuracy: {test_acc:.4f} ({test_acc:.1%})")
	print(f" Gap: {train_acc - test_acc:.4f}")
	print()
	print(
	classification_report(
	y_fold_test, y_fold_test_pred, target_names=label_names
	)
	)

	# ── 6. Cross-Fold Summary ──
	fold_accuracies = np.array(fold_accuracies)
	fold_train_accs = np.array(fold_train_accs)
	fold_precisions = np.array(fold_precisions)
	fold_recalls = np.array(fold_recalls)
	fold_f1s = np.array(fold_f1s)
	gaps = fold_train_accs - fold_accuracies

	print("\n" + "=" * 60)
	print(" CROSS-VALIDATION SUMMARY (5 Folds)")
	print("=" * 60)

	print(f"\n Per-Fold Test Accuracies:")
	for i, (ta, te) in enumerate(zip(fold_train_accs, fold_accuracies), 1):
	print(f" Fold {i}: Train {ta:.1%} \| Test {te:.1%} \| Gap {ta - te:.1%}")

	print(
	f"\n Average Training Accuracy: {fold_train_accs.mean():.4f} "
	f"(+/- {fold_train_accs.std():.4f})"
	)
	print(
	f" Average Testing Accuracy: {fold_accuracies.mean():.4f} "
	f"(+/- {fold_accuracies.std():.4f})"
	)
	print(
	f" Average Precision: {fold_precisions.mean():.4f} "
	f"(+/- {fold_precisions.std():.4f})"
	)
	print(
	f" Average Recall: {fold_recalls.mean():.4f} "
	f"(+/- {fold_recalls.std():.4f})"
	)
	print(
	f" Average F1 Score: {fold_f1s.mean():.4f} "
	f"(+/- {fold_f1s.std():.4f})"
	)
	print(f" Average Gap: {gaps.mean():.4f} " f"(+/- {gaps.std():.4f})")

	# ── 7. Consistency Check ──
	print("\n" + "=" * 60)
	print(" VERDICT CONSISTENCY & OVERFITTING ANALYSIS")
	print("=" * 60)

	avg_train = fold_train_accs.mean()
	avg_test = fold_accuracies.mean()
	avg_gap = gaps.mean()
	acc_std = fold_accuracies.std()

	if avg_train > 0.95 and avg_test < 0.70:
	overfit_status = "OVERFITTING DETECTED"
	print(f"\n * OVERFITTING DETECTED *")
	print(f" Average training accuracy ({avg_train:.1%}) is much higher than")
	print(f" average testing accuracy ({avg_test:.1%}).")
	print(f" The model memorizes training data and fails to generalize.")
	elif avg_gap > 0.10:
	overfit_status = "MILD OVERFITTING"
	print(f"\n MILD OVERFITTING ")
	print(f" Average gap ({avg_gap:.1%}) exceeds 10%.")
	else:
	overfit_status = "NO OVERFITTING"
	print(f"\n NO OVERFITTING DETECTED")
	print(f" Average gap ({avg_gap:.1%}) is within acceptable range.")

	if acc_std < 0.01:
	consistency = "HIGHLY CONSISTENT"
	print(f" Verdict Consistency: HIGHLY CONSISTENT (std={acc_std:.4f})")
	print(f" Predictions are very stable across all 5 folds.")
	elif acc_std < 0.03:
	consistency = "CONSISTENT"
	print(f" Verdict Consistency: CONSISTENT (std={acc_std:.4f})")
	print(f" Minor variance across folds — acceptable for production.")
	else:
	consistency = "INCONSISTENT"
	print(f" Verdict Consistency: INCONSISTENT (std={acc_std:.4f})")
	print(f" High variance suggests model stability issues.")

	# ── 8. Confusion Matrix (aggregated across all folds) ──
	print("\n\nGenerating plots...")
	cm = confusion_matrix(all_y_true, all_y_pred)
	overall_acc = accuracy_score(all_y_true, all_y_pred)

	fig, ax = plt.subplots(figsize=(8, 6))
	disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
	disp.plot(ax=ax, cmap="Blues", values_format="d")

	ax.set_title(
	f"Confusion Matrix — Aggregated 5-Fold CV\n"
	f"Overall Accuracy: {overall_acc:.1%} \| {overfit_status}",
	fontsize=14,
	fontweight="bold",
	)
	ax.set_xlabel("Predicted Label", fontsize=12)
	ax.set_ylabel("True Label", fontsize=12)

	plt.tight_layout()
	cm_path = os.path.join(OUTPUT_DIR, "confusion_matrix.png")
	fig.savefig(cm_path, dpi=150, bbox_inches="tight")
	print(f" Saved: {cm_path}")

	# ── 9. Per-Fold Accuracy Bar Chart ──
	fig2, ax2 = plt.subplots(figsize=(10, 5))

	x = np.arange(5)
	width = 0.35
	bars_train = ax2.bar(
	x - width / 2,
	fold_train_accs * 100,
	width,
	label="Training",
	color="#2196F3",
	edgecolor="black",
	linewidth=0.5,
	)
	bars_test = ax2.bar(
	x + width / 2,
	fold_accuracies * 100,
	width,
	label="Testing",
	color="#FF9800",
	edgecolor="black",
	linewidth=0.5,
	)

	for bar, val in zip(bars_train, fold_train_accs):
	ax2.text(
	bar.get_x() + bar.get_width() / 2,
	bar.get_height() + 0.3,
	f"{val:.1%}",
	ha="center",
	va="bottom",
	fontsize=9,
	fontweight="bold",
	)
	for bar, val in zip(bars_test, fold_accuracies):
	ax2.text(
	bar.get_x() + bar.get_width() / 2,
	bar.get_height() + 0.3,
	f"{val:.1%}",
	ha="center",
	va="bottom",
	fontsize=9,
	fontweight="bold",
	)

	ax2.set_xticks(x)
	ax2.set_xticklabels([f"Fold {i}" for i in range(1, 6)])
	ax2.set_ylim(0, 105)
	ax2.set_ylabel("Accuracy (%)", fontsize=12)
	ax2.set_title(
	f"Per-Fold Accuracy Comparison\n"
	f"Avg Test: {avg_test:.1%} (+/- {acc_std:.4f}) \| {consistency}",
	fontsize=14,
	fontweight="bold",
	)
	ax2.legend(loc="lower right")
	ax2.axhline(y=70, color="red", linestyle="--", alpha=0.5, label="70% threshold")

	plt.tight_layout()
	bar_path = os.path.join(OUTPUT_DIR, "accuracy_comparison.png")
	fig2.savefig(bar_path, dpi=150, bbox_inches="tight")
	print(f" Saved: {bar_path}")

	# ── Final Summary ──
	print("\n" + "=" * 60)
	print(" EVALUATION COMPLETE")
	print("=" * 60)
	print(f" Dataset: fake_news_filipino ({len(df)} articles)")
	print(f" Feature set: {X_full.shape[1]} (TF-IDF + 9 stylometric)")
	print(f" Cross-Validation: 5-Fold Stratified")
	print(f" Avg Training Accuracy: {avg_train:.4f} (+/- {fold_train_accs.std():.4f})")
	print(f" Avg Testing Accuracy: {avg_test:.4f} (+/- {acc_std:.4f})")
	print(f" Avg F1 Score: {fold_f1s.mean():.4f} (+/- {fold_f1s.std():.4f})")
	print(f" Avg Gap: {avg_gap:.4f}")
	print(f" Overfitting Status: {overfit_status}")
	print(f" Verdict Consistency: {consistency}")
	print(f" Plots saved to: {OUTPUT_DIR}/")
	print("=" * 60)


	if __name__ == "__main__":
	main()