Spaces:

Jandayl
/

Alalay

Running

App Files Files Community

Alalay / models /save_webapp_components.py

Jandayl

reverted changes

8acba37 about 1 month ago

raw

history blame contribute delete

9.36 kB

	import json

	import numpy as np
	import pandas as pd
	import joblib
	from sklearn.compose import ColumnTransformer
	from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import (
	accuracy_score,
	classification_report,
	confusion_matrix,
	f1_score,
	precision_score,
	recall_score,
	)
	from sklearn.model_selection import StratifiedKFold
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
	import warnings

	warnings.filterwarnings("ignore")

	print("=" * 60)
	print("TRAINING AND EVALUATION")
	print("=" * 60)

	# ---------- Load & clean dataset ----------

	df = pd.read_csv("Feature_Extracted_Corpus.csv")
	df["sentence_construction_type"] = df["sentence_construction_type"].replace(["Unknown"], "Other")
	df["sentence_type"] = df["sentence_type"].replace(["Compound-Complex"], "Other")

	label_encoder = LabelEncoder()
	y = df["group"].values
	y_enc = label_encoder.fit_transform(y)
	classes = label_encoder.classes_
	print(f"Classes: {list(classes)}")
	print(f"Class distribution:\n{pd.Series(y).value_counts().to_string()}\n")

	X = df.drop(columns=["id", "text", "group", "grade"])
	numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
	categorical_cols = ["sentence_construction_type", "sentence_type"]

	# ---------- Candidate models ----------
	# Each entry is (name, classifier). The one with the best 5-fold macro-F1 gets saved.

	CANDIDATES = {
	"RandomForest": RandomForestClassifier(
	n_estimators=300,
	max_depth=10,
	min_samples_leaf=4,
	min_samples_split=2,
	max_features="sqrt",
	class_weight="balanced",
	random_state=42,
	),
	"GradientBoosting": GradientBoostingClassifier(
	n_estimators=100,
	max_depth=3,
	learning_rate=0.1,
	random_state=42,
	),
	"ExtraTrees": ExtraTreesClassifier(
	n_estimators=300,
	max_depth=10,
	min_samples_leaf=4,
	class_weight="balanced",
	random_state=42,
	),
	"LogisticRegression": LogisticRegression(
	max_iter=1000,
	class_weight="balanced",
	random_state=42,
	),
	}

	# ---------- Build pipeline ----------

	def make_pipeline(classifier):
	preprocessor = ColumnTransformer(
	transformers=[
	("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
	("num", StandardScaler(), numeric_cols),
	],
	remainder="passthrough",
	)
	return Pipeline([
	("preprocessing", preprocessor),
	("classifier", classifier),
	])

	# ---------- 5-fold stratified CV for every candidate ----------
	# The winner (highest mean macro-F1) gets retrained on full data and saved.

	print("=" * 60)
	print("5-FOLD STRATIFIED CROSS-VALIDATION — ALL CANDIDATES")
	print("=" * 60)

	skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

	candidate_results = {} # name -> {fold_metrics, all_y_true, all_y_pred, oof_probs}

	for cname, clf in CANDIDATES.items():
	print(f"\n--- {cname} ---")

	fold_metrics = []
	all_y_true, all_y_pred = [], []
	oof_probs = np.zeros((len(y_enc), len(classes)))

	for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_enc), start=1):
	X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
	y_train, y_val = y_enc[train_idx], y_enc[val_idx]

	pipe = make_pipeline(clf)
	pipe.fit(X_train, y_train)

	y_pred = pipe.predict(X_val)
	probs = pipe.predict_proba(X_val)
	oof_probs[val_idx] = probs

	acc = accuracy_score(y_val, y_pred)
	prec = precision_score(y_val, y_pred, average="macro", zero_division=0)
	rec = recall_score(y_val, y_pred, average="macro", zero_division=0)
	f1 = f1_score(y_val, y_pred, average="macro", zero_division=0)

	fold_metrics.append({"accuracy": acc, "precision": prec, "recall": rec, "f1_macro": f1})
	all_y_true.extend(y_val)
	all_y_pred.extend(y_pred)

	print(f" Fold {fold} \| Acc: {acc:.4f} Prec: {prec:.4f} Rec: {rec:.4f} F1: {f1:.4f}")

	mean_f1 = np.mean([m["f1_macro"] for m in fold_metrics])
	mean_acc = np.mean([m["accuracy"] for m in fold_metrics])
	print(f" Mean \| Acc: {mean_acc:.4f} F1: {mean_f1:.4f}")

	candidate_results[cname] = {
	"fold_metrics": fold_metrics,
	"all_y_true": all_y_true,
	"all_y_pred": all_y_pred,
	"oof_probs": oof_probs,
	"mean_f1": mean_f1,
	"mean_accuracy": mean_acc,
	}

	# ---------- Pick winner ----------

	best_name = max(candidate_results, key=lambda n: candidate_results[n]["mean_f1"])
	best = candidate_results[best_name]
	fold_metrics = best["fold_metrics"]
	all_y_true = best["all_y_true"]
	all_y_pred = best["all_y_pred"]
	oof_probs = best["oof_probs"]
	cv_accuracy = best["mean_accuracy"]
	cv_f1 = best["mean_f1"]
	cv_precision = np.mean([m["precision"] for m in fold_metrics])
	cv_recall = np.mean([m["recall"] for m in fold_metrics])

	print("\n" + "=" * 60)
	print(f"WINNER: {best_name} (mean macro-F1 = {cv_f1:.4f})")
	print("=" * 60)

	# Full classification report and confusion matrix for the winner
	print("\nCLASSIFICATION REPORT (aggregated OOF predictions)")
	print(classification_report(all_y_true, all_y_pred, target_names=classes, zero_division=0))

	cm = confusion_matrix(all_y_true, all_y_pred)
	print("CONFUSION MATRIX")
	print(f" Labels: {list(classes)}\n")
	print(pd.DataFrame(cm, index=classes, columns=classes).to_string())

	# ---------- Learn thresholds from OOF probabilities ----------

	print("\n" + "=" * 60)
	print("THRESHOLD TUNING (from out-of-fold predictions)")
	print("=" * 60)

	thresholds = {}
	for i, class_name in enumerate(classes):
	best_t, best_f1_t = 0.5, -1.0
	for t in np.arange(0.3, 0.8, 0.05):
	preds = np.where(oof_probs[:, i] >= t, i, np.argmax(oof_probs, axis=1))
	score = f1_score(y_enc, preds, average="macro", zero_division=0)
	if score > best_f1_t:
	best_f1_t, best_t = score, t
	thresholds[class_name] = round(float(best_t), 2)
	print(f" {class_name}: threshold = {best_t:.2f} (macro-F1 at threshold: {best_f1_t:.4f})")

	# ---------- Retrain winner on FULL dataset ----------

	print("\n" + "=" * 60)
	print(f"RETRAINING {best_name} ON FULL DATASET FOR PRODUCTION")
	print("=" * 60)

	final_model = make_pipeline(CANDIDATES[best_name])
	final_model.fit(X, y_enc)
	print(f"Final model trained on all {len(X)} samples.")

	# ---------- Save all artifacts ----------

	print("\n" + "=" * 60)
	print("SAVING ARTIFACTS")
	print("=" * 60)

	joblib.dump(label_encoder, "label_encoder.pkl")
	print(" label_encoder.pkl saved — classes:", list(classes))

	feature_info = {
	"numeric_cols": numeric_cols,
	"categorical_cols": categorical_cols,
	"all_features": numeric_cols + categorical_cols,
	}
	joblib.dump(feature_info, "feature_info.pkl")
	print(" feature_info.pkl saved")

	joblib.dump(final_model, "readability_model.pkl")
	print(f" readability_model.pkl saved ({best_name} pipeline)")

	grade_mapping = {
	"lower": "Grades 2-3 (Lower Elementary)",
	"higher": "Grades 4-6 (Higher Elementary)",
	"secondary": "Grades 7-10 (Secondary)",
	}
	joblib.dump(grade_mapping, "grade_mapping.pkl")
	print(" grade_mapping.pkl saved")

	joblib.dump(thresholds, "thresholds.pkl")
	print(" thresholds.pkl saved —", thresholds)

	# Summary JSON
	all_summaries = {
	name: {
	"mean_accuracy": round(np.mean([m["accuracy"] for m in r["fold_metrics"]]), 4),
	"mean_precision": round(np.mean([m["precision"] for m in r["fold_metrics"]]), 4),
	"mean_recall": round(np.mean([m["recall"] for m in r["fold_metrics"]]), 4),
	"mean_f1_macro": round(r["mean_f1"], 4),
	"std_f1_macro": round(np.std([m["f1_macro"] for m in r["fold_metrics"]]), 4),
	"per_fold": r["fold_metrics"],
	}
	for name, r in candidate_results.items()
	}

	metrics_summary = {
	"cv_folds": 5,
	"winner": best_name,
	"mean_accuracy": round(cv_accuracy, 4),
	"mean_precision": round(cv_precision, 4),
	"mean_recall": round(cv_recall, 4),
	"mean_f1_macro": round(cv_f1, 4),
	"std_accuracy": round(np.std([m["accuracy"] for m in fold_metrics]), 4),
	"std_f1_macro": round(np.std([m["f1_macro"] for m in fold_metrics]), 4),
	"thresholds": thresholds,
	"all_candidates": all_summaries,
	}
	with open("training_metrics.json", "w") as f:
	json.dump(metrics_summary, f, indent=2)
	print(" training_metrics.json saved (all candidate CV results)")

	# ---------- Sanity check ----------

	print("\n" + "=" * 60)
	print("SANITY CHECK")
	print("=" * 60)

	test_model = joblib.load("readability_model.pkl")
	test_encoder = joblib.load("label_encoder.pkl")
	test_mapping = joblib.load("grade_mapping.pkl")

	sample_pred = test_model.predict(X.iloc[0:1])[0]
	sample_class = test_encoder.inverse_transform([sample_pred])[0]
	sample_grade = test_mapping[sample_class]
	print(f" Sample prediction: {sample_class} -> {sample_grade}")
	print(f" Probabilities: {test_model.predict_proba(X.iloc[0:1])[0]}")

	print("\n" + "=" * 60)
	print(f"ALL COMPONENTS SAVED SUCCESSFULLY! (model: {best_name})")
	print("=" * 60)