import json import numpy as np import pandas as pd import joblib from sklearn.compose import ColumnTransformer from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import ( accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score, ) from sklearn.model_selection import StratifiedKFold from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler import warnings warnings.filterwarnings("ignore") print("=" * 60) print("TRAINING AND EVALUATION") print("=" * 60) # ---------- Load & clean dataset ---------- df = pd.read_csv("Feature_Extracted_Corpus.csv") df["sentence_construction_type"] = df["sentence_construction_type"].replace(["Unknown"], "Other") df["sentence_type"] = df["sentence_type"].replace(["Compound-Complex"], "Other") label_encoder = LabelEncoder() y = df["group"].values y_enc = label_encoder.fit_transform(y) classes = label_encoder.classes_ print(f"Classes: {list(classes)}") print(f"Class distribution:\n{pd.Series(y).value_counts().to_string()}\n") X = df.drop(columns=["id", "text", "group", "grade"]) numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist() categorical_cols = ["sentence_construction_type", "sentence_type"] # ---------- Candidate models ---------- # Each entry is (name, classifier). The one with the best 5-fold macro-F1 gets saved. CANDIDATES = { "RandomForest": RandomForestClassifier( n_estimators=300, max_depth=10, min_samples_leaf=4, min_samples_split=2, max_features="sqrt", class_weight="balanced", random_state=42, ), "GradientBoosting": GradientBoostingClassifier( n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42, ), "ExtraTrees": ExtraTreesClassifier( n_estimators=300, max_depth=10, min_samples_leaf=4, class_weight="balanced", random_state=42, ), "LogisticRegression": LogisticRegression( max_iter=1000, class_weight="balanced", random_state=42, ), } # ---------- Build pipeline ---------- def make_pipeline(classifier): preprocessor = ColumnTransformer( transformers=[ ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols), ("num", StandardScaler(), numeric_cols), ], remainder="passthrough", ) return Pipeline([ ("preprocessing", preprocessor), ("classifier", classifier), ]) # ---------- 5-fold stratified CV for every candidate ---------- # The winner (highest mean macro-F1) gets retrained on full data and saved. print("=" * 60) print("5-FOLD STRATIFIED CROSS-VALIDATION — ALL CANDIDATES") print("=" * 60) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) candidate_results = {} # name -> {fold_metrics, all_y_true, all_y_pred, oof_probs} for cname, clf in CANDIDATES.items(): print(f"\n--- {cname} ---") fold_metrics = [] all_y_true, all_y_pred = [], [] oof_probs = np.zeros((len(y_enc), len(classes))) for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_enc), start=1): X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] y_train, y_val = y_enc[train_idx], y_enc[val_idx] pipe = make_pipeline(clf) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_val) probs = pipe.predict_proba(X_val) oof_probs[val_idx] = probs acc = accuracy_score(y_val, y_pred) prec = precision_score(y_val, y_pred, average="macro", zero_division=0) rec = recall_score(y_val, y_pred, average="macro", zero_division=0) f1 = f1_score(y_val, y_pred, average="macro", zero_division=0) fold_metrics.append({"accuracy": acc, "precision": prec, "recall": rec, "f1_macro": f1}) all_y_true.extend(y_val) all_y_pred.extend(y_pred) print(f" Fold {fold} | Acc: {acc:.4f} Prec: {prec:.4f} Rec: {rec:.4f} F1: {f1:.4f}") mean_f1 = np.mean([m["f1_macro"] for m in fold_metrics]) mean_acc = np.mean([m["accuracy"] for m in fold_metrics]) print(f" Mean | Acc: {mean_acc:.4f} F1: {mean_f1:.4f}") candidate_results[cname] = { "fold_metrics": fold_metrics, "all_y_true": all_y_true, "all_y_pred": all_y_pred, "oof_probs": oof_probs, "mean_f1": mean_f1, "mean_accuracy": mean_acc, } # ---------- Pick winner ---------- best_name = max(candidate_results, key=lambda n: candidate_results[n]["mean_f1"]) best = candidate_results[best_name] fold_metrics = best["fold_metrics"] all_y_true = best["all_y_true"] all_y_pred = best["all_y_pred"] oof_probs = best["oof_probs"] cv_accuracy = best["mean_accuracy"] cv_f1 = best["mean_f1"] cv_precision = np.mean([m["precision"] for m in fold_metrics]) cv_recall = np.mean([m["recall"] for m in fold_metrics]) print("\n" + "=" * 60) print(f"WINNER: {best_name} (mean macro-F1 = {cv_f1:.4f})") print("=" * 60) # Full classification report and confusion matrix for the winner print("\nCLASSIFICATION REPORT (aggregated OOF predictions)") print(classification_report(all_y_true, all_y_pred, target_names=classes, zero_division=0)) cm = confusion_matrix(all_y_true, all_y_pred) print("CONFUSION MATRIX") print(f" Labels: {list(classes)}\n") print(pd.DataFrame(cm, index=classes, columns=classes).to_string()) # ---------- Learn thresholds from OOF probabilities ---------- print("\n" + "=" * 60) print("THRESHOLD TUNING (from out-of-fold predictions)") print("=" * 60) thresholds = {} for i, class_name in enumerate(classes): best_t, best_f1_t = 0.5, -1.0 for t in np.arange(0.3, 0.8, 0.05): preds = np.where(oof_probs[:, i] >= t, i, np.argmax(oof_probs, axis=1)) score = f1_score(y_enc, preds, average="macro", zero_division=0) if score > best_f1_t: best_f1_t, best_t = score, t thresholds[class_name] = round(float(best_t), 2) print(f" {class_name}: threshold = {best_t:.2f} (macro-F1 at threshold: {best_f1_t:.4f})") # ---------- Retrain winner on FULL dataset ---------- print("\n" + "=" * 60) print(f"RETRAINING {best_name} ON FULL DATASET FOR PRODUCTION") print("=" * 60) final_model = make_pipeline(CANDIDATES[best_name]) final_model.fit(X, y_enc) print(f"Final model trained on all {len(X)} samples.") # ---------- Save all artifacts ---------- print("\n" + "=" * 60) print("SAVING ARTIFACTS") print("=" * 60) joblib.dump(label_encoder, "label_encoder.pkl") print(" label_encoder.pkl saved — classes:", list(classes)) feature_info = { "numeric_cols": numeric_cols, "categorical_cols": categorical_cols, "all_features": numeric_cols + categorical_cols, } joblib.dump(feature_info, "feature_info.pkl") print(" feature_info.pkl saved") joblib.dump(final_model, "readability_model.pkl") print(f" readability_model.pkl saved ({best_name} pipeline)") grade_mapping = { "lower": "Grades 2-3 (Lower Elementary)", "higher": "Grades 4-6 (Higher Elementary)", "secondary": "Grades 7-10 (Secondary)", } joblib.dump(grade_mapping, "grade_mapping.pkl") print(" grade_mapping.pkl saved") joblib.dump(thresholds, "thresholds.pkl") print(" thresholds.pkl saved —", thresholds) # Summary JSON all_summaries = { name: { "mean_accuracy": round(np.mean([m["accuracy"] for m in r["fold_metrics"]]), 4), "mean_precision": round(np.mean([m["precision"] for m in r["fold_metrics"]]), 4), "mean_recall": round(np.mean([m["recall"] for m in r["fold_metrics"]]), 4), "mean_f1_macro": round(r["mean_f1"], 4), "std_f1_macro": round(np.std([m["f1_macro"] for m in r["fold_metrics"]]), 4), "per_fold": r["fold_metrics"], } for name, r in candidate_results.items() } metrics_summary = { "cv_folds": 5, "winner": best_name, "mean_accuracy": round(cv_accuracy, 4), "mean_precision": round(cv_precision, 4), "mean_recall": round(cv_recall, 4), "mean_f1_macro": round(cv_f1, 4), "std_accuracy": round(np.std([m["accuracy"] for m in fold_metrics]), 4), "std_f1_macro": round(np.std([m["f1_macro"] for m in fold_metrics]), 4), "thresholds": thresholds, "all_candidates": all_summaries, } with open("training_metrics.json", "w") as f: json.dump(metrics_summary, f, indent=2) print(" training_metrics.json saved (all candidate CV results)") # ---------- Sanity check ---------- print("\n" + "=" * 60) print("SANITY CHECK") print("=" * 60) test_model = joblib.load("readability_model.pkl") test_encoder = joblib.load("label_encoder.pkl") test_mapping = joblib.load("grade_mapping.pkl") sample_pred = test_model.predict(X.iloc[0:1])[0] sample_class = test_encoder.inverse_transform([sample_pred])[0] sample_grade = test_mapping[sample_class] print(f" Sample prediction: {sample_class} -> {sample_grade}") print(f" Probabilities: {test_model.predict_proba(X.iloc[0:1])[0]}") print("\n" + "=" * 60) print(f"ALL COMPONENTS SAVED SUCCESSFULLY! (model: {best_name})") print("=" * 60)