| import json |
|
|
| import numpy as np |
| import pandas as pd |
| import joblib |
| from sklearn.compose import ColumnTransformer |
| from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.metrics import ( |
| accuracy_score, |
| classification_report, |
| confusion_matrix, |
| f1_score, |
| precision_score, |
| recall_score, |
| ) |
| from sklearn.model_selection import StratifiedKFold |
| from sklearn.pipeline import Pipeline |
| from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler |
| import warnings |
|
|
| warnings.filterwarnings("ignore") |
|
|
| print("=" * 60) |
| print("TRAINING AND EVALUATION") |
| print("=" * 60) |
|
|
| |
|
|
| df = pd.read_csv("Feature_Extracted_Corpus.csv") |
| df["sentence_construction_type"] = df["sentence_construction_type"].replace(["Unknown"], "Other") |
| df["sentence_type"] = df["sentence_type"].replace(["Compound-Complex"], "Other") |
|
|
| label_encoder = LabelEncoder() |
| y = df["group"].values |
| y_enc = label_encoder.fit_transform(y) |
| classes = label_encoder.classes_ |
| print(f"Classes: {list(classes)}") |
| print(f"Class distribution:\n{pd.Series(y).value_counts().to_string()}\n") |
|
|
| X = df.drop(columns=["id", "text", "group", "grade"]) |
| numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist() |
| categorical_cols = ["sentence_construction_type", "sentence_type"] |
|
|
| |
| |
|
|
| CANDIDATES = { |
| "RandomForest": RandomForestClassifier( |
| n_estimators=300, |
| max_depth=10, |
| min_samples_leaf=4, |
| min_samples_split=2, |
| max_features="sqrt", |
| class_weight="balanced", |
| random_state=42, |
| ), |
| "GradientBoosting": GradientBoostingClassifier( |
| n_estimators=100, |
| max_depth=3, |
| learning_rate=0.1, |
| random_state=42, |
| ), |
| "ExtraTrees": ExtraTreesClassifier( |
| n_estimators=300, |
| max_depth=10, |
| min_samples_leaf=4, |
| class_weight="balanced", |
| random_state=42, |
| ), |
| "LogisticRegression": LogisticRegression( |
| max_iter=1000, |
| class_weight="balanced", |
| random_state=42, |
| ), |
| } |
|
|
| |
|
|
| def make_pipeline(classifier): |
| preprocessor = ColumnTransformer( |
| transformers=[ |
| ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols), |
| ("num", StandardScaler(), numeric_cols), |
| ], |
| remainder="passthrough", |
| ) |
| return Pipeline([ |
| ("preprocessing", preprocessor), |
| ("classifier", classifier), |
| ]) |
|
|
| |
| |
|
|
| print("=" * 60) |
| print("5-FOLD STRATIFIED CROSS-VALIDATION — ALL CANDIDATES") |
| print("=" * 60) |
|
|
| skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) |
|
|
| candidate_results = {} |
|
|
| for cname, clf in CANDIDATES.items(): |
| print(f"\n--- {cname} ---") |
|
|
| fold_metrics = [] |
| all_y_true, all_y_pred = [], [] |
| oof_probs = np.zeros((len(y_enc), len(classes))) |
|
|
| for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_enc), start=1): |
| X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] |
| y_train, y_val = y_enc[train_idx], y_enc[val_idx] |
|
|
| pipe = make_pipeline(clf) |
| pipe.fit(X_train, y_train) |
|
|
| y_pred = pipe.predict(X_val) |
| probs = pipe.predict_proba(X_val) |
| oof_probs[val_idx] = probs |
|
|
| acc = accuracy_score(y_val, y_pred) |
| prec = precision_score(y_val, y_pred, average="macro", zero_division=0) |
| rec = recall_score(y_val, y_pred, average="macro", zero_division=0) |
| f1 = f1_score(y_val, y_pred, average="macro", zero_division=0) |
|
|
| fold_metrics.append({"accuracy": acc, "precision": prec, "recall": rec, "f1_macro": f1}) |
| all_y_true.extend(y_val) |
| all_y_pred.extend(y_pred) |
|
|
| print(f" Fold {fold} | Acc: {acc:.4f} Prec: {prec:.4f} Rec: {rec:.4f} F1: {f1:.4f}") |
|
|
| mean_f1 = np.mean([m["f1_macro"] for m in fold_metrics]) |
| mean_acc = np.mean([m["accuracy"] for m in fold_metrics]) |
| print(f" Mean | Acc: {mean_acc:.4f} F1: {mean_f1:.4f}") |
|
|
| candidate_results[cname] = { |
| "fold_metrics": fold_metrics, |
| "all_y_true": all_y_true, |
| "all_y_pred": all_y_pred, |
| "oof_probs": oof_probs, |
| "mean_f1": mean_f1, |
| "mean_accuracy": mean_acc, |
| } |
|
|
| |
|
|
| best_name = max(candidate_results, key=lambda n: candidate_results[n]["mean_f1"]) |
| best = candidate_results[best_name] |
| fold_metrics = best["fold_metrics"] |
| all_y_true = best["all_y_true"] |
| all_y_pred = best["all_y_pred"] |
| oof_probs = best["oof_probs"] |
| cv_accuracy = best["mean_accuracy"] |
| cv_f1 = best["mean_f1"] |
| cv_precision = np.mean([m["precision"] for m in fold_metrics]) |
| cv_recall = np.mean([m["recall"] for m in fold_metrics]) |
|
|
| print("\n" + "=" * 60) |
| print(f"WINNER: {best_name} (mean macro-F1 = {cv_f1:.4f})") |
| print("=" * 60) |
|
|
| |
| print("\nCLASSIFICATION REPORT (aggregated OOF predictions)") |
| print(classification_report(all_y_true, all_y_pred, target_names=classes, zero_division=0)) |
|
|
| cm = confusion_matrix(all_y_true, all_y_pred) |
| print("CONFUSION MATRIX") |
| print(f" Labels: {list(classes)}\n") |
| print(pd.DataFrame(cm, index=classes, columns=classes).to_string()) |
|
|
| |
|
|
| print("\n" + "=" * 60) |
| print("THRESHOLD TUNING (from out-of-fold predictions)") |
| print("=" * 60) |
|
|
| thresholds = {} |
| for i, class_name in enumerate(classes): |
| best_t, best_f1_t = 0.5, -1.0 |
| for t in np.arange(0.3, 0.8, 0.05): |
| preds = np.where(oof_probs[:, i] >= t, i, np.argmax(oof_probs, axis=1)) |
| score = f1_score(y_enc, preds, average="macro", zero_division=0) |
| if score > best_f1_t: |
| best_f1_t, best_t = score, t |
| thresholds[class_name] = round(float(best_t), 2) |
| print(f" {class_name}: threshold = {best_t:.2f} (macro-F1 at threshold: {best_f1_t:.4f})") |
|
|
| |
|
|
| print("\n" + "=" * 60) |
| print(f"RETRAINING {best_name} ON FULL DATASET FOR PRODUCTION") |
| print("=" * 60) |
|
|
| final_model = make_pipeline(CANDIDATES[best_name]) |
| final_model.fit(X, y_enc) |
| print(f"Final model trained on all {len(X)} samples.") |
|
|
| |
|
|
| print("\n" + "=" * 60) |
| print("SAVING ARTIFACTS") |
| print("=" * 60) |
|
|
| joblib.dump(label_encoder, "label_encoder.pkl") |
| print(" label_encoder.pkl saved — classes:", list(classes)) |
|
|
| feature_info = { |
| "numeric_cols": numeric_cols, |
| "categorical_cols": categorical_cols, |
| "all_features": numeric_cols + categorical_cols, |
| } |
| joblib.dump(feature_info, "feature_info.pkl") |
| print(" feature_info.pkl saved") |
|
|
| joblib.dump(final_model, "readability_model.pkl") |
| print(f" readability_model.pkl saved ({best_name} pipeline)") |
|
|
| grade_mapping = { |
| "lower": "Grades 2-3 (Lower Elementary)", |
| "higher": "Grades 4-6 (Higher Elementary)", |
| "secondary": "Grades 7-10 (Secondary)", |
| } |
| joblib.dump(grade_mapping, "grade_mapping.pkl") |
| print(" grade_mapping.pkl saved") |
|
|
| joblib.dump(thresholds, "thresholds.pkl") |
| print(" thresholds.pkl saved —", thresholds) |
|
|
| |
| all_summaries = { |
| name: { |
| "mean_accuracy": round(np.mean([m["accuracy"] for m in r["fold_metrics"]]), 4), |
| "mean_precision": round(np.mean([m["precision"] for m in r["fold_metrics"]]), 4), |
| "mean_recall": round(np.mean([m["recall"] for m in r["fold_metrics"]]), 4), |
| "mean_f1_macro": round(r["mean_f1"], 4), |
| "std_f1_macro": round(np.std([m["f1_macro"] for m in r["fold_metrics"]]), 4), |
| "per_fold": r["fold_metrics"], |
| } |
| for name, r in candidate_results.items() |
| } |
|
|
| metrics_summary = { |
| "cv_folds": 5, |
| "winner": best_name, |
| "mean_accuracy": round(cv_accuracy, 4), |
| "mean_precision": round(cv_precision, 4), |
| "mean_recall": round(cv_recall, 4), |
| "mean_f1_macro": round(cv_f1, 4), |
| "std_accuracy": round(np.std([m["accuracy"] for m in fold_metrics]), 4), |
| "std_f1_macro": round(np.std([m["f1_macro"] for m in fold_metrics]), 4), |
| "thresholds": thresholds, |
| "all_candidates": all_summaries, |
| } |
| with open("training_metrics.json", "w") as f: |
| json.dump(metrics_summary, f, indent=2) |
| print(" training_metrics.json saved (all candidate CV results)") |
|
|
| |
|
|
| print("\n" + "=" * 60) |
| print("SANITY CHECK") |
| print("=" * 60) |
|
|
| test_model = joblib.load("readability_model.pkl") |
| test_encoder = joblib.load("label_encoder.pkl") |
| test_mapping = joblib.load("grade_mapping.pkl") |
|
|
| sample_pred = test_model.predict(X.iloc[0:1])[0] |
| sample_class = test_encoder.inverse_transform([sample_pred])[0] |
| sample_grade = test_mapping[sample_class] |
| print(f" Sample prediction: {sample_class} -> {sample_grade}") |
| print(f" Probabilities: {test_model.predict_proba(X.iloc[0:1])[0]}") |
|
|
| print("\n" + "=" * 60) |
| print(f"ALL COMPONENTS SAVED SUCCESSFULLY! (model: {best_name})") |
| print("=" * 60) |