Spaces:

Jandayl
/

Alalay

Sleeping

File size: 9,358 Bytes

import json

import numpy as np
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import warnings

warnings.filterwarnings("ignore")

print("=" * 60)
print("TRAINING AND EVALUATION")
print("=" * 60)

# ---------- Load & clean dataset ----------

df = pd.read_csv("Feature_Extracted_Corpus.csv")
df["sentence_construction_type"] = df["sentence_construction_type"].replace(["Unknown"], "Other")
df["sentence_type"] = df["sentence_type"].replace(["Compound-Complex"], "Other")

label_encoder = LabelEncoder()
y = df["group"].values
y_enc = label_encoder.fit_transform(y)
classes = label_encoder.classes_
print(f"Classes: {list(classes)}")
print(f"Class distribution:\n{pd.Series(y).value_counts().to_string()}\n")

X = df.drop(columns=["id", "text", "group", "grade"])
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = ["sentence_construction_type", "sentence_type"]

# ---------- Candidate models ----------
# Each entry is (name, classifier). The one with the best 5-fold macro-F1 gets saved.

CANDIDATES = {
    "RandomForest": RandomForestClassifier(
        n_estimators=300,
        max_depth=10,
        min_samples_leaf=4,
        min_samples_split=2,
        max_features="sqrt",
        class_weight="balanced",
        random_state=42,
    ),
    "GradientBoosting": GradientBoostingClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        random_state=42,
    ),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=300,
        max_depth=10,
        min_samples_leaf=4,
        class_weight="balanced",
        random_state=42,
    ),
    "LogisticRegression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        random_state=42,
    ),
}

# ---------- Build pipeline ----------

def make_pipeline(classifier):
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
            ("num", StandardScaler(), numeric_cols),
        ],
        remainder="passthrough",
    )
    return Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", classifier),
    ])

# ---------- 5-fold stratified CV for every candidate ----------
# The winner (highest mean macro-F1) gets retrained on full data and saved.

print("=" * 60)
print("5-FOLD STRATIFIED CROSS-VALIDATION — ALL CANDIDATES")
print("=" * 60)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

candidate_results = {}  # name -> {fold_metrics, all_y_true, all_y_pred, oof_probs}

for cname, clf in CANDIDATES.items():
    print(f"\n--- {cname} ---")

    fold_metrics = []
    all_y_true, all_y_pred = [], []
    oof_probs = np.zeros((len(y_enc), len(classes)))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_enc), start=1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y_enc[train_idx], y_enc[val_idx]

        pipe = make_pipeline(clf)
        pipe.fit(X_train, y_train)

        y_pred = pipe.predict(X_val)
        probs  = pipe.predict_proba(X_val)
        oof_probs[val_idx] = probs

        acc  = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, average="macro", zero_division=0)
        rec  = recall_score(y_val, y_pred, average="macro", zero_division=0)
        f1   = f1_score(y_val, y_pred, average="macro", zero_division=0)

        fold_metrics.append({"accuracy": acc, "precision": prec, "recall": rec, "f1_macro": f1})
        all_y_true.extend(y_val)
        all_y_pred.extend(y_pred)

        print(f"  Fold {fold}  |  Acc: {acc:.4f}  Prec: {prec:.4f}  Rec: {rec:.4f}  F1: {f1:.4f}")

    mean_f1  = np.mean([m["f1_macro"]  for m in fold_metrics])
    mean_acc = np.mean([m["accuracy"]  for m in fold_metrics])
    print(f"  Mean   |  Acc: {mean_acc:.4f}  F1: {mean_f1:.4f}")

    candidate_results[cname] = {
        "fold_metrics":  fold_metrics,
        "all_y_true":    all_y_true,
        "all_y_pred":    all_y_pred,
        "oof_probs":     oof_probs,
        "mean_f1":       mean_f1,
        "mean_accuracy": mean_acc,
    }

# ---------- Pick winner ----------

best_name = max(candidate_results, key=lambda n: candidate_results[n]["mean_f1"])
best      = candidate_results[best_name]
fold_metrics = best["fold_metrics"]
all_y_true   = best["all_y_true"]
all_y_pred   = best["all_y_pred"]
oof_probs    = best["oof_probs"]
cv_accuracy  = best["mean_accuracy"]
cv_f1        = best["mean_f1"]
cv_precision = np.mean([m["precision"] for m in fold_metrics])
cv_recall    = np.mean([m["recall"]    for m in fold_metrics])

print("\n" + "=" * 60)
print(f"WINNER: {best_name}  (mean macro-F1 = {cv_f1:.4f})")
print("=" * 60)

# Full classification report and confusion matrix for the winner
print("\nCLASSIFICATION REPORT (aggregated OOF predictions)")
print(classification_report(all_y_true, all_y_pred, target_names=classes, zero_division=0))

cm = confusion_matrix(all_y_true, all_y_pred)
print("CONFUSION MATRIX")
print(f"  Labels: {list(classes)}\n")
print(pd.DataFrame(cm, index=classes, columns=classes).to_string())

# ---------- Learn thresholds from OOF probabilities ----------

print("\n" + "=" * 60)
print("THRESHOLD TUNING (from out-of-fold predictions)")
print("=" * 60)

thresholds = {}
for i, class_name in enumerate(classes):
    best_t, best_f1_t = 0.5, -1.0
    for t in np.arange(0.3, 0.8, 0.05):
        preds = np.where(oof_probs[:, i] >= t, i, np.argmax(oof_probs, axis=1))
        score = f1_score(y_enc, preds, average="macro", zero_division=0)
        if score > best_f1_t:
            best_f1_t, best_t = score, t
    thresholds[class_name] = round(float(best_t), 2)
    print(f"  {class_name}: threshold = {best_t:.2f}  (macro-F1 at threshold: {best_f1_t:.4f})")

# ---------- Retrain winner on FULL dataset ----------

print("\n" + "=" * 60)
print(f"RETRAINING {best_name} ON FULL DATASET FOR PRODUCTION")
print("=" * 60)

final_model = make_pipeline(CANDIDATES[best_name])
final_model.fit(X, y_enc)
print(f"Final model trained on all {len(X)} samples.")

# ---------- Save all artifacts ----------

print("\n" + "=" * 60)
print("SAVING ARTIFACTS")
print("=" * 60)

joblib.dump(label_encoder, "label_encoder.pkl")
print("  label_encoder.pkl saved  — classes:", list(classes))

feature_info = {
    "numeric_cols": numeric_cols,
    "categorical_cols": categorical_cols,
    "all_features": numeric_cols + categorical_cols,
}
joblib.dump(feature_info, "feature_info.pkl")
print("  feature_info.pkl saved")

joblib.dump(final_model, "readability_model.pkl")
print(f"  readability_model.pkl saved  ({best_name} pipeline)")

grade_mapping = {
    "lower":     "Grades 2-3 (Lower Elementary)",
    "higher":    "Grades 4-6 (Higher Elementary)",
    "secondary": "Grades 7-10 (Secondary)",
}
joblib.dump(grade_mapping, "grade_mapping.pkl")
print("  grade_mapping.pkl saved")

joblib.dump(thresholds, "thresholds.pkl")
print("  thresholds.pkl saved  —", thresholds)

# Summary JSON
all_summaries = {
    name: {
        "mean_accuracy":  round(np.mean([m["accuracy"]  for m in r["fold_metrics"]]), 4),
        "mean_precision": round(np.mean([m["precision"] for m in r["fold_metrics"]]), 4),
        "mean_recall":    round(np.mean([m["recall"]    for m in r["fold_metrics"]]), 4),
        "mean_f1_macro":  round(r["mean_f1"], 4),
        "std_f1_macro":   round(np.std([m["f1_macro"] for m in r["fold_metrics"]]), 4),
        "per_fold":       r["fold_metrics"],
    }
    for name, r in candidate_results.items()
}

metrics_summary = {
    "cv_folds":       5,
    "winner":         best_name,
    "mean_accuracy":  round(cv_accuracy, 4),
    "mean_precision": round(cv_precision, 4),
    "mean_recall":    round(cv_recall, 4),
    "mean_f1_macro":  round(cv_f1, 4),
    "std_accuracy":   round(np.std([m["accuracy"] for m in fold_metrics]), 4),
    "std_f1_macro":   round(np.std([m["f1_macro"] for m in fold_metrics]), 4),
    "thresholds":     thresholds,
    "all_candidates": all_summaries,
}
with open("training_metrics.json", "w") as f:
    json.dump(metrics_summary, f, indent=2)
print("  training_metrics.json saved  (all candidate CV results)")

# ---------- Sanity check ----------

print("\n" + "=" * 60)
print("SANITY CHECK")
print("=" * 60)

test_model   = joblib.load("readability_model.pkl")
test_encoder = joblib.load("label_encoder.pkl")
test_mapping = joblib.load("grade_mapping.pkl")

sample_pred  = test_model.predict(X.iloc[0:1])[0]
sample_class = test_encoder.inverse_transform([sample_pred])[0]
sample_grade = test_mapping[sample_class]
print(f"  Sample prediction: {sample_class} -> {sample_grade}")
print(f"  Probabilities: {test_model.predict_proba(X.iloc[0:1])[0]}")

print("\n" + "=" * 60)
print(f"ALL COMPONENTS SAVED SUCCESSFULLY!  (model: {best_name})")
print("=" * 60)