Alalay / models /save_webapp_components.py
Jandayl's picture
reverted changes
8acba37
import json
import numpy as np
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
accuracy_score,
classification_report,
confusion_matrix,
f1_score,
precision_score,
recall_score,
)
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import warnings
warnings.filterwarnings("ignore")
print("=" * 60)
print("TRAINING AND EVALUATION")
print("=" * 60)
# ---------- Load & clean dataset ----------
df = pd.read_csv("Feature_Extracted_Corpus.csv")
df["sentence_construction_type"] = df["sentence_construction_type"].replace(["Unknown"], "Other")
df["sentence_type"] = df["sentence_type"].replace(["Compound-Complex"], "Other")
label_encoder = LabelEncoder()
y = df["group"].values
y_enc = label_encoder.fit_transform(y)
classes = label_encoder.classes_
print(f"Classes: {list(classes)}")
print(f"Class distribution:\n{pd.Series(y).value_counts().to_string()}\n")
X = df.drop(columns=["id", "text", "group", "grade"])
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = ["sentence_construction_type", "sentence_type"]
# ---------- Candidate models ----------
# Each entry is (name, classifier). The one with the best 5-fold macro-F1 gets saved.
CANDIDATES = {
"RandomForest": RandomForestClassifier(
n_estimators=300,
max_depth=10,
min_samples_leaf=4,
min_samples_split=2,
max_features="sqrt",
class_weight="balanced",
random_state=42,
),
"GradientBoosting": GradientBoostingClassifier(
n_estimators=100,
max_depth=3,
learning_rate=0.1,
random_state=42,
),
"ExtraTrees": ExtraTreesClassifier(
n_estimators=300,
max_depth=10,
min_samples_leaf=4,
class_weight="balanced",
random_state=42,
),
"LogisticRegression": LogisticRegression(
max_iter=1000,
class_weight="balanced",
random_state=42,
),
}
# ---------- Build pipeline ----------
def make_pipeline(classifier):
preprocessor = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
("num", StandardScaler(), numeric_cols),
],
remainder="passthrough",
)
return Pipeline([
("preprocessing", preprocessor),
("classifier", classifier),
])
# ---------- 5-fold stratified CV for every candidate ----------
# The winner (highest mean macro-F1) gets retrained on full data and saved.
print("=" * 60)
print("5-FOLD STRATIFIED CROSS-VALIDATION — ALL CANDIDATES")
print("=" * 60)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
candidate_results = {} # name -> {fold_metrics, all_y_true, all_y_pred, oof_probs}
for cname, clf in CANDIDATES.items():
print(f"\n--- {cname} ---")
fold_metrics = []
all_y_true, all_y_pred = [], []
oof_probs = np.zeros((len(y_enc), len(classes)))
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_enc), start=1):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y_enc[train_idx], y_enc[val_idx]
pipe = make_pipeline(clf)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)
probs = pipe.predict_proba(X_val)
oof_probs[val_idx] = probs
acc = accuracy_score(y_val, y_pred)
prec = precision_score(y_val, y_pred, average="macro", zero_division=0)
rec = recall_score(y_val, y_pred, average="macro", zero_division=0)
f1 = f1_score(y_val, y_pred, average="macro", zero_division=0)
fold_metrics.append({"accuracy": acc, "precision": prec, "recall": rec, "f1_macro": f1})
all_y_true.extend(y_val)
all_y_pred.extend(y_pred)
print(f" Fold {fold} | Acc: {acc:.4f} Prec: {prec:.4f} Rec: {rec:.4f} F1: {f1:.4f}")
mean_f1 = np.mean([m["f1_macro"] for m in fold_metrics])
mean_acc = np.mean([m["accuracy"] for m in fold_metrics])
print(f" Mean | Acc: {mean_acc:.4f} F1: {mean_f1:.4f}")
candidate_results[cname] = {
"fold_metrics": fold_metrics,
"all_y_true": all_y_true,
"all_y_pred": all_y_pred,
"oof_probs": oof_probs,
"mean_f1": mean_f1,
"mean_accuracy": mean_acc,
}
# ---------- Pick winner ----------
best_name = max(candidate_results, key=lambda n: candidate_results[n]["mean_f1"])
best = candidate_results[best_name]
fold_metrics = best["fold_metrics"]
all_y_true = best["all_y_true"]
all_y_pred = best["all_y_pred"]
oof_probs = best["oof_probs"]
cv_accuracy = best["mean_accuracy"]
cv_f1 = best["mean_f1"]
cv_precision = np.mean([m["precision"] for m in fold_metrics])
cv_recall = np.mean([m["recall"] for m in fold_metrics])
print("\n" + "=" * 60)
print(f"WINNER: {best_name} (mean macro-F1 = {cv_f1:.4f})")
print("=" * 60)
# Full classification report and confusion matrix for the winner
print("\nCLASSIFICATION REPORT (aggregated OOF predictions)")
print(classification_report(all_y_true, all_y_pred, target_names=classes, zero_division=0))
cm = confusion_matrix(all_y_true, all_y_pred)
print("CONFUSION MATRIX")
print(f" Labels: {list(classes)}\n")
print(pd.DataFrame(cm, index=classes, columns=classes).to_string())
# ---------- Learn thresholds from OOF probabilities ----------
print("\n" + "=" * 60)
print("THRESHOLD TUNING (from out-of-fold predictions)")
print("=" * 60)
thresholds = {}
for i, class_name in enumerate(classes):
best_t, best_f1_t = 0.5, -1.0
for t in np.arange(0.3, 0.8, 0.05):
preds = np.where(oof_probs[:, i] >= t, i, np.argmax(oof_probs, axis=1))
score = f1_score(y_enc, preds, average="macro", zero_division=0)
if score > best_f1_t:
best_f1_t, best_t = score, t
thresholds[class_name] = round(float(best_t), 2)
print(f" {class_name}: threshold = {best_t:.2f} (macro-F1 at threshold: {best_f1_t:.4f})")
# ---------- Retrain winner on FULL dataset ----------
print("\n" + "=" * 60)
print(f"RETRAINING {best_name} ON FULL DATASET FOR PRODUCTION")
print("=" * 60)
final_model = make_pipeline(CANDIDATES[best_name])
final_model.fit(X, y_enc)
print(f"Final model trained on all {len(X)} samples.")
# ---------- Save all artifacts ----------
print("\n" + "=" * 60)
print("SAVING ARTIFACTS")
print("=" * 60)
joblib.dump(label_encoder, "label_encoder.pkl")
print(" label_encoder.pkl saved — classes:", list(classes))
feature_info = {
"numeric_cols": numeric_cols,
"categorical_cols": categorical_cols,
"all_features": numeric_cols + categorical_cols,
}
joblib.dump(feature_info, "feature_info.pkl")
print(" feature_info.pkl saved")
joblib.dump(final_model, "readability_model.pkl")
print(f" readability_model.pkl saved ({best_name} pipeline)")
grade_mapping = {
"lower": "Grades 2-3 (Lower Elementary)",
"higher": "Grades 4-6 (Higher Elementary)",
"secondary": "Grades 7-10 (Secondary)",
}
joblib.dump(grade_mapping, "grade_mapping.pkl")
print(" grade_mapping.pkl saved")
joblib.dump(thresholds, "thresholds.pkl")
print(" thresholds.pkl saved —", thresholds)
# Summary JSON
all_summaries = {
name: {
"mean_accuracy": round(np.mean([m["accuracy"] for m in r["fold_metrics"]]), 4),
"mean_precision": round(np.mean([m["precision"] for m in r["fold_metrics"]]), 4),
"mean_recall": round(np.mean([m["recall"] for m in r["fold_metrics"]]), 4),
"mean_f1_macro": round(r["mean_f1"], 4),
"std_f1_macro": round(np.std([m["f1_macro"] for m in r["fold_metrics"]]), 4),
"per_fold": r["fold_metrics"],
}
for name, r in candidate_results.items()
}
metrics_summary = {
"cv_folds": 5,
"winner": best_name,
"mean_accuracy": round(cv_accuracy, 4),
"mean_precision": round(cv_precision, 4),
"mean_recall": round(cv_recall, 4),
"mean_f1_macro": round(cv_f1, 4),
"std_accuracy": round(np.std([m["accuracy"] for m in fold_metrics]), 4),
"std_f1_macro": round(np.std([m["f1_macro"] for m in fold_metrics]), 4),
"thresholds": thresholds,
"all_candidates": all_summaries,
}
with open("training_metrics.json", "w") as f:
json.dump(metrics_summary, f, indent=2)
print(" training_metrics.json saved (all candidate CV results)")
# ---------- Sanity check ----------
print("\n" + "=" * 60)
print("SANITY CHECK")
print("=" * 60)
test_model = joblib.load("readability_model.pkl")
test_encoder = joblib.load("label_encoder.pkl")
test_mapping = joblib.load("grade_mapping.pkl")
sample_pred = test_model.predict(X.iloc[0:1])[0]
sample_class = test_encoder.inverse_transform([sample_pred])[0]
sample_grade = test_mapping[sample_class]
print(f" Sample prediction: {sample_class} -> {sample_grade}")
print(f" Probabilities: {test_model.predict_proba(X.iloc[0:1])[0]}")
print("\n" + "=" * 60)
print(f"ALL COMPONENTS SAVED SUCCESSFULLY! (model: {best_name})")
print("=" * 60)