""" Train all models and save them for the Streamlit app. Run this once: python3 train_models.py """ import pandas as pd import numpy as np import joblib import os from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score from sklearn.preprocessing import RobustScaler from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import ( accuracy_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, precision_score ) from xgboost import XGBClassifier import warnings warnings.filterwarnings("ignore") MODELS_DIR = "models" os.makedirs(MODELS_DIR, exist_ok=True) print("šŸ“‚ Loading dataset...") df = pd.read_csv("diabetes.csv") # ── Imputation ───────────────────────────────────────────────────────────── zero_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"] df_clean = df.copy() medians = {} for col in zero_cols: med = df_clean[col].replace(0, np.nan).median() medians[col] = med df_clean[col] = df_clean[col].replace(0, med) # ── Feature Engineering ──────────────────────────────────────────────────── def engineer_features(df_in): d = df_in.copy() d["Glucose_BMI"] = d["Glucose"] * d["BMI"] d["Age_Pregnancies"] = d["Age"] * d["Pregnancies"] d["BMI_Age"] = d["BMI"] * d["Age"] d["Glucose_Insulin_ratio"] = d["Glucose"] / (d["Insulin"] + 1) d["Risk_Score"] = ( (d["Glucose"] > 140).astype(int) + (d["BMI"] > 30).astype(int) + (d["Age"] > 40).astype(int) ) return d df_fe = engineer_features(df_clean) feature_cols = [c for c in df_fe.columns if c != "Outcome"] X = df_fe[feature_cols] y = df_fe["Outcome"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) scaler = RobustScaler() X_train_s = scaler.fit_transform(X_train) X_test_s = scaler.transform(X_test) # ── Model definitions ────────────────────────────────────────────────────── models = { "Logistic Regression": LogisticRegression(C=1.0, class_weight="balanced", max_iter=1000, random_state=42), "Random Forest": RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42, n_jobs=-1), "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42), "Decision Tree": DecisionTreeClassifier(class_weight="balanced", max_depth=6, random_state=42), "SVM": SVC(probability=True, class_weight="balanced", kernel="rbf", C=10, gamma="scale", random_state=42), "KNN": KNeighborsClassifier(n_neighbors=7, weights="distance"), "XGBoost": XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, scale_pos_weight=2, random_state=42, eval_metric="logloss", verbosity=0), } cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) results = {} print(f"\n{'Model':<25} {'Acc':>6} {'Prec':>6} {'Rec':>6} {'F1':>6} {'AUC':>7} {'CV-Acc':>8}") print("─" * 70) for name, model in models.items(): model.fit(X_train_s, y_train) pred = model.predict(X_test_s) prob = model.predict_proba(X_test_s)[:, 1] acc = accuracy_score(y_test, pred) prec = precision_score(y_test, pred) rec = recall_score(y_test, pred) f1 = f1_score(y_test, pred) auc = roc_auc_score(y_test, prob) cv_sc = cross_val_score(model, X_train_s, y_train, cv=cv, scoring="accuracy").mean() fpr, tpr, thresholds = roc_curve(y_test, prob) cm = confusion_matrix(y_test, pred).tolist() results[name] = dict( accuracy=acc, precision=prec, recall=rec, f1=f1, auc=auc, cv_accuracy=cv_sc, fpr=fpr.tolist(), tpr=tpr.tolist(), confusion_matrix=cm, thresholds=thresholds.tolist() ) print(f"{name:<25} {acc:>6.4f} {prec:>6.4f} {rec:>6.4f} {f1:>6.4f} {auc:>7.4f} {cv_sc:>8.4f}") # ── Ensemble ─────────────────────────────────────────────────────────────── print("\nTraining ensemble...") ensemble = VotingClassifier( estimators=[(n, m) for n, m in models.items()], voting="soft" ) ensemble.fit(X_train_s, y_train) ens_pred = ensemble.predict(X_test_s) ens_prob = ensemble.predict_proba(X_test_s)[:, 1] fpr_e, tpr_e, thr_e = roc_curve(y_test, ens_prob) results["Ensemble"] = dict( accuracy=accuracy_score(y_test, ens_pred), precision=precision_score(y_test, ens_pred), recall=recall_score(y_test, ens_pred), f1=f1_score(y_test, ens_pred), auc=roc_auc_score(y_test, ens_prob), cv_accuracy=accuracy_score(y_test, ens_pred), fpr=fpr_e.tolist(), tpr=tpr_e.tolist(), confusion_matrix=confusion_matrix(y_test, ens_pred).tolist(), thresholds=thr_e.tolist() ) # ── Save everything ──────────────────────────────────────────────────────── joblib.dump(scaler, f"{MODELS_DIR}/scaler.pkl") joblib.dump(models, f"{MODELS_DIR}/models.pkl") joblib.dump(ensemble, f"{MODELS_DIR}/ensemble.pkl") joblib.dump(results, f"{MODELS_DIR}/results.pkl") joblib.dump(medians, f"{MODELS_DIR}/medians.pkl") joblib.dump(feature_cols, f"{MODELS_DIR}/feature_cols.pkl") # Save test data for later analysis import json test_data = {"X_test": X_test.values.tolist(), "y_test": y_test.tolist(), "columns": feature_cols} with open(f"{MODELS_DIR}/test_data.json", "w") as f: json.dump(test_data, f) best = max(results, key=lambda k: results[k]["auc"]) print(f"\nšŸ† Best model by AUC: {best} — AUC={results[best]['auc']:.4f}") print("āœ… All models saved to ./models/")