Spaces:

saifmontaser
/

DiabetesPro

Runtime error

File size: 6,447 Bytes

33d0f9f

"""
Train all models and save them for the Streamlit app.
Run this once: python3 train_models.py
"""

import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, precision_score
)
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

MODELS_DIR = "models"
os.makedirs(MODELS_DIR, exist_ok=True)

print("📂 Loading dataset...")
df = pd.read_csv("diabetes.csv")

# ── Imputation ─────────────────────────────────────────────────────────────
zero_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
df_clean = df.copy()
medians = {}
for col in zero_cols:
    med = df_clean[col].replace(0, np.nan).median()
    medians[col] = med
    df_clean[col] = df_clean[col].replace(0, med)

# ── Feature Engineering ────────────────────────────────────────────────────
def engineer_features(df_in):
    d = df_in.copy()
    d["Glucose_BMI"]           = d["Glucose"] * d["BMI"]
    d["Age_Pregnancies"]       = d["Age"] * d["Pregnancies"]
    d["BMI_Age"]               = d["BMI"] * d["Age"]
    d["Glucose_Insulin_ratio"] = d["Glucose"] / (d["Insulin"] + 1)
    d["Risk_Score"] = (
        (d["Glucose"] > 140).astype(int) +
        (d["BMI"] > 30).astype(int) +
        (d["Age"] > 40).astype(int)
    )
    return d

df_fe = engineer_features(df_clean)
feature_cols = [c for c in df_fe.columns if c != "Outcome"]
X = df_fe[feature_cols]
y = df_fe["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# ── Model definitions ──────────────────────────────────────────────────────
models = {
    "Logistic Regression":  LogisticRegression(C=1.0, class_weight="balanced", max_iter=1000, random_state=42),
    "Random Forest":        RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42, n_jobs=-1),
    "Gradient Boosting":    GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42),
    "Decision Tree":        DecisionTreeClassifier(class_weight="balanced", max_depth=6, random_state=42),
    "SVM":                  SVC(probability=True, class_weight="balanced", kernel="rbf", C=10, gamma="scale", random_state=42),
    "KNN":                  KNeighborsClassifier(n_neighbors=7, weights="distance"),
    "XGBoost":              XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=4,
                                          scale_pos_weight=2, random_state=42,
                                          eval_metric="logloss", verbosity=0),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {}

print(f"\n{'Model':<25} {'Acc':>6} {'Prec':>6} {'Rec':>6} {'F1':>6} {'AUC':>7} {'CV-Acc':>8}")
print("─" * 70)

for name, model in models.items():
    model.fit(X_train_s, y_train)
    pred  = model.predict(X_test_s)
    prob  = model.predict_proba(X_test_s)[:, 1]
    acc   = accuracy_score(y_test, pred)
    prec  = precision_score(y_test, pred)
    rec   = recall_score(y_test, pred)
    f1    = f1_score(y_test, pred)
    auc   = roc_auc_score(y_test, prob)
    cv_sc = cross_val_score(model, X_train_s, y_train, cv=cv, scoring="accuracy").mean()
    fpr, tpr, thresholds = roc_curve(y_test, prob)
    cm    = confusion_matrix(y_test, pred).tolist()

    results[name] = dict(
        accuracy=acc, precision=prec, recall=rec, f1=f1, auc=auc,
        cv_accuracy=cv_sc, fpr=fpr.tolist(), tpr=tpr.tolist(),
        confusion_matrix=cm, thresholds=thresholds.tolist()
    )
    print(f"{name:<25} {acc:>6.4f} {prec:>6.4f} {rec:>6.4f} {f1:>6.4f} {auc:>7.4f} {cv_sc:>8.4f}")

# ── Ensemble ───────────────────────────────────────────────────────────────
print("\nTraining ensemble...")
ensemble = VotingClassifier(
    estimators=[(n, m) for n, m in models.items()], voting="soft"
)
ensemble.fit(X_train_s, y_train)
ens_pred = ensemble.predict(X_test_s)
ens_prob = ensemble.predict_proba(X_test_s)[:, 1]
fpr_e, tpr_e, thr_e = roc_curve(y_test, ens_prob)
results["Ensemble"] = dict(
    accuracy=accuracy_score(y_test, ens_pred),
    precision=precision_score(y_test, ens_pred),
    recall=recall_score(y_test, ens_pred),
    f1=f1_score(y_test, ens_pred),
    auc=roc_auc_score(y_test, ens_prob),
    cv_accuracy=accuracy_score(y_test, ens_pred),
    fpr=fpr_e.tolist(), tpr=tpr_e.tolist(),
    confusion_matrix=confusion_matrix(y_test, ens_pred).tolist(),
    thresholds=thr_e.tolist()
)

# ── Save everything ────────────────────────────────────────────────────────
joblib.dump(scaler,    f"{MODELS_DIR}/scaler.pkl")
joblib.dump(models,    f"{MODELS_DIR}/models.pkl")
joblib.dump(ensemble,  f"{MODELS_DIR}/ensemble.pkl")
joblib.dump(results,   f"{MODELS_DIR}/results.pkl")
joblib.dump(medians,   f"{MODELS_DIR}/medians.pkl")
joblib.dump(feature_cols, f"{MODELS_DIR}/feature_cols.pkl")

# Save test data for later analysis
import json
test_data = {"X_test": X_test.values.tolist(), "y_test": y_test.tolist(),
             "columns": feature_cols}
with open(f"{MODELS_DIR}/test_data.json", "w") as f:
    json.dump(test_data, f)

best = max(results, key=lambda k: results[k]["auc"])
print(f"\n🏆 Best model by AUC: {best} — AUC={results[best]['auc']:.4f}")
print("✅ All models saved to ./models/")