DiabetesPro / src /train_models.py
saifmontaser's picture
Upload train_models.py
33d0f9f verified
"""
Train all models and save them for the Streamlit app.
Run this once: python3 train_models.py
"""
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
accuracy_score, recall_score, f1_score,
roc_auc_score, roc_curve, confusion_matrix, precision_score
)
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")
MODELS_DIR = "models"
os.makedirs(MODELS_DIR, exist_ok=True)
print("πŸ“‚ Loading dataset...")
df = pd.read_csv("diabetes.csv")
# ── Imputation ─────────────────────────────────────────────────────────────
zero_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
df_clean = df.copy()
medians = {}
for col in zero_cols:
med = df_clean[col].replace(0, np.nan).median()
medians[col] = med
df_clean[col] = df_clean[col].replace(0, med)
# ── Feature Engineering ────────────────────────────────────────────────────
def engineer_features(df_in):
d = df_in.copy()
d["Glucose_BMI"] = d["Glucose"] * d["BMI"]
d["Age_Pregnancies"] = d["Age"] * d["Pregnancies"]
d["BMI_Age"] = d["BMI"] * d["Age"]
d["Glucose_Insulin_ratio"] = d["Glucose"] / (d["Insulin"] + 1)
d["Risk_Score"] = (
(d["Glucose"] > 140).astype(int) +
(d["BMI"] > 30).astype(int) +
(d["Age"] > 40).astype(int)
)
return d
df_fe = engineer_features(df_clean)
feature_cols = [c for c in df_fe.columns if c != "Outcome"]
X = df_fe[feature_cols]
y = df_fe["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
scaler = RobustScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
# ── Model definitions ──────────────────────────────────────────────────────
models = {
"Logistic Regression": LogisticRegression(C=1.0, class_weight="balanced", max_iter=1000, random_state=42),
"Random Forest": RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42, n_jobs=-1),
"Gradient Boosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42),
"Decision Tree": DecisionTreeClassifier(class_weight="balanced", max_depth=6, random_state=42),
"SVM": SVC(probability=True, class_weight="balanced", kernel="rbf", C=10, gamma="scale", random_state=42),
"KNN": KNeighborsClassifier(n_neighbors=7, weights="distance"),
"XGBoost": XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=4,
scale_pos_weight=2, random_state=42,
eval_metric="logloss", verbosity=0),
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {}
print(f"\n{'Model':<25} {'Acc':>6} {'Prec':>6} {'Rec':>6} {'F1':>6} {'AUC':>7} {'CV-Acc':>8}")
print("─" * 70)
for name, model in models.items():
model.fit(X_train_s, y_train)
pred = model.predict(X_test_s)
prob = model.predict_proba(X_test_s)[:, 1]
acc = accuracy_score(y_test, pred)
prec = precision_score(y_test, pred)
rec = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
auc = roc_auc_score(y_test, prob)
cv_sc = cross_val_score(model, X_train_s, y_train, cv=cv, scoring="accuracy").mean()
fpr, tpr, thresholds = roc_curve(y_test, prob)
cm = confusion_matrix(y_test, pred).tolist()
results[name] = dict(
accuracy=acc, precision=prec, recall=rec, f1=f1, auc=auc,
cv_accuracy=cv_sc, fpr=fpr.tolist(), tpr=tpr.tolist(),
confusion_matrix=cm, thresholds=thresholds.tolist()
)
print(f"{name:<25} {acc:>6.4f} {prec:>6.4f} {rec:>6.4f} {f1:>6.4f} {auc:>7.4f} {cv_sc:>8.4f}")
# ── Ensemble ───────────────────────────────────────────────────────────────
print("\nTraining ensemble...")
ensemble = VotingClassifier(
estimators=[(n, m) for n, m in models.items()], voting="soft"
)
ensemble.fit(X_train_s, y_train)
ens_pred = ensemble.predict(X_test_s)
ens_prob = ensemble.predict_proba(X_test_s)[:, 1]
fpr_e, tpr_e, thr_e = roc_curve(y_test, ens_prob)
results["Ensemble"] = dict(
accuracy=accuracy_score(y_test, ens_pred),
precision=precision_score(y_test, ens_pred),
recall=recall_score(y_test, ens_pred),
f1=f1_score(y_test, ens_pred),
auc=roc_auc_score(y_test, ens_prob),
cv_accuracy=accuracy_score(y_test, ens_pred),
fpr=fpr_e.tolist(), tpr=tpr_e.tolist(),
confusion_matrix=confusion_matrix(y_test, ens_pred).tolist(),
thresholds=thr_e.tolist()
)
# ── Save everything ────────────────────────────────────────────────────────
joblib.dump(scaler, f"{MODELS_DIR}/scaler.pkl")
joblib.dump(models, f"{MODELS_DIR}/models.pkl")
joblib.dump(ensemble, f"{MODELS_DIR}/ensemble.pkl")
joblib.dump(results, f"{MODELS_DIR}/results.pkl")
joblib.dump(medians, f"{MODELS_DIR}/medians.pkl")
joblib.dump(feature_cols, f"{MODELS_DIR}/feature_cols.pkl")
# Save test data for later analysis
import json
test_data = {"X_test": X_test.values.tolist(), "y_test": y_test.tolist(),
"columns": feature_cols}
with open(f"{MODELS_DIR}/test_data.json", "w") as f:
json.dump(test_data, f)
best = max(results, key=lambda k: results[k]["auc"])
print(f"\nπŸ† Best model by AUC: {best} β€” AUC={results[best]['auc']:.4f}")
print("βœ… All models saved to ./models/")