Spaces:
Runtime error
Runtime error
File size: 6,447 Bytes
33d0f9f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | """
Train all models and save them for the Streamlit app.
Run this once: python3 train_models.py
"""
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
accuracy_score, recall_score, f1_score,
roc_auc_score, roc_curve, confusion_matrix, precision_score
)
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")
MODELS_DIR = "models"
os.makedirs(MODELS_DIR, exist_ok=True)
print("π Loading dataset...")
df = pd.read_csv("diabetes.csv")
# ββ Imputation βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
zero_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
df_clean = df.copy()
medians = {}
for col in zero_cols:
med = df_clean[col].replace(0, np.nan).median()
medians[col] = med
df_clean[col] = df_clean[col].replace(0, med)
# ββ Feature Engineering ββββββββββββββββββββββββββββββββββββββββββββββββββββ
def engineer_features(df_in):
d = df_in.copy()
d["Glucose_BMI"] = d["Glucose"] * d["BMI"]
d["Age_Pregnancies"] = d["Age"] * d["Pregnancies"]
d["BMI_Age"] = d["BMI"] * d["Age"]
d["Glucose_Insulin_ratio"] = d["Glucose"] / (d["Insulin"] + 1)
d["Risk_Score"] = (
(d["Glucose"] > 140).astype(int) +
(d["BMI"] > 30).astype(int) +
(d["Age"] > 40).astype(int)
)
return d
df_fe = engineer_features(df_clean)
feature_cols = [c for c in df_fe.columns if c != "Outcome"]
X = df_fe[feature_cols]
y = df_fe["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
scaler = RobustScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
# ββ Model definitions ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
models = {
"Logistic Regression": LogisticRegression(C=1.0, class_weight="balanced", max_iter=1000, random_state=42),
"Random Forest": RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42, n_jobs=-1),
"Gradient Boosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42),
"Decision Tree": DecisionTreeClassifier(class_weight="balanced", max_depth=6, random_state=42),
"SVM": SVC(probability=True, class_weight="balanced", kernel="rbf", C=10, gamma="scale", random_state=42),
"KNN": KNeighborsClassifier(n_neighbors=7, weights="distance"),
"XGBoost": XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=4,
scale_pos_weight=2, random_state=42,
eval_metric="logloss", verbosity=0),
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {}
print(f"\n{'Model':<25} {'Acc':>6} {'Prec':>6} {'Rec':>6} {'F1':>6} {'AUC':>7} {'CV-Acc':>8}")
print("β" * 70)
for name, model in models.items():
model.fit(X_train_s, y_train)
pred = model.predict(X_test_s)
prob = model.predict_proba(X_test_s)[:, 1]
acc = accuracy_score(y_test, pred)
prec = precision_score(y_test, pred)
rec = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
auc = roc_auc_score(y_test, prob)
cv_sc = cross_val_score(model, X_train_s, y_train, cv=cv, scoring="accuracy").mean()
fpr, tpr, thresholds = roc_curve(y_test, prob)
cm = confusion_matrix(y_test, pred).tolist()
results[name] = dict(
accuracy=acc, precision=prec, recall=rec, f1=f1, auc=auc,
cv_accuracy=cv_sc, fpr=fpr.tolist(), tpr=tpr.tolist(),
confusion_matrix=cm, thresholds=thresholds.tolist()
)
print(f"{name:<25} {acc:>6.4f} {prec:>6.4f} {rec:>6.4f} {f1:>6.4f} {auc:>7.4f} {cv_sc:>8.4f}")
# ββ Ensemble βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("\nTraining ensemble...")
ensemble = VotingClassifier(
estimators=[(n, m) for n, m in models.items()], voting="soft"
)
ensemble.fit(X_train_s, y_train)
ens_pred = ensemble.predict(X_test_s)
ens_prob = ensemble.predict_proba(X_test_s)[:, 1]
fpr_e, tpr_e, thr_e = roc_curve(y_test, ens_prob)
results["Ensemble"] = dict(
accuracy=accuracy_score(y_test, ens_pred),
precision=precision_score(y_test, ens_pred),
recall=recall_score(y_test, ens_pred),
f1=f1_score(y_test, ens_pred),
auc=roc_auc_score(y_test, ens_prob),
cv_accuracy=accuracy_score(y_test, ens_pred),
fpr=fpr_e.tolist(), tpr=tpr_e.tolist(),
confusion_matrix=confusion_matrix(y_test, ens_pred).tolist(),
thresholds=thr_e.tolist()
)
# ββ Save everything ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
joblib.dump(scaler, f"{MODELS_DIR}/scaler.pkl")
joblib.dump(models, f"{MODELS_DIR}/models.pkl")
joblib.dump(ensemble, f"{MODELS_DIR}/ensemble.pkl")
joblib.dump(results, f"{MODELS_DIR}/results.pkl")
joblib.dump(medians, f"{MODELS_DIR}/medians.pkl")
joblib.dump(feature_cols, f"{MODELS_DIR}/feature_cols.pkl")
# Save test data for later analysis
import json
test_data = {"X_test": X_test.values.tolist(), "y_test": y_test.tolist(),
"columns": feature_cols}
with open(f"{MODELS_DIR}/test_data.json", "w") as f:
json.dump(test_data, f)
best = max(results, key=lambda k: results[k]["auc"])
print(f"\nπ Best model by AUC: {best} β AUC={results[best]['auc']:.4f}")
print("β
All models saved to ./models/")
|