Spaces:
Runtime error
Runtime error
| """ | |
| Train all models and save them for the Streamlit app. | |
| Run this once: python3 train_models.py | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| import os | |
| from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score | |
| from sklearn.preprocessing import RobustScaler | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.svm import SVC | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.metrics import ( | |
| accuracy_score, recall_score, f1_score, | |
| roc_auc_score, roc_curve, confusion_matrix, precision_score | |
| ) | |
| from xgboost import XGBClassifier | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| MODELS_DIR = "models" | |
| os.makedirs(MODELS_DIR, exist_ok=True) | |
| print("π Loading dataset...") | |
| df = pd.read_csv("diabetes.csv") | |
| # ββ Imputation βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| zero_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"] | |
| df_clean = df.copy() | |
| medians = {} | |
| for col in zero_cols: | |
| med = df_clean[col].replace(0, np.nan).median() | |
| medians[col] = med | |
| df_clean[col] = df_clean[col].replace(0, med) | |
| # ββ Feature Engineering ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def engineer_features(df_in): | |
| d = df_in.copy() | |
| d["Glucose_BMI"] = d["Glucose"] * d["BMI"] | |
| d["Age_Pregnancies"] = d["Age"] * d["Pregnancies"] | |
| d["BMI_Age"] = d["BMI"] * d["Age"] | |
| d["Glucose_Insulin_ratio"] = d["Glucose"] / (d["Insulin"] + 1) | |
| d["Risk_Score"] = ( | |
| (d["Glucose"] > 140).astype(int) + | |
| (d["BMI"] > 30).astype(int) + | |
| (d["Age"] > 40).astype(int) | |
| ) | |
| return d | |
| df_fe = engineer_features(df_clean) | |
| feature_cols = [c for c in df_fe.columns if c != "Outcome"] | |
| X = df_fe[feature_cols] | |
| y = df_fe["Outcome"] | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| ) | |
| scaler = RobustScaler() | |
| X_train_s = scaler.fit_transform(X_train) | |
| X_test_s = scaler.transform(X_test) | |
| # ββ Model definitions ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| models = { | |
| "Logistic Regression": LogisticRegression(C=1.0, class_weight="balanced", max_iter=1000, random_state=42), | |
| "Random Forest": RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42, n_jobs=-1), | |
| "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42), | |
| "Decision Tree": DecisionTreeClassifier(class_weight="balanced", max_depth=6, random_state=42), | |
| "SVM": SVC(probability=True, class_weight="balanced", kernel="rbf", C=10, gamma="scale", random_state=42), | |
| "KNN": KNeighborsClassifier(n_neighbors=7, weights="distance"), | |
| "XGBoost": XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, | |
| scale_pos_weight=2, random_state=42, | |
| eval_metric="logloss", verbosity=0), | |
| } | |
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | |
| results = {} | |
| print(f"\n{'Model':<25} {'Acc':>6} {'Prec':>6} {'Rec':>6} {'F1':>6} {'AUC':>7} {'CV-Acc':>8}") | |
| print("β" * 70) | |
| for name, model in models.items(): | |
| model.fit(X_train_s, y_train) | |
| pred = model.predict(X_test_s) | |
| prob = model.predict_proba(X_test_s)[:, 1] | |
| acc = accuracy_score(y_test, pred) | |
| prec = precision_score(y_test, pred) | |
| rec = recall_score(y_test, pred) | |
| f1 = f1_score(y_test, pred) | |
| auc = roc_auc_score(y_test, prob) | |
| cv_sc = cross_val_score(model, X_train_s, y_train, cv=cv, scoring="accuracy").mean() | |
| fpr, tpr, thresholds = roc_curve(y_test, prob) | |
| cm = confusion_matrix(y_test, pred).tolist() | |
| results[name] = dict( | |
| accuracy=acc, precision=prec, recall=rec, f1=f1, auc=auc, | |
| cv_accuracy=cv_sc, fpr=fpr.tolist(), tpr=tpr.tolist(), | |
| confusion_matrix=cm, thresholds=thresholds.tolist() | |
| ) | |
| print(f"{name:<25} {acc:>6.4f} {prec:>6.4f} {rec:>6.4f} {f1:>6.4f} {auc:>7.4f} {cv_sc:>8.4f}") | |
| # ββ Ensemble βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\nTraining ensemble...") | |
| ensemble = VotingClassifier( | |
| estimators=[(n, m) for n, m in models.items()], voting="soft" | |
| ) | |
| ensemble.fit(X_train_s, y_train) | |
| ens_pred = ensemble.predict(X_test_s) | |
| ens_prob = ensemble.predict_proba(X_test_s)[:, 1] | |
| fpr_e, tpr_e, thr_e = roc_curve(y_test, ens_prob) | |
| results["Ensemble"] = dict( | |
| accuracy=accuracy_score(y_test, ens_pred), | |
| precision=precision_score(y_test, ens_pred), | |
| recall=recall_score(y_test, ens_pred), | |
| f1=f1_score(y_test, ens_pred), | |
| auc=roc_auc_score(y_test, ens_prob), | |
| cv_accuracy=accuracy_score(y_test, ens_pred), | |
| fpr=fpr_e.tolist(), tpr=tpr_e.tolist(), | |
| confusion_matrix=confusion_matrix(y_test, ens_pred).tolist(), | |
| thresholds=thr_e.tolist() | |
| ) | |
| # ββ Save everything ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| joblib.dump(scaler, f"{MODELS_DIR}/scaler.pkl") | |
| joblib.dump(models, f"{MODELS_DIR}/models.pkl") | |
| joblib.dump(ensemble, f"{MODELS_DIR}/ensemble.pkl") | |
| joblib.dump(results, f"{MODELS_DIR}/results.pkl") | |
| joblib.dump(medians, f"{MODELS_DIR}/medians.pkl") | |
| joblib.dump(feature_cols, f"{MODELS_DIR}/feature_cols.pkl") | |
| # Save test data for later analysis | |
| import json | |
| test_data = {"X_test": X_test.values.tolist(), "y_test": y_test.tolist(), | |
| "columns": feature_cols} | |
| with open(f"{MODELS_DIR}/test_data.json", "w") as f: | |
| json.dump(test_data, f) | |
| best = max(results, key=lambda k: results[k]["auc"]) | |
| print(f"\nπ Best model by AUC: {best} β AUC={results[best]['auc']:.4f}") | |
| print("β All models saved to ./models/") | |