Spaces:

saifmontaser
/

DiabetesPro

Runtime error

App Files Files Community

DiabetesPro / src /train_models.py

saifmontaser

Upload train_models.py

33d0f9f verified 9 days ago

raw

history blame contribute delete

6.45 kB

	"""
	Train all models and save them for the Streamlit app.
	Run this once: python3 train_models.py
	"""

	import pandas as pd
	import numpy as np
	import joblib
	import os
	from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
	from sklearn.preprocessing import RobustScaler
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.svm import SVC
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.metrics import (
	accuracy_score, recall_score, f1_score,
	roc_auc_score, roc_curve, confusion_matrix, precision_score
	)
	from xgboost import XGBClassifier
	import warnings
	warnings.filterwarnings("ignore")

	MODELS_DIR = "models"
	os.makedirs(MODELS_DIR, exist_ok=True)

	print("📂 Loading dataset...")
	df = pd.read_csv("diabetes.csv")

	# ── Imputation ─────────────────────────────────────────────────────────────
	zero_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
	df_clean = df.copy()
	medians = {}
	for col in zero_cols:
	med = df_clean[col].replace(0, np.nan).median()
	medians[col] = med
	df_clean[col] = df_clean[col].replace(0, med)

	# ── Feature Engineering ────────────────────────────────────────────────────
	def engineer_features(df_in):
	d = df_in.copy()
	d["Glucose_BMI"] = d["Glucose"] * d["BMI"]
	d["Age_Pregnancies"] = d["Age"] * d["Pregnancies"]
	d["BMI_Age"] = d["BMI"] * d["Age"]
	d["Glucose_Insulin_ratio"] = d["Glucose"] / (d["Insulin"] + 1)
	d["Risk_Score"] = (
	(d["Glucose"] > 140).astype(int) +
	(d["BMI"] > 30).astype(int) +
	(d["Age"] > 40).astype(int)
	)
	return d

	df_fe = engineer_features(df_clean)
	feature_cols = [c for c in df_fe.columns if c != "Outcome"]
	X = df_fe[feature_cols]
	y = df_fe["Outcome"]

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42, stratify=y
	)

	scaler = RobustScaler()
	X_train_s = scaler.fit_transform(X_train)
	X_test_s = scaler.transform(X_test)

	# ── Model definitions ──────────────────────────────────────────────────────
	models = {
	"Logistic Regression": LogisticRegression(C=1.0, class_weight="balanced", max_iter=1000, random_state=42),
	"Random Forest": RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42, n_jobs=-1),
	"Gradient Boosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42),
	"Decision Tree": DecisionTreeClassifier(class_weight="balanced", max_depth=6, random_state=42),
	"SVM": SVC(probability=True, class_weight="balanced", kernel="rbf", C=10, gamma="scale", random_state=42),
	"KNN": KNeighborsClassifier(n_neighbors=7, weights="distance"),
	"XGBoost": XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=4,
	scale_pos_weight=2, random_state=42,
	eval_metric="logloss", verbosity=0),
	}

	cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
	results = {}

	print(f"\n{'Model':<25} {'Acc':>6} {'Prec':>6} {'Rec':>6} {'F1':>6} {'AUC':>7} {'CV-Acc':>8}")
	print("─" * 70)

	for name, model in models.items():
	model.fit(X_train_s, y_train)
	pred = model.predict(X_test_s)
	prob = model.predict_proba(X_test_s)[:, 1]
	acc = accuracy_score(y_test, pred)
	prec = precision_score(y_test, pred)
	rec = recall_score(y_test, pred)
	f1 = f1_score(y_test, pred)
	auc = roc_auc_score(y_test, prob)
	cv_sc = cross_val_score(model, X_train_s, y_train, cv=cv, scoring="accuracy").mean()
	fpr, tpr, thresholds = roc_curve(y_test, prob)
	cm = confusion_matrix(y_test, pred).tolist()

	results[name] = dict(
	accuracy=acc, precision=prec, recall=rec, f1=f1, auc=auc,
	cv_accuracy=cv_sc, fpr=fpr.tolist(), tpr=tpr.tolist(),
	confusion_matrix=cm, thresholds=thresholds.tolist()
	)
	print(f"{name:<25} {acc:>6.4f} {prec:>6.4f} {rec:>6.4f} {f1:>6.4f} {auc:>7.4f} {cv_sc:>8.4f}")

	# ── Ensemble ───────────────────────────────────────────────────────────────
	print("\nTraining ensemble...")
	ensemble = VotingClassifier(
	estimators=[(n, m) for n, m in models.items()], voting="soft"
	)
	ensemble.fit(X_train_s, y_train)
	ens_pred = ensemble.predict(X_test_s)
	ens_prob = ensemble.predict_proba(X_test_s)[:, 1]
	fpr_e, tpr_e, thr_e = roc_curve(y_test, ens_prob)
	results["Ensemble"] = dict(
	accuracy=accuracy_score(y_test, ens_pred),
	precision=precision_score(y_test, ens_pred),
	recall=recall_score(y_test, ens_pred),
	f1=f1_score(y_test, ens_pred),
	auc=roc_auc_score(y_test, ens_prob),
	cv_accuracy=accuracy_score(y_test, ens_pred),
	fpr=fpr_e.tolist(), tpr=tpr_e.tolist(),
	confusion_matrix=confusion_matrix(y_test, ens_pred).tolist(),
	thresholds=thr_e.tolist()
	)

	# ── Save everything ────────────────────────────────────────────────────────
	joblib.dump(scaler, f"{MODELS_DIR}/scaler.pkl")
	joblib.dump(models, f"{MODELS_DIR}/models.pkl")
	joblib.dump(ensemble, f"{MODELS_DIR}/ensemble.pkl")
	joblib.dump(results, f"{MODELS_DIR}/results.pkl")
	joblib.dump(medians, f"{MODELS_DIR}/medians.pkl")
	joblib.dump(feature_cols, f"{MODELS_DIR}/feature_cols.pkl")

	# Save test data for later analysis
	import json
	test_data = {"X_test": X_test.values.tolist(), "y_test": y_test.tolist(),
	"columns": feature_cols}
	with open(f"{MODELS_DIR}/test_data.json", "w") as f:
	json.dump(test_data, f)

	best = max(results, key=lambda k: results[k]["auc"])
	print(f"\n🏆 Best model by AUC: {best} — AUC={results[best]['auc']:.4f}")
	print("✅ All models saved to ./models/")