update demo for ensemble learning
Browse files- app.py +4 -2
- requirements.txt +2 -1
- src/heart_disease_core.py +85 -8
app.py
CHANGED
|
@@ -28,7 +28,7 @@ vlai_template.set_meta(
|
|
| 28 |
description="Predict heart disease risk from patient data with ML models trained on the Cleveland dataset.",
|
| 29 |
meta_items=[
|
| 30 |
("Dataset", "Cleveland Heart Disease"),
|
| 31 |
-
("Models", "Decision Tree, k-NN, Naive Bayes"),
|
| 32 |
("Ensemble", "Soft Voting"),
|
| 33 |
],
|
| 34 |
)
|
|
@@ -247,7 +247,9 @@ with gr.Blocks(theme="gstaff/sketch", css=vlai_template.custom_css, fill_width=T
|
|
| 247 |
|
| 248 |
- **Models are trained once at launch** on `data/cleveland.csv` (80/20 split).
|
| 249 |
- **Target is binarized automatically** (0 = no disease, >0 = disease).
|
| 250 |
-
- **
|
|
|
|
|
|
|
| 251 |
- **Feature descriptions**:
|
| 252 |
- `age`: Patient age in years
|
| 253 |
- `sex`: Gender (0=female, 1=male)
|
|
|
|
| 28 |
description="Predict heart disease risk from patient data with ML models trained on the Cleveland dataset.",
|
| 29 |
meta_items=[
|
| 30 |
("Dataset", "Cleveland Heart Disease"),
|
| 31 |
+
("Models", "Decision Tree, k-NN, Naive Bayes, Random Forest, AdaBoost, Gradient Boosting, XGBoost"),
|
| 32 |
("Ensemble", "Soft Voting"),
|
| 33 |
],
|
| 34 |
)
|
|
|
|
| 247 |
|
| 248 |
- **Models are trained once at launch** on `data/cleveland.csv` (80/20 split).
|
| 249 |
- **Target is binarized automatically** (0 = no disease, >0 = disease).
|
| 250 |
+
- **Seven models are compared**: Decision Tree, k-NN, Naive Bayes, Random Forest, AdaBoost, Gradient Boosting, and XGBoost.
|
| 251 |
+
- **Ensemble uses soft voting** over all individual models.
|
| 252 |
+
- **Best performing model** on test set is highlighted with 🏆 in the validation metrics table.
|
| 253 |
- **Feature descriptions**:
|
| 254 |
- `age`: Patient age in years
|
| 255 |
- `sex`: Gender (0=female, 1=male)
|
requirements.txt
CHANGED
|
@@ -5,4 +5,5 @@ numpy>=1.24.0
|
|
| 5 |
dtreeviz>=2.2.2
|
| 6 |
graphviz>=0.20.3
|
| 7 |
plotly>=5.15.0
|
| 8 |
-
supertree>=0.5.5
|
|
|
|
|
|
| 5 |
dtreeviz>=2.2.2
|
| 6 |
graphviz>=0.20.3
|
| 7 |
plotly>=5.15.0
|
| 8 |
+
supertree>=0.5.5
|
| 9 |
+
xgboost>=1.6.0
|
src/heart_disease_core.py
CHANGED
|
@@ -12,7 +12,18 @@ from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, reca
|
|
| 12 |
from sklearn.tree import DecisionTreeClassifier
|
| 13 |
from sklearn.neighbors import KNeighborsClassifier
|
| 14 |
from sklearn.naive_bayes import GaussianNB
|
| 15 |
-
from sklearn.ensemble import VotingClassifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
CLEVELAND_FEATURES_ORDER: List[str] = [
|
|
@@ -142,22 +153,77 @@ def build_models() -> Dict[str, Pipeline]:
|
|
| 142 |
("clf", GaussianNB())
|
| 143 |
])
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
# Soft Voting requires raw estimators, not Pipelines that share the same preprocessor.
|
| 146 |
# Easiest: ensemble as a single Pipeline with a VotingClassifier inside.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
ensemble = Pipeline(steps=[
|
| 148 |
("prep", pre),
|
| 149 |
("clf", VotingClassifier(
|
| 150 |
-
estimators=
|
| 151 |
-
("dt", DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=2, min_samples_leaf=1, criterion="gini")),
|
| 152 |
-
("knn", KNeighborsClassifier(n_neighbors=5)),
|
| 153 |
-
("nb", GaussianNB()),
|
| 154 |
-
],
|
| 155 |
voting="soft",
|
| 156 |
weights=None # can tweak later
|
| 157 |
))
|
| 158 |
])
|
| 159 |
|
| 160 |
-
|
|
|
|
| 161 |
|
| 162 |
def fit_all_models(df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42) -> Tuple[Dict[str, Pipeline], pd.DataFrame]:
|
| 163 |
"""
|
|
@@ -191,7 +257,7 @@ def fit_all_models(df: pd.DataFrame, test_size: float = 0.2, random_state: int =
|
|
| 191 |
f1 = f1_score(y_te, y_pred, zero_division=0)
|
| 192 |
|
| 193 |
metrics.append({
|
| 194 |
-
"
|
| 195 |
"ROC-AUC": round(float(auc), 4),
|
| 196 |
"Accuracy": round(float(acc), 4),
|
| 197 |
"Precision": round(float(prec), 4),
|
|
@@ -200,6 +266,17 @@ def fit_all_models(df: pd.DataFrame, test_size: float = 0.2, random_state: int =
|
|
| 200 |
})
|
| 201 |
|
| 202 |
metrics_df = pd.DataFrame(metrics).sort_values("ROC-AUC", ascending=False, ignore_index=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
return models, metrics_df
|
| 204 |
|
| 205 |
def predict_all(models: Dict[str, Pipeline], input_dict: Dict[str, float]) -> Dict[str, Dict[str, float]]:
|
|
|
|
| 12 |
from sklearn.tree import DecisionTreeClassifier
|
| 13 |
from sklearn.neighbors import KNeighborsClassifier
|
| 14 |
from sklearn.naive_bayes import GaussianNB
|
| 15 |
+
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
|
| 16 |
+
|
| 17 |
+
# Handle XGBoost import gracefully
|
| 18 |
+
XGBOOST_AVAILABLE = False
|
| 19 |
+
XGBClassifier = None
|
| 20 |
+
try:
|
| 21 |
+
from xgboost import XGBClassifier
|
| 22 |
+
XGBOOST_AVAILABLE = True
|
| 23 |
+
except (ImportError, Exception):
|
| 24 |
+
# Handle both import errors and library loading errors
|
| 25 |
+
XGBOOST_AVAILABLE = False
|
| 26 |
+
XGBClassifier = None
|
| 27 |
|
| 28 |
|
| 29 |
CLEVELAND_FEATURES_ORDER: List[str] = [
|
|
|
|
| 153 |
("clf", GaussianNB())
|
| 154 |
])
|
| 155 |
|
| 156 |
+
rf = Pipeline(steps=[
|
| 157 |
+
("prep", pre),
|
| 158 |
+
("clf", RandomForestClassifier(
|
| 159 |
+
random_state=42,
|
| 160 |
+
n_estimators=100,
|
| 161 |
+
max_depth=5,
|
| 162 |
+
min_samples_split=2,
|
| 163 |
+
min_samples_leaf=1
|
| 164 |
+
))
|
| 165 |
+
])
|
| 166 |
+
|
| 167 |
+
ada = Pipeline(steps=[
|
| 168 |
+
("prep", pre),
|
| 169 |
+
("clf", AdaBoostClassifier(
|
| 170 |
+
random_state=42,
|
| 171 |
+
n_estimators=100,
|
| 172 |
+
learning_rate=1.0
|
| 173 |
+
))
|
| 174 |
+
])
|
| 175 |
+
|
| 176 |
+
gb = Pipeline(steps=[
|
| 177 |
+
("prep", pre),
|
| 178 |
+
("clf", GradientBoostingClassifier(
|
| 179 |
+
random_state=42,
|
| 180 |
+
n_estimators=100,
|
| 181 |
+
learning_rate=0.1,
|
| 182 |
+
max_depth=3
|
| 183 |
+
))
|
| 184 |
+
])
|
| 185 |
+
|
| 186 |
+
models = {"Decision Tree": dt, "k-NN": knn, "Naive Bayes": nb, "Random Forest": rf, "AdaBoost": ada, "Gradient Boosting": gb}
|
| 187 |
+
|
| 188 |
+
# Add XGBoost if available
|
| 189 |
+
if XGBOOST_AVAILABLE:
|
| 190 |
+
xgb = Pipeline(steps=[
|
| 191 |
+
("prep", pre),
|
| 192 |
+
("clf", XGBClassifier(
|
| 193 |
+
random_state=42,
|
| 194 |
+
n_estimators=100,
|
| 195 |
+
learning_rate=0.1,
|
| 196 |
+
max_depth=3,
|
| 197 |
+
eval_metric='logloss'
|
| 198 |
+
))
|
| 199 |
+
])
|
| 200 |
+
models["XGBoost"] = xgb
|
| 201 |
+
|
| 202 |
# Soft Voting requires raw estimators, not Pipelines that share the same preprocessor.
|
| 203 |
# Easiest: ensemble as a single Pipeline with a VotingClassifier inside.
|
| 204 |
+
estimators = [
|
| 205 |
+
("dt", DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=2, min_samples_leaf=1, criterion="gini")),
|
| 206 |
+
("knn", KNeighborsClassifier(n_neighbors=5)),
|
| 207 |
+
("nb", GaussianNB()),
|
| 208 |
+
("rf", RandomForestClassifier(random_state=42, n_estimators=100, max_depth=5, min_samples_split=2, min_samples_leaf=1)),
|
| 209 |
+
("ada", AdaBoostClassifier(random_state=42, n_estimators=100, learning_rate=1.0)),
|
| 210 |
+
("gb", GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)),
|
| 211 |
+
]
|
| 212 |
+
|
| 213 |
+
if XGBOOST_AVAILABLE:
|
| 214 |
+
estimators.append(("xgb", XGBClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3, eval_metric='logloss')))
|
| 215 |
+
|
| 216 |
ensemble = Pipeline(steps=[
|
| 217 |
("prep", pre),
|
| 218 |
("clf", VotingClassifier(
|
| 219 |
+
estimators=estimators,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
voting="soft",
|
| 221 |
weights=None # can tweak later
|
| 222 |
))
|
| 223 |
])
|
| 224 |
|
| 225 |
+
models["Ensemble (Soft Voting)"] = ensemble
|
| 226 |
+
return models
|
| 227 |
|
| 228 |
def fit_all_models(df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42) -> Tuple[Dict[str, Pipeline], pd.DataFrame]:
|
| 229 |
"""
|
|
|
|
| 257 |
f1 = f1_score(y_te, y_pred, zero_division=0)
|
| 258 |
|
| 259 |
metrics.append({
|
| 260 |
+
"Model": name,
|
| 261 |
"ROC-AUC": round(float(auc), 4),
|
| 262 |
"Accuracy": round(float(acc), 4),
|
| 263 |
"Precision": round(float(prec), 4),
|
|
|
|
| 266 |
})
|
| 267 |
|
| 268 |
metrics_df = pd.DataFrame(metrics).sort_values("ROC-AUC", ascending=False, ignore_index=True)
|
| 269 |
+
|
| 270 |
+
# Add performance ranking and highlight best performance
|
| 271 |
+
metrics_df["Rank"] = range(1, len(metrics_df) + 1)
|
| 272 |
+
|
| 273 |
+
# Mark the best performing model
|
| 274 |
+
best_model_idx = metrics_df["ROC-AUC"].idxmax()
|
| 275 |
+
metrics_df.loc[best_model_idx, "Model"] = "🏆 " + metrics_df.loc[best_model_idx, "Model"] + " (BEST)"
|
| 276 |
+
|
| 277 |
+
# Reorder columns to show rank first
|
| 278 |
+
metrics_df = metrics_df[["Rank", "Model", "ROC-AUC", "Accuracy", "Precision", "Recall", "F1"]]
|
| 279 |
+
|
| 280 |
return models, metrics_df
|
| 281 |
|
| 282 |
def predict_all(models: Dict[str, Pipeline], input_dict: Dict[str, float]) -> Dict[str, Dict[str, float]]:
|