wjnwjn59 commited on
Commit
8a0750c
·
1 Parent(s): 473cc44

update demo for ensemble learning

Browse files
Files changed (3) hide show
  1. app.py +4 -2
  2. requirements.txt +2 -1
  3. src/heart_disease_core.py +85 -8
app.py CHANGED
@@ -28,7 +28,7 @@ vlai_template.set_meta(
28
  description="Predict heart disease risk from patient data with ML models trained on the Cleveland dataset.",
29
  meta_items=[
30
  ("Dataset", "Cleveland Heart Disease"),
31
- ("Models", "Decision Tree, k-NN, Naive Bayes"),
32
  ("Ensemble", "Soft Voting"),
33
  ],
34
  )
@@ -247,7 +247,9 @@ with gr.Blocks(theme="gstaff/sketch", css=vlai_template.custom_css, fill_width=T
247
 
248
  - **Models are trained once at launch** on `data/cleveland.csv` (80/20 split).
249
  - **Target is binarized automatically** (0 = no disease, >0 = disease).
250
- - **Ensemble uses soft voting** over Decision Tree, k-NN, and Naive Bayes.
 
 
251
  - **Feature descriptions**:
252
  - `age`: Patient age in years
253
  - `sex`: Gender (0=female, 1=male)
 
28
  description="Predict heart disease risk from patient data with ML models trained on the Cleveland dataset.",
29
  meta_items=[
30
  ("Dataset", "Cleveland Heart Disease"),
31
+ ("Models", "Decision Tree, k-NN, Naive Bayes, Random Forest, AdaBoost, Gradient Boosting, XGBoost"),
32
  ("Ensemble", "Soft Voting"),
33
  ],
34
  )
 
247
 
248
  - **Models are trained once at launch** on `data/cleveland.csv` (80/20 split).
249
  - **Target is binarized automatically** (0 = no disease, >0 = disease).
250
+ - **Seven models are compared**: Decision Tree, k-NN, Naive Bayes, Random Forest, AdaBoost, Gradient Boosting, and XGBoost.
251
+ - **Ensemble uses soft voting** over all individual models.
252
+ - **Best performing model** on test set is highlighted with 🏆 in the validation metrics table.
253
  - **Feature descriptions**:
254
  - `age`: Patient age in years
255
  - `sex`: Gender (0=female, 1=male)
requirements.txt CHANGED
@@ -5,4 +5,5 @@ numpy>=1.24.0
5
  dtreeviz>=2.2.2
6
  graphviz>=0.20.3
7
  plotly>=5.15.0
8
- supertree>=0.5.5
 
 
5
  dtreeviz>=2.2.2
6
  graphviz>=0.20.3
7
  plotly>=5.15.0
8
+ supertree>=0.5.5
9
+ xgboost>=1.6.0
src/heart_disease_core.py CHANGED
@@ -12,7 +12,18 @@ from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, reca
12
  from sklearn.tree import DecisionTreeClassifier
13
  from sklearn.neighbors import KNeighborsClassifier
14
  from sklearn.naive_bayes import GaussianNB
15
- from sklearn.ensemble import VotingClassifier
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  CLEVELAND_FEATURES_ORDER: List[str] = [
@@ -142,22 +153,77 @@ def build_models() -> Dict[str, Pipeline]:
142
  ("clf", GaussianNB())
143
  ])
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  # Soft Voting requires raw estimators, not Pipelines that share the same preprocessor.
146
  # Easiest: ensemble as a single Pipeline with a VotingClassifier inside.
 
 
 
 
 
 
 
 
 
 
 
 
147
  ensemble = Pipeline(steps=[
148
  ("prep", pre),
149
  ("clf", VotingClassifier(
150
- estimators=[
151
- ("dt", DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=2, min_samples_leaf=1, criterion="gini")),
152
- ("knn", KNeighborsClassifier(n_neighbors=5)),
153
- ("nb", GaussianNB()),
154
- ],
155
  voting="soft",
156
  weights=None # can tweak later
157
  ))
158
  ])
159
 
160
- return {"Decision Tree": dt, "k-NN": knn, "Naive Bayes": nb, "Ensemble (Soft Voting)": ensemble}
 
161
 
162
  def fit_all_models(df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42) -> Tuple[Dict[str, Pipeline], pd.DataFrame]:
163
  """
@@ -191,7 +257,7 @@ def fit_all_models(df: pd.DataFrame, test_size: float = 0.2, random_state: int =
191
  f1 = f1_score(y_te, y_pred, zero_division=0)
192
 
193
  metrics.append({
194
- "model": name,
195
  "ROC-AUC": round(float(auc), 4),
196
  "Accuracy": round(float(acc), 4),
197
  "Precision": round(float(prec), 4),
@@ -200,6 +266,17 @@ def fit_all_models(df: pd.DataFrame, test_size: float = 0.2, random_state: int =
200
  })
201
 
202
  metrics_df = pd.DataFrame(metrics).sort_values("ROC-AUC", ascending=False, ignore_index=True)
 
 
 
 
 
 
 
 
 
 
 
203
  return models, metrics_df
204
 
205
  def predict_all(models: Dict[str, Pipeline], input_dict: Dict[str, float]) -> Dict[str, Dict[str, float]]:
 
12
  from sklearn.tree import DecisionTreeClassifier
13
  from sklearn.neighbors import KNeighborsClassifier
14
  from sklearn.naive_bayes import GaussianNB
15
+ from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
16
+
17
+ # Handle XGBoost import gracefully
18
+ XGBOOST_AVAILABLE = False
19
+ XGBClassifier = None
20
+ try:
21
+ from xgboost import XGBClassifier
22
+ XGBOOST_AVAILABLE = True
23
+ except (ImportError, Exception):
24
+ # Handle both import errors and library loading errors
25
+ XGBOOST_AVAILABLE = False
26
+ XGBClassifier = None
27
 
28
 
29
  CLEVELAND_FEATURES_ORDER: List[str] = [
 
153
  ("clf", GaussianNB())
154
  ])
155
 
156
+ rf = Pipeline(steps=[
157
+ ("prep", pre),
158
+ ("clf", RandomForestClassifier(
159
+ random_state=42,
160
+ n_estimators=100,
161
+ max_depth=5,
162
+ min_samples_split=2,
163
+ min_samples_leaf=1
164
+ ))
165
+ ])
166
+
167
+ ada = Pipeline(steps=[
168
+ ("prep", pre),
169
+ ("clf", AdaBoostClassifier(
170
+ random_state=42,
171
+ n_estimators=100,
172
+ learning_rate=1.0
173
+ ))
174
+ ])
175
+
176
+ gb = Pipeline(steps=[
177
+ ("prep", pre),
178
+ ("clf", GradientBoostingClassifier(
179
+ random_state=42,
180
+ n_estimators=100,
181
+ learning_rate=0.1,
182
+ max_depth=3
183
+ ))
184
+ ])
185
+
186
+ models = {"Decision Tree": dt, "k-NN": knn, "Naive Bayes": nb, "Random Forest": rf, "AdaBoost": ada, "Gradient Boosting": gb}
187
+
188
+ # Add XGBoost if available
189
+ if XGBOOST_AVAILABLE:
190
+ xgb = Pipeline(steps=[
191
+ ("prep", pre),
192
+ ("clf", XGBClassifier(
193
+ random_state=42,
194
+ n_estimators=100,
195
+ learning_rate=0.1,
196
+ max_depth=3,
197
+ eval_metric='logloss'
198
+ ))
199
+ ])
200
+ models["XGBoost"] = xgb
201
+
202
  # Soft Voting requires raw estimators, not Pipelines that share the same preprocessor.
203
  # Easiest: ensemble as a single Pipeline with a VotingClassifier inside.
204
+ estimators = [
205
+ ("dt", DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=2, min_samples_leaf=1, criterion="gini")),
206
+ ("knn", KNeighborsClassifier(n_neighbors=5)),
207
+ ("nb", GaussianNB()),
208
+ ("rf", RandomForestClassifier(random_state=42, n_estimators=100, max_depth=5, min_samples_split=2, min_samples_leaf=1)),
209
+ ("ada", AdaBoostClassifier(random_state=42, n_estimators=100, learning_rate=1.0)),
210
+ ("gb", GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)),
211
+ ]
212
+
213
+ if XGBOOST_AVAILABLE:
214
+ estimators.append(("xgb", XGBClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3, eval_metric='logloss')))
215
+
216
  ensemble = Pipeline(steps=[
217
  ("prep", pre),
218
  ("clf", VotingClassifier(
219
+ estimators=estimators,
 
 
 
 
220
  voting="soft",
221
  weights=None # can tweak later
222
  ))
223
  ])
224
 
225
+ models["Ensemble (Soft Voting)"] = ensemble
226
+ return models
227
 
228
  def fit_all_models(df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42) -> Tuple[Dict[str, Pipeline], pd.DataFrame]:
229
  """
 
257
  f1 = f1_score(y_te, y_pred, zero_division=0)
258
 
259
  metrics.append({
260
+ "Model": name,
261
  "ROC-AUC": round(float(auc), 4),
262
  "Accuracy": round(float(acc), 4),
263
  "Precision": round(float(prec), 4),
 
266
  })
267
 
268
  metrics_df = pd.DataFrame(metrics).sort_values("ROC-AUC", ascending=False, ignore_index=True)
269
+
270
+ # Add performance ranking and highlight best performance
271
+ metrics_df["Rank"] = range(1, len(metrics_df) + 1)
272
+
273
+ # Mark the best performing model
274
+ best_model_idx = metrics_df["ROC-AUC"].idxmax()
275
+ metrics_df.loc[best_model_idx, "Model"] = "🏆 " + metrics_df.loc[best_model_idx, "Model"] + " (BEST)"
276
+
277
+ # Reorder columns to show rank first
278
+ metrics_df = metrics_df[["Rank", "Model", "ROC-AUC", "Accuracy", "Precision", "Recall", "F1"]]
279
+
280
  return models, metrics_df
281
 
282
  def predict_all(models: Dict[str, Pipeline], input_dict: Dict[str, float]) -> Dict[str, Dict[str, float]]: