Synav commited on
Commit
7d7e2cb
·
verified ·
1 Parent(s): ef9ce84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -14
app.py CHANGED
@@ -9,7 +9,15 @@ import matplotlib.pyplot as plt
9
  import os
10
  from huggingface_hub import hf_hub_download, HfApi
11
  import hmac
12
-
 
 
 
 
 
 
 
 
13
 
14
 
15
  from sklearn.pipeline import Pipeline
@@ -76,14 +84,6 @@ def build_pipeline(num_cols, cat_cols):
76
  # ============================================================
77
  # Validation utilities
78
  # ============================================================
79
- def validate_schema(df: pd.DataFrame) -> pd.DataFrame:
80
- missing = [c for c in FEATURE_COLS + [LABEL_COL] if c not in df.columns]
81
- if missing:
82
- raise ValueError(
83
- f"Missing required columns: {missing}. "
84
- f"Excel must contain columns A..Z and AA exactly."
85
- )
86
- return df[FEATURE_COLS + [LABEL_COL]].copy()
87
 
88
 
89
  def coerce_binary_label(y: pd.Series):
@@ -156,13 +156,42 @@ def train_and_save(df: pd.DataFrame, feature_cols, num_cols, cat_cols):
156
  proba = pipe.predict_proba(X_test)[:, 1]
157
  pred = (proba >= 0.5).astype(int)
158
 
 
 
 
 
 
 
 
 
 
159
  metrics = {
160
- "roc_auc": float(roc_auc_score(y_test, proba)),
161
- "accuracy@0.5": float(accuracy_score(y_test, pred)),
162
  "n_train": int(len(X_train)),
163
  "n_test": int(len(X_test)),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  }
165
 
 
166
  joblib.dump(pipe, "model.joblib")
167
 
168
  meta = {
@@ -184,6 +213,30 @@ def train_and_save(df: pd.DataFrame, feature_cols, num_cols, cat_cols):
184
 
185
  return pipe, meta, X
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
 
189
 
@@ -402,12 +455,69 @@ with tab_train:
402
 
403
  st.success("Training complete. model.joblib and meta.json created.")
404
 
 
 
 
 
405
  m = meta["metrics"]
 
 
406
  c1, c2, c3, c4 = st.columns(4)
407
  c1.metric("ROC AUC", f"{m['roc_auc']:.3f}")
408
- c2.metric("Accuracy", f"{m['accuracy@0.5']:.3f}")
409
- c3.metric("Train N", m["n_train"])
410
- c4.metric("Test N", m["n_test"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
 
412
  # ---------------- PUBLISH (only after training) ----------------
413
 
 
9
  import os
10
  from huggingface_hub import hf_hub_download, HfApi
11
  import hmac
12
+ from sklearn.metrics import (
13
+ roc_auc_score, accuracy_score,
14
+ roc_curve, confusion_matrix,
15
+ precision_score, recall_score, f1_score,
16
+ balanced_accuracy_score,
17
+ precision_recall_curve, average_precision_score,
18
+ brier_score_loss
19
+ )
20
+ from sklearn.calibration import calibration_curve
21
 
22
 
23
  from sklearn.pipeline import Pipeline
 
84
  # ============================================================
85
  # Validation utilities
86
  # ============================================================
 
 
 
 
 
 
 
 
87
 
88
 
89
  def coerce_binary_label(y: pd.Series):
 
156
  proba = pipe.predict_proba(X_test)[:, 1]
157
  pred = (proba >= 0.5).astype(int)
158
 
159
+ # Core probability-based metrics
160
+ roc_auc = float(roc_auc_score(y_test, proba))
161
+
162
+ # ROC curve coordinates
163
+ fpr, tpr, roc_thresholds = roc_curve(y_test, proba)
164
+
165
+ # Threshold-based metrics (default at 0.5)
166
+ cls = compute_classification_metrics(y_test, proba, threshold=0.5)
167
+
168
  metrics = {
169
+ "roc_auc": roc_auc,
 
170
  "n_train": int(len(X_train)),
171
  "n_test": int(len(X_test)),
172
+
173
+ # Store threshold metrics at 0.5
174
+ "threshold@0.5": cls["threshold"],
175
+ "accuracy@0.5": cls["accuracy"],
176
+ "balanced_accuracy@0.5": cls["balanced_accuracy"],
177
+ "precision@0.5": cls["precision"],
178
+ "recall@0.5": cls["recall"],
179
+ "f1@0.5": cls["f1"],
180
+ "sensitivity@0.5": cls["sensitivity"],
181
+ "specificity@0.5": cls["specificity"],
182
+ "confusion_matrix@0.5": {
183
+ "tn": cls["tn"], "fp": cls["fp"], "fn": cls["fn"], "tp": cls["tp"]
184
+ },
185
+
186
+ # Store ROC curve arrays for later plotting if needed
187
+ "roc_curve": {
188
+ "fpr": [float(x) for x in fpr],
189
+ "tpr": [float(x) for x in tpr],
190
+ "thresholds": [float(x) for x in roc_thresholds],
191
+ },
192
  }
193
 
194
+
195
  joblib.dump(pipe, "model.joblib")
196
 
197
  meta = {
 
213
 
214
  return pipe, meta, X
215
 
216
+ def compute_classification_metrics(y_true, y_proba, threshold: float = 0.5):
217
+ y_pred = (y_proba >= threshold).astype(int)
218
+
219
+ tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
220
+
221
+ sensitivity = tp / (tp + fn) if (tp + fn) else 0.0 # recall, TPR
222
+ specificity = tn / (tn + fp) if (tn + fp) else 0.0 # TNR
223
+ precision = precision_score(y_true, y_pred, zero_division=0)
224
+ recall = recall_score(y_true, y_pred, zero_division=0)
225
+ f1 = f1_score(y_true, y_pred, zero_division=0)
226
+ acc = accuracy_score(y_true, y_pred)
227
+ bacc = balanced_accuracy_score(y_true, y_pred)
228
+
229
+ return {
230
+ "threshold": float(threshold),
231
+ "tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp),
232
+ "sensitivity": float(sensitivity),
233
+ "specificity": float(specificity),
234
+ "precision": float(precision),
235
+ "recall": float(recall),
236
+ "f1": float(f1),
237
+ "accuracy": float(acc),
238
+ "balanced_accuracy": float(bacc),
239
+ }
240
 
241
 
242
 
 
455
 
456
  st.success("Training complete. model.joblib and meta.json created.")
457
 
458
+
459
+ st.divider()
460
+ st.subheader("Training performance (test split)")
461
+
462
  m = meta["metrics"]
463
+
464
+ # Show key metrics at threshold 0.5
465
  c1, c2, c3, c4 = st.columns(4)
466
  c1.metric("ROC AUC", f"{m['roc_auc']:.3f}")
467
+ c2.metric("Sensitivity (Recall)", f"{m['sensitivity@0.5']:.3f}")
468
+ c3.metric("Specificity", f"{m['specificity@0.5']:.3f}")
469
+ c4.metric("F1", f"{m['f1@0.5']:.3f}")
470
+
471
+ c5, c6, c7, c8 = st.columns(4)
472
+ c5.metric("Precision", f"{m['precision@0.5']:.3f}")
473
+ c6.metric("Accuracy", f"{m['accuracy@0.5']:.3f}")
474
+ c7.metric("Balanced Acc", f"{m['balanced_accuracy@0.5']:.3f}")
475
+ c8.metric("Test N", m["n_test"])
476
+
477
+ # Confusion matrix display
478
+ cm = m["confusion_matrix@0.5"]
479
+ cm_df = pd.DataFrame(
480
+ [[cm["tn"], cm["fp"]], [cm["fn"], cm["tp"]]],
481
+ index=["Actual 0", "Actual 1"],
482
+ columns=["Pred 0", "Pred 1"]
483
+ )
484
+ st.markdown("**Confusion Matrix (threshold = 0.5)**")
485
+ st.dataframe(cm_df)
486
+
487
+ # ROC curve plot (matplotlib)
488
+ roc = m["roc_curve"]
489
+ fig = plt.figure()
490
+ plt.plot(roc["fpr"], roc["tpr"])
491
+ plt.plot([0, 1], [0, 1])
492
+ plt.xlabel("False Positive Rate (1 - Specificity)")
493
+ plt.ylabel("True Positive Rate (Sensitivity)")
494
+ plt.title(f"ROC Curve (AUC = {m['roc_auc']:.3f})")
495
+ st.pyplot(fig, clear_figure=True)
496
+
497
+ st.divider()
498
+ st.subheader("Threshold analysis")
499
+
500
+ thr = st.slider("Decision threshold", 0.0, 1.0, 0.5, 0.01)
501
+
502
+ # Recompute threshold-based metrics quickly using stored probabilities
503
+ # You need y_test and proba in scope. Easiest is to store them in session_state during training.
504
+ st.session_state.y_test_last = y_test
505
+ st.session_state.proba_last = proba
506
+ if "y_test_last" in st.session_state and "proba_last" in st.session_state:
507
+ cls = compute_classification_metrics(st.session_state.y_test_last, st.session_state.proba_last, threshold=thr)
508
+ st.write({
509
+ "Sensitivity": cls["sensitivity"],
510
+ "Specificity": cls["specificity"],
511
+ "Precision": cls["precision"],
512
+ "Recall": cls["recall"],
513
+ "F1": cls["f1"],
514
+ "Accuracy": cls["accuracy"],
515
+ "Balanced Accuracy": cls["balanced_accuracy"],
516
+ })
517
+
518
+
519
+
520
+
521
 
522
  # ---------------- PUBLISH (only after training) ----------------
523