Spaces:

Synav
/

Explainable-Acute-Leukemia-Mortality-Predictor

Running

App Files Files Community

Synav commited on Jan 23

Commit

35419fa

verified ·

1 Parent(s): 21da4e4

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -11

app.py CHANGED Viewed

@@ -149,21 +149,21 @@ def build_pipeline(
     # if SVD is ON, selection happens on components)
     if use_feature_selection:
         selector_est = LogisticRegression(
-            penalty="l1",
-            solver="saga",
-            C=float(l1_C),
-            max_iter=5000,
-            n_jobs=-1
         )
         # If you want to cap features: set max_features and use threshold that keeps top coefficients.
         # SelectFromModel doesn't have direct "max_features" — simplest safe approach is threshold-based.
-        # Keep threshold='mean' as default; adjust if you want more aggressive pruning.
-        selector = SelectFromModel(selector_est, threshold="mean")
         steps.append(("select", selector))
     # Final classifier (keep stable, probability-calibratable)
-    clf = LogisticRegression(max_iter=5000, solver="lbfgs")
     steps.append(("clf", clf))
     return Pipeline(steps)
@@ -222,7 +222,31 @@ def compute_classification_metrics(y_true, y_proba, threshold: float = 0.5):
         "accuracy": float(acc),
         "balanced_accuracy": float(bacc),
     }
 def compute_pr_curve(y_true, y_proba):
     precision, recall, pr_thresholds = precision_recall_curve(y_true, y_proba)
     ap = average_precision_score(y_true, y_proba)
@@ -336,13 +360,15 @@ def train_and_save(
     # ----- METRICS BLOCK (MISSING) -----
     roc_auc = float(roc_auc_score(y_test, proba))
     fpr, tpr, roc_thresholds = roc_curve(y_test, proba)
-    cls = compute_classification_metrics(y_test, proba, threshold=0.5)
     metrics = {
         "roc_auc": roc_auc,
         "n_train": int(len(X_train)),
         "n_test": int(len(X_test)),
-        "threshold@0.5": cls["threshold"],
         "accuracy@0.5": cls["accuracy"],
         "balanced_accuracy@0.5": cls["balanced_accuracy"],
         "precision@0.5": cls["precision"],

     # if SVD is ON, selection happens on components)
     if use_feature_selection:
         selector_est = LogisticRegression(
+            penalty="l1", solver="saga", C=float(l1_C),
+            max_iter=5000, n_jobs=-1,
+            class_weight="balanced"
         )
         # If you want to cap features: set max_features and use threshold that keeps top coefficients.
         # SelectFromModel doesn't have direct "max_features" — simplest safe approach is threshold-based.
+        # Keep threshold='median' as default; adjust if you want more aggressive pruning.
+        selector = SelectFromModel(selector_est, threshold="median")
         steps.append(("select", selector))
     # Final classifier (keep stable, probability-calibratable)
+    clf = LogisticRegression(max_iter=5000, solver="lbfgs", class_weight="balanced")
     steps.append(("clf", clf))
     return Pipeline(steps)
         "accuracy": float(acc),
         "balanced_accuracy": float(bacc),
     }
+def find_best_threshold(y_true, y_proba, metric="f1"):
+    thresholds = np.linspace(0.05, 0.95, 181)  # step ~0.005
+    best_t, best_val, best_cls = 0.5, -1, None
+    for t in thresholds:
+        cls = compute_classification_metrics(y_true, y_proba, threshold=float(t))
+        val = cls.get(metric, 0.0)
+        if val > best_val:
+            best_val, best_t, best_cls = val, float(t), cls
+    return best_t, best_val, best_cls
+def find_best_threshold_f1(y_true, y_proba, t_min=0.01, t_max=0.99, n=199):
+    """
+    Returns threshold that maximizes F1 on (y_true, y_proba).
+    """
+    thresholds = np.linspace(float(t_min), float(t_max), int(n))
+    best = {"threshold": 0.5, "f1": -1.0, "cls": None}
+    for t in thresholds:
+        cls = compute_classification_metrics(y_true, y_proba, threshold=float(t))
+        if cls["f1"] > best["f1"]:
+            best = {"threshold": float(t), "f1": float(cls["f1"]), "cls": cls}
+    return best["threshold"], best["cls"]
 def compute_pr_curve(y_true, y_proba):
     precision, recall, pr_thresholds = precision_recall_curve(y_true, y_proba)
     ap = average_precision_score(y_true, y_proba)
     # ----- METRICS BLOCK (MISSING) -----
     roc_auc = float(roc_auc_score(y_test, proba))
     fpr, tpr, roc_thresholds = roc_curve(y_test, proba)
+    best_thr, best_val, cls = find_best_threshold(y_test, proba, metric="f1")
     metrics = {
         "roc_auc": roc_auc,
         "n_train": int(len(X_train)),
         "n_test": int(len(X_test)),
+        "best_threshold_by": "f1",
+        "best_threshold": float(best_thr),
         "accuracy@0.5": cls["accuracy"],
         "balanced_accuracy@0.5": cls["balanced_accuracy"],
         "precision@0.5": cls["precision"],