Synav commited on
Commit
35419fa
·
verified ·
1 Parent(s): 21da4e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -11
app.py CHANGED
@@ -149,21 +149,21 @@ def build_pipeline(
149
  # if SVD is ON, selection happens on components)
150
  if use_feature_selection:
151
  selector_est = LogisticRegression(
152
- penalty="l1",
153
- solver="saga",
154
- C=float(l1_C),
155
- max_iter=5000,
156
- n_jobs=-1
157
  )
158
 
 
159
  # If you want to cap features: set max_features and use threshold that keeps top coefficients.
160
  # SelectFromModel doesn't have direct "max_features" — simplest safe approach is threshold-based.
161
- # Keep threshold='mean' as default; adjust if you want more aggressive pruning.
162
- selector = SelectFromModel(selector_est, threshold="mean")
163
  steps.append(("select", selector))
164
 
165
  # Final classifier (keep stable, probability-calibratable)
166
- clf = LogisticRegression(max_iter=5000, solver="lbfgs")
 
167
  steps.append(("clf", clf))
168
 
169
  return Pipeline(steps)
@@ -222,7 +222,31 @@ def compute_classification_metrics(y_true, y_proba, threshold: float = 0.5):
222
  "accuracy": float(acc),
223
  "balanced_accuracy": float(bacc),
224
  }
225
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  def compute_pr_curve(y_true, y_proba):
227
  precision, recall, pr_thresholds = precision_recall_curve(y_true, y_proba)
228
  ap = average_precision_score(y_true, y_proba)
@@ -336,13 +360,15 @@ def train_and_save(
336
  # ----- METRICS BLOCK (MISSING) -----
337
  roc_auc = float(roc_auc_score(y_test, proba))
338
  fpr, tpr, roc_thresholds = roc_curve(y_test, proba)
339
- cls = compute_classification_metrics(y_test, proba, threshold=0.5)
 
340
 
341
  metrics = {
342
  "roc_auc": roc_auc,
343
  "n_train": int(len(X_train)),
344
  "n_test": int(len(X_test)),
345
- "threshold@0.5": cls["threshold"],
 
346
  "accuracy@0.5": cls["accuracy"],
347
  "balanced_accuracy@0.5": cls["balanced_accuracy"],
348
  "precision@0.5": cls["precision"],
 
149
  # if SVD is ON, selection happens on components)
150
  if use_feature_selection:
151
  selector_est = LogisticRegression(
152
+ penalty="l1", solver="saga", C=float(l1_C),
153
+ max_iter=5000, n_jobs=-1,
154
+ class_weight="balanced"
 
 
155
  )
156
 
157
+
158
  # If you want to cap features: set max_features and use threshold that keeps top coefficients.
159
  # SelectFromModel doesn't have direct "max_features" — simplest safe approach is threshold-based.
160
+ # Keep threshold='median' as default; adjust if you want more aggressive pruning.
161
+ selector = SelectFromModel(selector_est, threshold="median")
162
  steps.append(("select", selector))
163
 
164
  # Final classifier (keep stable, probability-calibratable)
165
+ clf = LogisticRegression(max_iter=5000, solver="lbfgs", class_weight="balanced")
166
+
167
  steps.append(("clf", clf))
168
 
169
  return Pipeline(steps)
 
222
  "accuracy": float(acc),
223
  "balanced_accuracy": float(bacc),
224
  }
225
+
226
+ def find_best_threshold(y_true, y_proba, metric="f1"):
227
+ thresholds = np.linspace(0.05, 0.95, 181) # step ~0.005
228
+ best_t, best_val, best_cls = 0.5, -1, None
229
+ for t in thresholds:
230
+ cls = compute_classification_metrics(y_true, y_proba, threshold=float(t))
231
+ val = cls.get(metric, 0.0)
232
+ if val > best_val:
233
+ best_val, best_t, best_cls = val, float(t), cls
234
+ return best_t, best_val, best_cls
235
+
236
+ def find_best_threshold_f1(y_true, y_proba, t_min=0.01, t_max=0.99, n=199):
237
+ """
238
+ Returns threshold that maximizes F1 on (y_true, y_proba).
239
+ """
240
+ thresholds = np.linspace(float(t_min), float(t_max), int(n))
241
+ best = {"threshold": 0.5, "f1": -1.0, "cls": None}
242
+
243
+ for t in thresholds:
244
+ cls = compute_classification_metrics(y_true, y_proba, threshold=float(t))
245
+ if cls["f1"] > best["f1"]:
246
+ best = {"threshold": float(t), "f1": float(cls["f1"]), "cls": cls}
247
+
248
+ return best["threshold"], best["cls"]
249
+
250
  def compute_pr_curve(y_true, y_proba):
251
  precision, recall, pr_thresholds = precision_recall_curve(y_true, y_proba)
252
  ap = average_precision_score(y_true, y_proba)
 
360
  # ----- METRICS BLOCK (MISSING) -----
361
  roc_auc = float(roc_auc_score(y_test, proba))
362
  fpr, tpr, roc_thresholds = roc_curve(y_test, proba)
363
+ best_thr, best_val, cls = find_best_threshold(y_test, proba, metric="f1")
364
+
365
 
366
  metrics = {
367
  "roc_auc": roc_auc,
368
  "n_train": int(len(X_train)),
369
  "n_test": int(len(X_test)),
370
+ "best_threshold_by": "f1",
371
+ "best_threshold": float(best_thr),
372
  "accuracy@0.5": cls["accuracy"],
373
  "balanced_accuracy@0.5": cls["balanced_accuracy"],
374
  "precision@0.5": cls["precision"],