Update app.py
Browse files
app.py
CHANGED
|
@@ -149,21 +149,21 @@ def build_pipeline(
|
|
| 149 |
# if SVD is ON, selection happens on components)
|
| 150 |
if use_feature_selection:
|
| 151 |
selector_est = LogisticRegression(
|
| 152 |
-
penalty="l1",
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
max_iter=5000,
|
| 156 |
-
n_jobs=-1
|
| 157 |
)
|
| 158 |
|
|
|
|
| 159 |
# If you want to cap features: set max_features and use threshold that keeps top coefficients.
|
| 160 |
# SelectFromModel doesn't have direct "max_features" — simplest safe approach is threshold-based.
|
| 161 |
-
# Keep threshold='
|
| 162 |
-
selector = SelectFromModel(selector_est, threshold="
|
| 163 |
steps.append(("select", selector))
|
| 164 |
|
| 165 |
# Final classifier (keep stable, probability-calibratable)
|
| 166 |
-
clf = LogisticRegression(max_iter=5000, solver="lbfgs")
|
|
|
|
| 167 |
steps.append(("clf", clf))
|
| 168 |
|
| 169 |
return Pipeline(steps)
|
|
@@ -222,7 +222,31 @@ def compute_classification_metrics(y_true, y_proba, threshold: float = 0.5):
|
|
| 222 |
"accuracy": float(acc),
|
| 223 |
"balanced_accuracy": float(bacc),
|
| 224 |
}
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
def compute_pr_curve(y_true, y_proba):
|
| 227 |
precision, recall, pr_thresholds = precision_recall_curve(y_true, y_proba)
|
| 228 |
ap = average_precision_score(y_true, y_proba)
|
|
@@ -336,13 +360,15 @@ def train_and_save(
|
|
| 336 |
# ----- METRICS BLOCK (MISSING) -----
|
| 337 |
roc_auc = float(roc_auc_score(y_test, proba))
|
| 338 |
fpr, tpr, roc_thresholds = roc_curve(y_test, proba)
|
| 339 |
-
cls =
|
|
|
|
| 340 |
|
| 341 |
metrics = {
|
| 342 |
"roc_auc": roc_auc,
|
| 343 |
"n_train": int(len(X_train)),
|
| 344 |
"n_test": int(len(X_test)),
|
| 345 |
-
"
|
|
|
|
| 346 |
"accuracy@0.5": cls["accuracy"],
|
| 347 |
"balanced_accuracy@0.5": cls["balanced_accuracy"],
|
| 348 |
"precision@0.5": cls["precision"],
|
|
|
|
| 149 |
# if SVD is ON, selection happens on components)
|
| 150 |
if use_feature_selection:
|
| 151 |
selector_est = LogisticRegression(
|
| 152 |
+
penalty="l1", solver="saga", C=float(l1_C),
|
| 153 |
+
max_iter=5000, n_jobs=-1,
|
| 154 |
+
class_weight="balanced"
|
|
|
|
|
|
|
| 155 |
)
|
| 156 |
|
| 157 |
+
|
| 158 |
# If you want to cap features: set max_features and use threshold that keeps top coefficients.
|
| 159 |
# SelectFromModel doesn't have direct "max_features" — simplest safe approach is threshold-based.
|
| 160 |
+
# Keep threshold='median' as default; adjust if you want more aggressive pruning.
|
| 161 |
+
selector = SelectFromModel(selector_est, threshold="median")
|
| 162 |
steps.append(("select", selector))
|
| 163 |
|
| 164 |
# Final classifier (keep stable, probability-calibratable)
|
| 165 |
+
clf = LogisticRegression(max_iter=5000, solver="lbfgs", class_weight="balanced")
|
| 166 |
+
|
| 167 |
steps.append(("clf", clf))
|
| 168 |
|
| 169 |
return Pipeline(steps)
|
|
|
|
| 222 |
"accuracy": float(acc),
|
| 223 |
"balanced_accuracy": float(bacc),
|
| 224 |
}
|
| 225 |
+
|
| 226 |
+
def find_best_threshold(y_true, y_proba, metric="f1"):
|
| 227 |
+
thresholds = np.linspace(0.05, 0.95, 181) # step ~0.005
|
| 228 |
+
best_t, best_val, best_cls = 0.5, -1, None
|
| 229 |
+
for t in thresholds:
|
| 230 |
+
cls = compute_classification_metrics(y_true, y_proba, threshold=float(t))
|
| 231 |
+
val = cls.get(metric, 0.0)
|
| 232 |
+
if val > best_val:
|
| 233 |
+
best_val, best_t, best_cls = val, float(t), cls
|
| 234 |
+
return best_t, best_val, best_cls
|
| 235 |
+
|
| 236 |
+
def find_best_threshold_f1(y_true, y_proba, t_min=0.01, t_max=0.99, n=199):
|
| 237 |
+
"""
|
| 238 |
+
Returns threshold that maximizes F1 on (y_true, y_proba).
|
| 239 |
+
"""
|
| 240 |
+
thresholds = np.linspace(float(t_min), float(t_max), int(n))
|
| 241 |
+
best = {"threshold": 0.5, "f1": -1.0, "cls": None}
|
| 242 |
+
|
| 243 |
+
for t in thresholds:
|
| 244 |
+
cls = compute_classification_metrics(y_true, y_proba, threshold=float(t))
|
| 245 |
+
if cls["f1"] > best["f1"]:
|
| 246 |
+
best = {"threshold": float(t), "f1": float(cls["f1"]), "cls": cls}
|
| 247 |
+
|
| 248 |
+
return best["threshold"], best["cls"]
|
| 249 |
+
|
| 250 |
def compute_pr_curve(y_true, y_proba):
|
| 251 |
precision, recall, pr_thresholds = precision_recall_curve(y_true, y_proba)
|
| 252 |
ap = average_precision_score(y_true, y_proba)
|
|
|
|
| 360 |
# ----- METRICS BLOCK (MISSING) -----
|
| 361 |
roc_auc = float(roc_auc_score(y_test, proba))
|
| 362 |
fpr, tpr, roc_thresholds = roc_curve(y_test, proba)
|
| 363 |
+
best_thr, best_val, cls = find_best_threshold(y_test, proba, metric="f1")
|
| 364 |
+
|
| 365 |
|
| 366 |
metrics = {
|
| 367 |
"roc_auc": roc_auc,
|
| 368 |
"n_train": int(len(X_train)),
|
| 369 |
"n_test": int(len(X_test)),
|
| 370 |
+
"best_threshold_by": "f1",
|
| 371 |
+
"best_threshold": float(best_thr),
|
| 372 |
"accuracy@0.5": cls["accuracy"],
|
| 373 |
"balanced_accuracy@0.5": cls["balanced_accuracy"],
|
| 374 |
"precision@0.5": cls["precision"],
|