Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -322,7 +322,7 @@ def train_classifier(
|
|
| 322 |
max_depth: float,
|
| 323 |
random_state: float,
|
| 324 |
) -> Tuple[Optional[plt.Figure], str]:
|
| 325 |
-
"""Train a four-class classifier
|
| 326 |
if not feature_columns:
|
| 327 |
return None, "### ❌ Please select at least one feature."
|
| 328 |
|
|
@@ -332,31 +332,26 @@ def train_classifier(
|
|
| 332 |
train_df = train_df[train_df["noise_label"].isin(CLASS_ORDER)]
|
| 333 |
|
| 334 |
if len(train_df) < 20:
|
| 335 |
-
return None, "### ❌ Not enough
|
| 336 |
|
| 337 |
X = train_df[feature_columns]
|
| 338 |
y = train_df["noise_label"]
|
| 339 |
|
| 340 |
seed = int(random_state)
|
| 341 |
depth = int(max_depth) if max_depth and int(max_depth) > 0 else None
|
| 342 |
-
|
| 343 |
|
|
|
|
| 344 |
try:
|
| 345 |
X_train, X_test, y_train, y_test = train_test_split(
|
| 346 |
-
X,
|
| 347 |
-
y,
|
| 348 |
-
test_size=test_size,
|
| 349 |
-
random_state=seed,
|
| 350 |
-
stratify=y,
|
| 351 |
)
|
| 352 |
except ValueError:
|
| 353 |
X_train, X_test, y_train, y_test = train_test_split(
|
| 354 |
-
X,
|
| 355 |
-
y,
|
| 356 |
-
test_size=test_size,
|
| 357 |
-
random_state=seed,
|
| 358 |
)
|
| 359 |
|
|
|
|
| 360 |
model = Pipeline(
|
| 361 |
steps=[
|
| 362 |
("imputer", SimpleImputer(strategy="median")),
|
|
@@ -364,10 +359,13 @@ def train_classifier(
|
|
| 364 |
(
|
| 365 |
"classifier",
|
| 366 |
HistGradientBoostingClassifier(
|
| 367 |
-
max_iter=
|
| 368 |
max_depth=depth,
|
| 369 |
random_state=seed,
|
| 370 |
min_samples_leaf=1,
|
|
|
|
|
|
|
|
|
|
| 371 |
),
|
| 372 |
),
|
| 373 |
]
|
|
|
|
| 322 |
max_depth: float,
|
| 323 |
random_state: float,
|
| 324 |
) -> Tuple[Optional[plt.Figure], str]:
|
| 325 |
+
"""Train a four-class classifier with better handling of class imbalance."""
|
| 326 |
if not feature_columns:
|
| 327 |
return None, "### ❌ Please select at least one feature."
|
| 328 |
|
|
|
|
| 332 |
train_df = train_df[train_df["noise_label"].isin(CLASS_ORDER)]
|
| 333 |
|
| 334 |
if len(train_df) < 20:
|
| 335 |
+
return None, "### ❌ Not enough rows after filtering missing values."
|
| 336 |
|
| 337 |
X = train_df[feature_columns]
|
| 338 |
y = train_df["noise_label"]
|
| 339 |
|
| 340 |
seed = int(random_state)
|
| 341 |
depth = int(max_depth) if max_depth and int(max_depth) > 0 else None
|
| 342 |
+
max_iter = int(n_estimators)
|
| 343 |
|
| 344 |
+
# --- Stratified split ---
|
| 345 |
try:
|
| 346 |
X_train, X_test, y_train, y_test = train_test_split(
|
| 347 |
+
X, y, test_size=test_size, random_state=seed, stratify=y
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
)
|
| 349 |
except ValueError:
|
| 350 |
X_train, X_test, y_train, y_test = train_test_split(
|
| 351 |
+
X, y, test_size=test_size, random_state=seed
|
|
|
|
|
|
|
|
|
|
| 352 |
)
|
| 353 |
|
| 354 |
+
# --- Pipeline with class_weight='balanced' ---
|
| 355 |
model = Pipeline(
|
| 356 |
steps=[
|
| 357 |
("imputer", SimpleImputer(strategy="median")),
|
|
|
|
| 359 |
(
|
| 360 |
"classifier",
|
| 361 |
HistGradientBoostingClassifier(
|
| 362 |
+
max_iter=max_iter,
|
| 363 |
max_depth=depth,
|
| 364 |
random_state=seed,
|
| 365 |
min_samples_leaf=1,
|
| 366 |
+
class_weight="balanced", # ← главное улучшение
|
| 367 |
+
learning_rate=0.1, # можно поиграть (0.05-0.2)
|
| 368 |
+
max_bins=255, # стандартное хорошее значение
|
| 369 |
),
|
| 370 |
),
|
| 371 |
]
|