QSBench commited on
Commit
94dd65f
·
verified ·
1 Parent(s): a4d74cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -13
app.py CHANGED
@@ -322,7 +322,7 @@ def train_classifier(
322
  max_depth: float,
323
  random_state: float,
324
  ) -> Tuple[Optional[plt.Figure], str]:
325
- """Train a four-class classifier and return metrics plus a plot."""
326
  if not feature_columns:
327
  return None, "### ❌ Please select at least one feature."
328
 
@@ -332,31 +332,26 @@ def train_classifier(
332
  train_df = train_df[train_df["noise_label"].isin(CLASS_ORDER)]
333
 
334
  if len(train_df) < 20:
335
- return None, "### ❌ Not enough clean rows after filtering missing values."
336
 
337
  X = train_df[feature_columns]
338
  y = train_df["noise_label"]
339
 
340
  seed = int(random_state)
341
  depth = int(max_depth) if max_depth and int(max_depth) > 0 else None
342
- trees = int(n_estimators)
343
 
 
344
  try:
345
  X_train, X_test, y_train, y_test = train_test_split(
346
- X,
347
- y,
348
- test_size=test_size,
349
- random_state=seed,
350
- stratify=y,
351
  )
352
  except ValueError:
353
  X_train, X_test, y_train, y_test = train_test_split(
354
- X,
355
- y,
356
- test_size=test_size,
357
- random_state=seed,
358
  )
359
 
 
360
  model = Pipeline(
361
  steps=[
362
  ("imputer", SimpleImputer(strategy="median")),
@@ -364,10 +359,13 @@ def train_classifier(
364
  (
365
  "classifier",
366
  HistGradientBoostingClassifier(
367
- max_iter=trees,
368
  max_depth=depth,
369
  random_state=seed,
370
  min_samples_leaf=1,
 
 
 
371
  ),
372
  ),
373
  ]
 
322
  max_depth: float,
323
  random_state: float,
324
  ) -> Tuple[Optional[plt.Figure], str]:
325
+ """Train a four-class classifier with better handling of class imbalance."""
326
  if not feature_columns:
327
  return None, "### ❌ Please select at least one feature."
328
 
 
332
  train_df = train_df[train_df["noise_label"].isin(CLASS_ORDER)]
333
 
334
  if len(train_df) < 20:
335
+ return None, "### ❌ Not enough rows after filtering missing values."
336
 
337
  X = train_df[feature_columns]
338
  y = train_df["noise_label"]
339
 
340
  seed = int(random_state)
341
  depth = int(max_depth) if max_depth and int(max_depth) > 0 else None
342
+ max_iter = int(n_estimators)
343
 
344
+ # --- Stratified split ---
345
  try:
346
  X_train, X_test, y_train, y_test = train_test_split(
347
+ X, y, test_size=test_size, random_state=seed, stratify=y
 
 
 
 
348
  )
349
  except ValueError:
350
  X_train, X_test, y_train, y_test = train_test_split(
351
+ X, y, test_size=test_size, random_state=seed
 
 
 
352
  )
353
 
354
+ # --- Pipeline with class_weight='balanced' ---
355
  model = Pipeline(
356
  steps=[
357
  ("imputer", SimpleImputer(strategy="median")),
 
359
  (
360
  "classifier",
361
  HistGradientBoostingClassifier(
362
+ max_iter=max_iter,
363
  max_depth=depth,
364
  random_state=seed,
365
  min_samples_leaf=1,
366
+ class_weight="balanced", # ← главное улучшение
367
+ learning_rate=0.1, # можно поиграть (0.05-0.2)
368
+ max_bins=255, # стандартное хорошее значение
369
  ),
370
  ),
371
  ]