Akshay4506 commited on
Commit
f1435ee
·
1 Parent(s): da50127

perf: subsample to 1000 rows on HF and fix encoder consistency for correct predictions

Browse files
Files changed (3) hide show
  1. webapp/benchmark.py +15 -3
  2. webapp/ensemble.py +10 -4
  3. webapp/main.py +4 -0
webapp/benchmark.py CHANGED
@@ -224,14 +224,17 @@ def _run_cv(builder, X, y, task):
224
  else:
225
  splits = list(KFold(N_FOLDS, shuffle=True, random_state=RAND).split(X))
226
 
 
 
 
227
  fold_results = []
228
  for tr_idx, val_idx in splits:
229
  Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
230
  ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
231
 
232
- # Capture encoders from training set and apply to validation set
233
- Xtr_p, encoders = _prep(Xtr)
234
- Xval_p, _ = _prep(Xval, encoders=encoders)
235
 
236
  model = builder(task)
237
  if task == "classification":
@@ -440,6 +443,15 @@ def run_benchmark(df: pd.DataFrame, target_col: str) -> dict:
440
 
441
  y_raw = df[target_col].copy()
442
  X = df.drop(columns=[target_col]).copy()
 
 
 
 
 
 
 
 
 
443
  task = infer_task(y_raw)
444
  y, _ = _encode_target(y_raw, task)
445
 
 
224
  else:
225
  splits = list(KFold(N_FOLDS, shuffle=True, random_state=RAND).split(X))
226
 
227
+ # Pre-fit encoders on the full dataset to ensure consistent feature space
228
+ X_full_p, global_encoders = _prep(X)
229
+
230
  fold_results = []
231
  for tr_idx, val_idx in splits:
232
  Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
233
  ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
234
 
235
+ # Apply the global encoders to both splits
236
+ Xtr_p, _ = _prep(Xtr, encoders=global_encoders)
237
+ Xval_p, _ = _prep(Xval, encoders=global_encoders)
238
 
239
  model = builder(task)
240
  if task == "classification":
 
443
 
444
  y_raw = df[target_col].copy()
445
  X = df.drop(columns=[target_col]).copy()
446
+
447
+ # Subsample for benchmarking if the dataset is too large (>1000 rows)
448
+ # This prevents the "decades of time" issue on Hugging Face CPU spaces.
449
+ if len(df) > 1000:
450
+ print(f"Subsampling dataset from {len(df)} to 1000 rows for benchmarking speed.")
451
+ df = df.sample(n=1000, random_state=RAND)
452
+ y_raw = df[target_col].copy()
453
+ X = df.drop(columns=[target_col]).copy()
454
+
455
  task = infer_task(y_raw)
456
  y, _ = _encode_target(y_raw, task)
457
 
webapp/ensemble.py CHANGED
@@ -67,11 +67,14 @@ def run_voting_ensemble(top_pairs: list, X: pd.DataFrame, y: pd.Series,
67
  n_classes = int(y.nunique()) if task == "classification" else None
68
  fold_results = []
69
 
 
 
 
70
  for tr_idx, val_idx in splits:
71
  Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
72
  ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
73
- Xtr_p, encoders = prep_fn(Xtr)
74
- Xval_p, _ = prep_fn(Xval, encoders=encoders)
75
 
76
  t0 = time.perf_counter()
77
 
@@ -171,11 +174,14 @@ def run_stacking_ensemble(sklearn_pairs: list, X: pd.DataFrame, y: pd.Series,
171
 
172
  fold_results = []
173
 
 
 
 
174
  for tr_idx, val_idx in splits:
175
  Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
176
  ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
177
- Xtr_p, encoders = prep_fn(Xtr)
178
- Xval_p, _ = prep_fn(Xval, encoders=encoders)
179
 
180
  estimators = [(name, builder(task)) for name, builder in sklearn_pairs]
181
 
 
67
  n_classes = int(y.nunique()) if task == "classification" else None
68
  fold_results = []
69
 
70
+ # Pre-fit encoders on full X
71
+ X_full_p, global_encoders = prep_fn(X)
72
+
73
  for tr_idx, val_idx in splits:
74
  Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
75
  ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
76
+ Xtr_p, _ = prep_fn(Xtr, encoders=global_encoders)
77
+ Xval_p, _ = prep_fn(Xval, encoders=global_encoders)
78
 
79
  t0 = time.perf_counter()
80
 
 
174
 
175
  fold_results = []
176
 
177
+ # Pre-fit encoders on full X
178
+ X_full_p, global_encoders = prep_fn(X)
179
+
180
  for tr_idx, val_idx in splits:
181
  Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
182
  ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
183
+ Xtr_p, _ = prep_fn(Xtr, encoders=global_encoders)
184
+ Xval_p, _ = prep_fn(Xval, encoders=global_encoders)
185
 
186
  estimators = [(name, builder(task)) for name, builder in sklearn_pairs]
187
 
webapp/main.py CHANGED
@@ -196,6 +196,10 @@ async def benchmark(
196
 
197
  # Cache the Best Overall model for the Live Playground
198
  best_name = result["recommendation"]["recommendations"]["best_overall"]["model"]
 
 
 
 
199
  X = df.drop(columns=[target_col])
200
  y_raw = df[target_col]
201
  task = result["dataset_info"]["task"]
 
196
 
197
  # Cache the Best Overall model for the Live Playground
198
  best_name = result["recommendation"]["recommendations"]["best_overall"]["model"]
199
+ # Subsample for training the champion model if too large (Demo speed)
200
+ if len(df) > 1000:
201
+ df = df.sample(n=1000, random_state=42)
202
+
203
  X = df.drop(columns=[target_col])
204
  y_raw = df[target_col]
205
  task = result["dataset_info"]["task"]