Spaces:
Sleeping
Sleeping
Commit ·
f1435ee
1
Parent(s): da50127
perf: subsample to 1000 rows on HF and fix encoder consistency for correct predictions
Browse files- webapp/benchmark.py +15 -3
- webapp/ensemble.py +10 -4
- webapp/main.py +4 -0
webapp/benchmark.py
CHANGED
|
@@ -224,14 +224,17 @@ def _run_cv(builder, X, y, task):
|
|
| 224 |
else:
|
| 225 |
splits = list(KFold(N_FOLDS, shuffle=True, random_state=RAND).split(X))
|
| 226 |
|
|
|
|
|
|
|
|
|
|
| 227 |
fold_results = []
|
| 228 |
for tr_idx, val_idx in splits:
|
| 229 |
Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
|
| 230 |
ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
|
| 231 |
|
| 232 |
-
#
|
| 233 |
-
Xtr_p,
|
| 234 |
-
Xval_p, _
|
| 235 |
|
| 236 |
model = builder(task)
|
| 237 |
if task == "classification":
|
|
@@ -440,6 +443,15 @@ def run_benchmark(df: pd.DataFrame, target_col: str) -> dict:
|
|
| 440 |
|
| 441 |
y_raw = df[target_col].copy()
|
| 442 |
X = df.drop(columns=[target_col]).copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
task = infer_task(y_raw)
|
| 444 |
y, _ = _encode_target(y_raw, task)
|
| 445 |
|
|
|
|
| 224 |
else:
|
| 225 |
splits = list(KFold(N_FOLDS, shuffle=True, random_state=RAND).split(X))
|
| 226 |
|
| 227 |
+
# Pre-fit encoders on the full dataset to ensure consistent feature space
|
| 228 |
+
X_full_p, global_encoders = _prep(X)
|
| 229 |
+
|
| 230 |
fold_results = []
|
| 231 |
for tr_idx, val_idx in splits:
|
| 232 |
Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
|
| 233 |
ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
|
| 234 |
|
| 235 |
+
# Apply the global encoders to both splits
|
| 236 |
+
Xtr_p, _ = _prep(Xtr, encoders=global_encoders)
|
| 237 |
+
Xval_p, _ = _prep(Xval, encoders=global_encoders)
|
| 238 |
|
| 239 |
model = builder(task)
|
| 240 |
if task == "classification":
|
|
|
|
| 443 |
|
| 444 |
y_raw = df[target_col].copy()
|
| 445 |
X = df.drop(columns=[target_col]).copy()
|
| 446 |
+
|
| 447 |
+
# Subsample for benchmarking if the dataset is too large (>1000 rows)
|
| 448 |
+
# This prevents the "decades of time" issue on Hugging Face CPU spaces.
|
| 449 |
+
if len(df) > 1000:
|
| 450 |
+
print(f"Subsampling dataset from {len(df)} to 1000 rows for benchmarking speed.")
|
| 451 |
+
df = df.sample(n=1000, random_state=RAND)
|
| 452 |
+
y_raw = df[target_col].copy()
|
| 453 |
+
X = df.drop(columns=[target_col]).copy()
|
| 454 |
+
|
| 455 |
task = infer_task(y_raw)
|
| 456 |
y, _ = _encode_target(y_raw, task)
|
| 457 |
|
webapp/ensemble.py
CHANGED
|
@@ -67,11 +67,14 @@ def run_voting_ensemble(top_pairs: list, X: pd.DataFrame, y: pd.Series,
|
|
| 67 |
n_classes = int(y.nunique()) if task == "classification" else None
|
| 68 |
fold_results = []
|
| 69 |
|
|
|
|
|
|
|
|
|
|
| 70 |
for tr_idx, val_idx in splits:
|
| 71 |
Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
|
| 72 |
ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
|
| 73 |
-
Xtr_p,
|
| 74 |
-
Xval_p, _
|
| 75 |
|
| 76 |
t0 = time.perf_counter()
|
| 77 |
|
|
@@ -171,11 +174,14 @@ def run_stacking_ensemble(sklearn_pairs: list, X: pd.DataFrame, y: pd.Series,
|
|
| 171 |
|
| 172 |
fold_results = []
|
| 173 |
|
|
|
|
|
|
|
|
|
|
| 174 |
for tr_idx, val_idx in splits:
|
| 175 |
Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
|
| 176 |
ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
|
| 177 |
-
Xtr_p,
|
| 178 |
-
Xval_p, _
|
| 179 |
|
| 180 |
estimators = [(name, builder(task)) for name, builder in sklearn_pairs]
|
| 181 |
|
|
|
|
| 67 |
n_classes = int(y.nunique()) if task == "classification" else None
|
| 68 |
fold_results = []
|
| 69 |
|
| 70 |
+
# Pre-fit encoders on full X
|
| 71 |
+
X_full_p, global_encoders = prep_fn(X)
|
| 72 |
+
|
| 73 |
for tr_idx, val_idx in splits:
|
| 74 |
Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
|
| 75 |
ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
|
| 76 |
+
Xtr_p, _ = prep_fn(Xtr, encoders=global_encoders)
|
| 77 |
+
Xval_p, _ = prep_fn(Xval, encoders=global_encoders)
|
| 78 |
|
| 79 |
t0 = time.perf_counter()
|
| 80 |
|
|
|
|
| 174 |
|
| 175 |
fold_results = []
|
| 176 |
|
| 177 |
+
# Pre-fit encoders on full X
|
| 178 |
+
X_full_p, global_encoders = prep_fn(X)
|
| 179 |
+
|
| 180 |
for tr_idx, val_idx in splits:
|
| 181 |
Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
|
| 182 |
ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
|
| 183 |
+
Xtr_p, _ = prep_fn(Xtr, encoders=global_encoders)
|
| 184 |
+
Xval_p, _ = prep_fn(Xval, encoders=global_encoders)
|
| 185 |
|
| 186 |
estimators = [(name, builder(task)) for name, builder in sklearn_pairs]
|
| 187 |
|
webapp/main.py
CHANGED
|
@@ -196,6 +196,10 @@ async def benchmark(
|
|
| 196 |
|
| 197 |
# Cache the Best Overall model for the Live Playground
|
| 198 |
best_name = result["recommendation"]["recommendations"]["best_overall"]["model"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
X = df.drop(columns=[target_col])
|
| 200 |
y_raw = df[target_col]
|
| 201 |
task = result["dataset_info"]["task"]
|
|
|
|
| 196 |
|
| 197 |
# Cache the Best Overall model for the Live Playground
|
| 198 |
best_name = result["recommendation"]["recommendations"]["best_overall"]["model"]
|
| 199 |
+
# Subsample for training the champion model if too large (Demo speed)
|
| 200 |
+
if len(df) > 1000:
|
| 201 |
+
df = df.sample(n=1000, random_state=42)
|
| 202 |
+
|
| 203 |
X = df.drop(columns=[target_col])
|
| 204 |
y_raw = df[target_col]
|
| 205 |
task = result["dataset_info"]["task"]
|