Spaces:

Akshay4506
/

ModelMatrix

Sleeping

Akshay4506 commited on 22 days ago

Commit

f1435ee

1 Parent(s): da50127

perf: subsample to 1000 rows on HF and fix encoder consistency for correct predictions

Files changed (3) hide show

webapp/benchmark.py CHANGED Viewed

@@ -224,14 +224,17 @@ def _run_cv(builder, X, y, task):
     else:
         splits = list(KFold(N_FOLDS, shuffle=True, random_state=RAND).split(X))
     fold_results = []
     for tr_idx, val_idx in splits:
         Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
         ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
-        # Capture encoders from training set and apply to validation set
-        Xtr_p, encoders = _prep(Xtr)
-        Xval_p, _       = _prep(Xval, encoders=encoders)
         model = builder(task)
         if task == "classification":
@@ -440,6 +443,15 @@ def run_benchmark(df: pd.DataFrame, target_col: str) -> dict:
     y_raw = df[target_col].copy()
     X     = df.drop(columns=[target_col]).copy()
     task = infer_task(y_raw)
     y, _  = _encode_target(y_raw, task)

     else:
         splits = list(KFold(N_FOLDS, shuffle=True, random_state=RAND).split(X))
+    # Pre-fit encoders on the full dataset to ensure consistent feature space
+    X_full_p, global_encoders = _prep(X)
     fold_results = []
     for tr_idx, val_idx in splits:
         Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
         ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
+        # Apply the global encoders to both splits
+        Xtr_p, _  = _prep(Xtr, encoders=global_encoders)
+        Xval_p, _ = _prep(Xval, encoders=global_encoders)
         model = builder(task)
         if task == "classification":
     y_raw = df[target_col].copy()
     X     = df.drop(columns=[target_col]).copy()
+    # Subsample for benchmarking if the dataset is too large (>1000 rows)
+    # This prevents the "decades of time" issue on Hugging Face CPU spaces.
+    if len(df) > 1000:
+        print(f"Subsampling dataset from {len(df)} to 1000 rows for benchmarking speed.")
+        df = df.sample(n=1000, random_state=RAND)
+        y_raw = df[target_col].copy()
+        X     = df.drop(columns=[target_col]).copy()
     task = infer_task(y_raw)
     y, _  = _encode_target(y_raw, task)

webapp/ensemble.py CHANGED Viewed

@@ -67,11 +67,14 @@ def run_voting_ensemble(top_pairs: list, X: pd.DataFrame, y: pd.Series,
     n_classes = int(y.nunique()) if task == "classification" else None
     fold_results = []
     for tr_idx, val_idx in splits:
         Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
         ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
-        Xtr_p, encoders = prep_fn(Xtr)
-        Xval_p, _       = prep_fn(Xval, encoders=encoders)
         t0 = time.perf_counter()
@@ -171,11 +174,14 @@ def run_stacking_ensemble(sklearn_pairs: list, X: pd.DataFrame, y: pd.Series,
     fold_results = []
     for tr_idx, val_idx in splits:
         Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
         ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
-        Xtr_p, encoders = prep_fn(Xtr)
-        Xval_p, _       = prep_fn(Xval, encoders=encoders)
         estimators = [(name, builder(task)) for name, builder in sklearn_pairs]

     n_classes = int(y.nunique()) if task == "classification" else None
     fold_results = []
+    # Pre-fit encoders on full X
+    X_full_p, global_encoders = prep_fn(X)
     for tr_idx, val_idx in splits:
         Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
         ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
+        Xtr_p, _  = prep_fn(Xtr, encoders=global_encoders)
+        Xval_p, _ = prep_fn(Xval, encoders=global_encoders)
         t0 = time.perf_counter()
     fold_results = []
+    # Pre-fit encoders on full X
+    X_full_p, global_encoders = prep_fn(X)
     for tr_idx, val_idx in splits:
         Xtr, Xval = X.iloc[tr_idx], X.iloc[val_idx]
         ytr, yval = y.iloc[tr_idx], y.iloc[val_idx]
+        Xtr_p, _  = prep_fn(Xtr, encoders=global_encoders)
+        Xval_p, _ = prep_fn(Xval, encoders=global_encoders)
         estimators = [(name, builder(task)) for name, builder in sklearn_pairs]

webapp/main.py CHANGED Viewed

@@ -196,6 +196,10 @@ async def benchmark(
         # Cache the Best Overall model for the Live Playground
         best_name = result["recommendation"]["recommendations"]["best_overall"]["model"]
         X = df.drop(columns=[target_col])
         y_raw = df[target_col]
         task = result["dataset_info"]["task"]

         # Cache the Best Overall model for the Live Playground
         best_name = result["recommendation"]["recommendations"]["best_overall"]["model"]
+        # Subsample for training the champion model if too large (Demo speed)
+        if len(df) > 1000:
+            df = df.sample(n=1000, random_state=42)
         X = df.drop(columns=[target_col])
         y_raw = df[target_col]
         task = result["dataset_info"]["task"]