Synav commited on
Commit
86c037c
·
verified ·
1 Parent(s): 7fafbeb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -11
app.py CHANGED
@@ -948,21 +948,50 @@ def ensure_model_repo_exists(model_repo_id: str, token: str):
948
 
949
  def coerce_X_like_schema(X: pd.DataFrame, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame:
950
  """
951
- Ensure X has correct columns and coercions, matching your training/inference convention.
 
 
 
952
  """
953
- X = X[feature_cols].copy().replace({pd.NA: np.nan})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
954
 
955
  for c in num_cols:
956
- if c in X.columns:
957
- X[c] = pd.to_numeric(X[c], errors="coerce")
958
 
959
  for c in cat_cols:
960
- if c in X.columns:
961
- X[c] = X[c].astype("object")
962
- X.loc[X[c].isna(), c] = np.nan
963
- X[c] = X[c].map(lambda v: v if pd.isna(v) else str(v))
 
 
964
 
965
- return X
966
 
967
 
968
  def get_shap_background_auto(model_repo_id: str, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame | None:
@@ -2291,9 +2320,21 @@ with tab_predict:
2291
  infer_file = st.file_uploader("Upload inference Excel (.xlsx) (optional, for dropdown options + batch prediction)", type=["xlsx"], key="infer_xlsx")
2292
  if infer_file:
2293
  st.session_state.df_inf = pd.read_excel(infer_file, engine="openpyxl")
2294
- st.session_state.df_inf = add_ethnicity_region(st.session_state.df_inf, "Ethnicity", "Ethnicity_Region")
 
 
 
 
 
 
 
 
 
 
 
2295
  else:
2296
  st.session_state.df_inf = None
 
2297
 
2298
  df_for_options = st.session_state.df_inf
2299
 
@@ -2798,7 +2839,8 @@ with tab_predict:
2798
  st.stop()
2799
 
2800
 
2801
- X_inf = df_inf[feature_cols].copy()
 
2802
  X_inf = X_inf.replace({pd.NA: np.nan})
2803
 
2804
  for c in num_cols:
 
948
 
949
  def coerce_X_like_schema(X: pd.DataFrame, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame:
950
  """
951
+ Robustly align an inference dataframe to the training schema:
952
+ - normalizes column names (spaces/newlines/underscores/case)
953
+ - renames matching columns to the model's exact feature names
954
+ - creates missing columns as NaN (so prediction can still run)
955
  """
956
+ X = X.copy()
957
+
958
+ # Build normalized lookup from inference file columns -> actual column name in inference
959
+ inf_lookup = {norm_col(c): c for c in X.columns}
960
+
961
+ # Build output with exact training columns
962
+ X_out = pd.DataFrame(index=X.index)
963
+
964
+ missing = []
965
+ for col in feature_cols:
966
+ k = norm_col(col)
967
+ if k in inf_lookup:
968
+ X_out[col] = X[inf_lookup[k]]
969
+ else:
970
+ X_out[col] = np.nan
971
+ missing.append(col)
972
+
973
+ # Optional: show missing columns (don’t hard fail)
974
+ if missing:
975
+ st.warning(
976
+ "Inference file is missing some training columns (filled as blank/NaN): "
977
+ + ", ".join(missing[:12]) + (" ..." if len(missing) > 12 else "")
978
+ )
979
+
980
+ # Coercions (same as your existing logic)
981
+ X_out = X_out.replace({pd.NA: np.nan})
982
 
983
  for c in num_cols:
984
+ if c in X_out.columns:
985
+ X_out[c] = pd.to_numeric(X_out[c], errors="coerce")
986
 
987
  for c in cat_cols:
988
+ if c in X_out.columns:
989
+ X_out[c] = X_out[c].astype("object")
990
+ X_out.loc[X_out[c].isna(), c] = np.nan
991
+ X_out[c] = X_out[c].map(lambda v: v if pd.isna(v) else str(v))
992
+
993
+ return X_out
994
 
 
995
 
996
 
997
  def get_shap_background_auto(model_repo_id: str, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame | None:
 
2320
  infer_file = st.file_uploader("Upload inference Excel (.xlsx) (optional, for dropdown options + batch prediction)", type=["xlsx"], key="infer_xlsx")
2321
  if infer_file:
2322
  st.session_state.df_inf = pd.read_excel(infer_file, engine="openpyxl")
2323
+
2324
+ # 🔴 PLACE IT HERE — normalize headers immediately after load
2325
+ st.session_state.df_inf.columns = [
2326
+ " ".join(str(c).replace("\u00A0"," ").split())
2327
+ for c in st.session_state.df_inf.columns
2328
+ ]
2329
+
2330
+ st.session_state.df_inf = add_ethnicity_region(
2331
+ st.session_state.df_inf,
2332
+ "Ethnicity",
2333
+ "Ethnicity_Region"
2334
+ )
2335
  else:
2336
  st.session_state.df_inf = None
2337
+
2338
 
2339
  df_for_options = st.session_state.df_inf
2340
 
 
2839
  st.stop()
2840
 
2841
 
2842
+ X_inf = coerce_X_like_schema(df_inf, feature_cols, num_cols, cat_cols)
2843
+
2844
  X_inf = X_inf.replace({pd.NA: np.nan})
2845
 
2846
  for c in num_cols: