Update app.py
Browse files
app.py
CHANGED
|
@@ -948,21 +948,50 @@ def ensure_model_repo_exists(model_repo_id: str, token: str):
|
|
| 948 |
|
| 949 |
def coerce_X_like_schema(X: pd.DataFrame, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame:
|
| 950 |
"""
|
| 951 |
-
|
|
|
|
|
|
|
|
|
|
| 952 |
"""
|
| 953 |
-
X = X
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 954 |
|
| 955 |
for c in num_cols:
|
| 956 |
-
if c in
|
| 957 |
-
|
| 958 |
|
| 959 |
for c in cat_cols:
|
| 960 |
-
if c in
|
| 961 |
-
|
| 962 |
-
|
| 963 |
-
|
|
|
|
|
|
|
| 964 |
|
| 965 |
-
return X
|
| 966 |
|
| 967 |
|
| 968 |
def get_shap_background_auto(model_repo_id: str, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame | None:
|
|
@@ -2291,9 +2320,21 @@ with tab_predict:
|
|
| 2291 |
infer_file = st.file_uploader("Upload inference Excel (.xlsx) (optional, for dropdown options + batch prediction)", type=["xlsx"], key="infer_xlsx")
|
| 2292 |
if infer_file:
|
| 2293 |
st.session_state.df_inf = pd.read_excel(infer_file, engine="openpyxl")
|
| 2294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2295 |
else:
|
| 2296 |
st.session_state.df_inf = None
|
|
|
|
| 2297 |
|
| 2298 |
df_for_options = st.session_state.df_inf
|
| 2299 |
|
|
@@ -2798,7 +2839,8 @@ with tab_predict:
|
|
| 2798 |
st.stop()
|
| 2799 |
|
| 2800 |
|
| 2801 |
-
X_inf = df_inf
|
|
|
|
| 2802 |
X_inf = X_inf.replace({pd.NA: np.nan})
|
| 2803 |
|
| 2804 |
for c in num_cols:
|
|
|
|
| 948 |
|
| 949 |
def coerce_X_like_schema(X: pd.DataFrame, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame:
|
| 950 |
"""
|
| 951 |
+
Robustly align an inference dataframe to the training schema:
|
| 952 |
+
- normalizes column names (spaces/newlines/underscores/case)
|
| 953 |
+
- renames matching columns to the model's exact feature names
|
| 954 |
+
- creates missing columns as NaN (so prediction can still run)
|
| 955 |
"""
|
| 956 |
+
X = X.copy()
|
| 957 |
+
|
| 958 |
+
# Build normalized lookup from inference file columns -> actual column name in inference
|
| 959 |
+
inf_lookup = {norm_col(c): c for c in X.columns}
|
| 960 |
+
|
| 961 |
+
# Build output with exact training columns
|
| 962 |
+
X_out = pd.DataFrame(index=X.index)
|
| 963 |
+
|
| 964 |
+
missing = []
|
| 965 |
+
for col in feature_cols:
|
| 966 |
+
k = norm_col(col)
|
| 967 |
+
if k in inf_lookup:
|
| 968 |
+
X_out[col] = X[inf_lookup[k]]
|
| 969 |
+
else:
|
| 970 |
+
X_out[col] = np.nan
|
| 971 |
+
missing.append(col)
|
| 972 |
+
|
| 973 |
+
# Optional: show missing columns (don’t hard fail)
|
| 974 |
+
if missing:
|
| 975 |
+
st.warning(
|
| 976 |
+
"Inference file is missing some training columns (filled as blank/NaN): "
|
| 977 |
+
+ ", ".join(missing[:12]) + (" ..." if len(missing) > 12 else "")
|
| 978 |
+
)
|
| 979 |
+
|
| 980 |
+
# Coercions (same as your existing logic)
|
| 981 |
+
X_out = X_out.replace({pd.NA: np.nan})
|
| 982 |
|
| 983 |
for c in num_cols:
|
| 984 |
+
if c in X_out.columns:
|
| 985 |
+
X_out[c] = pd.to_numeric(X_out[c], errors="coerce")
|
| 986 |
|
| 987 |
for c in cat_cols:
|
| 988 |
+
if c in X_out.columns:
|
| 989 |
+
X_out[c] = X_out[c].astype("object")
|
| 990 |
+
X_out.loc[X_out[c].isna(), c] = np.nan
|
| 991 |
+
X_out[c] = X_out[c].map(lambda v: v if pd.isna(v) else str(v))
|
| 992 |
+
|
| 993 |
+
return X_out
|
| 994 |
|
|
|
|
| 995 |
|
| 996 |
|
| 997 |
def get_shap_background_auto(model_repo_id: str, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame | None:
|
|
|
|
| 2320 |
infer_file = st.file_uploader("Upload inference Excel (.xlsx) (optional, for dropdown options + batch prediction)", type=["xlsx"], key="infer_xlsx")
|
| 2321 |
if infer_file:
|
| 2322 |
st.session_state.df_inf = pd.read_excel(infer_file, engine="openpyxl")
|
| 2323 |
+
|
| 2324 |
+
# 🔴 PLACE IT HERE — normalize headers immediately after load
|
| 2325 |
+
st.session_state.df_inf.columns = [
|
| 2326 |
+
" ".join(str(c).replace("\u00A0"," ").split())
|
| 2327 |
+
for c in st.session_state.df_inf.columns
|
| 2328 |
+
]
|
| 2329 |
+
|
| 2330 |
+
st.session_state.df_inf = add_ethnicity_region(
|
| 2331 |
+
st.session_state.df_inf,
|
| 2332 |
+
"Ethnicity",
|
| 2333 |
+
"Ethnicity_Region"
|
| 2334 |
+
)
|
| 2335 |
else:
|
| 2336 |
st.session_state.df_inf = None
|
| 2337 |
+
|
| 2338 |
|
| 2339 |
df_for_options = st.session_state.df_inf
|
| 2340 |
|
|
|
|
| 2839 |
st.stop()
|
| 2840 |
|
| 2841 |
|
| 2842 |
+
X_inf = coerce_X_like_schema(df_inf, feature_cols, num_cols, cat_cols)
|
| 2843 |
+
|
| 2844 |
X_inf = X_inf.replace({pd.NA: np.nan})
|
| 2845 |
|
| 2846 |
for c in num_cols:
|