Spaces:

Synav
/

Explainable-Acute-Leukemia-Mortality-Predictor

Running

App Files Files Community

Synav commited on Jan 28

Commit

86c037c

verified ·

1 Parent(s): 7fafbeb

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -11

app.py CHANGED Viewed

@@ -948,21 +948,50 @@ def ensure_model_repo_exists(model_repo_id: str, token: str):
 def coerce_X_like_schema(X: pd.DataFrame, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame:
     """
-    Ensure X has correct columns and coercions, matching your training/inference convention.
     """
-    X = X[feature_cols].copy().replace({pd.NA: np.nan})
     for c in num_cols:
-        if c in X.columns:
-            X[c] = pd.to_numeric(X[c], errors="coerce")
     for c in cat_cols:
-        if c in X.columns:
-            X[c] = X[c].astype("object")
-            X.loc[X[c].isna(), c] = np.nan
-            X[c] = X[c].map(lambda v: v if pd.isna(v) else str(v))
-    return X
 def get_shap_background_auto(model_repo_id: str, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame | None:
@@ -2291,9 +2320,21 @@ with tab_predict:
     infer_file = st.file_uploader("Upload inference Excel (.xlsx) (optional, for dropdown options + batch prediction)", type=["xlsx"], key="infer_xlsx")
     if infer_file:
         st.session_state.df_inf = pd.read_excel(infer_file, engine="openpyxl")
-        st.session_state.df_inf = add_ethnicity_region(st.session_state.df_inf, "Ethnicity", "Ethnicity_Region")
     else:
         st.session_state.df_inf = None
     df_for_options = st.session_state.df_inf
@@ -2798,7 +2839,8 @@ with tab_predict:
             st.stop()
-        X_inf = df_inf[feature_cols].copy()
         X_inf = X_inf.replace({pd.NA: np.nan})
         for c in num_cols:

 def coerce_X_like_schema(X: pd.DataFrame, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame:
     """
+    Robustly align an inference dataframe to the training schema:
+    - normalizes column names (spaces/newlines/underscores/case)
+    - renames matching columns to the model's exact feature names
+    - creates missing columns as NaN (so prediction can still run)
     """
+    X = X.copy()
+    # Build normalized lookup from inference file columns -> actual column name in inference
+    inf_lookup = {norm_col(c): c for c in X.columns}
+    # Build output with exact training columns
+    X_out = pd.DataFrame(index=X.index)
+    missing = []
+    for col in feature_cols:
+        k = norm_col(col)
+        if k in inf_lookup:
+            X_out[col] = X[inf_lookup[k]]
+        else:
+            X_out[col] = np.nan
+            missing.append(col)
+    # Optional: show missing columns (don’t hard fail)
+    if missing:
+        st.warning(
+            "Inference file is missing some training columns (filled as blank/NaN): "
+            + ", ".join(missing[:12]) + (" ..." if len(missing) > 12 else "")
+        )
+    # Coercions (same as your existing logic)
+    X_out = X_out.replace({pd.NA: np.nan})
     for c in num_cols:
+        if c in X_out.columns:
+            X_out[c] = pd.to_numeric(X_out[c], errors="coerce")
     for c in cat_cols:
+        if c in X_out.columns:
+            X_out[c] = X_out[c].astype("object")
+            X_out.loc[X_out[c].isna(), c] = np.nan
+            X_out[c] = X_out[c].map(lambda v: v if pd.isna(v) else str(v))
+    return X_out
 def get_shap_background_auto(model_repo_id: str, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame | None:
     infer_file = st.file_uploader("Upload inference Excel (.xlsx) (optional, for dropdown options + batch prediction)", type=["xlsx"], key="infer_xlsx")
     if infer_file:
         st.session_state.df_inf = pd.read_excel(infer_file, engine="openpyxl")
+        # 🔴 PLACE IT HERE — normalize headers immediately after load
+        st.session_state.df_inf.columns = [
+            " ".join(str(c).replace("\u00A0"," ").split())
+            for c in st.session_state.df_inf.columns
+        ]
+        st.session_state.df_inf = add_ethnicity_region(
+            st.session_state.df_inf,
+            "Ethnicity",
+            "Ethnicity_Region"
+        )
     else:
         st.session_state.df_inf = None
     df_for_options = st.session_state.df_inf
             st.stop()
+        X_inf = coerce_X_like_schema(df_inf, feature_cols, num_cols, cat_cols)
         X_inf = X_inf.replace({pd.NA: np.nan})
         for c in num_cols: