Synav commited on
Commit
95e8c3b
·
verified ·
1 Parent(s): 3dda313

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -23
app.py CHANGED
@@ -677,58 +677,56 @@ def train_and_save(
677
 
678
 
679
  # ---- Train survival model (CoxPH) ----
 
 
680
  survival_trained = False
681
  surv_notes = None
682
 
683
  if time_days is not None and event01 is not None:
684
  try:
685
- # Build survival training frame from ORIGINAL df
686
  df_surv = df[feature_cols].copy().replace({pd.NA: np.nan})
687
 
688
- # Coerce numeric/categorical exactly like your main pipeline convention
689
  for c in num_cols:
690
  if c in df_surv.columns:
691
  df_surv[c] = pd.to_numeric(df_surv[c], errors="coerce")
692
-
693
  for c in cat_cols:
694
  if c in df_surv.columns:
695
  df_surv[c] = df_surv[c].astype("object")
696
  df_surv.loc[df_surv[c].isna(), c] = np.nan
697
  df_surv[c] = df_surv[c].map(lambda v: v if pd.isna(v) else str(v))
698
 
699
- # Add survival targets
700
  df_surv["time_days"] = time_days
701
  df_surv["event"] = event01
702
 
703
- # Keep only rows with valid survival targets
704
  df_surv = df_surv.dropna(subset=["time_days", "event"])
705
 
706
- # One-hot categoricals (drop_first=True matches your intention)
707
- df_surv_oh = pd.get_dummies(df_surv, columns=[c for c in cat_cols if c in df_surv.columns], drop_first=True)
708
 
709
  duration_col = "time_days"
710
  event_col = "event"
711
  X_cols = [c for c in df_surv_oh.columns if c not in (duration_col, event_col)]
712
-
713
- # Ensure numeric matrix for Cox
714
- df_surv_oh[X_cols] = df_surv_oh[X_cols].apply(pd.to_numeric, errors="coerce")
715
 
716
- # Fit an imputer ONLY for Cox predictors and store it in the bundle
 
 
 
717
  imp = SimpleImputer(strategy="median")
718
- df_surv_oh[X_cols] = imp.fit_transform(df_surv_oh[X_cols])
719
 
720
- # Final sanity: remove any remaining bad rows (rare)
721
- df_surv_oh = df_surv_oh.dropna(subset=[duration_col, event_col])
722
 
723
- # Fit penalized Cox
724
  cph = CoxPHFitter(penalizer=0.1)
725
- cph.fit(df_surv_oh, duration_col=duration_col, event_col=event_col)
726
-
727
- surv_columns = X_cols # predictors used in Cox
728
 
729
  bundle = {
730
  "model": cph,
731
- "columns": surv_columns,
732
  "imputer": imp,
733
  "cat_cols": cat_cols,
734
  "num_cols": num_cols,
@@ -741,18 +739,17 @@ def train_and_save(
741
 
742
  survival_trained = True
743
  surv_notes = "Survival model trained successfully."
744
-
745
  except Exception as e:
746
  survival_trained = False
747
  surv_notes = f"Survival model training failed: {e}"
748
  else:
749
  surv_notes = "Survival columns missing or could not be parsed; survival model not trained."
750
 
751
-
752
-
753
 
754
 
755
-
756
 
757
  joblib.dump(pipe, "model.joblib",compress=3)
758
 
@@ -1586,6 +1583,7 @@ with tab_train:
1586
  else:
1587
 
1588
  df = pd.read_excel(train_file, engine="openpyxl")
 
1589
  feature_cols = get_feature_cols_from_df(df)
1590
 
1591
  st.dataframe(df.head(), use_container_width=True)
 
677
 
678
 
679
  # ---- Train survival model (CoxPH) ----
680
+ from sklearn.impute import SimpleImputer
681
+
682
  survival_trained = False
683
  surv_notes = None
684
 
685
  if time_days is not None and event01 is not None:
686
  try:
 
687
  df_surv = df[feature_cols].copy().replace({pd.NA: np.nan})
688
 
689
+ # coerce numeric/cat like you already do
690
  for c in num_cols:
691
  if c in df_surv.columns:
692
  df_surv[c] = pd.to_numeric(df_surv[c], errors="coerce")
 
693
  for c in cat_cols:
694
  if c in df_surv.columns:
695
  df_surv[c] = df_surv[c].astype("object")
696
  df_surv.loc[df_surv[c].isna(), c] = np.nan
697
  df_surv[c] = df_surv[c].map(lambda v: v if pd.isna(v) else str(v))
698
 
 
699
  df_surv["time_days"] = time_days
700
  df_surv["event"] = event01
701
 
 
702
  df_surv = df_surv.dropna(subset=["time_days", "event"])
703
 
704
+ # one-hot
705
+ df_surv_oh = pd.get_dummies(df_surv, columns=cat_cols, drop_first=True)
706
 
707
  duration_col = "time_days"
708
  event_col = "event"
709
  X_cols = [c for c in df_surv_oh.columns if c not in (duration_col, event_col)]
 
 
 
710
 
711
+ # numeric coercion
712
+ df_surv_oh[X_cols] = df_surv_oh[X_cols].apply(pd.to_numeric, errors="coerce")
713
+
714
+ # impute predictors ONLY
715
  imp = SimpleImputer(strategy="median")
716
+ X_imp = imp.fit_transform(df_surv_oh[X_cols])
717
 
718
+ # safe assignment back
719
+ df_surv_oh.loc[:, X_cols] = pd.DataFrame(X_imp, columns=X_cols, index=df_surv_oh.index)
720
 
721
+ # fit Cox
722
  cph = CoxPHFitter(penalizer=0.1)
723
+ cph.fit(df_surv_oh[[duration_col, event_col] + X_cols],
724
+ duration_col=duration_col,
725
+ event_col=event_col)
726
 
727
  bundle = {
728
  "model": cph,
729
+ "columns": X_cols,
730
  "imputer": imp,
731
  "cat_cols": cat_cols,
732
  "num_cols": num_cols,
 
739
 
740
  survival_trained = True
741
  surv_notes = "Survival model trained successfully."
742
+
743
  except Exception as e:
744
  survival_trained = False
745
  surv_notes = f"Survival model training failed: {e}"
746
  else:
747
  surv_notes = "Survival columns missing or could not be parsed; survival model not trained."
748
 
749
+
 
750
 
751
 
752
+
753
 
754
  joblib.dump(pipe, "model.joblib",compress=3)
755
 
 
1583
  else:
1584
 
1585
  df = pd.read_excel(train_file, engine="openpyxl")
1586
+ df.columns = [c.strip() for c in df.columns]
1587
  feature_cols = get_feature_cols_from_df(df)
1588
 
1589
  st.dataframe(df.head(), use_container_width=True)