Update app.py
Browse files
app.py
CHANGED
|
@@ -677,58 +677,56 @@ def train_and_save(
|
|
| 677 |
|
| 678 |
|
| 679 |
# ---- Train survival model (CoxPH) ----
|
|
|
|
|
|
|
| 680 |
survival_trained = False
|
| 681 |
surv_notes = None
|
| 682 |
|
| 683 |
if time_days is not None and event01 is not None:
|
| 684 |
try:
|
| 685 |
-
# Build survival training frame from ORIGINAL df
|
| 686 |
df_surv = df[feature_cols].copy().replace({pd.NA: np.nan})
|
| 687 |
|
| 688 |
-
#
|
| 689 |
for c in num_cols:
|
| 690 |
if c in df_surv.columns:
|
| 691 |
df_surv[c] = pd.to_numeric(df_surv[c], errors="coerce")
|
| 692 |
-
|
| 693 |
for c in cat_cols:
|
| 694 |
if c in df_surv.columns:
|
| 695 |
df_surv[c] = df_surv[c].astype("object")
|
| 696 |
df_surv.loc[df_surv[c].isna(), c] = np.nan
|
| 697 |
df_surv[c] = df_surv[c].map(lambda v: v if pd.isna(v) else str(v))
|
| 698 |
|
| 699 |
-
# Add survival targets
|
| 700 |
df_surv["time_days"] = time_days
|
| 701 |
df_surv["event"] = event01
|
| 702 |
|
| 703 |
-
# Keep only rows with valid survival targets
|
| 704 |
df_surv = df_surv.dropna(subset=["time_days", "event"])
|
| 705 |
|
| 706 |
-
#
|
| 707 |
-
df_surv_oh = pd.get_dummies(df_surv, columns=
|
| 708 |
|
| 709 |
duration_col = "time_days"
|
| 710 |
event_col = "event"
|
| 711 |
X_cols = [c for c in df_surv_oh.columns if c not in (duration_col, event_col)]
|
| 712 |
-
|
| 713 |
-
# Ensure numeric matrix for Cox
|
| 714 |
-
df_surv_oh[X_cols] = df_surv_oh[X_cols].apply(pd.to_numeric, errors="coerce")
|
| 715 |
|
| 716 |
-
#
|
|
|
|
|
|
|
|
|
|
| 717 |
imp = SimpleImputer(strategy="median")
|
| 718 |
-
|
| 719 |
|
| 720 |
-
#
|
| 721 |
-
df_surv_oh =
|
| 722 |
|
| 723 |
-
#
|
| 724 |
cph = CoxPHFitter(penalizer=0.1)
|
| 725 |
-
cph.fit(df_surv_oh
|
| 726 |
-
|
| 727 |
-
|
| 728 |
|
| 729 |
bundle = {
|
| 730 |
"model": cph,
|
| 731 |
-
"columns":
|
| 732 |
"imputer": imp,
|
| 733 |
"cat_cols": cat_cols,
|
| 734 |
"num_cols": num_cols,
|
|
@@ -741,18 +739,17 @@ def train_and_save(
|
|
| 741 |
|
| 742 |
survival_trained = True
|
| 743 |
surv_notes = "Survival model trained successfully."
|
| 744 |
-
|
| 745 |
except Exception as e:
|
| 746 |
survival_trained = False
|
| 747 |
surv_notes = f"Survival model training failed: {e}"
|
| 748 |
else:
|
| 749 |
surv_notes = "Survival columns missing or could not be parsed; survival model not trained."
|
| 750 |
|
| 751 |
-
|
| 752 |
-
|
| 753 |
|
| 754 |
|
| 755 |
-
|
| 756 |
|
| 757 |
joblib.dump(pipe, "model.joblib",compress=3)
|
| 758 |
|
|
@@ -1586,6 +1583,7 @@ with tab_train:
|
|
| 1586 |
else:
|
| 1587 |
|
| 1588 |
df = pd.read_excel(train_file, engine="openpyxl")
|
|
|
|
| 1589 |
feature_cols = get_feature_cols_from_df(df)
|
| 1590 |
|
| 1591 |
st.dataframe(df.head(), use_container_width=True)
|
|
|
|
| 677 |
|
| 678 |
|
| 679 |
# ---- Train survival model (CoxPH) ----
|
| 680 |
+
from sklearn.impute import SimpleImputer
|
| 681 |
+
|
| 682 |
survival_trained = False
|
| 683 |
surv_notes = None
|
| 684 |
|
| 685 |
if time_days is not None and event01 is not None:
|
| 686 |
try:
|
|
|
|
| 687 |
df_surv = df[feature_cols].copy().replace({pd.NA: np.nan})
|
| 688 |
|
| 689 |
+
# coerce numeric/cat like you already do
|
| 690 |
for c in num_cols:
|
| 691 |
if c in df_surv.columns:
|
| 692 |
df_surv[c] = pd.to_numeric(df_surv[c], errors="coerce")
|
|
|
|
| 693 |
for c in cat_cols:
|
| 694 |
if c in df_surv.columns:
|
| 695 |
df_surv[c] = df_surv[c].astype("object")
|
| 696 |
df_surv.loc[df_surv[c].isna(), c] = np.nan
|
| 697 |
df_surv[c] = df_surv[c].map(lambda v: v if pd.isna(v) else str(v))
|
| 698 |
|
|
|
|
| 699 |
df_surv["time_days"] = time_days
|
| 700 |
df_surv["event"] = event01
|
| 701 |
|
|
|
|
| 702 |
df_surv = df_surv.dropna(subset=["time_days", "event"])
|
| 703 |
|
| 704 |
+
# one-hot
|
| 705 |
+
df_surv_oh = pd.get_dummies(df_surv, columns=cat_cols, drop_first=True)
|
| 706 |
|
| 707 |
duration_col = "time_days"
|
| 708 |
event_col = "event"
|
| 709 |
X_cols = [c for c in df_surv_oh.columns if c not in (duration_col, event_col)]
|
|
|
|
|
|
|
|
|
|
| 710 |
|
| 711 |
+
# numeric coercion
|
| 712 |
+
df_surv_oh[X_cols] = df_surv_oh[X_cols].apply(pd.to_numeric, errors="coerce")
|
| 713 |
+
|
| 714 |
+
# impute predictors ONLY
|
| 715 |
imp = SimpleImputer(strategy="median")
|
| 716 |
+
X_imp = imp.fit_transform(df_surv_oh[X_cols])
|
| 717 |
|
| 718 |
+
# safe assignment back
|
| 719 |
+
df_surv_oh.loc[:, X_cols] = pd.DataFrame(X_imp, columns=X_cols, index=df_surv_oh.index)
|
| 720 |
|
| 721 |
+
# fit Cox
|
| 722 |
cph = CoxPHFitter(penalizer=0.1)
|
| 723 |
+
cph.fit(df_surv_oh[[duration_col, event_col] + X_cols],
|
| 724 |
+
duration_col=duration_col,
|
| 725 |
+
event_col=event_col)
|
| 726 |
|
| 727 |
bundle = {
|
| 728 |
"model": cph,
|
| 729 |
+
"columns": X_cols,
|
| 730 |
"imputer": imp,
|
| 731 |
"cat_cols": cat_cols,
|
| 732 |
"num_cols": num_cols,
|
|
|
|
| 739 |
|
| 740 |
survival_trained = True
|
| 741 |
surv_notes = "Survival model trained successfully."
|
| 742 |
+
|
| 743 |
except Exception as e:
|
| 744 |
survival_trained = False
|
| 745 |
surv_notes = f"Survival model training failed: {e}"
|
| 746 |
else:
|
| 747 |
surv_notes = "Survival columns missing or could not be parsed; survival model not trained."
|
| 748 |
|
| 749 |
+
|
|
|
|
| 750 |
|
| 751 |
|
| 752 |
+
|
| 753 |
|
| 754 |
joblib.dump(pipe, "model.joblib",compress=3)
|
| 755 |
|
|
|
|
| 1583 |
else:
|
| 1584 |
|
| 1585 |
df = pd.read_excel(train_file, engine="openpyxl")
|
| 1586 |
+
df.columns = [c.strip() for c in df.columns]
|
| 1587 |
feature_cols = get_feature_cols_from_df(df)
|
| 1588 |
|
| 1589 |
st.dataframe(df.head(), use_container_width=True)
|