lspred / pipeline.py
Masterhazi's picture
Upload 4 files
cb034e5 verified
import pandas as pd
import numpy as np
class LiverSurvivalPipeline2:
def __init__(self, model, feature_columns, medians):
self.model = model
self.feature_columns = feature_columns
self.medians = medians
# -----------------------------
# Child-Pugh Scoring Functions
# -----------------------------
def bilirubin_score(self, bilirubin):
if bilirubin < 2:
return 1
elif 2 <= bilirubin <= 3:
return 2
else:
return 3
def albumin_score(self, albumin):
if albumin > 3.5:
return 1
elif 2.8 <= albumin <= 3.5:
return 2
else:
return 3
def ascites_score(self, ascites):
if ascites == 'N':
return 1
else:
return 3
def prothrombin_score(self, prothrombin):
if prothrombin < 4:
return 1
elif 4 <= prothrombin <= 6:
return 2
else:
return 3
# -----------------------------
# PREPROCESSING (FINAL VERSION)
# -----------------------------
def preprocess(self, df):
df = df.copy()
# ---- Age conversion ----
df['age_in_years'] = df['Age'] / 365.25
df.drop('Age', axis=1, inplace=True)
# ---- Fill categorical ----
cat_cols = ['Drug','Sex','Ascites','Hepatomegaly','Spiders','Edema']
for col in cat_cols:
df[col] = df[col].fillna("unknown")
# ---- Numeric columns used in final training ----
numeric_cols = [
'Bilirubin','Cholesterol','Albumin','Copper',
'Alk_Phos','SGOT','Tryglicerides',
'Platelets','Prothrombin','Stage'
]
# Missing flags + median imputation
for col in numeric_cols:
df[f"{col}_missing"] = df[col].isna().astype(int)
df[col] = df[col].fillna(self.medians[col])
# ---- Clinical engineered features ----
df['Thrombocytopenia'] = np.where(df['Platelets'] < 150, 1, 0)
df['el_bil'] = np.where(
(df['Bilirubin'] > 0.2) & (df['Bilirubin'] < 1.3), 0, 1
)
df['lo_alb'] = np.where(df['Albumin'] < 3.5, 1, 0)
df['el_co'] = np.where(df['Copper'] > 140, 1, 0)
df['el_phos'] = np.where(df['Alk_Phos'] > 147, 1, 0)
df['el_sgot'] = np.where(df['SGOT'] > 45, 1, 0)
df['el_clot'] = np.where(df['Prothrombin'] > 13.5, 1, 0)
# ---- Child-Pugh ----
df['Bilirubin_Score'] = df['Bilirubin'].apply(self.bilirubin_score)
df['Albumin_Score'] = df['Albumin'].apply(self.albumin_score)
df['Ascites_Score'] = df['Ascites'].apply(self.ascites_score)
df['Prothrombin_Score'] = df['Prothrombin'].apply(self.prothrombin_score)
df['Child_Pugh_Score'] = df[
['Bilirubin_Score','Albumin_Score',
'Ascites_Score','Prothrombin_Score']
].sum(axis=1)
# ---- Drop columns EXACTLY like training ----
drop_cols = [
'id',
'N_Days',
'Bilirubin_Score',
'Albumin_Score',
'Ascites_Score',
'Prothrombin_Score'
]
df.drop(drop_cols, axis=1, inplace=True, errors='ignore')
# ---- One-hot encoding ----
df = pd.get_dummies(df, drop_first=True)
# ---- Align to training columns ----
df = df.reindex(columns=self.feature_columns, fill_value=0)
return df
# -----------------------------
# Prediction
# -----------------------------
def predict_proba(self, df):
df_processed = self.preprocess(df)
return self.model.predict_proba(df_processed)[:, 1]
def predict(self, df, threshold=0.5):
probs = self.predict_proba(df)
return (probs >= threshold).astype(int)