import numpy as np import pandas as pd import re from sklearn.preprocessing import MultiLabelBinarizer # Constants UNKNOWN_TOKEN = "X" DATE_FORMAT = '%d/%m/%Y' BLOOD_GROUP_COLS = ["D_Blood group", "Recepient_Blood group before HSCT"] NATIONALITY_CORRECTIONS = { "AFGHANISTAN": "AFGHAN", "ALGERIA": "ALGERIAN", "EMARATI": "EMIRATI", "UAE": "EMIRATI", "PHILIPPINO": "FILIPINO", "JORDAN": "JORDANIAN", "JORDANI": "JORDANIAN", "PAKISTAN": "PAKISTANI", "PAKISTANII": "PAKISTANI", "PALESTINE": "PALESTINIAN", "PALESTENIAN": "PALESTINIAN", "USA": "AMERICAN", } # 1. Regional Grouping (Geography-Based) REGIONAL_GROUPING = { # Middle East 'EMIRATI': 'Middle East', 'OMANI': 'Middle East', 'SAUDI': 'Middle East', 'KUWAIT': 'Middle East', 'JORDANIAN': 'Middle East', 'LEBANESE': 'Middle East', 'IRAQI': 'Middle East', 'SYRIAN': 'Middle East', 'YEMENI': 'Middle East', 'PALESTINIAN': 'Middle East', 'BAHRAINI': 'Middle East', 'LIBYAN': 'Middle East', # North Africa 'EGYPTIAN': 'North Africa', 'SUDANESE': 'North Africa', 'ALGERIAN': 'North Africa', 'MOROCCAN': 'North Africa', 'MAURITANIA': 'North Africa', 'COMORAN': 'North Africa', # South Asia 'INDIAN': 'South Asia', 'PAKISTANI': 'South Asia', 'BANGLADESHI': 'South Asia', 'SRI LANKAN': 'South Asia', 'AFGHAN': 'South Asia', # Southeast Asia 'FILIPINO': 'Southeast Asia', 'INDONESIAN': 'Southeast Asia', # East Africa 'ETHIOPIAN': 'East Africa', 'SOMALI': 'East Africa', 'ERITREAN': 'East Africa', # Central Asia 'UZBEKISTANI': 'Central Asia', # Western Nations / Oceania / Americas 'AMERICAN': 'Western', 'BRITISH': 'Western', 'NEW ZEALANDER': 'Oceania', 'FIJI': 'Oceania' } # 2. Cultural-Linguistic Grouping CULTURAL_GROUPING = { 'EMIRATI': 'Arab', 'OMANI': 'Arab', 'SAUDI': 'Arab', 'KUWAIT': 'Arab', 'JORDANIAN': 'Arab', 'LEBANESE': 'Arab', 'IRAQI': 'Arab', 'SYRIAN': 'Arab', 'YEMENI': 'Arab', 'PALESTINIAN': 'Arab', 'BAHRAINI': 'Arab', 'LIBYAN': 'Arab', 'EGYPTIAN': 'Arab', 'SUDANESE': 'Arab-African', 'ALGERIAN': 'Arab', 'MOROCCAN': 'Arab', 'MAURITANIA': 'Arab', 'COMORAN': 'Arab-African', 'INDIAN': 'South Asian', 'PAKISTANI': 'South Asian', 'BANGLADESHI': 'South Asian', 'SRI LANKAN': 'South Asian', 'AFGHAN': 'South Asian', 'FILIPINO': 'Southeast Asian', 'INDONESIAN': 'Southeast Asian', 'ETHIOPIAN': 'East African', 'SOMALI': 'East African', 'ERITREAN': 'East African', 'UZBEKISTANI': 'Central Asian', 'AMERICAN': 'Western/English-speaking', 'BRITISH': 'Western/English-speaking', 'NEW ZEALANDER': 'Western/English-speaking', 'FIJI': 'Pacific Islander' } # 3. World Bank Income Grouping INCOME_GROUPING = { 'EMIRATI': 'High income', 'OMANI': 'High income', 'SAUDI': 'High income', 'KUWAIT': 'High income', 'JORDANIAN': 'Upper-middle income', 'LEBANESE': 'Upper-middle income', 'IRAQI': 'Upper-middle income', 'SYRIAN': 'Low income', 'YEMENI': 'Low income', 'PALESTINIAN': 'Lower-middle income', 'BAHRAINI': 'High income', 'LIBYAN': 'Upper-middle income', 'EGYPTIAN': 'Lower-middle income', 'SUDANESE': 'Low income', 'ALGERIAN': 'Lower-middle income', 'MOROCCAN': 'Lower-middle income', 'MAURITANIA': 'Low income', 'COMORAN': 'Low income', 'INDIAN': 'Lower-middle income', 'PAKISTANI': 'Lower-middle income', 'BANGLADESHI': 'Lower-middle income', 'SRI LANKAN': 'Lower-middle income', 'AFGHAN': 'Low income', 'FILIPINO': 'Lower-middle income', 'INDONESIAN': 'Lower-middle income', 'ETHIOPIAN': 'Low income', 'SOMALI': 'Low income', 'ERITREAN': 'Low income', 'UZBEKISTANI': 'Lower-middle income', 'AMERICAN': 'High income', 'BRITISH': 'High income', 'NEW ZEALANDER': 'High income', 'FIJI': 'Upper-middle income' } # 4. WHO Regional Office Grouping WHO_REGION_GROUPING = { 'EMIRATI': 'EMRO', 'OMANI': 'EMRO', 'SAUDI': 'EMRO', 'KUWAIT': 'EMRO', 'JORDANIAN': 'EMRO', 'LEBANESE': 'EMRO', 'IRAQI': 'EMRO', 'SYRIAN': 'EMRO', 'YEMENI': 'EMRO', 'PALESTINIAN': 'EMRO', 'BAHRAINI': 'EMRO', 'LIBYAN': 'EMRO', 'EGYPTIAN': 'EMRO', 'SUDANESE': 'EMRO', 'ALGERIAN': 'AFRO', 'MOROCCAN': 'EMRO', 'MAURITANIA': 'AFRO', 'COMORAN': 'AFRO', 'INDIAN': 'SEARO', 'PAKISTANI': 'EMRO', 'BANGLADESHI': 'SEARO', 'SRI LANKAN': 'SEARO', 'AFGHAN': 'EMRO', 'FILIPINO': 'WPRO', 'INDONESIAN': 'SEARO', 'ETHIOPIAN': 'AFRO', 'SOMALI': 'EMRO', 'ERITREAN': 'AFRO', 'UZBEKISTANI': 'EURO', 'AMERICAN': 'AMRO', 'BRITISH': 'EURO', 'NEW ZEALANDER': 'WPRO', 'FIJI': 'WPRO' } groupings = { 'Recepient_Nationality_Geographical': REGIONAL_GROUPING, 'Recepient_Nationality_Cultural': CULTURAL_GROUPING, 'Recepient_Nationality_Regional_Income': INCOME_GROUPING, 'Recepient_Nationality_Regional_WHO': WHO_REGION_GROUPING } # FIRST_GVHD_PROPHYLAXIS_CORRECTIONS DRUG_SPELLING_CORRECTIONS = { "CYCLOSPOPRIN": "CYCLOSPORIN", "CYCLOSPRIN": "CYCLOSPORIN", "CYCLOSPOROIN": "CYCLOSPORIN", "CY": "CYCLOSPORIN", "TAC": "TACROLIMUS", "MTX": "METHOTREXATE", "BUDESONIDE": "STEROID", "STEROIDS": "STEROID", "ATG.": "ATG", "FLUDARABINIE": "FLUDARABINE", "FLUDRABINE":"FLUDARABINE", "BUSULPHAN": "BUSULFAN", "MEPHALAN": "MELPHALAN", "GEMCITABIBE": "GEMCITABINE", } GENDER_MAP = { 0: "MALE", 1: "FEMALE", 2: UNKNOWN_TOKEN, "0": "MALE", "1": "FEMALE", "2": UNKNOWN_TOKEN } RELATION_CORRECTIONS = { r"(?i)BROTHER": "SIBLING", r"(?i)SISTER": "SIBLING", r"(?i)FATHER": "FIRST DEGREE RELATIVE", r"(?i)MOTHER": "FIRST DEGREE RELATIVE", r"(?i)SON": "FIRST DEGREE RELATIVE", r"(?i)DAUGHTER": "FIRST DEGREE RELATIVE", r"(?i)COUSIN": "SECOND DEGREE RELATIVE", r"(?i)UNCLE": "SECOND DEGREE RELATIVE", r"(?i)AUNT": "SECOND DEGREE RELATIVE", r"(?i)other": UNKNOWN_TOKEN } STRING_NORMALIZATION_MAP = { r"(?i)unknown": UNKNOWN_TOKEN, r"(?i)unkown": UNKNOWN_TOKEN, r"(?i)Unknwon": UNKNOWN_TOKEN, np.nan: UNKNOWN_TOKEN, r"(?i)\bMale\b": "MALE", r"(?i)\bFemale\b": "FEMALE", "1o": "10", r"(?i)Umbilical Cord": "UMBILICAL CORD", r"(?i)Umbilical Cord blood": "UMBILICAL CORD", r"(?i)Bone Marrow": "BONE MARROW", "MDS": "MYELODYSPLASTIC SYNDROME" } DIAGNOSIS_GROUP_MAP = { "MYELOPROLIFERATIVE DISORDER": "MYELOPROLIFERATIVE NEOPLASMS", "CML": "MYELOPROLIFERATIVE NEOPLASMS", "MYELOFIBROSIS": "MYELOPROLIFERATIVE NEOPLASMS", "NON-HODGKIN LYMPHOMA": "LYMPHOMA", 'NON HODGKIN LYMPHOMA': "LYMPHOMA", "HODGKIN LYMPHOMA": "LYMPHOMA", "BETA THALASSEMIA": "RED CELL DISORDERS", 'BETA THALESSEMIA': "RED CELL DISORDERS", "ALPHA THALASSEMIA": "RED CELL DISORDERS", "ALPHA THALESSEMIA": "RED CELL DISORDERS", "ALPHA THALSSEMIA": "RED CELL DISORDERS", "HEREDITARY SPHEROCYTOSIS": "RED CELL DISORDERS", "SICKLE CELL DISEASE": "RED CELL DISORDERS", "APLASTIC ANEMIA": "BMF SYNDROMES", "FANCONI ANEMIA": "BMF SYNDROMES", "DYSKERATOSIS CONGENITA": "BMF SYNDROMES", 'DYSKERATOSIS CONGENTIA': "BMF SYNDROMES", "CHRONIC GRANULOMATOUS DISEASE": "IMMUNE DISORDERS", "COMBINED VARIABLE IMMUNODEFICIENCY": "IMMUNE DISORDERS", "SCID": "IMMUNE DISORDERS", ## check this one "X-LINKED HYPERGAMMAGLOBULINEMIA": "IMMUNE DISORDERS", '-LINKED HYPERGAMMAGLOBULINEMIA': "IMMUNE DISORDERS", '-LINKED HYPER IGM SYNDROME': "IMMUNE DISORDERS", "HYPOGAMMAGLOBULINEMIA": "IMMUNE DISORDERS", ## check this one "GLANZMANN": "OTHER", 'GLANZMANN THROMBASTHENIA': "OTHER", "CLL": "OTHER", "PNH": "OTHER", "HLH": "OTHER", "LANGERHANS CELL HISTIOCYTOSIS": "OTHER", "BLASTIC PLASMACYTOID DENDRITIC CELL NEOPLASM": "OTHER", 'BLASTIC PLASMACYTOID DENDRITRIC CELL NEOPLASM': "OTHER", "B-ALL": "ALL", "BALL": "ALL", "TALL": "ALL", "T-ALL": "ALL", "AML": "AML", "ACUTE MYELOID LEUKEMIA": "AML" } # # 0 nonmalignant; 1: malignant MALIGNANT_MAP = { 'AML': 1, 'RED CELL DISORDERS': 0, 'AMYLOIDOSIS': 0, 'BMF SYNDROMES': 0, 'ALL': 1, 'OTHER': 0, 'IMMUNE DISORDERS': 0, 'CHRONIC LYMPHOCYTIC LEUKEMIA': 1, 'MYELOPROLIFERATIVE NEOPLASMS': 1, # note: CML is malignant; not sure about MYELOPROLIFERATIVE DISORDER & MYELOFIBROSIS 'HEMOPHAGOCYTIC LYMPHOHISTIOCYTOSIS (HLH)': 0, 'LYMPHOMA': 1, 'MYELODYSPLASTIC SYNDROME': 1, 'MEDULLOBLASTOMA': 0, 'MULTIPLE MYELOMA': 0, 'NEUROBLASTOMA': 0, 'PAROXYSMAL NOCTURNAL HEMOGLOBINURIA': 0, 'PLASMA CELL LEUKEMIA': 0 } HLA_MATCHING_MAP = { "12 OF 12": "FULL", "10 OF 10": "FULL", "8 OF 8": "FULL", # not full? "9 OF 10": "PARTIAL", "8 OF 10": "PARTIAL", "PARTIALLY MATCHED": "PARTIAL", "7 OF 10": "HAPLOIDENTICAL", "6 OF 12": "HAPLOIDENTICAL", "6 OF 10": "HAPLOIDENTICAL", "5 OF 10": "HAPLOIDENTICAL", # confirm if the following are all haploidentical "5 OF 8": "HAPLOIDENTICAL", "4 OF 6": "HAPLOIDENTICAL", } # --- NEW: additional columns for Pro model / survival --- SURVIVAL_DATE_COLS = ["HSCT_date", "Last_followup_date", "Date_of_death", "Date of first diagnosis/BMBx date"] ALLOWED_DONOR_TYPES = {"MRD", "MUD", "HAPLO", "MMRD", "MMUD", "CORD", "OTHER", UNKNOWN_TOKEN} ALLOWED_COND_INTENSITY = {"MAC", "RIC", "NMA", UNKNOWN_TOKEN} ALLOWED_PROPH_CAT = {"CNI_BASED", "PTCY_BASED", "ATG_BASED", "TCD", "OTHER", UNKNOWN_TOKEN} DONOR_TYPE_MAP = { # normalize common variants "HAPLOIDENTICAL": "HAPLO", "HAPLO-IDENTICAL": "HAPLO", "HAPLO ID": "HAPLO", "MATCHED RELATED": "MRD", "MATCHED UNRELATED": "MUD", "MISMATCHED RELATED": "MMRD", "MISMATCHED UNRELATED": "MMUD", "UCB": "CORD", "UMBILICAL CORD": "CORD", "CORD BLOOD": "CORD", } COND_INTENSITY_MAP = { "MYELOABLATIVE": "MAC", "REDUCED INTENSITY": "RIC", "NON-MYELOABLATIVE": "NMA", "NON MYELOABLATIVE": "NMA", } PROPH_CAT_MAP = { "CNI BASED": "CNI_BASED", "CNI-BASED": "CNI_BASED", "PTCY BASED": "PTCY_BASED", "PTCY-BASED": "PTCY_BASED", "ATG BASED": "ATG_BASED", "ATG-BASED": "ATG_BASED", } def load_train_features(): # Define features HLA_sub12 = [ # Recepient - HLA-A 'R_HLA_A_1', 'R_HLA_A_2', 'R_HLA_A_3', 'R_HLA_A_4', 'R_HLA_A_7', 'R_HLA_A_8', 'R_HLA_A_11', 'R_HLA_A_12', 'R_HLA_A_20', 'R_HLA_A_23', 'R_HLA_A_24', 'R_HLA_A_25', 'R_HLA_A_26', 'R_HLA_A_29', 'R_HLA_A_30', 'R_HLA_A_31', 'R_HLA_A_32', 'R_HLA_A_33', 'R_HLA_A_34', 'R_HLA_A_66', 'R_HLA_A_68', 'R_HLA_A_69', 'R_HLA_A_74', 'R_HLA_A_X', # Recepient - HLA-B 'R_HLA_B_7', 'R_HLA_B_8', 'R_HLA_B_13', 'R_HLA_B_14', 'R_HLA_B_15', 'R_HLA_B_18', 'R_HLA_B_23', 'R_HLA_B_24', 'R_HLA_B_27', 'R_HLA_B_35', 'R_HLA_B_37', 'R_HLA_B_38', 'R_HLA_B_39', 'R_HLA_B_40', 'R_HLA_B_41', 'R_HLA_B_42', 'R_HLA_B_44', 'R_HLA_B_45', 'R_HLA_B_46', 'R_HLA_B_49', 'R_HLA_B_50', 'R_HLA_B_51', 'R_HLA_B_52', 'R_HLA_B_53', 'R_HLA_B_55', 'R_HLA_B_56', 'R_HLA_B_57', 'R_HLA_B_58', 'R_HLA_B_73', 'R_HLA_B_81', 'R_HLA_B_X', # Recepient - HLA-C 'R_HLA_C_1', 'R_HLA_C_2', 'R_HLA_C_3', 'R_HLA_C_4', 'R_HLA_C_5', 'R_HLA_C_6', 'R_HLA_C_7', 'R_HLA_C_8', 'R_HLA_C_12', 'R_HLA_C_14', 'R_HLA_C_15', 'R_HLA_C_16', 'R_HLA_C_17', 'R_HLA_C_18', 'R_HLA_C_38', 'R_HLA_C_49', 'R_HLA_C_50', 'R_HLA_C_X', # Recepient - HLA-DR 'R_HLA_DR_1', 'R_HLA_DR_2', 'R_HLA_DR_3', 'R_HLA_DR_4', 'R_HLA_DR_5', 'R_HLA_DR_6', 'R_HLA_DR_7', 'R_HLA_DR_8', 'R_HLA_DR_9', 'R_HLA_DR_10', 'R_HLA_DR_11', 'R_HLA_DR_12', 'R_HLA_DR_13', 'R_HLA_DR_14', 'R_HLA_DR_15', 'R_HLA_DR_16', 'R_HLA_DR_17', 'R_HLA_DR_X', # Recepient - HLA-DQ 'R_HLA_DQ_1', 'R_HLA_DQ_2', 'R_HLA_DQ_3', 'R_HLA_DQ_4', 'R_HLA_DQ_5', 'R_HLA_DQ_6', 'R_HLA_DQ_7', 'R_HLA_DQ_11', 'R_HLA_DQ_15', 'R_HLA_DQ_16', 'R_HLA_DQ_301', 'R_HLA_DQ_X', # Donor - HLA-A 'D_HLA_A_1', 'D_HLA_A_2', 'D_HLA_A_3', 'D_HLA_A_8', 'D_HLA_A_11', 'D_HLA_A_12', 'D_HLA_A_23', 'D_HLA_A_24', 'D_HLA_A_25', 'D_HLA_A_26', 'D_HLA_A_29', 'D_HLA_A_30', 'D_HLA_A_31', 'D_HLA_A_32', 'D_HLA_A_33', 'D_HLA_A_34', 'D_HLA_A_66', 'D_HLA_A_68', 'D_HLA_A_69', 'D_HLA_A_7', 'D_HLA_A_74', 'D_HLA_A_X', # Donor - HLA-B 'D_HLA_B_7', 'D_HLA_B_8', 'D_HLA_B_13', 'D_HLA_B_14', 'D_HLA_B_15', 'D_HLA_B_17', 'D_HLA_B_18', 'D_HLA_B_23', 'D_HLA_B_24', 'D_HLA_B_27', 'D_HLA_B_35', 'D_HLA_B_37', 'D_HLA_B_38', 'D_HLA_B_39', 'D_HLA_B_40', 'D_HLA_B_41', 'D_HLA_B_42', 'D_HLA_B_44', 'D_HLA_B_45', 'D_HLA_B_48', 'D_HLA_B_49', 'D_HLA_B_50', 'D_HLA_B_51', 'D_HLA_B_52', 'D_HLA_B_53', 'D_HLA_B_55', 'D_HLA_B_56', 'D_HLA_B_57', 'D_HLA_B_58', 'D_HLA_B_73', 'D_HLA_B_81', 'D_HLA_B_X', # Donor - HLA-C 'D_HLA_C_1', 'D_HLA_C_2', 'D_HLA_C_3', 'D_HLA_C_4', 'D_HLA_C_5', 'D_HLA_C_6', 'D_HLA_C_7', 'D_HLA_C_8', 'D_HLA_C_12', 'D_HLA_C_14', 'D_HLA_C_15', 'D_HLA_C_16', 'D_HLA_C_17', 'D_HLA_C_18', 'D_HLA_C_38', 'D_HLA_C_49', 'D_HLA_C_50', 'D_HLA_C_X', # Donor - HLA-DR 'D_HLA_DR_1', 'D_HLA_DR_2', 'D_HLA_DR_3', 'D_HLA_DR_4', 'D_HLA_DR_5', 'D_HLA_DR_6', 'D_HLA_DR_7', 'D_HLA_DR_8', 'D_HLA_DR_9', 'D_HLA_DR_10', 'D_HLA_DR_11', 'D_HLA_DR_12', 'D_HLA_DR_13', 'D_HLA_DR_14', 'D_HLA_DR_15', 'D_HLA_DR_16', 'D_HLA_DR_17', 'D_HLA_DR_X', # Donor - HLA-DQ 'D_HLA_DQ_1', 'D_HLA_DQ_2', 'D_HLA_DQ_3', 'D_HLA_DQ_4', 'D_HLA_DQ_5', 'D_HLA_DQ_6', 'D_HLA_DQ_7', 'D_HLA_DQ_11', 'D_HLA_DQ_15', 'D_HLA_DQ_16', 'D_HLA_DQ_301', 'D_HLA_DQ_X' ] HLA_sub12_without_X = [i for i in HLA_sub12 if "_X" not in i] prehsct_onehot = [ 'PreHSCT_ALEMTUZUMAB', 'PreHSCT_ATG', 'PreHSCT_BEAM', 'PreHSCT_BUSULFAN', 'PreHSCT_CAMPATH', 'PreHSCT_CARMUSTINE', 'PreHSCT_CLOFARABINE', 'PreHSCT_CYCLOPHOSPHAMIDE', 'PreHSCT_CYCLOSPORIN', 'PreHSCT_CYTARABINE', 'PreHSCT_ETOPOSIDE', 'PreHSCT_FLUDARABINE', 'PreHSCT_GEMCITABINE', 'PreHSCT_MELPHALAN', 'PreHSCT_MTX', 'PreHSCT_OTHER', 'PreHSCT_RANIMUSTINE', 'PreHSCT_REDUCEDCONDITIONING', 'PreHSCT_RITUXIMAB', 'PreHSCT_SIROLIMUS', 'PreHSCT_TBI', 'PreHSCT_THIOTEPA', 'PreHSCT_TREOSULFAN', 'PreHSCT_UA', 'PreHSCT_VORNOSTAT', ] first_prophylaxis_onehot = [ 'First_GVHD_prophylaxis_ABATACEPT', 'First_GVHD_prophylaxis_ALEMTUZUMAB', 'First_GVHD_prophylaxis_ATG', 'First_GVHD_prophylaxis_CYCLOPHOSPHAMIDE', 'First_GVHD_prophylaxis_CYCLOSPORIN', 'First_GVHD_prophylaxis_IMATINIB', 'First_GVHD_prophylaxis_LEFLUNOMIDE', 'First_GVHD_prophylaxis_MMF', 'First_GVHD_prophylaxis_MTX', 'First_GVHD_prophylaxis_NONE', 'First_GVHD_prophylaxis_RUXOLITINIB', 'First_GVHD_prophylaxis_SIROLIMUS', 'First_GVHD_prophylaxis_STEROID', 'First_GVHD_prophylaxis_TAC', ] train_features = [[ 'Recepient_gender', 'R_Age_at_transplant_cutoff18', 'Recepient_Nationality_Cultural', 'Hematological Diagnosis_Grouped', 'Recepient_Blood group before HSCT_MergePlusMinus', 'D_Age_at_transplant_cutoff18', 'Age_Gap_R_D', 'Donor_gender', 'D_Blood group_MergePlusMinus', 'Number of lines of Rx before HSCT', 'Source of cells', 'Donor_relation to recepient', ] + HLA_sub12_without_X + prehsct_onehot + first_prophylaxis_onehot][0] # Categorical features cat_features = [ 'Recepient_gender', 'Recepient_Nationality_Cultural', 'Hematological Diagnosis_Grouped', 'Recepient_Blood group before HSCT_MergePlusMinus', 'Donor_gender', 'D_Blood group_MergePlusMinus', 'Source of cells', 'Donor_relation to recepient', ] return train_features, cat_features def load_dataset(file_path: str) -> pd.DataFrame: """Load dataset from CSV file and drop columns with all missing values""" df = pd.read_csv(file_path, header=1) return df.dropna(axis=1, how="all") def normalize_strings(df: pd.DataFrame) -> pd.DataFrame: """ Standardize string values across the dataset: - Replace variations of unknown/NA with consistent token - Correct common misspellings and abbreviations - Capitalize all strings for consistency - Strip leading/trailing whitespace """ # Apply global string replacements df = df.replace(STRING_NORMALIZATION_MAP, regex=True) # Handle nationality-specific replacements non_nationality_cols = [col for col in df.columns if "Nationality" not in col] df[non_nationality_cols] = df[non_nationality_cols].replace( {r"(?i)\buk\b": UNKNOWN_TOKEN}, regex=True ) # Handle non-HLA specific replacements non_hla_cols = [col for col in df.columns if "HLA" not in col] df[non_hla_cols] = df[non_hla_cols].replace( {r"(?i)\bna\b": UNKNOWN_TOKEN}, regex=True ) # Capitalize all string values df = df.applymap(lambda x: x.upper() if isinstance(x, str) else x) # Strip whitespace return df.applymap(lambda x: x.strip() if isinstance(x, str) else x) def clean_blood_group_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame: """Remove spaces from specified blood group columns""" for col in columns: if col in df.columns: df[col] = df[col].astype(str).str.replace(r"\s+", "", regex=True) return df def standardize_hla_matching(df: pd.DataFrame) -> pd.DataFrame: if "HLA match ratio" in df.columns: df["HLA match ratio"] = df["HLA match ratio"].replace(HLA_MATCHING_MAP, regex=False) return df def process_hla_columns(df: pd.DataFrame) -> pd.DataFrame: """ Clean and process HLA columns by: 1. Splitting combined HLA values into separate columns 2. Standardizing missing value representation 3. Sorting allele values numerically 4. Recombining cleaned values """ # Padding function to ensure 2 elements, filling with 'NA'. Used for Individual_Predictions def pad_list(val): if not isinstance(val, list): val = [] return (val + ['NA', 'NA'])[:2] hla_columns = [col for col in df.columns if "R_HLA" in col or "D_HLA" in col] # hla_columns = ['R_HLA_A', 'R_HLA_B', 'R_HLA_C', 'R_HLA_DR', 'R_HLA_DQ', # 'D_HLA_A', 'D_HLA_B', 'D_HLA_C', 'D_HLA_DR', 'D_HLA_DQ'] for col in hla_columns: # Handle special NA representation df[col] = df[col].replace({"NA": "NA&NA"}) # Split into two separate columns split_cols = [f"{col}1", f"{col}2"] if type(df[col].iloc[0]) != list: s = df[col].astype(str) # ensures .str works s = s.replace({"NA": "NA&NA", "NAN": "NA&NA", "NONE": "NA&NA"}) tmp = s.str.split("&", n=1, expand=True) if tmp.shape[1] == 1: tmp[1] = np.nan df[split_cols] = tmp.iloc[:, :2] elif type(df[col].iloc[0]) == list: df[col] = df[col].apply(pad_list) df[split_cols] = pd.DataFrame(df[col].tolist(), index=df.index) # Standardize missing values missing_indicators = {" ", "NA", "N/A", UNKNOWN_TOKEN, "''", '""', "", "B1", None} df[split_cols] = df[split_cols].replace(missing_indicators, np.nan) # Convert to numeric and handle zeros df[split_cols] = df[split_cols].apply(pd.to_numeric, errors='coerce') df[split_cols] = df[split_cols].replace(0, np.nan) # Sort values numerically df[split_cols] = np.sort(df[split_cols], axis=1) # Convert numbers to integers, missing to 'X' df[split_cols] = df[split_cols].applymap(lambda x: str(int(x)) if pd.notna(x) else UNKNOWN_TOKEN) # Recombine cleaned values df[col] = df[split_cols].astype(str).agg("&".join, axis=1) return df def cast_as_int_if_possible(x): try: i = int(x) # Only return int if conversion is lossless (e.g., avoid converting '5.5' -> 5) if float(x) == i: return i except: pass return x def HLA_unique_alleles(df, HLA_col1, HLA_col2): u1 = df[HLA_col1].astype(str).unique() u2 = df[HLA_col2].astype(str).unique() unique_set = set(u1).union(set(u2)) unique_set = {UNKNOWN_TOKEN if v in {"nan", "None", ""} else v for v in unique_set} return sorted(unique_set) def expand_HLA_cols_(df, HLA_col1, HLA_col2): HLA_uniques = [u for u in HLA_unique_alleles(df, HLA_col1, HLA_col2) if u != UNKNOWN_TOKEN] col_name = HLA_col1[:-1] # get "R_HLA_A" from "R_HLA_A1" for i in HLA_uniques: df[f"{col_name}_{i}"] = 0 df.loc[df[HLA_col1]==i, f"{col_name}_{i}"] = 1 # or = 1 df.loc[df[HLA_col2]==i, f"{col_name}_{i}"] = 1 # or = 1 return df def expand_HLA_cols(df): df = expand_HLA_cols_(df, HLA_col1="R_HLA_A1", HLA_col2="R_HLA_A2") df = expand_HLA_cols_(df, HLA_col1="R_HLA_B1", HLA_col2="R_HLA_B2") df = expand_HLA_cols_(df, HLA_col1="R_HLA_C1", HLA_col2="R_HLA_C2") df = expand_HLA_cols_(df, HLA_col1="R_HLA_DR1", HLA_col2="R_HLA_DR2") df = expand_HLA_cols_(df, HLA_col1="R_HLA_DQ1", HLA_col2="R_HLA_DQ2") df = expand_HLA_cols_(df, HLA_col1="D_HLA_A1", HLA_col2="D_HLA_A2") df = expand_HLA_cols_(df, HLA_col1="D_HLA_B1", HLA_col2="D_HLA_B2") df = expand_HLA_cols_(df, HLA_col1="D_HLA_C1", HLA_col2="D_HLA_C2") df = expand_HLA_cols_(df, HLA_col1="D_HLA_DR1", HLA_col2="D_HLA_DR2") df = expand_HLA_cols_(df, HLA_col1="D_HLA_DQ1", HLA_col2="D_HLA_DQ2") return df def correct_nationalities(df: pd.DataFrame, column: str) -> pd.DataFrame: """Standardize nationality names using predefined corrections""" df[column] = df[column].replace(NATIONALITY_CORRECTIONS) return df def correct_indiv_drug_name(drug_list): if pd.isna(drug_list): return drug_list if isinstance(drug_list, str): parts = re.split(r'([ /+])', drug_list) # keep separators elif isinstance(drug_list, list): parts = drug_list else: return drug_list corrected_parts = [] for part in parts: token = part.strip() if token and token not in {'/', '+', ' '}: corrected_parts.append(DRUG_SPELLING_CORRECTIONS.get(token, token)) else: corrected_parts.append(part) return ''.join(corrected_parts) def correct_drug_name_in_list(df: pd.DataFrame, column: str) -> pd.DataFrame: """Standardize drug names in a list using predefined corrections, preserving separators.""" # Apply the correction function to each entry in the specified column df[column] = df[column].apply(correct_indiv_drug_name) return df def standardize_compound_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame: """ Process columns with compound values by: 1. Removing spaces 2. Standardizing separators 3. Sorting components alphabetically """ for col in columns: if col in df.columns and type(df[col].iloc[0]) != list: # Clean string values df[col] = df[col].str.replace(r"\s+", "", regex=True).str.replace("+", "/").str.replace(",", "/") # Split, remove empty parts, sort, and join df[col] = df[col].apply( lambda x: "/".join(sorted([p.strip() for p in x.split("/") if p.strip()])) if isinstance(x, str) else x ) return df def standardize_gender(df: pd.DataFrame) -> pd.DataFrame: """Standardize donor gender values and infer from relationship where possible""" # Apply gender mapping df["Donor_gender"] = df["Donor_gender"].replace(GENDER_MAP) df["Recepient_gender"] = df["Recepient_gender"].replace(GENDER_MAP) # Infer gender from relationship gender_map = { "BROTHER": "MALE", "SISTER": "FEMALE", "FATHER": "MALE", "MOTHER": "FEMALE", "SON": "MALE", "DAUGHTER": "FEMALE", "UNCLE": "MALE", "AUNT": "FEMALE" } for relationship, gender in gender_map.items(): mask = df["Donor_relation to recepient"] == relationship df.loc[mask, "Donor_gender"] = gender return df def correct_donor_relationships(df: pd.DataFrame) -> pd.DataFrame: """Standardize relationship categories using predefined corrections""" return df.replace({"Donor_relation to recepient": RELATION_CORRECTIONS}, regex=True) def handle_self_donor_consistency(df: pd.DataFrame) -> pd.DataFrame: """ Ensure data consistency for self-donors by: 1. Setting HLA values to 'SELF&SELF' 2. Verifying matching demographics """ self_mask = df["Donor_relation to recepient"] == "SELF" # Set HLA values for self-donors hla_cols = [col for col in df.columns if "R_HLA" in col or "D_HLA" in col] df.loc[self_mask, hla_cols] = "SELF&SELF" # Verify demographic consistency assert df.loc[self_mask, "Recepient_gender"].equals( df.loc[self_mask, "Donor_gender"] ), "Recepient/Donor gender mismatch for self-donors" assert df.loc[self_mask, "Recepient_Blood group before HSCT"].equals( df.loc[self_mask, "D_Blood group"] ), "Blood group mismatch for self-donors" assert (df.loc[self_mask, "Recepient_DOB"].values == df.loc[self_mask, "Donor_DOB"].values).all() ), "DOB mismatch for self-donors" return df def safe_extract_year(date_val): if pd.isna(date_val): return UNKNOWN_TOKEN if isinstance(date_val, (pd.Timestamp, np.datetime64)): try: return int(pd.to_datetime(date_val).year) except: return UNKNOWN_TOKEN if not isinstance(date_val, str) or date_val == UNKNOWN_TOKEN: return UNKNOWN_TOKEN try: if "YEAR" in date_val: return UNKNOWN_TOKEN parts = date_val.split("/") if len(parts) < 3: return UNKNOWN_TOKEN year_part = parts[-1].strip() return int(year_part) if year_part.isdigit() else UNKNOWN_TOKEN except (ValueError, TypeError): return UNKNOWN_TOKEN def extract_year(df: pd.DataFrame, column_name) -> pd.DataFrame: df[column_name + "_Year"] = df[column_name].apply(safe_extract_year) return df def calculate_ages(df: pd.DataFrame) -> pd.DataFrame: """ Calculate: 1. Recepient age at transplant 2. Donor age at transplant 3. Age gap between recepient and donor """ # Calculate ages with safe conversion def calculate_age_diff(row, dob_col, transplant_col): try: return int(row[transplant_col]) - int(row[dob_col]) except (TypeError, ValueError): return UNKNOWN_TOKEN df["R_Age_at_transplant"] = df.apply( lambda row: calculate_age_diff(row, "Recepient_DOB_Year", "HSCT_date_Year"), axis=1 ) df["D_Age_at_transplant"] = df.apply( lambda row: calculate_age_diff(row, "Donor_DOB_Year", "HSCT_date_Year"), axis=1 ) df["Age_Gap_R_D"] = df.apply( lambda row: calculate_age_diff(row, "Donor_DOB_Year", "Recepient_DOB_Year"), axis=1 ) return df # Utility Function: Split and One-Hot Encode Drug Regimens def split_and_one_hot_encode(df, column_name, prefix): if type(df[column_name].iloc[0]) != list: df[column_name] = df[column_name].fillna("").apply( lambda x: [t.strip() for t in x.split("/") if t.strip()] if x else [] ) mlb = MultiLabelBinarizer() encoded_df = pd.DataFrame( mlb.fit_transform(df[column_name]), columns=[f"{prefix}_{drug.strip()}" for drug in mlb.classes_ if str(drug).strip()], index=df.index ) return pd.concat([df, encoded_df], axis=1) # Normalize Blood Groups (Remove +/-) def merge_blood_groups(df, column, new_col): """ Removes '+' and '-' from blood group values. Args: df (pd.DataFrame): Input dataframe column (str): Column name to normalize new_col (str): New column name for cleaned values Returns: pd.DataFrame: Updated dataframe """ df[new_col] = df[column].apply(lambda x: re.sub(r'[+-]', '', x) if pd.notnull(x) else np.nan) return df def binarize_age(df, age_col, cutoff, new_col): """ Binarizes age column based on a cutoff. Non-numeric values are left as-is. Args: df (pd.DataFrame): Input dataframe age_col (str): Column name containing age cutoff (int): Age cutoff new_col (str): New binary column name Returns: pd.DataFrame: Updated dataframe """ def binarize_or_keep(val): try: return int(val >= cutoff) except TypeError: return val # Leave strings or non-numeric values unchanged df[new_col] = df[age_col].apply(binarize_or_keep) return df # Create Composite Gender & Relation Columns def add_gender_relation_features(df): """ Creates new columns combining donor relation with recepient and donor genders. Returns: pd.DataFrame: Updated dataframe """ df["Relation_and_Recepient_Gender"] = df["Donor_relation to recepient"] + " R_" + df["Recepient_gender"] df["Relation_and_Donor_Gender"] = df["Donor_relation to recepient"] + " D_" + df["Donor_gender"] df["Relation_and_Recepient_and_Donor_Gender"] = ( df["Donor_relation to recepient"] + " R_" + df["Recepient_gender"] + " D_" + df["Donor_gender"] ) return df # Nationality-Based Groupings def apply_nationality_groupings(df, column, grouping_dicts): """ Applies multiple groupings based on nationality. Args: df (pd.DataFrame): Input dataframe column (str): Column to group by grouping_dicts (dict): Dictionary of {new_col_name: mapping_dict} Returns: pd.DataFrame: Updated dataframe """ for new_col, mapping in grouping_dicts.items(): df[new_col] = df[column].replace(mapping) return df # Group and Binarize Diagnosis def group_and_binarize_diagnosis(df, original_col, group_map, malignant_map): """ Groups diagnosis into categories and flags as malignant or not. Args: df (pd.DataFrame): Input dataframe original_col (str): Original diagnosis column group_map (dict): Mapping of diagnoses to groups malignant_map (dict): Mapping of groups to binary malignancy label Returns: pd.DataFrame: Updated dataframe """ grouped_col = f"{original_col}_Grouped" malignant_col = f"{original_col}_Malignant" df[grouped_col] = df[original_col].replace(group_map) df[malignant_col] = df[grouped_col].replace(malignant_map) return df # Function to check if a column contains any list def is_list_column(col): return any(isinstance(val, list) for val in col) def parse_date_columns(df: pd.DataFrame, cols: list) -> pd.DataFrame: """Parse date columns safely (day-first) and keep as datetime64.""" for c in cols: if c in df.columns: df[c] = pd.to_datetime(df[c], dayfirst=True, errors="coerce") return df def standardize_simple_category( df: pd.DataFrame, col: str, mapping: dict, allowed: set, unknown_token: str = UNKNOWN_TOKEN ) -> pd.DataFrame: """Standardize a single categorical column: normalize strings -> map -> validate -> unknown.""" if col not in df.columns: return df # Ensure strings df[col] = df[col].astype(str).str.strip().str.upper() # Replace known "unknown" markers df[col] = df[col].replace({"NAN": unknown_token, "NONE": unknown_token, "NA": unknown_token, "N/A": unknown_token}) # Apply mapping dictionary (after uppercase) df[col] = df[col].replace(mapping) # Keep only allowed; everything else -> OTHER (or UNKNOWN_TOKEN) def _clean(v: str) -> str: if v in allowed: return v if v in {"", unknown_token}: return unknown_token # If you prefer strict unknown: return unknown_token return "OTHER" if "OTHER" in allowed else unknown_token df[col] = df[col].apply(_clean) return df def coerce_event_column(df: pd.DataFrame) -> pd.DataFrame: """ Create a robust Event_clean: - 1 if Date_of_death present - else 0 if Last_followup_date present - else try to use existing Event column (coerced to 0/1) """ if "Event" in df.columns: # Coerce typical formats: "1", "0", "YES/NO", etc. tmp = df["Event"].astype(str).str.strip().str.upper() tmp = tmp.replace({"YES": "1", "Y": "1", "TRUE": "1", "DEAD": "1", "NO": "0", "N": "0", "FALSE": "0", "ALIVE": "0", "NAN": ""}) df["Event_clean"] = pd.to_numeric(tmp, errors="coerce") else: df["Event_clean"] = np.nan # Override using death date if available (strongest truth source) if "Date_of_death" in df.columns: df.loc[df["Date_of_death"].notna(), "Event_clean"] = 1 # If no death date but follow-up date exists, assume censored if "Last_followup_date" in df.columns: df.loc[(df["Date_of_death"].isna()) & (df["Last_followup_date"].notna()), "Event_clean"] = 0 # Final fill: unknown -> 0 (conservative censoring) OR np.nan if you want strict df["Event_clean"] = df["Event_clean"].fillna(0).astype(int) return df def derive_os_time_days(df: pd.DataFrame) -> pd.DataFrame: """ Create OS_time_days from HSCT_date to death (if event=1) else last follow-up. """ if "HSCT_date" not in df.columns: return df # Need HSCT_date parsed if not np.issubdtype(df["HSCT_date"].dtype, np.datetime64): df["HSCT_date"] = pd.to_datetime(df["HSCT_date"], dayfirst=True, errors="coerce") # Choose end date end_date = None if "Date_of_death" in df.columns and "Last_followup_date" in df.columns: end_date = np.where(df["Event_clean"].eq(1), df["Date_of_death"], df["Last_followup_date"]) end_date = pd.to_datetime(end_date, errors="coerce") elif "Date_of_death" in df.columns: end_date = df["Date_of_death"] elif "Last_followup_date" in df.columns: end_date = df["Last_followup_date"] if end_date is None: return df df["OS_time_days"] = (end_date - df["HSCT_date"]).dt.days # Clean impossible/negative values df.loc[df["OS_time_days"] < 0, "OS_time_days"] = np.nan return df def calculate_ages_from_dates(df: pd.DataFrame) -> pd.DataFrame: """Calculate recepient/donor age at HSCT using real dates (preferred).""" # Ensure datetime for c in ["HSCT_date", "Recepient_DOB", "Donor_DOB"]: if c in df.columns: df[c] = pd.to_datetime(df[c], dayfirst=True, errors="coerce") if "HSCT_date" in df.columns and "Recepient_DOB" in df.columns: df["R_Age_at_transplant"] = ((df["HSCT_date"] - df["Recepient_DOB"]).dt.days / 365.25) if "HSCT_date" in df.columns and "Donor_DOB" in df.columns: df["D_Age_at_transplant"] = ((df["HSCT_date"] - df["Donor_DOB"]).dt.days / 365.25) if "R_Age_at_transplant" in df.columns and "D_Age_at_transplant" in df.columns: df["Age_Gap_R_D"] = df["R_Age_at_transplant"] - df["D_Age_at_transplant"] # Optional: round ages to int for compatibility with your current pipeline for c in ["R_Age_at_transplant", "D_Age_at_transplant", "Age_Gap_R_D"]: if c in df.columns: df[c] = df[c].round().astype("Int64") # keeps NA return df def preprocess_pipeline(df) -> pd.DataFrame: """ Full preprocessing pipeline: 1. Load and initial cleaning 2. String normalization 3. Special column processing 4. Data corrections 5. Feature engineering """ df = df.dropna(axis=1, how="all") # Special column processing # Strip leading/trailing spaces from column names df.columns = df.columns.str.strip() # Remove spaces from HLA columns df.columns = [ re.sub(r"\s+", "", col) if "_HLA" in col else col for col in df.columns ] # NEW: parse survival/date columns early (before normalize_strings) df = parse_date_columns(df, SURVIVAL_DATE_COLS) # String handling df = normalize_strings(df) df = clean_blood_group_columns(df, BLOOD_GROUP_COLS) # Data corrections df = correct_nationalities(df, "Recepient_Nationality") df = correct_drug_name_in_list(df, "PreHSCT conditioning regimen+/-ATG+/-TBI") df = correct_drug_name_in_list(df, "First_GVHD prophylaxis") # df = correct_drug_name_in_list(df, "Post HSCT regimen") df = standardize_compound_columns( df, ["PreHSCT conditioning regimen+/-ATG+/-TBI", "First_GVHD prophylaxis"] ) df = standardize_gender(df) df = correct_donor_relationships(df) if "SELF" in df["Donor_relation to recepient"].unique(): df = handle_self_donor_consistency(df) # --- NEW: standardize new Pro model categorical columns --- df = standardize_simple_category(df, "Donor_type", DONOR_TYPE_MAP, ALLOWED_DONOR_TYPES) df = standardize_simple_category(df, "Conditioning_intensity", COND_INTENSITY_MAP, ALLOWED_COND_INTENSITY) df = standardize_simple_category(df, "GVHD_Prophylaxis_Cat", PROPH_CAT_MAP, ALLOWED_PROPH_CAT) # HLA processing df = standardize_hla_matching(df) df = process_hla_columns(df) df = expand_HLA_cols(df) # Extract years df = extract_year(df, "HSCT_date") df = extract_year(df, "Recepient_DOB") df = extract_year(df, "Donor_DOB") df = extract_year(df, "Date of first diagnosis/BMBx date") df = calculate_ages_from_dates(df) # Final missing value handling datetime_cols = [c for c in df.columns if np.issubdtype(df[c].dtype, np.datetime64)] df[datetime_cols] = df[datetime_cols] # no-op, just clarity non_dt_cols = [c for c in df.columns if c not in datetime_cols] df[non_dt_cols] = df[non_dt_cols].fillna(UNKNOWN_TOKEN) # --- NEW: survival-ready variables for Cox --- # ensure dates remain datetime (fillna above may have introduced "X" strings in non-date cols only) df = parse_date_columns(df, SURVIVAL_DATE_COLS) df = coerce_event_column(df) df = derive_os_time_days(df) # One-hot encode multi-drug regimen columns df = split_and_one_hot_encode(df, 'PreHSCT conditioning regimen+/-ATG+/-TBI', 'PreHSCT') df = split_and_one_hot_encode(df, 'First_GVHD prophylaxis', 'First_GVHD_prophylaxis') # df = split_and_one_hot_encode(df, 'Post HSCT regimen', 'PostHSCT') # Normalize blood groups df = merge_blood_groups(df, "Recepient_Blood group before HSCT", "Recepient_Blood group before HSCT_MergePlusMinus") df = merge_blood_groups(df, "D_Blood group", "D_Blood group_MergePlusMinus") # Binarize ages df = binarize_age(df, "R_Age_at_transplant", 16, "R_Age_at_transplant_cutoff16") df = binarize_age(df, "R_Age_at_transplant", 18, "R_Age_at_transplant_cutoff18") df = binarize_age(df, "D_Age_at_transplant", 16, "D_Age_at_transplant_cutoff16") df = binarize_age(df, "D_Age_at_transplant", 18, "D_Age_at_transplant_cutoff18") # Gender/Relation features df = add_gender_relation_features(df) # Group nationalities df = apply_nationality_groupings(df, 'Recepient_Nationality', groupings) # Group and binarize diagnosis df = group_and_binarize_diagnosis(df, 'Hematological Diagnosis', DIAGNOSIS_GROUP_MAP, MALIGNANT_MAP) df = df.replace(UNKNOWN_TOKEN, np.nan) # Drop columns with only one unique value # df = df.loc[:, df.nunique() > 1] # get unhashable type list error.. # # Keep columns that either: # # - Are not list-type and have more than one unique value # # - Are list-type (skip them from processing) # df = df.loc[:, [ # is_list_column(df[col]) or df[col].nunique(dropna=False) > 1 # for col in df.columns # ]] # df = df.drop(columns=["First_GVHD_prophylaxis_MTX", "PreHSCT_MTX"], errors='ignore') # Add columns for new dfs for features that exist in the original dataset but not in the new one for feature in load_train_features()[0]: if ("_HLA" in feature or "First_GVHD_prophylaxis_" in feature or "PreHSCT_" in feature) and feature not in df.columns: df[feature] = 0 train_features, _ = load_train_features() df_model = df.reindex(columns=train_features, fill_value=0) return df, df_model if __name__ == "__main__": df_raw = load_dataset( "/home/muhammadridzuan/2025_GVHD/2024_GVHD_SSMC/GVHD_Intel_data_MBZUAI_1.2.csv" ) _, df_model = preprocess_pipeline(df_raw) df_model.to_csv("preprocessed_gvhd_data.csv", index=False)