Spaces:
Running
Running
| import numpy as np | |
| import pandas as pd | |
| import re | |
| from sklearn.preprocessing import MultiLabelBinarizer | |
| # Constants | |
| UNKNOWN_TOKEN = "X" | |
| DATE_FORMAT = '%d/%m/%Y' | |
| BLOOD_GROUP_COLS = ["D_Blood group", "Recepient_Blood group before HSCT"] | |
| NATIONALITY_CORRECTIONS = { | |
| "AFGHANISTAN": "AFGHAN", | |
| "ALGERIA": "ALGERIAN", | |
| "EMARATI": "EMIRATI", | |
| "UAE": "EMIRATI", | |
| "PHILIPPINO": "FILIPINO", | |
| "JORDAN": "JORDANIAN", | |
| "JORDANI": "JORDANIAN", | |
| "PAKISTAN": "PAKISTANI", | |
| "PAKISTANII": "PAKISTANI", | |
| "PALESTINE": "PALESTINIAN", | |
| "PALESTENIAN": "PALESTINIAN", | |
| "USA": "AMERICAN", | |
| } | |
| # 1. Regional Grouping (Geography-Based) | |
| REGIONAL_GROUPING = { | |
| # Middle East | |
| 'EMIRATI': 'Middle East', | |
| 'OMANI': 'Middle East', | |
| 'SAUDI': 'Middle East', | |
| 'KUWAIT': 'Middle East', | |
| 'JORDANIAN': 'Middle East', | |
| 'LEBANESE': 'Middle East', | |
| 'IRAQI': 'Middle East', | |
| 'SYRIAN': 'Middle East', | |
| 'YEMENI': 'Middle East', | |
| 'PALESTINIAN': 'Middle East', | |
| 'BAHRAINI': 'Middle East', | |
| 'LIBYAN': 'Middle East', | |
| # North Africa | |
| 'EGYPTIAN': 'North Africa', | |
| 'SUDANESE': 'North Africa', | |
| 'ALGERIAN': 'North Africa', | |
| 'MOROCCAN': 'North Africa', | |
| 'MAURITANIA': 'North Africa', | |
| 'COMORAN': 'North Africa', | |
| # South Asia | |
| 'INDIAN': 'South Asia', | |
| 'PAKISTANI': 'South Asia', | |
| 'BANGLADESHI': 'South Asia', | |
| 'SRI LANKAN': 'South Asia', | |
| 'AFGHAN': 'South Asia', | |
| # Southeast Asia | |
| 'FILIPINO': 'Southeast Asia', | |
| 'INDONESIAN': 'Southeast Asia', | |
| # East Africa | |
| 'ETHIOPIAN': 'East Africa', | |
| 'SOMALI': 'East Africa', | |
| 'ERITREAN': 'East Africa', | |
| # Central Asia | |
| 'UZBEKISTANI': 'Central Asia', | |
| # Western Nations / Oceania / Americas | |
| 'AMERICAN': 'Western', | |
| 'BRITISH': 'Western', | |
| 'NEW ZEALANDER': 'Oceania', | |
| 'FIJI': 'Oceania' | |
| } | |
| # 2. Cultural-Linguistic Grouping | |
| CULTURAL_GROUPING = { | |
| 'EMIRATI': 'Arab', | |
| 'OMANI': 'Arab', | |
| 'SAUDI': 'Arab', | |
| 'KUWAIT': 'Arab', | |
| 'JORDANIAN': 'Arab', | |
| 'LEBANESE': 'Arab', | |
| 'IRAQI': 'Arab', | |
| 'SYRIAN': 'Arab', | |
| 'YEMENI': 'Arab', | |
| 'PALESTINIAN': 'Arab', | |
| 'BAHRAINI': 'Arab', | |
| 'LIBYAN': 'Arab', | |
| 'EGYPTIAN': 'Arab', | |
| 'SUDANESE': 'Arab-African', | |
| 'ALGERIAN': 'Arab', | |
| 'MOROCCAN': 'Arab', | |
| 'MAURITANIA': 'Arab', | |
| 'COMORAN': 'Arab-African', | |
| 'INDIAN': 'South Asian', | |
| 'PAKISTANI': 'South Asian', | |
| 'BANGLADESHI': 'South Asian', | |
| 'SRI LANKAN': 'South Asian', | |
| 'AFGHAN': 'South Asian', | |
| 'FILIPINO': 'Southeast Asian', | |
| 'INDONESIAN': 'Southeast Asian', | |
| 'ETHIOPIAN': 'East African', | |
| 'SOMALI': 'East African', | |
| 'ERITREAN': 'East African', | |
| 'UZBEKISTANI': 'Central Asian', | |
| 'AMERICAN': 'Western/English-speaking', | |
| 'BRITISH': 'Western/English-speaking', | |
| 'NEW ZEALANDER': 'Western/English-speaking', | |
| 'FIJI': 'Pacific Islander' | |
| } | |
| # 3. World Bank Income Grouping | |
| INCOME_GROUPING = { | |
| 'EMIRATI': 'High income', | |
| 'OMANI': 'High income', | |
| 'SAUDI': 'High income', | |
| 'KUWAIT': 'High income', | |
| 'JORDANIAN': 'Upper-middle income', | |
| 'LEBANESE': 'Upper-middle income', | |
| 'IRAQI': 'Upper-middle income', | |
| 'SYRIAN': 'Low income', | |
| 'YEMENI': 'Low income', | |
| 'PALESTINIAN': 'Lower-middle income', | |
| 'BAHRAINI': 'High income', | |
| 'LIBYAN': 'Upper-middle income', | |
| 'EGYPTIAN': 'Lower-middle income', | |
| 'SUDANESE': 'Low income', | |
| 'ALGERIAN': 'Lower-middle income', | |
| 'MOROCCAN': 'Lower-middle income', | |
| 'MAURITANIA': 'Low income', | |
| 'COMORAN': 'Low income', | |
| 'INDIAN': 'Lower-middle income', | |
| 'PAKISTANI': 'Lower-middle income', | |
| 'BANGLADESHI': 'Lower-middle income', | |
| 'SRI LANKAN': 'Lower-middle income', | |
| 'AFGHAN': 'Low income', | |
| 'FILIPINO': 'Lower-middle income', | |
| 'INDONESIAN': 'Lower-middle income', | |
| 'ETHIOPIAN': 'Low income', | |
| 'SOMALI': 'Low income', | |
| 'ERITREAN': 'Low income', | |
| 'UZBEKISTANI': 'Lower-middle income', | |
| 'AMERICAN': 'High income', | |
| 'BRITISH': 'High income', | |
| 'NEW ZEALANDER': 'High income', | |
| 'FIJI': 'Upper-middle income' | |
| } | |
| # 4. WHO Regional Office Grouping | |
| WHO_REGION_GROUPING = { | |
| 'EMIRATI': 'EMRO', | |
| 'OMANI': 'EMRO', | |
| 'SAUDI': 'EMRO', | |
| 'KUWAIT': 'EMRO', | |
| 'JORDANIAN': 'EMRO', | |
| 'LEBANESE': 'EMRO', | |
| 'IRAQI': 'EMRO', | |
| 'SYRIAN': 'EMRO', | |
| 'YEMENI': 'EMRO', | |
| 'PALESTINIAN': 'EMRO', | |
| 'BAHRAINI': 'EMRO', | |
| 'LIBYAN': 'EMRO', | |
| 'EGYPTIAN': 'EMRO', | |
| 'SUDANESE': 'EMRO', | |
| 'ALGERIAN': 'AFRO', | |
| 'MOROCCAN': 'EMRO', | |
| 'MAURITANIA': 'AFRO', | |
| 'COMORAN': 'AFRO', | |
| 'INDIAN': 'SEARO', | |
| 'PAKISTANI': 'EMRO', | |
| 'BANGLADESHI': 'SEARO', | |
| 'SRI LANKAN': 'SEARO', | |
| 'AFGHAN': 'EMRO', | |
| 'FILIPINO': 'WPRO', | |
| 'INDONESIAN': 'SEARO', | |
| 'ETHIOPIAN': 'AFRO', | |
| 'SOMALI': 'EMRO', | |
| 'ERITREAN': 'AFRO', | |
| 'UZBEKISTANI': 'EURO', | |
| 'AMERICAN': 'AMRO', | |
| 'BRITISH': 'EURO', | |
| 'NEW ZEALANDER': 'WPRO', | |
| 'FIJI': 'WPRO' | |
| } | |
| groupings = { | |
| 'Recepient_Nationality_Geographical': REGIONAL_GROUPING, | |
| 'Recepient_Nationality_Cultural': CULTURAL_GROUPING, | |
| 'Recepient_Nationality_Regional_Income': INCOME_GROUPING, | |
| 'Recepient_Nationality_Regional_WHO': WHO_REGION_GROUPING | |
| } | |
| # FIRST_GVHD_PROPHYLAXIS_CORRECTIONS | |
| DRUG_SPELLING_CORRECTIONS = { | |
| "CYCLOSPOPRIN": "CYCLOSPORIN", | |
| "CYCLOSPRIN": "CYCLOSPORIN", | |
| "CYCLOSPOROIN": "CYCLOSPORIN", | |
| "CY": "CYCLOSPORIN", | |
| "TAC": "TACROLIMUS", | |
| "MTX": "METHOTREXATE", | |
| "BUDESONIDE": "STEROID", | |
| "STEROIDS": "STEROID", | |
| "ATG.": "ATG", | |
| "FLUDARABINIE": "FLUDARABINE", | |
| "FLUDRABINE":"FLUDARABINE", | |
| "BUSULPHAN": "BUSULFAN", | |
| "MEPHALAN": "MELPHALAN", | |
| "GEMCITABIBE": "GEMCITABINE", | |
| } | |
| GENDER_MAP = { | |
| 0: "MALE", 1: "FEMALE", 2: UNKNOWN_TOKEN, | |
| "0": "MALE", "1": "FEMALE", "2": UNKNOWN_TOKEN | |
| } | |
| RELATION_CORRECTIONS = { | |
| r"(?i)BROTHER": "SIBLING", | |
| r"(?i)SISTER": "SIBLING", | |
| r"(?i)FATHER": "FIRST DEGREE RELATIVE", | |
| r"(?i)MOTHER": "FIRST DEGREE RELATIVE", | |
| r"(?i)SON": "FIRST DEGREE RELATIVE", | |
| r"(?i)DAUGHTER": "FIRST DEGREE RELATIVE", | |
| r"(?i)COUSIN": "SECOND DEGREE RELATIVE", | |
| r"(?i)UNCLE": "SECOND DEGREE RELATIVE", | |
| r"(?i)AUNT": "SECOND DEGREE RELATIVE", | |
| r"(?i)other": UNKNOWN_TOKEN | |
| } | |
| STRING_NORMALIZATION_MAP = { | |
| r"(?i)unknown": UNKNOWN_TOKEN, r"(?i)unkown": UNKNOWN_TOKEN, | |
| r"(?i)Unknwon": UNKNOWN_TOKEN, np.nan: UNKNOWN_TOKEN, | |
| r"(?i)\bMale\b": "MALE", r"(?i)\bFemale\b": "FEMALE", | |
| "1o": "10", r"(?i)Umbilical Cord": "UMBILICAL CORD", | |
| r"(?i)Umbilical Cord blood": "UMBILICAL CORD", | |
| r"(?i)Bone Marrow": "BONE MARROW", "MDS": "MYELODYSPLASTIC SYNDROME" | |
| } | |
| DIAGNOSIS_GROUP_MAP = { | |
| "MYELOPROLIFERATIVE DISORDER": "MYELOPROLIFERATIVE NEOPLASMS", | |
| "CML": "MYELOPROLIFERATIVE NEOPLASMS", | |
| "MYELOFIBROSIS": "MYELOPROLIFERATIVE NEOPLASMS", | |
| "NON-HODGKIN LYMPHOMA": "LYMPHOMA", | |
| 'NON HODGKIN LYMPHOMA': "LYMPHOMA", | |
| "HODGKIN LYMPHOMA": "LYMPHOMA", | |
| "BETA THALASSEMIA": "RED CELL DISORDERS", | |
| 'BETA THALESSEMIA': "RED CELL DISORDERS", | |
| "ALPHA THALASSEMIA": "RED CELL DISORDERS", | |
| "ALPHA THALESSEMIA": "RED CELL DISORDERS", | |
| "ALPHA THALSSEMIA": "RED CELL DISORDERS", | |
| "HEREDITARY SPHEROCYTOSIS": "RED CELL DISORDERS", | |
| "SICKLE CELL DISEASE": "RED CELL DISORDERS", | |
| "APLASTIC ANEMIA": "BMF SYNDROMES", | |
| "FANCONI ANEMIA": "BMF SYNDROMES", | |
| "DYSKERATOSIS CONGENITA": "BMF SYNDROMES", | |
| 'DYSKERATOSIS CONGENTIA': "BMF SYNDROMES", | |
| "CHRONIC GRANULOMATOUS DISEASE": "IMMUNE DISORDERS", | |
| "COMBINED VARIABLE IMMUNODEFICIENCY": "IMMUNE DISORDERS", | |
| "SCID": "IMMUNE DISORDERS", | |
| ## check this one | |
| "X-LINKED HYPERGAMMAGLOBULINEMIA": "IMMUNE DISORDERS", | |
| '-LINKED HYPERGAMMAGLOBULINEMIA': "IMMUNE DISORDERS", | |
| '-LINKED HYPER IGM SYNDROME': "IMMUNE DISORDERS", | |
| "HYPOGAMMAGLOBULINEMIA": "IMMUNE DISORDERS", | |
| ## check this one | |
| "GLANZMANN": "OTHER", | |
| 'GLANZMANN THROMBASTHENIA': "OTHER", | |
| "CLL": "OTHER", | |
| "PNH": "OTHER", | |
| "HLH": "OTHER", | |
| "LANGERHANS CELL HISTIOCYTOSIS": "OTHER", | |
| "BLASTIC PLASMACYTOID DENDRITIC CELL NEOPLASM": "OTHER", | |
| 'BLASTIC PLASMACYTOID DENDRITRIC CELL NEOPLASM': "OTHER", | |
| "B-ALL": "ALL", | |
| "BALL": "ALL", | |
| "TALL": "ALL", | |
| "T-ALL": "ALL", | |
| "AML": "AML", | |
| "ACUTE MYELOID LEUKEMIA": "AML" | |
| } | |
| # # 0 nonmalignant; 1: malignant | |
| MALIGNANT_MAP = { | |
| 'AML': 1, | |
| 'RED CELL DISORDERS': 0, | |
| 'AMYLOIDOSIS': 0, | |
| 'BMF SYNDROMES': 0, | |
| 'ALL': 1, | |
| 'OTHER': 0, | |
| 'IMMUNE DISORDERS': 0, | |
| 'CHRONIC LYMPHOCYTIC LEUKEMIA': 1, | |
| 'MYELOPROLIFERATIVE NEOPLASMS': 1, # note: CML is malignant; not sure about MYELOPROLIFERATIVE DISORDER & MYELOFIBROSIS | |
| 'HEMOPHAGOCYTIC LYMPHOHISTIOCYTOSIS (HLH)': 0, | |
| 'LYMPHOMA': 1, | |
| 'MYELODYSPLASTIC SYNDROME': 1, | |
| 'MEDULLOBLASTOMA': 0, | |
| 'MULTIPLE MYELOMA': 0, | |
| 'NEUROBLASTOMA': 0, | |
| 'PAROXYSMAL NOCTURNAL HEMOGLOBINURIA': 0, | |
| 'PLASMA CELL LEUKEMIA': 0 | |
| } | |
| HLA_MATCHING_MAP = { | |
| "12 OF 12": "FULL", | |
| "10 OF 10": "FULL", | |
| "8 OF 8": "FULL", # not full? | |
| "9 OF 10": "PARTIAL", | |
| "8 OF 10": "PARTIAL", | |
| "PARTIALLY MATCHED": "PARTIAL", | |
| "7 OF 10": "HAPLOIDENTICAL", | |
| "6 OF 12": "HAPLOIDENTICAL", | |
| "6 OF 10": "HAPLOIDENTICAL", | |
| "5 OF 10": "HAPLOIDENTICAL", | |
| # confirm if the following are all haploidentical | |
| "5 OF 8": "HAPLOIDENTICAL", | |
| "4 OF 6": "HAPLOIDENTICAL", | |
| } | |
| def load_train_features(): | |
| # Define features | |
| HLA_sub12 = [ | |
| # Recipient - HLA-A | |
| 'R_HLA_A_1', 'R_HLA_A_2', 'R_HLA_A_3', 'R_HLA_A_4', 'R_HLA_A_7', 'R_HLA_A_8', | |
| 'R_HLA_A_11', 'R_HLA_A_12', 'R_HLA_A_20', 'R_HLA_A_23', 'R_HLA_A_24', 'R_HLA_A_25', | |
| 'R_HLA_A_26', 'R_HLA_A_29', 'R_HLA_A_30', 'R_HLA_A_31', 'R_HLA_A_32', 'R_HLA_A_33', | |
| 'R_HLA_A_34', 'R_HLA_A_66', 'R_HLA_A_68', 'R_HLA_A_69', 'R_HLA_A_74', 'R_HLA_A_X', | |
| # Recipient - HLA-B | |
| 'R_HLA_B_7', 'R_HLA_B_8', 'R_HLA_B_13', 'R_HLA_B_14', 'R_HLA_B_15', 'R_HLA_B_18', | |
| 'R_HLA_B_23', 'R_HLA_B_24', 'R_HLA_B_27', 'R_HLA_B_35', 'R_HLA_B_37', 'R_HLA_B_38', | |
| 'R_HLA_B_39', 'R_HLA_B_40', 'R_HLA_B_41', 'R_HLA_B_42', 'R_HLA_B_44', 'R_HLA_B_45', | |
| 'R_HLA_B_46', 'R_HLA_B_49', 'R_HLA_B_50', 'R_HLA_B_51', 'R_HLA_B_52', 'R_HLA_B_53', | |
| 'R_HLA_B_55', 'R_HLA_B_56', 'R_HLA_B_57', 'R_HLA_B_58', 'R_HLA_B_73', 'R_HLA_B_81', | |
| 'R_HLA_B_X', | |
| # Recipient - HLA-C | |
| 'R_HLA_C_1', 'R_HLA_C_2', 'R_HLA_C_3', 'R_HLA_C_4', 'R_HLA_C_5', 'R_HLA_C_6', | |
| 'R_HLA_C_7', 'R_HLA_C_8', 'R_HLA_C_12', 'R_HLA_C_14', 'R_HLA_C_15', 'R_HLA_C_16', | |
| 'R_HLA_C_17', 'R_HLA_C_18', 'R_HLA_C_38', 'R_HLA_C_49', 'R_HLA_C_50', 'R_HLA_C_X', | |
| # Recipient - HLA-DR | |
| 'R_HLA_DR_1', 'R_HLA_DR_2', 'R_HLA_DR_3', 'R_HLA_DR_4', 'R_HLA_DR_5', 'R_HLA_DR_6', | |
| 'R_HLA_DR_7', 'R_HLA_DR_8', 'R_HLA_DR_9', 'R_HLA_DR_10', 'R_HLA_DR_11', 'R_HLA_DR_12', | |
| 'R_HLA_DR_13', 'R_HLA_DR_14', 'R_HLA_DR_15', 'R_HLA_DR_16', 'R_HLA_DR_17', 'R_HLA_DR_X', | |
| # Recipient - HLA-DQ | |
| 'R_HLA_DQ_1', 'R_HLA_DQ_2', 'R_HLA_DQ_3', 'R_HLA_DQ_4', 'R_HLA_DQ_5', 'R_HLA_DQ_6', | |
| 'R_HLA_DQ_7', 'R_HLA_DQ_11', 'R_HLA_DQ_15', 'R_HLA_DQ_16', 'R_HLA_DQ_301', 'R_HLA_DQ_X', | |
| # Donor - HLA-A | |
| 'D_HLA_A_1', 'D_HLA_A_2', 'D_HLA_A_3', 'D_HLA_A_8', 'D_HLA_A_11', 'D_HLA_A_12', | |
| 'D_HLA_A_23', 'D_HLA_A_24', 'D_HLA_A_25', 'D_HLA_A_26', 'D_HLA_A_29', 'D_HLA_A_30', | |
| 'D_HLA_A_31', 'D_HLA_A_32', 'D_HLA_A_33', 'D_HLA_A_34', 'D_HLA_A_66', 'D_HLA_A_68', | |
| 'D_HLA_A_69', 'D_HLA_A_7', 'D_HLA_A_74', 'D_HLA_A_X', | |
| # Donor - HLA-B | |
| 'D_HLA_B_7', 'D_HLA_B_8', 'D_HLA_B_13', 'D_HLA_B_14', 'D_HLA_B_15', 'D_HLA_B_17', | |
| 'D_HLA_B_18', 'D_HLA_B_23', 'D_HLA_B_24', 'D_HLA_B_27', 'D_HLA_B_35', 'D_HLA_B_37', | |
| 'D_HLA_B_38', 'D_HLA_B_39', 'D_HLA_B_40', 'D_HLA_B_41', 'D_HLA_B_42', 'D_HLA_B_44', | |
| 'D_HLA_B_45', 'D_HLA_B_48', 'D_HLA_B_49', 'D_HLA_B_50', 'D_HLA_B_51', 'D_HLA_B_52', | |
| 'D_HLA_B_53', 'D_HLA_B_55', 'D_HLA_B_56', 'D_HLA_B_57', 'D_HLA_B_58', 'D_HLA_B_73', | |
| 'D_HLA_B_81', 'D_HLA_B_X', | |
| # Donor - HLA-C | |
| 'D_HLA_C_1', 'D_HLA_C_2', 'D_HLA_C_3', 'D_HLA_C_4', 'D_HLA_C_5', 'D_HLA_C_6', | |
| 'D_HLA_C_7', 'D_HLA_C_8', 'D_HLA_C_12', 'D_HLA_C_14', 'D_HLA_C_15', 'D_HLA_C_16', | |
| 'D_HLA_C_17', 'D_HLA_C_18', 'D_HLA_C_38', 'D_HLA_C_49', 'D_HLA_C_50', 'D_HLA_C_X', | |
| # Donor - HLA-DR | |
| 'D_HLA_DR_1', 'D_HLA_DR_2', 'D_HLA_DR_3', 'D_HLA_DR_4', 'D_HLA_DR_5', 'D_HLA_DR_6', | |
| 'D_HLA_DR_7', 'D_HLA_DR_8', 'D_HLA_DR_9', 'D_HLA_DR_10', 'D_HLA_DR_11', 'D_HLA_DR_12', | |
| 'D_HLA_DR_13', 'D_HLA_DR_14', 'D_HLA_DR_15', 'D_HLA_DR_16', 'D_HLA_DR_17', 'D_HLA_DR_X', | |
| # Donor - HLA-DQ | |
| 'D_HLA_DQ_1', 'D_HLA_DQ_2', 'D_HLA_DQ_3', 'D_HLA_DQ_4', 'D_HLA_DQ_5', 'D_HLA_DQ_6', | |
| 'D_HLA_DQ_7', 'D_HLA_DQ_11', 'D_HLA_DQ_15', 'D_HLA_DQ_16', 'D_HLA_DQ_301', 'D_HLA_DQ_X' | |
| ] | |
| HLA_sub12_without_X = [i for i in HLA_sub12 if "_X" not in i] | |
| prehsct_onehot = [ | |
| 'PreHSCT_ALEMTUZUMAB', | |
| 'PreHSCT_ATG', | |
| 'PreHSCT_BEAM', | |
| 'PreHSCT_BUSULFAN', | |
| 'PreHSCT_CAMPATH', | |
| 'PreHSCT_CARMUSTINE', | |
| 'PreHSCT_CLOFARABINE', | |
| 'PreHSCT_CYCLOPHOSPHAMIDE', | |
| 'PreHSCT_CYCLOSPORIN', | |
| 'PreHSCT_CYTARABINE', | |
| 'PreHSCT_ETOPOSIDE', | |
| 'PreHSCT_FLUDARABINE', | |
| 'PreHSCT_GEMCITABINE', | |
| 'PreHSCT_MELPHALAN', | |
| 'PreHSCT_MTX', | |
| 'PreHSCT_OTHER', | |
| 'PreHSCT_RANIMUSTINE', | |
| 'PreHSCT_REDUCEDCONDITIONING', | |
| 'PreHSCT_RITUXIMAB', | |
| 'PreHSCT_SIROLIMUS', | |
| 'PreHSCT_TBI', | |
| 'PreHSCT_THIOTEPA', | |
| 'PreHSCT_TREOSULFAN', | |
| 'PreHSCT_UA', | |
| 'PreHSCT_VORNOSTAT', | |
| ] | |
| first_prophylaxis_onehot = [ | |
| 'First_GVHD_prophylaxis_ABATACEPT', | |
| 'First_GVHD_prophylaxis_ALEMTUZUMAB', | |
| 'First_GVHD_prophylaxis_ATG', | |
| 'First_GVHD_prophylaxis_CYCLOPHOSPHAMIDE', | |
| 'First_GVHD_prophylaxis_CYCLOSPORIN', | |
| 'First_GVHD_prophylaxis_IMATINIB', | |
| 'First_GVHD_prophylaxis_LEFLUNOMIDE', | |
| 'First_GVHD_prophylaxis_MMF', | |
| 'First_GVHD_prophylaxis_MTX', | |
| 'First_GVHD_prophylaxis_NONE', | |
| 'First_GVHD_prophylaxis_RUXOLITINIB', | |
| 'First_GVHD_prophylaxis_SIROLIMUS', | |
| 'First_GVHD_prophylaxis_STEROID', | |
| 'First_GVHD_prophylaxis_TAC', | |
| ] | |
| train_features = [[ | |
| 'Recipient_gender', | |
| 'R_Age_at_transplant_cutoff18', | |
| 'Recepient_Nationality_Cultural', | |
| 'Hematological Diagnosis_Grouped', | |
| 'Recepient_Blood group before HSCT_MergePlusMinus', | |
| 'D_Age_at_transplant_cutoff18', | |
| 'Age_Gap_R_D', | |
| 'Donor_gender', | |
| 'D_Blood group_MergePlusMinus', | |
| 'Number of lines of Rx before HSCT', | |
| 'Source of cells', | |
| 'Donor_relation to recipient', | |
| ] + HLA_sub12_without_X + prehsct_onehot + first_prophylaxis_onehot][0] | |
| # Categorical features | |
| cat_features = [ | |
| 'Recipient_gender', | |
| 'Recepient_Nationality_Cultural', | |
| 'Hematological Diagnosis_Grouped', | |
| 'Recepient_Blood group before HSCT_MergePlusMinus', | |
| 'Donor_gender', | |
| 'D_Blood group_MergePlusMinus', | |
| 'Source of cells', | |
| 'Donor_relation to recipient', | |
| ] | |
| return train_features, cat_features | |
| def load_dataset(file_path: str) -> pd.DataFrame: | |
| """Load dataset from CSV file and drop columns with all missing values""" | |
| df = pd.read_csv(file_path, header=1) | |
| return df.dropna(axis=1, how="all") | |
| def normalize_strings(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Standardize string values across the dataset: | |
| - Replace variations of unknown/NA with consistent token | |
| - Correct common misspellings and abbreviations | |
| - Capitalize all strings for consistency | |
| - Strip leading/trailing whitespace | |
| """ | |
| # Apply global string replacements | |
| df = df.replace(STRING_NORMALIZATION_MAP, regex=True) | |
| # Handle nationality-specific replacements | |
| non_nationality_cols = [col for col in df.columns if "Nationality" not in col] | |
| df[non_nationality_cols] = df[non_nationality_cols].replace( | |
| {r"(?i)\buk\b": UNKNOWN_TOKEN}, regex=True | |
| ) | |
| # Handle non-HLA specific replacements | |
| non_hla_cols = [col for col in df.columns if "HLA" not in col] | |
| df[non_hla_cols] = df[non_hla_cols].replace( | |
| {r"(?i)\bna\b": UNKNOWN_TOKEN}, regex=True | |
| ) | |
| # Capitalize all string values | |
| df = df.applymap(lambda x: x.upper() if isinstance(x, str) else x) | |
| # Strip whitespace | |
| return df.applymap(lambda x: x.strip() if isinstance(x, str) else x) | |
| def clean_blood_group_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame: | |
| """Remove spaces from specified blood group columns""" | |
| for col in columns: | |
| df[col] = df[col].str.replace(r"\s+", "", regex=True) | |
| return df | |
| def standardize_hla_matching(df: pd.DataFrame) -> pd.DataFrame: | |
| # Map HLA matching values to standardized format | |
| df['HLA match ratio'] = df['HLA match ratio'].replace(HLA_MATCHING_MAP, regex=False) | |
| return df | |
| def process_hla_columns(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Clean and process HLA columns by: | |
| 1. Splitting combined HLA values into separate columns | |
| 2. Standardizing missing value representation | |
| 3. Sorting allele values numerically | |
| 4. Recombining cleaned values | |
| """ | |
| # Padding function to ensure 2 elements, filling with 'NA'. Used for Individual_Predictions | |
| def pad_list(val): | |
| if not isinstance(val, list): | |
| val = [] | |
| return (val + ['NA', 'NA'])[:2] | |
| hla_columns = [col for col in df.columns if "R_HLA" in col or "D_HLA" in col] | |
| # hla_columns = ['R_HLA_A', 'R_HLA_B', 'R_HLA_C', 'R_HLA_DR', 'R_HLA_DQ', | |
| # 'D_HLA_A', 'D_HLA_B', 'D_HLA_C', 'D_HLA_DR', 'D_HLA_DQ'] | |
| for col in hla_columns: | |
| # Handle special NA representation | |
| df[col] = df[col].replace({"NA": "NA&NA"}) | |
| # Split into two separate columns | |
| split_cols = [f"{col}1", f"{col}2"] | |
| if type(df[col].iloc[0]) != list: # and "&" in df[col].iloc[0]: | |
| df[split_cols] = df[col].str.split("&", expand=True) | |
| elif type(df[col].iloc[0]) == list: | |
| df[col] = df[col].apply(pad_list) | |
| df[split_cols] = pd.DataFrame(df[col].tolist(), index=df.index) | |
| # Standardize missing values | |
| missing_indicators = {" ", "NA", "N/A", UNKNOWN_TOKEN, "''", '""', "", "B1", None} | |
| df[split_cols] = df[split_cols].replace(missing_indicators, np.nan) | |
| # Convert to numeric and handle zeros | |
| df[split_cols] = df[split_cols].apply(pd.to_numeric, errors='coerce') | |
| df[split_cols] = df[split_cols].replace(0, np.nan) | |
| # Sort values numerically | |
| df[split_cols] = np.sort(df[split_cols], axis=1) | |
| # Convert numbers to integers, missing to 'X' | |
| df[split_cols] = df[split_cols].applymap(lambda x: str(int(x)) if pd.notna(x) else UNKNOWN_TOKEN) | |
| # Recombine cleaned values | |
| df[col] = df[split_cols].astype(str).agg("&".join, axis=1) | |
| return df | |
| def cast_as_int_if_possible(x): | |
| try: | |
| i = int(x) | |
| # Only return int if conversion is lossless (e.g., avoid converting '5.5' -> 5) | |
| if float(x) == i: | |
| return i | |
| except: | |
| pass | |
| return x | |
| def HLA_unique_alleles(df, HLA_col1, HLA_col2): | |
| HLA_col1_unique = df[HLA_col1].unique() | |
| HLA_col2_unique = df[HLA_col2].unique() | |
| HLA_col1_unique = [cast_as_int_if_possible(val) for val in HLA_col1_unique] | |
| HLA_col2_unique = [cast_as_int_if_possible(val) for val in HLA_col2_unique] | |
| unique_set = set(HLA_col1_unique).union(set(HLA_col2_unique)) | |
| # Replace NaN with "X" | |
| unique_set = {(UNKNOWN_TOKEN if pd.isna(item) else str(item)) for item in unique_set} | |
| print('unique_set', unique_set) | |
| return sorted(unique_set) | |
| def expand_HLA_cols_(df, HLA_col1, HLA_col2): | |
| HLA_uniques = HLA_unique_alleles(df, HLA_col1, HLA_col2) | |
| col_name = HLA_col1[:-1] # get "R_HLA_A" from "R_HLA_A1" | |
| for i in HLA_uniques: | |
| df[f"{col_name}_{i}"] = 0 | |
| df.loc[df[HLA_col1]==i, f"{col_name}_{i}"] = 1 # or = 1 | |
| df.loc[df[HLA_col2]==i, f"{col_name}_{i}"] = 1 # or = 1 | |
| return df | |
| def expand_HLA_cols(df): | |
| df = expand_HLA_cols_(df, HLA_col1="R_HLA_A1", HLA_col2="R_HLA_A2") | |
| df = expand_HLA_cols_(df, HLA_col1="R_HLA_B1", HLA_col2="R_HLA_B2") | |
| df = expand_HLA_cols_(df, HLA_col1="R_HLA_C1", HLA_col2="R_HLA_C2") | |
| df = expand_HLA_cols_(df, HLA_col1="R_HLA_DR1", HLA_col2="R_HLA_DR2") | |
| df = expand_HLA_cols_(df, HLA_col1="R_HLA_DQ1", HLA_col2="R_HLA_DQ2") | |
| df = expand_HLA_cols_(df, HLA_col1="D_HLA_A1", HLA_col2="D_HLA_A2") | |
| df = expand_HLA_cols_(df, HLA_col1="D_HLA_B1", HLA_col2="D_HLA_B2") | |
| df = expand_HLA_cols_(df, HLA_col1="D_HLA_C1", HLA_col2="D_HLA_C2") | |
| df = expand_HLA_cols_(df, HLA_col1="D_HLA_DR1", HLA_col2="D_HLA_DR2") | |
| df = expand_HLA_cols_(df, HLA_col1="D_HLA_DQ1", HLA_col2="D_HLA_DQ2") | |
| return df | |
| def correct_nationalities(df: pd.DataFrame, column: str) -> pd.DataFrame: | |
| """Standardize nationality names using predefined corrections""" | |
| df[column] = df[column].replace(NATIONALITY_CORRECTIONS) | |
| return df | |
| def correct_indiv_drug_name(drug_list): | |
| if type(drug_list) == str: | |
| # Find all the drug names and separators in the string | |
| parts = re.split(r'([ /+])', drug_list) # Split but keep the separators | |
| elif type(drug_list) == list: | |
| parts = drug_list | |
| corrected_parts = [] | |
| for part in parts: | |
| # If the part is a drug name, apply the correction | |
| if part.strip() and part.strip() not in {'', ' ', '/', '+'}: | |
| corrected_part = DRUG_SPELLING_CORRECTIONS.get(part.strip(), part.strip()) | |
| corrected_parts.append(corrected_part) | |
| else: | |
| # If it's a separator (/, +, space), just keep it | |
| corrected_parts.append(part) | |
| # Join the parts back together | |
| return ''.join(corrected_parts) | |
| def correct_drug_name_in_list(df: pd.DataFrame, column: str) -> pd.DataFrame: | |
| """Standardize drug names in a list using predefined corrections, preserving separators.""" | |
| # Apply the correction function to each entry in the specified column | |
| df[column] = df[column].apply(correct_indiv_drug_name) | |
| return df | |
| def standardize_compound_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame: | |
| """ | |
| Process columns with compound values by: | |
| 1. Removing spaces | |
| 2. Standardizing separators | |
| 3. Sorting components alphabetically | |
| """ | |
| for col in columns: | |
| if col in df.columns and type(df[col].iloc[0]) != list: | |
| # Clean string values | |
| df[col] = df[col].str.replace(r"\s+", "", regex=True).str.replace("+", "/").str.replace(",", "/") | |
| # Split, remove empty parts, sort, and join | |
| df[col] = df[col].apply( | |
| lambda x: "/".join(sorted([part for part in x.split("/") if part])) if isinstance(x, str) else x | |
| ) | |
| return df | |
| def standardize_gender(df: pd.DataFrame) -> pd.DataFrame: | |
| """Standardize donor gender values and infer from relationship where possible""" | |
| # Apply gender mapping | |
| df["Donor_gender"] = df["Donor_gender"].replace(GENDER_MAP) | |
| df["Recipient_gender"] = df["Recipient_gender"].replace(GENDER_MAP) | |
| # Infer gender from relationship | |
| gender_map = { | |
| "BROTHER": "MALE", "SISTER": "FEMALE", | |
| "FATHER": "MALE", "MOTHER": "FEMALE", | |
| "SON": "MALE", "DAUGHTER": "FEMALE", | |
| "UNCLE": "MALE", "AUNT": "FEMALE" | |
| } | |
| for relationship, gender in gender_map.items(): | |
| mask = df["Donor_relation to recipient"] == relationship | |
| df.loc[mask, "Donor_gender"] = gender | |
| return df | |
| def correct_donor_relationships(df: pd.DataFrame) -> pd.DataFrame: | |
| """Standardize relationship categories using predefined corrections""" | |
| return df.replace({"Donor_relation to recipient": RELATION_CORRECTIONS}, regex=True) | |
| def handle_self_donor_consistency(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Ensure data consistency for self-donors by: | |
| 1. Setting HLA values to 'SELF&SELF' | |
| 2. Verifying matching demographics | |
| """ | |
| self_mask = df["Donor_relation to recipient"] == "SELF" | |
| # Set HLA values for self-donors | |
| hla_cols = [col for col in df.columns if "R_HLA" in col or "D_HLA" in col] | |
| df.loc[self_mask, hla_cols] = "SELF&SELF" | |
| # Verify demographic consistency | |
| assert df.loc[self_mask, "Recipient_gender"].equals( | |
| df.loc[self_mask, "Donor_gender"] | |
| ), "Recipient/Donor gender mismatch for self-donors" | |
| assert df.loc[self_mask, "Recepient_Blood group before HSCT"].equals( | |
| df.loc[self_mask, "D_Blood group"] | |
| ), "Blood group mismatch for self-donors" | |
| assert df.loc[self_mask, "Recepient_DOB"].equals( | |
| df.loc[self_mask, "Donor_DOB"] | |
| ), "DOB mismatch for self-donors" | |
| return df | |
| def safe_extract_year(date_str: str) -> str: | |
| """ | |
| Safely extract year from date string: | |
| - Returns year as integer if valid | |
| - Returns UNKNOWN_TOKEN for invalid/missing dates | |
| """ | |
| if not isinstance(date_str, str) or date_str == UNKNOWN_TOKEN: | |
| return UNKNOWN_TOKEN | |
| try: | |
| # Handle special cases like "35 YEAR OLD" | |
| if "YEAR" in date_str: | |
| return UNKNOWN_TOKEN | |
| parts = date_str.split("/") | |
| if len(parts) < 3: | |
| return UNKNOWN_TOKEN | |
| year_part = parts[-1].strip() | |
| return int(year_part) if year_part.isdigit() else UNKNOWN_TOKEN | |
| except (ValueError, TypeError): | |
| return UNKNOWN_TOKEN | |
| def extract_year(df: pd.DataFrame, column_name) -> pd.DataFrame: | |
| df[column_name + "_Year"] = df[column_name].apply(safe_extract_year) | |
| return df | |
| def calculate_ages(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Calculate: | |
| 1. Recipient age at transplant | |
| 2. Donor age at transplant | |
| 3. Age gap between recipient and donor | |
| """ | |
| # Extract years safely | |
| df = extract_year(df, "HSCT_date") | |
| df = extract_year(df, "Recepient_DOB") | |
| df = extract_year(df, "Donor_DOB") | |
| # Calculate ages with safe conversion | |
| def calculate_age_diff(row, dob_col, transplant_col): | |
| try: | |
| return int(row[transplant_col]) - int(row[dob_col]) | |
| except (TypeError, ValueError): | |
| return UNKNOWN_TOKEN | |
| df["R_Age_at_transplant"] = df.apply( | |
| lambda row: calculate_age_diff(row, "Recepient_DOB_Year", "HSCT_date_Year"), | |
| axis=1 | |
| ) | |
| df["D_Age_at_transplant"] = df.apply( | |
| lambda row: calculate_age_diff(row, "Donor_DOB_Year", "HSCT_date_Year"), | |
| axis=1 | |
| ) | |
| df["Age_Gap_R_D"] = df.apply( | |
| lambda row: calculate_age_diff(row, "Donor_DOB_Year", "Recepient_DOB_Year"), | |
| axis=1 | |
| ) | |
| return df | |
| # Utility Function: Split and One-Hot Encode Drug Regimens | |
| def split_and_one_hot_encode(df, column_name, prefix): | |
| """ | |
| Splits entries in a column by "/" and one-hot encodes the resulting tokens. | |
| Args: | |
| df (pd.DataFrame): Input dataframe | |
| column_name (str): Name of the column to process | |
| prefix (str): Prefix for the resulting one-hot encoded columns | |
| Returns: | |
| pd.DataFrame: DataFrame with one-hot encoded columns added | |
| """ | |
| if type(df[column_name].iloc[0]) != list: | |
| df[column_name] = df[column_name].fillna("").apply(lambda x: re.split(r'[/]', x) if x else []) | |
| else: | |
| pass | |
| mlb = MultiLabelBinarizer() | |
| encoded_df = pd.DataFrame( | |
| mlb.fit_transform(df[column_name]), | |
| columns=[f"{prefix}_{drug.strip()}" for drug in mlb.classes_], | |
| index=df.index | |
| ) | |
| df = pd.concat([df, encoded_df], axis=1) | |
| return df | |
| # Normalize Blood Groups (Remove +/-) | |
| def merge_blood_groups(df, column, new_col): | |
| """ | |
| Removes '+' and '-' from blood group values. | |
| Args: | |
| df (pd.DataFrame): Input dataframe | |
| column (str): Column name to normalize | |
| new_col (str): New column name for cleaned values | |
| Returns: | |
| pd.DataFrame: Updated dataframe | |
| """ | |
| df[new_col] = df[column].apply(lambda x: re.sub(r'[+-]', '', x) if pd.notnull(x) else np.nan) | |
| return df | |
| def binarize_age(df, age_col, cutoff, new_col): | |
| """ | |
| Binarizes age column based on a cutoff. Non-numeric values are left as-is. | |
| Args: | |
| df (pd.DataFrame): Input dataframe | |
| age_col (str): Column name containing age | |
| cutoff (int): Age cutoff | |
| new_col (str): New binary column name | |
| Returns: | |
| pd.DataFrame: Updated dataframe | |
| """ | |
| def binarize_or_keep(val): | |
| try: | |
| return int(val >= cutoff) | |
| except TypeError: | |
| return val # Leave strings or non-numeric values unchanged | |
| df[new_col] = df[age_col].apply(binarize_or_keep) | |
| return df | |
| # Create Composite Gender & Relation Columns | |
| def add_gender_relation_features(df): | |
| """ | |
| Creates new columns combining donor relation with recipient and donor genders. | |
| Returns: | |
| pd.DataFrame: Updated dataframe | |
| """ | |
| df["Relation_and_Recipient_Gender"] = df["Donor_relation to recipient"] + " R_" + df["Recipient_gender"] | |
| df["Relation_and_Donor_Gender"] = df["Donor_relation to recipient"] + " D_" + df["Donor_gender"] | |
| df["Relation_and_Recipient_and_Donor_Gender"] = ( | |
| df["Donor_relation to recipient"] + " R_" + df["Recipient_gender"] + " D_" + df["Donor_gender"] | |
| ) | |
| return df | |
| # Nationality-Based Groupings | |
| def apply_nationality_groupings(df, column, grouping_dicts): | |
| """ | |
| Applies multiple groupings based on nationality. | |
| Args: | |
| df (pd.DataFrame): Input dataframe | |
| column (str): Column to group by | |
| grouping_dicts (dict): Dictionary of {new_col_name: mapping_dict} | |
| Returns: | |
| pd.DataFrame: Updated dataframe | |
| """ | |
| for new_col, mapping in grouping_dicts.items(): | |
| df[new_col] = df[column].replace(mapping) | |
| return df | |
| # Group and Binarize Diagnosis | |
| def group_and_binarize_diagnosis(df, original_col, group_map, malignant_map): | |
| """ | |
| Groups diagnosis into categories and flags as malignant or not. | |
| Args: | |
| df (pd.DataFrame): Input dataframe | |
| original_col (str): Original diagnosis column | |
| group_map (dict): Mapping of diagnoses to groups | |
| malignant_map (dict): Mapping of groups to binary malignancy label | |
| Returns: | |
| pd.DataFrame: Updated dataframe | |
| """ | |
| grouped_col = f"{original_col}_Grouped" | |
| malignant_col = f"{original_col}_Malignant" | |
| df[grouped_col] = df[original_col].replace(group_map) | |
| df[malignant_col] = df[grouped_col].replace(malignant_map) | |
| return df | |
| # Function to check if a column contains any list | |
| def is_list_column(col): | |
| return any(isinstance(val, list) for val in col) | |
| def preprocess_pipeline(df) -> pd.DataFrame: | |
| """ | |
| Full preprocessing pipeline: | |
| 1. Load and initial cleaning | |
| 2. String normalization | |
| 3. Special column processing | |
| 4. Data corrections | |
| 5. Feature engineering | |
| """ | |
| df = df.dropna(axis=1, how="all") | |
| # Special column processing | |
| # Strip leading/trailing spaces from column names | |
| df.columns = df.columns.str.strip() | |
| # Remove spaces from HLA columns | |
| df.columns = [ | |
| re.sub(r"\s+", "", col) if "_HLA" in col else col | |
| for col in df.columns | |
| ] | |
| # String handling | |
| df = normalize_strings(df) | |
| df = clean_blood_group_columns(df, BLOOD_GROUP_COLS) | |
| # Data corrections | |
| df = correct_nationalities(df, "Recepient_Nationality") | |
| df = correct_drug_name_in_list(df, "PreHSCT conditioning regimen+/-ATG+/-TBI") | |
| df = correct_drug_name_in_list(df, "First_GVHD prophylaxis") | |
| # df = correct_drug_name_in_list(df, "Post HSCT regimen") | |
| df = standardize_compound_columns( | |
| df, | |
| ["PreHSCT conditioning regimen+/-ATG+/-TBI", "First_GVHD prophylaxis"] | |
| ) | |
| df = standardize_gender(df) | |
| df = correct_donor_relationships(df) | |
| if "SELF" in df["Donor_relation to recipient"].unique(): | |
| df = handle_self_donor_consistency(df) | |
| # HLA processing | |
| df = standardize_hla_matching(df) | |
| df = process_hla_columns(df) | |
| df = expand_HLA_cols(df) | |
| # Extract years | |
| df = extract_year(df, "HSCT_date") | |
| df = extract_year(df, "Recepient_DOB") | |
| df = extract_year(df, "Donor_DOB") | |
| df = extract_year(df, "Date of first diagnosis/BMBx date") | |
| df = calculate_ages(df) | |
| # Final missing value handling | |
| df = df.fillna(UNKNOWN_TOKEN) | |
| # One-hot encode multi-drug regimen columns | |
| df = split_and_one_hot_encode(df, 'PreHSCT conditioning regimen+/-ATG+/-TBI', 'PreHSCT') | |
| df = split_and_one_hot_encode(df, 'First_GVHD prophylaxis', 'First_GVHD_prophylaxis') | |
| # df = split_and_one_hot_encode(df, 'Post HSCT regimen', 'PostHSCT') | |
| # Normalize blood groups | |
| df = merge_blood_groups(df, "Recepient_Blood group before HSCT", "Recepient_Blood group before HSCT_MergePlusMinus") | |
| df = merge_blood_groups(df, "D_Blood group", "D_Blood group_MergePlusMinus") | |
| # Binarize ages | |
| df = binarize_age(df, "R_Age_at_transplant", 16, "R_Age_at_transplant_cutoff16") | |
| df = binarize_age(df, "R_Age_at_transplant", 18, "R_Age_at_transplant_cutoff18") | |
| df = binarize_age(df, "D_Age_at_transplant", 16, "D_Age_at_transplant_cutoff16") | |
| df = binarize_age(df, "D_Age_at_transplant", 18, "D_Age_at_transplant_cutoff18") | |
| # Gender/Relation features | |
| df = add_gender_relation_features(df) | |
| # Group nationalities | |
| df = apply_nationality_groupings(df, 'Recepient_Nationality', groupings) | |
| # Group and binarize diagnosis | |
| df = group_and_binarize_diagnosis(df, 'Hematological Diagnosis', DIAGNOSIS_GROUP_MAP, MALIGNANT_MAP) | |
| df = df.replace(UNKNOWN_TOKEN, np.nan) | |
| # Drop columns with only one unique value | |
| # df = df.loc[:, df.nunique() > 1] # get unhashable type list error.. | |
| # # Keep columns that either: | |
| # # - Are not list-type and have more than one unique value | |
| # # - Are list-type (skip them from processing) | |
| # df = df.loc[:, [ | |
| # is_list_column(df[col]) or df[col].nunique(dropna=False) > 1 | |
| # for col in df.columns | |
| # ]] | |
| # df = df.drop(columns=["First_GVHD_prophylaxis_MTX", "PreHSCT_MTX"], errors='ignore') | |
| # Add columns for new dfs for features that exist in the original dataset but not in the new one | |
| for feature in load_train_features()[0]: | |
| if ("_HLA" in feature or "First_GVHD_prophylaxis_" in feature or "PreHSCT_" in feature) and feature not in df.columns: | |
| df[feature] = 0 | |
| return df | |
| if __name__ == "__main__": | |
| processed_data = preprocess_pipeline( | |
| "/home/muhammadridzuan/2025_GVHD/2024_GVHD_SSMC/GVHD_Intel_data_MBZUAI_1.2.csv" | |
| ) | |
| processed_data.to_csv("preprocessed_gvhd_data.csv", index=False) |