Spaces:

GVHD-UAE
/

GVHD_Prediction

Running

App Files Files Community

GVHD_Prediction / src /preprocess_utils.py

mridzuan

extract years from dates

fbb35c6 7 months ago

raw

history blame contribute delete

34.8 kB

	import numpy as np
	import pandas as pd
	import re
	from sklearn.preprocessing import MultiLabelBinarizer

	# Constants
	UNKNOWN_TOKEN = "X"
	DATE_FORMAT = '%d/%m/%Y'
	BLOOD_GROUP_COLS = ["D_Blood group", "Recepient_Blood group before HSCT"]
	NATIONALITY_CORRECTIONS = {
	"AFGHANISTAN": "AFGHAN",
	"ALGERIA": "ALGERIAN",
	"EMARATI": "EMIRATI",
	"UAE": "EMIRATI",
	"PHILIPPINO": "FILIPINO",
	"JORDAN": "JORDANIAN",
	"JORDANI": "JORDANIAN",
	"PAKISTAN": "PAKISTANI",
	"PAKISTANII": "PAKISTANI",
	"PALESTINE": "PALESTINIAN",
	"PALESTENIAN": "PALESTINIAN",
	"USA": "AMERICAN",
	}
	# 1. Regional Grouping (Geography-Based)
	REGIONAL_GROUPING = {
	# Middle East
	'EMIRATI': 'Middle East',
	'OMANI': 'Middle East',
	'SAUDI': 'Middle East',
	'KUWAIT': 'Middle East',
	'JORDANIAN': 'Middle East',
	'LEBANESE': 'Middle East',
	'IRAQI': 'Middle East',
	'SYRIAN': 'Middle East',
	'YEMENI': 'Middle East',
	'PALESTINIAN': 'Middle East',
	'BAHRAINI': 'Middle East',
	'LIBYAN': 'Middle East',

	# North Africa
	'EGYPTIAN': 'North Africa',
	'SUDANESE': 'North Africa',
	'ALGERIAN': 'North Africa',
	'MOROCCAN': 'North Africa',
	'MAURITANIA': 'North Africa',
	'COMORAN': 'North Africa',

	# South Asia
	'INDIAN': 'South Asia',
	'PAKISTANI': 'South Asia',
	'BANGLADESHI': 'South Asia',
	'SRI LANKAN': 'South Asia',
	'AFGHAN': 'South Asia',

	# Southeast Asia
	'FILIPINO': 'Southeast Asia',
	'INDONESIAN': 'Southeast Asia',

	# East Africa
	'ETHIOPIAN': 'East Africa',
	'SOMALI': 'East Africa',
	'ERITREAN': 'East Africa',

	# Central Asia
	'UZBEKISTANI': 'Central Asia',

	# Western Nations / Oceania / Americas
	'AMERICAN': 'Western',
	'BRITISH': 'Western',
	'NEW ZEALANDER': 'Oceania',
	'FIJI': 'Oceania'
	}

	# 2. Cultural-Linguistic Grouping
	CULTURAL_GROUPING = {
	'EMIRATI': 'Arab',
	'OMANI': 'Arab',
	'SAUDI': 'Arab',
	'KUWAIT': 'Arab',
	'JORDANIAN': 'Arab',
	'LEBANESE': 'Arab',
	'IRAQI': 'Arab',
	'SYRIAN': 'Arab',
	'YEMENI': 'Arab',
	'PALESTINIAN': 'Arab',
	'BAHRAINI': 'Arab',
	'LIBYAN': 'Arab',
	'EGYPTIAN': 'Arab',
	'SUDANESE': 'Arab-African',
	'ALGERIAN': 'Arab',
	'MOROCCAN': 'Arab',
	'MAURITANIA': 'Arab',
	'COMORAN': 'Arab-African',
	'INDIAN': 'South Asian',
	'PAKISTANI': 'South Asian',
	'BANGLADESHI': 'South Asian',
	'SRI LANKAN': 'South Asian',
	'AFGHAN': 'South Asian',
	'FILIPINO': 'Southeast Asian',
	'INDONESIAN': 'Southeast Asian',
	'ETHIOPIAN': 'East African',
	'SOMALI': 'East African',
	'ERITREAN': 'East African',
	'UZBEKISTANI': 'Central Asian',
	'AMERICAN': 'Western/English-speaking',
	'BRITISH': 'Western/English-speaking',
	'NEW ZEALANDER': 'Western/English-speaking',
	'FIJI': 'Pacific Islander'
	}

	# 3. World Bank Income Grouping
	INCOME_GROUPING = {
	'EMIRATI': 'High income',
	'OMANI': 'High income',
	'SAUDI': 'High income',
	'KUWAIT': 'High income',
	'JORDANIAN': 'Upper-middle income',
	'LEBANESE': 'Upper-middle income',
	'IRAQI': 'Upper-middle income',
	'SYRIAN': 'Low income',
	'YEMENI': 'Low income',
	'PALESTINIAN': 'Lower-middle income',
	'BAHRAINI': 'High income',
	'LIBYAN': 'Upper-middle income',
	'EGYPTIAN': 'Lower-middle income',
	'SUDANESE': 'Low income',
	'ALGERIAN': 'Lower-middle income',
	'MOROCCAN': 'Lower-middle income',
	'MAURITANIA': 'Low income',
	'COMORAN': 'Low income',
	'INDIAN': 'Lower-middle income',
	'PAKISTANI': 'Lower-middle income',
	'BANGLADESHI': 'Lower-middle income',
	'SRI LANKAN': 'Lower-middle income',
	'AFGHAN': 'Low income',
	'FILIPINO': 'Lower-middle income',
	'INDONESIAN': 'Lower-middle income',
	'ETHIOPIAN': 'Low income',
	'SOMALI': 'Low income',
	'ERITREAN': 'Low income',
	'UZBEKISTANI': 'Lower-middle income',
	'AMERICAN': 'High income',
	'BRITISH': 'High income',
	'NEW ZEALANDER': 'High income',
	'FIJI': 'Upper-middle income'
	}

	# 4. WHO Regional Office Grouping
	WHO_REGION_GROUPING = {
	'EMIRATI': 'EMRO',
	'OMANI': 'EMRO',
	'SAUDI': 'EMRO',
	'KUWAIT': 'EMRO',
	'JORDANIAN': 'EMRO',
	'LEBANESE': 'EMRO',
	'IRAQI': 'EMRO',
	'SYRIAN': 'EMRO',
	'YEMENI': 'EMRO',
	'PALESTINIAN': 'EMRO',
	'BAHRAINI': 'EMRO',
	'LIBYAN': 'EMRO',
	'EGYPTIAN': 'EMRO',
	'SUDANESE': 'EMRO',
	'ALGERIAN': 'AFRO',
	'MOROCCAN': 'EMRO',
	'MAURITANIA': 'AFRO',
	'COMORAN': 'AFRO',
	'INDIAN': 'SEARO',
	'PAKISTANI': 'EMRO',
	'BANGLADESHI': 'SEARO',
	'SRI LANKAN': 'SEARO',
	'AFGHAN': 'EMRO',
	'FILIPINO': 'WPRO',
	'INDONESIAN': 'SEARO',
	'ETHIOPIAN': 'AFRO',
	'SOMALI': 'EMRO',
	'ERITREAN': 'AFRO',
	'UZBEKISTANI': 'EURO',
	'AMERICAN': 'AMRO',
	'BRITISH': 'EURO',
	'NEW ZEALANDER': 'WPRO',
	'FIJI': 'WPRO'
	}
	groupings = {
	'Recepient_Nationality_Geographical': REGIONAL_GROUPING,
	'Recepient_Nationality_Cultural': CULTURAL_GROUPING,
	'Recepient_Nationality_Regional_Income': INCOME_GROUPING,
	'Recepient_Nationality_Regional_WHO': WHO_REGION_GROUPING
	}

	# FIRST_GVHD_PROPHYLAXIS_CORRECTIONS
	DRUG_SPELLING_CORRECTIONS = {
	"CYCLOSPOPRIN": "CYCLOSPORIN",
	"CYCLOSPRIN": "CYCLOSPORIN",
	"CYCLOSPOROIN": "CYCLOSPORIN",
	"CY": "CYCLOSPORIN",
	"TAC": "TACROLIMUS",
	"MTX": "METHOTREXATE",
	"BUDESONIDE": "STEROID",
	"STEROIDS": "STEROID",
	"ATG.": "ATG",
	"FLUDARABINIE": "FLUDARABINE",
	"FLUDRABINE":"FLUDARABINE",
	"BUSULPHAN": "BUSULFAN",
	"MEPHALAN": "MELPHALAN",
	"GEMCITABIBE": "GEMCITABINE",
	}
	GENDER_MAP = {
	0: "MALE", 1: "FEMALE", 2: UNKNOWN_TOKEN,
	"0": "MALE", "1": "FEMALE", "2": UNKNOWN_TOKEN
	}
	RELATION_CORRECTIONS = {
	r"(?i)BROTHER": "SIBLING",
	r"(?i)SISTER": "SIBLING",
	r"(?i)FATHER": "FIRST DEGREE RELATIVE",
	r"(?i)MOTHER": "FIRST DEGREE RELATIVE",
	r"(?i)SON": "FIRST DEGREE RELATIVE",
	r"(?i)DAUGHTER": "FIRST DEGREE RELATIVE",
	r"(?i)COUSIN": "SECOND DEGREE RELATIVE",
	r"(?i)UNCLE": "SECOND DEGREE RELATIVE",
	r"(?i)AUNT": "SECOND DEGREE RELATIVE",
	r"(?i)other": UNKNOWN_TOKEN
	}
	STRING_NORMALIZATION_MAP = {
	r"(?i)unknown": UNKNOWN_TOKEN, r"(?i)unkown": UNKNOWN_TOKEN,
	r"(?i)Unknwon": UNKNOWN_TOKEN, np.nan: UNKNOWN_TOKEN,
	r"(?i)\bMale\b": "MALE", r"(?i)\bFemale\b": "FEMALE",
	"1o": "10", r"(?i)Umbilical Cord": "UMBILICAL CORD",
	r"(?i)Umbilical Cord blood": "UMBILICAL CORD",
	r"(?i)Bone Marrow": "BONE MARROW", "MDS": "MYELODYSPLASTIC SYNDROME"
	}
	DIAGNOSIS_GROUP_MAP = {
	"MYELOPROLIFERATIVE DISORDER": "MYELOPROLIFERATIVE NEOPLASMS",
	"CML": "MYELOPROLIFERATIVE NEOPLASMS",
	"MYELOFIBROSIS": "MYELOPROLIFERATIVE NEOPLASMS",
	"NON-HODGKIN LYMPHOMA": "LYMPHOMA",
	'NON HODGKIN LYMPHOMA': "LYMPHOMA",
	"HODGKIN LYMPHOMA": "LYMPHOMA",
	"BETA THALASSEMIA": "RED CELL DISORDERS",
	'BETA THALESSEMIA': "RED CELL DISORDERS",
	"ALPHA THALASSEMIA": "RED CELL DISORDERS",
	"ALPHA THALESSEMIA": "RED CELL DISORDERS",
	"ALPHA THALSSEMIA": "RED CELL DISORDERS",
	"HEREDITARY SPHEROCYTOSIS": "RED CELL DISORDERS",
	"SICKLE CELL DISEASE": "RED CELL DISORDERS",
	"APLASTIC ANEMIA": "BMF SYNDROMES",
	"FANCONI ANEMIA": "BMF SYNDROMES",
	"DYSKERATOSIS CONGENITA": "BMF SYNDROMES",
	'DYSKERATOSIS CONGENTIA': "BMF SYNDROMES",
	"CHRONIC GRANULOMATOUS DISEASE": "IMMUNE DISORDERS",
	"COMBINED VARIABLE IMMUNODEFICIENCY": "IMMUNE DISORDERS",
	"SCID": "IMMUNE DISORDERS",

	## check this one
	"X-LINKED HYPERGAMMAGLOBULINEMIA": "IMMUNE DISORDERS",
	'-LINKED HYPERGAMMAGLOBULINEMIA': "IMMUNE DISORDERS",
	'-LINKED HYPER IGM SYNDROME': "IMMUNE DISORDERS",
	"HYPOGAMMAGLOBULINEMIA": "IMMUNE DISORDERS",

	## check this one
	"GLANZMANN": "OTHER",
	'GLANZMANN THROMBASTHENIA': "OTHER",

	"CLL": "OTHER",
	"PNH": "OTHER",
	"HLH": "OTHER",
	"LANGERHANS CELL HISTIOCYTOSIS": "OTHER",
	"BLASTIC PLASMACYTOID DENDRITIC CELL NEOPLASM": "OTHER",
	'BLASTIC PLASMACYTOID DENDRITRIC CELL NEOPLASM': "OTHER",
	"B-ALL": "ALL",
	"BALL": "ALL",
	"TALL": "ALL",
	"T-ALL": "ALL",
	"AML": "AML",
	"ACUTE MYELOID LEUKEMIA": "AML"
	}

	# # 0 nonmalignant; 1: malignant
	MALIGNANT_MAP = {
	'AML': 1,
	'RED CELL DISORDERS': 0,
	'AMYLOIDOSIS': 0,
	'BMF SYNDROMES': 0,
	'ALL': 1,
	'OTHER': 0,
	'IMMUNE DISORDERS': 0,
	'CHRONIC LYMPHOCYTIC LEUKEMIA': 1,
	'MYELOPROLIFERATIVE NEOPLASMS': 1, # note: CML is malignant; not sure about MYELOPROLIFERATIVE DISORDER & MYELOFIBROSIS
	'HEMOPHAGOCYTIC LYMPHOHISTIOCYTOSIS (HLH)': 0,
	'LYMPHOMA': 1,
	'MYELODYSPLASTIC SYNDROME': 1,
	'MEDULLOBLASTOMA': 0,
	'MULTIPLE MYELOMA': 0,
	'NEUROBLASTOMA': 0,
	'PAROXYSMAL NOCTURNAL HEMOGLOBINURIA': 0,
	'PLASMA CELL LEUKEMIA': 0
	}

	HLA_MATCHING_MAP = {
	"12 OF 12": "FULL",
	"10 OF 10": "FULL",
	"8 OF 8": "FULL", # not full?

	"9 OF 10": "PARTIAL",
	"8 OF 10": "PARTIAL",
	"PARTIALLY MATCHED": "PARTIAL",

	"7 OF 10": "HAPLOIDENTICAL",
	"6 OF 12": "HAPLOIDENTICAL",
	"6 OF 10": "HAPLOIDENTICAL",
	"5 OF 10": "HAPLOIDENTICAL",

	# confirm if the following are all haploidentical
	"5 OF 8": "HAPLOIDENTICAL",
	"4 OF 6": "HAPLOIDENTICAL",
	}

	def load_train_features():
	# Define features
	HLA_sub12 = [

	# Recipient - HLA-A
	'R_HLA_A_1', 'R_HLA_A_2', 'R_HLA_A_3', 'R_HLA_A_4', 'R_HLA_A_7', 'R_HLA_A_8',
	'R_HLA_A_11', 'R_HLA_A_12', 'R_HLA_A_20', 'R_HLA_A_23', 'R_HLA_A_24', 'R_HLA_A_25',
	'R_HLA_A_26', 'R_HLA_A_29', 'R_HLA_A_30', 'R_HLA_A_31', 'R_HLA_A_32', 'R_HLA_A_33',
	'R_HLA_A_34', 'R_HLA_A_66', 'R_HLA_A_68', 'R_HLA_A_69', 'R_HLA_A_74', 'R_HLA_A_X',

	# Recipient - HLA-B
	'R_HLA_B_7', 'R_HLA_B_8', 'R_HLA_B_13', 'R_HLA_B_14', 'R_HLA_B_15', 'R_HLA_B_18',
	'R_HLA_B_23', 'R_HLA_B_24', 'R_HLA_B_27', 'R_HLA_B_35', 'R_HLA_B_37', 'R_HLA_B_38',
	'R_HLA_B_39', 'R_HLA_B_40', 'R_HLA_B_41', 'R_HLA_B_42', 'R_HLA_B_44', 'R_HLA_B_45',
	'R_HLA_B_46', 'R_HLA_B_49', 'R_HLA_B_50', 'R_HLA_B_51', 'R_HLA_B_52', 'R_HLA_B_53',
	'R_HLA_B_55', 'R_HLA_B_56', 'R_HLA_B_57', 'R_HLA_B_58', 'R_HLA_B_73', 'R_HLA_B_81',
	'R_HLA_B_X',

	# Recipient - HLA-C
	'R_HLA_C_1', 'R_HLA_C_2', 'R_HLA_C_3', 'R_HLA_C_4', 'R_HLA_C_5', 'R_HLA_C_6',
	'R_HLA_C_7', 'R_HLA_C_8', 'R_HLA_C_12', 'R_HLA_C_14', 'R_HLA_C_15', 'R_HLA_C_16',
	'R_HLA_C_17', 'R_HLA_C_18', 'R_HLA_C_38', 'R_HLA_C_49', 'R_HLA_C_50', 'R_HLA_C_X',

	# Recipient - HLA-DR
	'R_HLA_DR_1', 'R_HLA_DR_2', 'R_HLA_DR_3', 'R_HLA_DR_4', 'R_HLA_DR_5', 'R_HLA_DR_6',
	'R_HLA_DR_7', 'R_HLA_DR_8', 'R_HLA_DR_9', 'R_HLA_DR_10', 'R_HLA_DR_11', 'R_HLA_DR_12',
	'R_HLA_DR_13', 'R_HLA_DR_14', 'R_HLA_DR_15', 'R_HLA_DR_16', 'R_HLA_DR_17', 'R_HLA_DR_X',

	# Recipient - HLA-DQ
	'R_HLA_DQ_1', 'R_HLA_DQ_2', 'R_HLA_DQ_3', 'R_HLA_DQ_4', 'R_HLA_DQ_5', 'R_HLA_DQ_6',
	'R_HLA_DQ_7', 'R_HLA_DQ_11', 'R_HLA_DQ_15', 'R_HLA_DQ_16', 'R_HLA_DQ_301', 'R_HLA_DQ_X',

	# Donor - HLA-A
	'D_HLA_A_1', 'D_HLA_A_2', 'D_HLA_A_3', 'D_HLA_A_8', 'D_HLA_A_11', 'D_HLA_A_12',
	'D_HLA_A_23', 'D_HLA_A_24', 'D_HLA_A_25', 'D_HLA_A_26', 'D_HLA_A_29', 'D_HLA_A_30',
	'D_HLA_A_31', 'D_HLA_A_32', 'D_HLA_A_33', 'D_HLA_A_34', 'D_HLA_A_66', 'D_HLA_A_68',
	'D_HLA_A_69', 'D_HLA_A_7', 'D_HLA_A_74', 'D_HLA_A_X',

	# Donor - HLA-B
	'D_HLA_B_7', 'D_HLA_B_8', 'D_HLA_B_13', 'D_HLA_B_14', 'D_HLA_B_15', 'D_HLA_B_17',
	'D_HLA_B_18', 'D_HLA_B_23', 'D_HLA_B_24', 'D_HLA_B_27', 'D_HLA_B_35', 'D_HLA_B_37',
	'D_HLA_B_38', 'D_HLA_B_39', 'D_HLA_B_40', 'D_HLA_B_41', 'D_HLA_B_42', 'D_HLA_B_44',
	'D_HLA_B_45', 'D_HLA_B_48', 'D_HLA_B_49', 'D_HLA_B_50', 'D_HLA_B_51', 'D_HLA_B_52',
	'D_HLA_B_53', 'D_HLA_B_55', 'D_HLA_B_56', 'D_HLA_B_57', 'D_HLA_B_58', 'D_HLA_B_73',
	'D_HLA_B_81', 'D_HLA_B_X',

	# Donor - HLA-C
	'D_HLA_C_1', 'D_HLA_C_2', 'D_HLA_C_3', 'D_HLA_C_4', 'D_HLA_C_5', 'D_HLA_C_6',
	'D_HLA_C_7', 'D_HLA_C_8', 'D_HLA_C_12', 'D_HLA_C_14', 'D_HLA_C_15', 'D_HLA_C_16',
	'D_HLA_C_17', 'D_HLA_C_18', 'D_HLA_C_38', 'D_HLA_C_49', 'D_HLA_C_50', 'D_HLA_C_X',

	# Donor - HLA-DR
	'D_HLA_DR_1', 'D_HLA_DR_2', 'D_HLA_DR_3', 'D_HLA_DR_4', 'D_HLA_DR_5', 'D_HLA_DR_6',
	'D_HLA_DR_7', 'D_HLA_DR_8', 'D_HLA_DR_9', 'D_HLA_DR_10', 'D_HLA_DR_11', 'D_HLA_DR_12',
	'D_HLA_DR_13', 'D_HLA_DR_14', 'D_HLA_DR_15', 'D_HLA_DR_16', 'D_HLA_DR_17', 'D_HLA_DR_X',

	# Donor - HLA-DQ
	'D_HLA_DQ_1', 'D_HLA_DQ_2', 'D_HLA_DQ_3', 'D_HLA_DQ_4', 'D_HLA_DQ_5', 'D_HLA_DQ_6',
	'D_HLA_DQ_7', 'D_HLA_DQ_11', 'D_HLA_DQ_15', 'D_HLA_DQ_16', 'D_HLA_DQ_301', 'D_HLA_DQ_X'
	]


	HLA_sub12_without_X = [i for i in HLA_sub12 if "_X" not in i]

	prehsct_onehot = [
	'PreHSCT_ALEMTUZUMAB',
	'PreHSCT_ATG',
	'PreHSCT_BEAM',
	'PreHSCT_BUSULFAN',
	'PreHSCT_CAMPATH',
	'PreHSCT_CARMUSTINE',
	'PreHSCT_CLOFARABINE',
	'PreHSCT_CYCLOPHOSPHAMIDE',
	'PreHSCT_CYCLOSPORIN',
	'PreHSCT_CYTARABINE',
	'PreHSCT_ETOPOSIDE',
	'PreHSCT_FLUDARABINE',
	'PreHSCT_GEMCITABINE',
	'PreHSCT_MELPHALAN',
	'PreHSCT_MTX',
	'PreHSCT_OTHER',
	'PreHSCT_RANIMUSTINE',
	'PreHSCT_REDUCEDCONDITIONING',
	'PreHSCT_RITUXIMAB',
	'PreHSCT_SIROLIMUS',
	'PreHSCT_TBI',
	'PreHSCT_THIOTEPA',
	'PreHSCT_TREOSULFAN',
	'PreHSCT_UA',
	'PreHSCT_VORNOSTAT',
	]

	first_prophylaxis_onehot = [
	'First_GVHD_prophylaxis_ABATACEPT',
	'First_GVHD_prophylaxis_ALEMTUZUMAB',
	'First_GVHD_prophylaxis_ATG',
	'First_GVHD_prophylaxis_CYCLOPHOSPHAMIDE',
	'First_GVHD_prophylaxis_CYCLOSPORIN',
	'First_GVHD_prophylaxis_IMATINIB',
	'First_GVHD_prophylaxis_LEFLUNOMIDE',
	'First_GVHD_prophylaxis_MMF',
	'First_GVHD_prophylaxis_MTX',
	'First_GVHD_prophylaxis_NONE',
	'First_GVHD_prophylaxis_RUXOLITINIB',
	'First_GVHD_prophylaxis_SIROLIMUS',
	'First_GVHD_prophylaxis_STEROID',
	'First_GVHD_prophylaxis_TAC',
	]

	train_features = [[
	'Recipient_gender',
	'R_Age_at_transplant_cutoff18',
	'Recepient_Nationality_Cultural',
	'Hematological Diagnosis_Grouped',
	'Recepient_Blood group before HSCT_MergePlusMinus',
	'D_Age_at_transplant_cutoff18',
	'Age_Gap_R_D',
	'Donor_gender',
	'D_Blood group_MergePlusMinus',
	'Number of lines of Rx before HSCT',
	'Source of cells',
	'Donor_relation to recipient',
	] + HLA_sub12_without_X + prehsct_onehot + first_prophylaxis_onehot][0]

	# Categorical features
	cat_features = [
	'Recipient_gender',
	'Recepient_Nationality_Cultural',
	'Hematological Diagnosis_Grouped',
	'Recepient_Blood group before HSCT_MergePlusMinus',
	'Donor_gender',
	'D_Blood group_MergePlusMinus',
	'Source of cells',
	'Donor_relation to recipient',
	]

	return train_features, cat_features

	def load_dataset(file_path: str) -> pd.DataFrame:
	"""Load dataset from CSV file and drop columns with all missing values"""
	df = pd.read_csv(file_path, header=1)
	return df.dropna(axis=1, how="all")

	def normalize_strings(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Standardize string values across the dataset:
	- Replace variations of unknown/NA with consistent token
	- Correct common misspellings and abbreviations
	- Capitalize all strings for consistency
	- Strip leading/trailing whitespace
	"""
	# Apply global string replacements
	df = df.replace(STRING_NORMALIZATION_MAP, regex=True)

	# Handle nationality-specific replacements
	non_nationality_cols = [col for col in df.columns if "Nationality" not in col]
	df[non_nationality_cols] = df[non_nationality_cols].replace(
	{r"(?i)\buk\b": UNKNOWN_TOKEN}, regex=True
	)

	# Handle non-HLA specific replacements
	non_hla_cols = [col for col in df.columns if "HLA" not in col]
	df[non_hla_cols] = df[non_hla_cols].replace(
	{r"(?i)\bna\b": UNKNOWN_TOKEN}, regex=True
	)

	# Capitalize all string values
	df = df.applymap(lambda x: x.upper() if isinstance(x, str) else x)

	# Strip whitespace
	return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

	def clean_blood_group_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
	"""Remove spaces from specified blood group columns"""
	for col in columns:
	df[col] = df[col].str.replace(r"\s+", "", regex=True)
	return df

	def standardize_hla_matching(df: pd.DataFrame) -> pd.DataFrame:
	# Map HLA matching values to standardized format
	df['HLA match ratio'] = df['HLA match ratio'].replace(HLA_MATCHING_MAP, regex=False)
	return df

	def process_hla_columns(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Clean and process HLA columns by:
	1. Splitting combined HLA values into separate columns
	2. Standardizing missing value representation
	3. Sorting allele values numerically
	4. Recombining cleaned values
	"""
	# Padding function to ensure 2 elements, filling with 'NA'. Used for Individual_Predictions
	def pad_list(val):
	if not isinstance(val, list):
	val = []
	return (val + ['NA', 'NA'])[:2]

	hla_columns = [col for col in df.columns if "R_HLA" in col or "D_HLA" in col]
	# hla_columns = ['R_HLA_A', 'R_HLA_B', 'R_HLA_C', 'R_HLA_DR', 'R_HLA_DQ',
	# 'D_HLA_A', 'D_HLA_B', 'D_HLA_C', 'D_HLA_DR', 'D_HLA_DQ']

	for col in hla_columns:
	# Handle special NA representation
	df[col] = df[col].replace({"NA": "NA&NA"})

	# Split into two separate columns
	split_cols = [f"{col}1", f"{col}2"]

	if type(df[col].iloc[0]) != list: # and "&" in df[col].iloc[0]:
	df[split_cols] = df[col].str.split("&", expand=True)
	elif type(df[col].iloc[0]) == list:
	df[col] = df[col].apply(pad_list)
	df[split_cols] = pd.DataFrame(df[col].tolist(), index=df.index)

	# Standardize missing values
	missing_indicators = {" ", "NA", "N/A", UNKNOWN_TOKEN, "''", '""', "", "B1", None}
	df[split_cols] = df[split_cols].replace(missing_indicators, np.nan)

	# Convert to numeric and handle zeros
	df[split_cols] = df[split_cols].apply(pd.to_numeric, errors='coerce')
	df[split_cols] = df[split_cols].replace(0, np.nan)

	# Sort values numerically
	df[split_cols] = np.sort(df[split_cols], axis=1)

	# Convert numbers to integers, missing to 'X'
	df[split_cols] = df[split_cols].applymap(lambda x: str(int(x)) if pd.notna(x) else UNKNOWN_TOKEN)

	# Recombine cleaned values
	df[col] = df[split_cols].astype(str).agg("&".join, axis=1)

	return df

	def cast_as_int_if_possible(x):
	try:
	i = int(x)
	# Only return int if conversion is lossless (e.g., avoid converting '5.5' -> 5)
	if float(x) == i:
	return i
	except:
	pass
	return x

	def HLA_unique_alleles(df, HLA_col1, HLA_col2):
	HLA_col1_unique = df[HLA_col1].unique()
	HLA_col2_unique = df[HLA_col2].unique()

	HLA_col1_unique = [cast_as_int_if_possible(val) for val in HLA_col1_unique]
	HLA_col2_unique = [cast_as_int_if_possible(val) for val in HLA_col2_unique]

	unique_set = set(HLA_col1_unique).union(set(HLA_col2_unique))

	# Replace NaN with "X"
	unique_set = {(UNKNOWN_TOKEN if pd.isna(item) else str(item)) for item in unique_set}
	print('unique_set', unique_set)
	return sorted(unique_set)

	def expand_HLA_cols_(df, HLA_col1, HLA_col2):
	HLA_uniques = HLA_unique_alleles(df, HLA_col1, HLA_col2)

	col_name = HLA_col1[:-1] # get "R_HLA_A" from "R_HLA_A1"
	for i in HLA_uniques:
	df[f"{col_name}_{i}"] = 0
	df.loc[df[HLA_col1]==i, f"{col_name}_{i}"] = 1 # or = 1
	df.loc[df[HLA_col2]==i, f"{col_name}_{i}"] = 1 # or = 1

	return df

	def expand_HLA_cols(df):
	df = expand_HLA_cols_(df, HLA_col1="R_HLA_A1", HLA_col2="R_HLA_A2")
	df = expand_HLA_cols_(df, HLA_col1="R_HLA_B1", HLA_col2="R_HLA_B2")
	df = expand_HLA_cols_(df, HLA_col1="R_HLA_C1", HLA_col2="R_HLA_C2")
	df = expand_HLA_cols_(df, HLA_col1="R_HLA_DR1", HLA_col2="R_HLA_DR2")
	df = expand_HLA_cols_(df, HLA_col1="R_HLA_DQ1", HLA_col2="R_HLA_DQ2")

	df = expand_HLA_cols_(df, HLA_col1="D_HLA_A1", HLA_col2="D_HLA_A2")
	df = expand_HLA_cols_(df, HLA_col1="D_HLA_B1", HLA_col2="D_HLA_B2")
	df = expand_HLA_cols_(df, HLA_col1="D_HLA_C1", HLA_col2="D_HLA_C2")
	df = expand_HLA_cols_(df, HLA_col1="D_HLA_DR1", HLA_col2="D_HLA_DR2")
	df = expand_HLA_cols_(df, HLA_col1="D_HLA_DQ1", HLA_col2="D_HLA_DQ2")
	return df

	def correct_nationalities(df: pd.DataFrame, column: str) -> pd.DataFrame:
	"""Standardize nationality names using predefined corrections"""
	df[column] = df[column].replace(NATIONALITY_CORRECTIONS)
	return df

	def correct_indiv_drug_name(drug_list):
	if type(drug_list) == str:
	# Find all the drug names and separators in the string
	parts = re.split(r'([ /+])', drug_list) # Split but keep the separators
	elif type(drug_list) == list:
	parts = drug_list

	corrected_parts = []

	for part in parts:
	# If the part is a drug name, apply the correction
	if part.strip() and part.strip() not in {'', ' ', '/', '+'}:
	corrected_part = DRUG_SPELLING_CORRECTIONS.get(part.strip(), part.strip())
	corrected_parts.append(corrected_part)
	else:
	# If it's a separator (/, +, space), just keep it
	corrected_parts.append(part)

	# Join the parts back together
	return ''.join(corrected_parts)

	def correct_drug_name_in_list(df: pd.DataFrame, column: str) -> pd.DataFrame:
	"""Standardize drug names in a list using predefined corrections, preserving separators."""
	# Apply the correction function to each entry in the specified column
	df[column] = df[column].apply(correct_indiv_drug_name)

	return df

	def standardize_compound_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
	"""
	Process columns with compound values by:
	1. Removing spaces
	2. Standardizing separators
	3. Sorting components alphabetically
	"""
	for col in columns:
	if col in df.columns and type(df[col].iloc[0]) != list:
	# Clean string values
	df[col] = df[col].str.replace(r"\s+", "", regex=True).str.replace("+", "/").str.replace(",", "/")

	# Split, remove empty parts, sort, and join
	df[col] = df[col].apply(
	lambda x: "/".join(sorted([part for part in x.split("/") if part])) if isinstance(x, str) else x
	)
	return df

	def standardize_gender(df: pd.DataFrame) -> pd.DataFrame:
	"""Standardize donor gender values and infer from relationship where possible"""
	# Apply gender mapping
	df["Donor_gender"] = df["Donor_gender"].replace(GENDER_MAP)
	df["Recipient_gender"] = df["Recipient_gender"].replace(GENDER_MAP)

	# Infer gender from relationship
	gender_map = {
	"BROTHER": "MALE", "SISTER": "FEMALE",
	"FATHER": "MALE", "MOTHER": "FEMALE",
	"SON": "MALE", "DAUGHTER": "FEMALE",
	"UNCLE": "MALE", "AUNT": "FEMALE"
	}
	for relationship, gender in gender_map.items():
	mask = df["Donor_relation to recipient"] == relationship
	df.loc[mask, "Donor_gender"] = gender

	return df

	def correct_donor_relationships(df: pd.DataFrame) -> pd.DataFrame:
	"""Standardize relationship categories using predefined corrections"""
	return df.replace({"Donor_relation to recipient": RELATION_CORRECTIONS}, regex=True)

	def handle_self_donor_consistency(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Ensure data consistency for self-donors by:
	1. Setting HLA values to 'SELF&SELF'
	2. Verifying matching demographics
	"""
	self_mask = df["Donor_relation to recipient"] == "SELF"

	# Set HLA values for self-donors
	hla_cols = [col for col in df.columns if "R_HLA" in col or "D_HLA" in col]
	df.loc[self_mask, hla_cols] = "SELF&SELF"

	# Verify demographic consistency
	assert df.loc[self_mask, "Recipient_gender"].equals(
	df.loc[self_mask, "Donor_gender"]
	), "Recipient/Donor gender mismatch for self-donors"

	assert df.loc[self_mask, "Recepient_Blood group before HSCT"].equals(
	df.loc[self_mask, "D_Blood group"]
	), "Blood group mismatch for self-donors"

	assert df.loc[self_mask, "Recepient_DOB"].equals(
	df.loc[self_mask, "Donor_DOB"]
	), "DOB mismatch for self-donors"

	return df

	def safe_extract_year(date_str: str) -> str:
	"""
	Safely extract year from date string:
	- Returns year as integer if valid
	- Returns UNKNOWN_TOKEN for invalid/missing dates
	"""
	if not isinstance(date_str, str) or date_str == UNKNOWN_TOKEN:
	return UNKNOWN_TOKEN

	try:
	# Handle special cases like "35 YEAR OLD"
	if "YEAR" in date_str:
	return UNKNOWN_TOKEN

	parts = date_str.split("/")
	if len(parts) < 3:
	return UNKNOWN_TOKEN

	year_part = parts[-1].strip()
	return int(year_part) if year_part.isdigit() else UNKNOWN_TOKEN
	except (ValueError, TypeError):
	return UNKNOWN_TOKEN

	def extract_year(df: pd.DataFrame, column_name) -> pd.DataFrame:
	df[column_name + "_Year"] = df[column_name].apply(safe_extract_year)
	return df

	def calculate_ages(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Calculate:
	1. Recipient age at transplant
	2. Donor age at transplant
	3. Age gap between recipient and donor
	"""
	# Extract years safely
	df = extract_year(df, "HSCT_date")
	df = extract_year(df, "Recepient_DOB")
	df = extract_year(df, "Donor_DOB")

	# Calculate ages with safe conversion
	def calculate_age_diff(row, dob_col, transplant_col):
	try:
	return int(row[transplant_col]) - int(row[dob_col])
	except (TypeError, ValueError):
	return UNKNOWN_TOKEN

	df["R_Age_at_transplant"] = df.apply(
	lambda row: calculate_age_diff(row, "Recepient_DOB_Year", "HSCT_date_Year"),
	axis=1
	)

	df["D_Age_at_transplant"] = df.apply(
	lambda row: calculate_age_diff(row, "Donor_DOB_Year", "HSCT_date_Year"),
	axis=1
	)

	df["Age_Gap_R_D"] = df.apply(
	lambda row: calculate_age_diff(row, "Donor_DOB_Year", "Recepient_DOB_Year"),
	axis=1
	)

	return df

	# Utility Function: Split and One-Hot Encode Drug Regimens
	def split_and_one_hot_encode(df, column_name, prefix):
	"""
	Splits entries in a column by "/" and one-hot encodes the resulting tokens.

	Args:
	df (pd.DataFrame): Input dataframe
	column_name (str): Name of the column to process
	prefix (str): Prefix for the resulting one-hot encoded columns

	Returns:
	pd.DataFrame: DataFrame with one-hot encoded columns added
	"""
	if type(df[column_name].iloc[0]) != list:
	df[column_name] = df[column_name].fillna("").apply(lambda x: re.split(r'[/]', x) if x else [])
	else:
	pass

	mlb = MultiLabelBinarizer()
	encoded_df = pd.DataFrame(
	mlb.fit_transform(df[column_name]),
	columns=[f"{prefix}_{drug.strip()}" for drug in mlb.classes_],
	index=df.index
	)

	df = pd.concat([df, encoded_df], axis=1)
	return df

	# Normalize Blood Groups (Remove +/-)
	def merge_blood_groups(df, column, new_col):
	"""
	Removes '+' and '-' from blood group values.

	Args:
	df (pd.DataFrame): Input dataframe
	column (str): Column name to normalize
	new_col (str): New column name for cleaned values

	Returns:
	pd.DataFrame: Updated dataframe
	"""
	df[new_col] = df[column].apply(lambda x: re.sub(r'[+-]', '', x) if pd.notnull(x) else np.nan)
	return df

	def binarize_age(df, age_col, cutoff, new_col):
	"""
	Binarizes age column based on a cutoff. Non-numeric values are left as-is.

	Args:
	df (pd.DataFrame): Input dataframe
	age_col (str): Column name containing age
	cutoff (int): Age cutoff
	new_col (str): New binary column name

	Returns:
	pd.DataFrame: Updated dataframe
	"""
	def binarize_or_keep(val):
	try:
	return int(val >= cutoff)
	except TypeError:
	return val # Leave strings or non-numeric values unchanged

	df[new_col] = df[age_col].apply(binarize_or_keep)
	return df

	# Create Composite Gender & Relation Columns
	def add_gender_relation_features(df):
	"""
	Creates new columns combining donor relation with recipient and donor genders.

	Returns:
	pd.DataFrame: Updated dataframe
	"""
	df["Relation_and_Recipient_Gender"] = df["Donor_relation to recipient"] + " R_" + df["Recipient_gender"]
	df["Relation_and_Donor_Gender"] = df["Donor_relation to recipient"] + " D_" + df["Donor_gender"]
	df["Relation_and_Recipient_and_Donor_Gender"] = (
	df["Donor_relation to recipient"] + " R_" + df["Recipient_gender"] + " D_" + df["Donor_gender"]
	)
	return df

	# Nationality-Based Groupings
	def apply_nationality_groupings(df, column, grouping_dicts):
	"""
	Applies multiple groupings based on nationality.

	Args:
	df (pd.DataFrame): Input dataframe
	column (str): Column to group by
	grouping_dicts (dict): Dictionary of {new_col_name: mapping_dict}

	Returns:
	pd.DataFrame: Updated dataframe
	"""
	for new_col, mapping in grouping_dicts.items():
	df[new_col] = df[column].replace(mapping)
	return df

	# Group and Binarize Diagnosis
	def group_and_binarize_diagnosis(df, original_col, group_map, malignant_map):
	"""
	Groups diagnosis into categories and flags as malignant or not.

	Args:
	df (pd.DataFrame): Input dataframe
	original_col (str): Original diagnosis column
	group_map (dict): Mapping of diagnoses to groups
	malignant_map (dict): Mapping of groups to binary malignancy label

	Returns:
	pd.DataFrame: Updated dataframe
	"""
	grouped_col = f"{original_col}_Grouped"
	malignant_col = f"{original_col}_Malignant"

	df[grouped_col] = df[original_col].replace(group_map)
	df[malignant_col] = df[grouped_col].replace(malignant_map)
	return df

	# Function to check if a column contains any list
	def is_list_column(col):
	return any(isinstance(val, list) for val in col)

	def preprocess_pipeline(df) -> pd.DataFrame:
	"""
	Full preprocessing pipeline:
	1. Load and initial cleaning
	2. String normalization
	3. Special column processing
	4. Data corrections
	5. Feature engineering
	"""
	df = df.dropna(axis=1, how="all")

	# Special column processing
	# Strip leading/trailing spaces from column names
	df.columns = df.columns.str.strip()
	# Remove spaces from HLA columns
	df.columns = [
	re.sub(r"\s+", "", col) if "_HLA" in col else col
	for col in df.columns
	]

	# String handling
	df = normalize_strings(df)
	df = clean_blood_group_columns(df, BLOOD_GROUP_COLS)

	# Data corrections
	df = correct_nationalities(df, "Recepient_Nationality")
	df = correct_drug_name_in_list(df, "PreHSCT conditioning regimen+/-ATG+/-TBI")
	df = correct_drug_name_in_list(df, "First_GVHD prophylaxis")
	# df = correct_drug_name_in_list(df, "Post HSCT regimen")
	df = standardize_compound_columns(
	df,
	["PreHSCT conditioning regimen+/-ATG+/-TBI", "First_GVHD prophylaxis"]
	)
	df = standardize_gender(df)
	df = correct_donor_relationships(df)

	if "SELF" in df["Donor_relation to recipient"].unique():
	df = handle_self_donor_consistency(df)

	# HLA processing
	df = standardize_hla_matching(df)
	df = process_hla_columns(df)
	df = expand_HLA_cols(df)

	# Extract years
	df = extract_year(df, "HSCT_date")
	df = extract_year(df, "Recepient_DOB")
	df = extract_year(df, "Donor_DOB")
	df = extract_year(df, "Date of first diagnosis/BMBx date")

	df = calculate_ages(df)

	# Final missing value handling
	df = df.fillna(UNKNOWN_TOKEN)

	# One-hot encode multi-drug regimen columns
	df = split_and_one_hot_encode(df, 'PreHSCT conditioning regimen+/-ATG+/-TBI', 'PreHSCT')
	df = split_and_one_hot_encode(df, 'First_GVHD prophylaxis', 'First_GVHD_prophylaxis')
	# df = split_and_one_hot_encode(df, 'Post HSCT regimen', 'PostHSCT')

	# Normalize blood groups
	df = merge_blood_groups(df, "Recepient_Blood group before HSCT", "Recepient_Blood group before HSCT_MergePlusMinus")
	df = merge_blood_groups(df, "D_Blood group", "D_Blood group_MergePlusMinus")

	# Binarize ages
	df = binarize_age(df, "R_Age_at_transplant", 16, "R_Age_at_transplant_cutoff16")
	df = binarize_age(df, "R_Age_at_transplant", 18, "R_Age_at_transplant_cutoff18")
	df = binarize_age(df, "D_Age_at_transplant", 16, "D_Age_at_transplant_cutoff16")
	df = binarize_age(df, "D_Age_at_transplant", 18, "D_Age_at_transplant_cutoff18")

	# Gender/Relation features
	df = add_gender_relation_features(df)

	# Group nationalities
	df = apply_nationality_groupings(df, 'Recepient_Nationality', groupings)

	# Group and binarize diagnosis
	df = group_and_binarize_diagnosis(df, 'Hematological Diagnosis', DIAGNOSIS_GROUP_MAP, MALIGNANT_MAP)

	df = df.replace(UNKNOWN_TOKEN, np.nan)

	# Drop columns with only one unique value
	# df = df.loc[:, df.nunique() > 1] # get unhashable type list error..

	# # Keep columns that either:
	# # - Are not list-type and have more than one unique value
	# # - Are list-type (skip them from processing)
	# df = df.loc[:, [
	# is_list_column(df[col]) or df[col].nunique(dropna=False) > 1
	# for col in df.columns
	# ]]

	# df = df.drop(columns=["First_GVHD_prophylaxis_MTX", "PreHSCT_MTX"], errors='ignore')

	# Add columns for new dfs for features that exist in the original dataset but not in the new one
	for feature in load_train_features()[0]:
	if ("_HLA" in feature or "First_GVHD_prophylaxis_" in feature or "PreHSCT_" in feature) and feature not in df.columns:
	df[feature] = 0

	return df

	if __name__ == "__main__":
	processed_data = preprocess_pipeline(
	"/home/muhammadridzuan/2025_GVHD/2024_GVHD_SSMC/GVHD_Intel_data_MBZUAI_1.2.csv"
	)
	processed_data.to_csv("preprocessed_gvhd_data.csv", index=False)