Spaces:
Sleeping
Sleeping
add HLA_MATCHING_MAP
Browse files- src/preprocess_utils.py +36 -11
src/preprocess_utils.py
CHANGED
|
@@ -22,7 +22,7 @@ NATIONALITY_CORRECTIONS = {
|
|
| 22 |
"USA": "AMERICAN",
|
| 23 |
}
|
| 24 |
# 1. Regional Grouping (Geography-Based)
|
| 25 |
-
|
| 26 |
# Middle East
|
| 27 |
'EMIRATI': 'Middle East',
|
| 28 |
'OMANI': 'Middle East',
|
|
@@ -72,7 +72,7 @@ regional_grouping = {
|
|
| 72 |
}
|
| 73 |
|
| 74 |
# 2. Cultural-Linguistic Grouping
|
| 75 |
-
|
| 76 |
'EMIRATI': 'Arab',
|
| 77 |
'OMANI': 'Arab',
|
| 78 |
'SAUDI': 'Arab',
|
|
@@ -109,7 +109,7 @@ cultural_grouping = {
|
|
| 109 |
}
|
| 110 |
|
| 111 |
# 3. World Bank Income Grouping
|
| 112 |
-
|
| 113 |
'EMIRATI': 'High income',
|
| 114 |
'OMANI': 'High income',
|
| 115 |
'SAUDI': 'High income',
|
|
@@ -146,7 +146,7 @@ income_grouping = {
|
|
| 146 |
}
|
| 147 |
|
| 148 |
# 4. WHO Regional Office Grouping
|
| 149 |
-
|
| 150 |
'EMIRATI': 'EMRO',
|
| 151 |
'OMANI': 'EMRO',
|
| 152 |
'SAUDI': 'EMRO',
|
|
@@ -182,10 +182,10 @@ who_region_grouping = {
|
|
| 182 |
'FIJI': 'WPRO'
|
| 183 |
}
|
| 184 |
groupings = {
|
| 185 |
-
'Recepient_Nationality_Geographical':
|
| 186 |
-
'Recepient_Nationality_Cultural':
|
| 187 |
-
'Recepient_Nationality_Regional_Income':
|
| 188 |
-
'Recepient_Nationality_Regional_WHO':
|
| 189 |
}
|
| 190 |
|
| 191 |
# FIRST_GVHD_PROPHYLAXIS_CORRECTIONS
|
|
@@ -229,7 +229,7 @@ STRING_NORMALIZATION_MAP = {
|
|
| 229 |
r"(?i)Umbilical Cord blood": "UMBILICAL CORD",
|
| 230 |
r"(?i)Bone Marrow": "BONE MARROW", "MDS": "MYELODYSPLASTIC SYNDROME"
|
| 231 |
}
|
| 232 |
-
|
| 233 |
"MYELOPROLIFERATIVE DISORDER": "MYELOPROLIFERATIVE NEOPLASMS",
|
| 234 |
"CML": "MYELOPROLIFERATIVE NEOPLASMS",
|
| 235 |
"MYELOFIBROSIS": "MYELOPROLIFERATIVE NEOPLASMS",
|
|
@@ -276,7 +276,7 @@ diagnosis_group_map = {
|
|
| 276 |
}
|
| 277 |
|
| 278 |
# # 0 nonmalignant; 1: malignant
|
| 279 |
-
|
| 280 |
'AML': 1,
|
| 281 |
'RED CELL DISORDERS': 0,
|
| 282 |
'AMYLOIDOSIS': 0,
|
|
@@ -296,6 +296,25 @@ malignant_map = {
|
|
| 296 |
'PLASMA CELL LEUKEMIA': 0
|
| 297 |
}
|
| 298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
def load_train_features():
|
| 300 |
# Define features
|
| 301 |
HLA_sub12 = [
|
|
@@ -474,6 +493,12 @@ def clean_blood_group_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
|
| 474 |
df[col] = df[col].str.replace(r"\s+", "", regex=True)
|
| 475 |
return df
|
| 476 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
def process_hla_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 478 |
"""
|
| 479 |
Clean and process HLA columns by:
|
|
@@ -524,7 +549,6 @@ def process_hla_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 524 |
|
| 525 |
return df
|
| 526 |
|
| 527 |
-
|
| 528 |
def cast_as_int_if_possible(x):
|
| 529 |
try:
|
| 530 |
i = int(x)
|
|
@@ -897,6 +921,7 @@ def preprocess_pipeline(df) -> pd.DataFrame:
|
|
| 897 |
df = handle_self_donor_consistency(df)
|
| 898 |
|
| 899 |
# HLA processing
|
|
|
|
| 900 |
df = process_hla_columns(df)
|
| 901 |
df = expand_HLA_cols(df)
|
| 902 |
|
|
|
|
| 22 |
"USA": "AMERICAN",
|
| 23 |
}
|
| 24 |
# 1. Regional Grouping (Geography-Based)
|
| 25 |
+
REGIONAL_GROUPING = {
|
| 26 |
# Middle East
|
| 27 |
'EMIRATI': 'Middle East',
|
| 28 |
'OMANI': 'Middle East',
|
|
|
|
| 72 |
}
|
| 73 |
|
| 74 |
# 2. Cultural-Linguistic Grouping
|
| 75 |
+
CULTURAL_GROUPING = {
|
| 76 |
'EMIRATI': 'Arab',
|
| 77 |
'OMANI': 'Arab',
|
| 78 |
'SAUDI': 'Arab',
|
|
|
|
| 109 |
}
|
| 110 |
|
| 111 |
# 3. World Bank Income Grouping
|
| 112 |
+
INCOME_GROUPING = {
|
| 113 |
'EMIRATI': 'High income',
|
| 114 |
'OMANI': 'High income',
|
| 115 |
'SAUDI': 'High income',
|
|
|
|
| 146 |
}
|
| 147 |
|
| 148 |
# 4. WHO Regional Office Grouping
|
| 149 |
+
WHO_REGION_GROUPING = {
|
| 150 |
'EMIRATI': 'EMRO',
|
| 151 |
'OMANI': 'EMRO',
|
| 152 |
'SAUDI': 'EMRO',
|
|
|
|
| 182 |
'FIJI': 'WPRO'
|
| 183 |
}
|
| 184 |
groupings = {
|
| 185 |
+
'Recepient_Nationality_Geographical': REGIONAL_GROUPING,
|
| 186 |
+
'Recepient_Nationality_Cultural': CULTURAL_GROUPING,
|
| 187 |
+
'Recepient_Nationality_Regional_Income': INCOME_GROUPING,
|
| 188 |
+
'Recepient_Nationality_Regional_WHO': WHO_REGION_GROUPING
|
| 189 |
}
|
| 190 |
|
| 191 |
# FIRST_GVHD_PROPHYLAXIS_CORRECTIONS
|
|
|
|
| 229 |
r"(?i)Umbilical Cord blood": "UMBILICAL CORD",
|
| 230 |
r"(?i)Bone Marrow": "BONE MARROW", "MDS": "MYELODYSPLASTIC SYNDROME"
|
| 231 |
}
|
| 232 |
+
DIAGNOSIS_GROUP_MAP = {
|
| 233 |
"MYELOPROLIFERATIVE DISORDER": "MYELOPROLIFERATIVE NEOPLASMS",
|
| 234 |
"CML": "MYELOPROLIFERATIVE NEOPLASMS",
|
| 235 |
"MYELOFIBROSIS": "MYELOPROLIFERATIVE NEOPLASMS",
|
|
|
|
| 276 |
}
|
| 277 |
|
| 278 |
# # 0 nonmalignant; 1: malignant
|
| 279 |
+
MALIGNANT_MAP = {
|
| 280 |
'AML': 1,
|
| 281 |
'RED CELL DISORDERS': 0,
|
| 282 |
'AMYLOIDOSIS': 0,
|
|
|
|
| 296 |
'PLASMA CELL LEUKEMIA': 0
|
| 297 |
}
|
| 298 |
|
| 299 |
+
HLA_MATCHING_MAP = {
|
| 300 |
+
"12 OF 12": "FULL",
|
| 301 |
+
"10 OF 10": "FULL",
|
| 302 |
+
"8 OF 8": "FULL", # not full?
|
| 303 |
+
|
| 304 |
+
"9 OF 10": "PARTIAL",
|
| 305 |
+
"8 OF 10": "PARTIAL",
|
| 306 |
+
"PARTIALLY MATCHED": "PARTIAL",
|
| 307 |
+
|
| 308 |
+
"7 OF 10": "HAPLOIDENTICAL",
|
| 309 |
+
"6 OF 12": "HAPLOIDENTICAL",
|
| 310 |
+
"6 OF 10": "HAPLOIDENTICAL",
|
| 311 |
+
"5 OF 10": "HAPLOIDENTICAL",
|
| 312 |
+
|
| 313 |
+
# confirm if the following are all haploidentical
|
| 314 |
+
"5 OF 8": "HAPLOIDENTICAL",
|
| 315 |
+
"4 OF 6": "HAPLOIDENTICAL",
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
def load_train_features():
|
| 319 |
# Define features
|
| 320 |
HLA_sub12 = [
|
|
|
|
| 493 |
df[col] = df[col].str.replace(r"\s+", "", regex=True)
|
| 494 |
return df
|
| 495 |
|
| 496 |
+
|
| 497 |
+
def standardize_hla_matching(df: pd.DataFrame) -> pd.DataFrame:
|
| 498 |
+
# Map HLA matching values to standardized format
|
| 499 |
+
df['HLA match ratio'] = df['HLA match ratio'].replace(HLA_MATCHING_MAP, regex=False)
|
| 500 |
+
return df
|
| 501 |
+
|
| 502 |
def process_hla_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 503 |
"""
|
| 504 |
Clean and process HLA columns by:
|
|
|
|
| 549 |
|
| 550 |
return df
|
| 551 |
|
|
|
|
| 552 |
def cast_as_int_if_possible(x):
|
| 553 |
try:
|
| 554 |
i = int(x)
|
|
|
|
| 921 |
df = handle_self_donor_consistency(df)
|
| 922 |
|
| 923 |
# HLA processing
|
| 924 |
+
df = standardize_hla_matching(df)
|
| 925 |
df = process_hla_columns(df)
|
| 926 |
df = expand_HLA_cols(df)
|
| 927 |
|