mridzuan commited on
Commit
e989a63
·
1 Parent(s): 11ee773

add HLA_MATCHING_MAP

Browse files
Files changed (1) hide show
  1. src/preprocess_utils.py +36 -11
src/preprocess_utils.py CHANGED
@@ -22,7 +22,7 @@ NATIONALITY_CORRECTIONS = {
22
  "USA": "AMERICAN",
23
  }
24
  # 1. Regional Grouping (Geography-Based)
25
- regional_grouping = {
26
  # Middle East
27
  'EMIRATI': 'Middle East',
28
  'OMANI': 'Middle East',
@@ -72,7 +72,7 @@ regional_grouping = {
72
  }
73
 
74
  # 2. Cultural-Linguistic Grouping
75
- cultural_grouping = {
76
  'EMIRATI': 'Arab',
77
  'OMANI': 'Arab',
78
  'SAUDI': 'Arab',
@@ -109,7 +109,7 @@ cultural_grouping = {
109
  }
110
 
111
  # 3. World Bank Income Grouping
112
- income_grouping = {
113
  'EMIRATI': 'High income',
114
  'OMANI': 'High income',
115
  'SAUDI': 'High income',
@@ -146,7 +146,7 @@ income_grouping = {
146
  }
147
 
148
  # 4. WHO Regional Office Grouping
149
- who_region_grouping = {
150
  'EMIRATI': 'EMRO',
151
  'OMANI': 'EMRO',
152
  'SAUDI': 'EMRO',
@@ -182,10 +182,10 @@ who_region_grouping = {
182
  'FIJI': 'WPRO'
183
  }
184
  groupings = {
185
- 'Recepient_Nationality_Geographical': regional_grouping,
186
- 'Recepient_Nationality_Cultural': cultural_grouping,
187
- 'Recepient_Nationality_Regional_Income': income_grouping,
188
- 'Recepient_Nationality_Regional_WHO': who_region_grouping
189
  }
190
 
191
  # FIRST_GVHD_PROPHYLAXIS_CORRECTIONS
@@ -229,7 +229,7 @@ STRING_NORMALIZATION_MAP = {
229
  r"(?i)Umbilical Cord blood": "UMBILICAL CORD",
230
  r"(?i)Bone Marrow": "BONE MARROW", "MDS": "MYELODYSPLASTIC SYNDROME"
231
  }
232
- diagnosis_group_map = {
233
  "MYELOPROLIFERATIVE DISORDER": "MYELOPROLIFERATIVE NEOPLASMS",
234
  "CML": "MYELOPROLIFERATIVE NEOPLASMS",
235
  "MYELOFIBROSIS": "MYELOPROLIFERATIVE NEOPLASMS",
@@ -276,7 +276,7 @@ diagnosis_group_map = {
276
  }
277
 
278
  # # 0 nonmalignant; 1: malignant
279
- malignant_map = {
280
  'AML': 1,
281
  'RED CELL DISORDERS': 0,
282
  'AMYLOIDOSIS': 0,
@@ -296,6 +296,25 @@ malignant_map = {
296
  'PLASMA CELL LEUKEMIA': 0
297
  }
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  def load_train_features():
300
  # Define features
301
  HLA_sub12 = [
@@ -474,6 +493,12 @@ def clean_blood_group_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
474
  df[col] = df[col].str.replace(r"\s+", "", regex=True)
475
  return df
476
 
 
 
 
 
 
 
477
  def process_hla_columns(df: pd.DataFrame) -> pd.DataFrame:
478
  """
479
  Clean and process HLA columns by:
@@ -524,7 +549,6 @@ def process_hla_columns(df: pd.DataFrame) -> pd.DataFrame:
524
 
525
  return df
526
 
527
-
528
  def cast_as_int_if_possible(x):
529
  try:
530
  i = int(x)
@@ -897,6 +921,7 @@ def preprocess_pipeline(df) -> pd.DataFrame:
897
  df = handle_self_donor_consistency(df)
898
 
899
  # HLA processing
 
900
  df = process_hla_columns(df)
901
  df = expand_HLA_cols(df)
902
 
 
22
  "USA": "AMERICAN",
23
  }
24
  # 1. Regional Grouping (Geography-Based)
25
+ REGIONAL_GROUPING = {
26
  # Middle East
27
  'EMIRATI': 'Middle East',
28
  'OMANI': 'Middle East',
 
72
  }
73
 
74
  # 2. Cultural-Linguistic Grouping
75
+ CULTURAL_GROUPING = {
76
  'EMIRATI': 'Arab',
77
  'OMANI': 'Arab',
78
  'SAUDI': 'Arab',
 
109
  }
110
 
111
  # 3. World Bank Income Grouping
112
+ INCOME_GROUPING = {
113
  'EMIRATI': 'High income',
114
  'OMANI': 'High income',
115
  'SAUDI': 'High income',
 
146
  }
147
 
148
  # 4. WHO Regional Office Grouping
149
+ WHO_REGION_GROUPING = {
150
  'EMIRATI': 'EMRO',
151
  'OMANI': 'EMRO',
152
  'SAUDI': 'EMRO',
 
182
  'FIJI': 'WPRO'
183
  }
184
  groupings = {
185
+ 'Recepient_Nationality_Geographical': REGIONAL_GROUPING,
186
+ 'Recepient_Nationality_Cultural': CULTURAL_GROUPING,
187
+ 'Recepient_Nationality_Regional_Income': INCOME_GROUPING,
188
+ 'Recepient_Nationality_Regional_WHO': WHO_REGION_GROUPING
189
  }
190
 
191
  # FIRST_GVHD_PROPHYLAXIS_CORRECTIONS
 
229
  r"(?i)Umbilical Cord blood": "UMBILICAL CORD",
230
  r"(?i)Bone Marrow": "BONE MARROW", "MDS": "MYELODYSPLASTIC SYNDROME"
231
  }
232
+ DIAGNOSIS_GROUP_MAP = {
233
  "MYELOPROLIFERATIVE DISORDER": "MYELOPROLIFERATIVE NEOPLASMS",
234
  "CML": "MYELOPROLIFERATIVE NEOPLASMS",
235
  "MYELOFIBROSIS": "MYELOPROLIFERATIVE NEOPLASMS",
 
276
  }
277
 
278
  # # 0 nonmalignant; 1: malignant
279
+ MALIGNANT_MAP = {
280
  'AML': 1,
281
  'RED CELL DISORDERS': 0,
282
  'AMYLOIDOSIS': 0,
 
296
  'PLASMA CELL LEUKEMIA': 0
297
  }
298
 
299
+ HLA_MATCHING_MAP = {
300
+ "12 OF 12": "FULL",
301
+ "10 OF 10": "FULL",
302
+ "8 OF 8": "FULL", # not full?
303
+
304
+ "9 OF 10": "PARTIAL",
305
+ "8 OF 10": "PARTIAL",
306
+ "PARTIALLY MATCHED": "PARTIAL",
307
+
308
+ "7 OF 10": "HAPLOIDENTICAL",
309
+ "6 OF 12": "HAPLOIDENTICAL",
310
+ "6 OF 10": "HAPLOIDENTICAL",
311
+ "5 OF 10": "HAPLOIDENTICAL",
312
+
313
+ # confirm if the following are all haploidentical
314
+ "5 OF 8": "HAPLOIDENTICAL",
315
+ "4 OF 6": "HAPLOIDENTICAL",
316
+ }
317
+
318
  def load_train_features():
319
  # Define features
320
  HLA_sub12 = [
 
493
  df[col] = df[col].str.replace(r"\s+", "", regex=True)
494
  return df
495
 
496
+
497
+ def standardize_hla_matching(df: pd.DataFrame) -> pd.DataFrame:
498
+ # Map HLA matching values to standardized format
499
+ df['HLA match ratio'] = df['HLA match ratio'].replace(HLA_MATCHING_MAP, regex=False)
500
+ return df
501
+
502
  def process_hla_columns(df: pd.DataFrame) -> pd.DataFrame:
503
  """
504
  Clean and process HLA columns by:
 
549
 
550
  return df
551
 
 
552
  def cast_as_int_if_possible(x):
553
  try:
554
  i = int(x)
 
921
  df = handle_self_donor_consistency(df)
922
 
923
  # HLA processing
924
+ df = standardize_hla_matching(df)
925
  df = process_hla_columns(df)
926
  df = expand_HLA_cols(df)
927