mridzuan commited on
Commit
fbb35c6
·
1 Parent(s): 0e1554b

extract years from dates

Browse files
Files changed (1) hide show
  1. src/preprocess_utils.py +13 -4
src/preprocess_utils.py CHANGED
@@ -721,6 +721,10 @@ def safe_extract_year(date_str: str) -> str:
721
  except (ValueError, TypeError):
722
  return UNKNOWN_TOKEN
723
 
 
 
 
 
724
  def calculate_ages(df: pd.DataFrame) -> pd.DataFrame:
725
  """
726
  Calculate:
@@ -729,9 +733,9 @@ def calculate_ages(df: pd.DataFrame) -> pd.DataFrame:
729
  3. Age gap between recipient and donor
730
  """
731
  # Extract years safely
732
- df["Recepient_DOB_Year"] = df["Recepient_DOB"].apply(safe_extract_year)
733
- df["Donor_DOB_Year"] = df["Donor_DOB"].apply(safe_extract_year)
734
- df["HSCT_date_Year"] = df["HSCT_date"].apply(safe_extract_year)
735
 
736
  # Calculate ages with safe conversion
737
  def calculate_age_diff(row, dob_col, transplant_col):
@@ -924,7 +928,12 @@ def preprocess_pipeline(df) -> pd.DataFrame:
924
  df = process_hla_columns(df)
925
  df = expand_HLA_cols(df)
926
 
927
- # Feature engineering
 
 
 
 
 
928
  df = calculate_ages(df)
929
 
930
  # Final missing value handling
 
721
  except (ValueError, TypeError):
722
  return UNKNOWN_TOKEN
723
 
724
+ def extract_year(df: pd.DataFrame, column_name) -> pd.DataFrame:
725
+ df[column_name + "_Year"] = df[column_name].apply(safe_extract_year)
726
+ return df
727
+
728
  def calculate_ages(df: pd.DataFrame) -> pd.DataFrame:
729
  """
730
  Calculate:
 
733
  3. Age gap between recipient and donor
734
  """
735
  # Extract years safely
736
+ df = extract_year(df, "HSCT_date")
737
+ df = extract_year(df, "Recepient_DOB")
738
+ df = extract_year(df, "Donor_DOB")
739
 
740
  # Calculate ages with safe conversion
741
  def calculate_age_diff(row, dob_col, transplant_col):
 
928
  df = process_hla_columns(df)
929
  df = expand_HLA_cols(df)
930
 
931
+ # Extract years
932
+ df = extract_year(df, "HSCT_date")
933
+ df = extract_year(df, "Recepient_DOB")
934
+ df = extract_year(df, "Donor_DOB")
935
+ df = extract_year(df, "Date of first diagnosis/BMBx date")
936
+
937
  df = calculate_ages(df)
938
 
939
  # Final missing value handling