Spaces:
Sleeping
Sleeping
extract years from dates
Browse files- src/preprocess_utils.py +13 -4
src/preprocess_utils.py
CHANGED
|
@@ -721,6 +721,10 @@ def safe_extract_year(date_str: str) -> str:
|
|
| 721 |
except (ValueError, TypeError):
|
| 722 |
return UNKNOWN_TOKEN
|
| 723 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 724 |
def calculate_ages(df: pd.DataFrame) -> pd.DataFrame:
|
| 725 |
"""
|
| 726 |
Calculate:
|
|
@@ -729,9 +733,9 @@ def calculate_ages(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 729 |
3. Age gap between recipient and donor
|
| 730 |
"""
|
| 731 |
# Extract years safely
|
| 732 |
-
df
|
| 733 |
-
df
|
| 734 |
-
df
|
| 735 |
|
| 736 |
# Calculate ages with safe conversion
|
| 737 |
def calculate_age_diff(row, dob_col, transplant_col):
|
|
@@ -924,7 +928,12 @@ def preprocess_pipeline(df) -> pd.DataFrame:
|
|
| 924 |
df = process_hla_columns(df)
|
| 925 |
df = expand_HLA_cols(df)
|
| 926 |
|
| 927 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 928 |
df = calculate_ages(df)
|
| 929 |
|
| 930 |
# Final missing value handling
|
|
|
|
| 721 |
except (ValueError, TypeError):
|
| 722 |
return UNKNOWN_TOKEN
|
| 723 |
|
| 724 |
+
def extract_year(df: pd.DataFrame, column_name) -> pd.DataFrame:
|
| 725 |
+
df[column_name + "_Year"] = df[column_name].apply(safe_extract_year)
|
| 726 |
+
return df
|
| 727 |
+
|
| 728 |
def calculate_ages(df: pd.DataFrame) -> pd.DataFrame:
|
| 729 |
"""
|
| 730 |
Calculate:
|
|
|
|
| 733 |
3. Age gap between recipient and donor
|
| 734 |
"""
|
| 735 |
# Extract years safely
|
| 736 |
+
df = extract_year(df, "HSCT_date")
|
| 737 |
+
df = extract_year(df, "Recepient_DOB")
|
| 738 |
+
df = extract_year(df, "Donor_DOB")
|
| 739 |
|
| 740 |
# Calculate ages with safe conversion
|
| 741 |
def calculate_age_diff(row, dob_col, transplant_col):
|
|
|
|
| 928 |
df = process_hla_columns(df)
|
| 929 |
df = expand_HLA_cols(df)
|
| 930 |
|
| 931 |
+
# Extract years
|
| 932 |
+
df = extract_year(df, "HSCT_date")
|
| 933 |
+
df = extract_year(df, "Recepient_DOB")
|
| 934 |
+
df = extract_year(df, "Donor_DOB")
|
| 935 |
+
df = extract_year(df, "Date of first diagnosis/BMBx date")
|
| 936 |
+
|
| 937 |
df = calculate_ages(df)
|
| 938 |
|
| 939 |
# Final missing value handling
|