| | """ |
| | Derive features from lab tests for 2 models: |
| | Parallel model 1: uses both hospital and community exacerbation events |
| | Parallel model 2: uses only hospital exacerbation events |
| | """ |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | import sys |
| | import os |
| | import model_h |
| | import ggc.preprocessing.labs as labs_preprocessing |
| | import yaml |
| |
|
| |
|
| | def calc_lab_metric(lab_df, data, lab_name, metric, weigh_data_by_recency=False): |
| | """ |
| | Calculate metrics on laboratory data. |
| | |
| | Args: |
| | lab_df (pd.DataFrame): dataframe containing labs to be used in calculations. |
| | data (pd.DataFrame): main dataframe to which columns containing the results from |
| | the lab calculations are merged onto. |
| | lab_name (list): name of labs required for metric calculations. |
| | metric (str): name of metric to be calculated. The possible metrics are: |
| | 'MaxLifetime': calculates the maximum value of lab for patient within |
| | entire dataset before their index date. |
| | 'MinLifetime': calculates the minimum value of lab for patient within |
| | entire dataset before their index date. |
| | 'Max1Year': calculates the maximum value of lab for patient within 1 |
| | year prior to index date. |
| | 'Min1Year': calculates the maximum value of lab for patient within 1 |
| | year prior to index date. |
| | 'Latest': finds the closest lab value prior to index date. |
| | weigh_data_by_recency (bool): option to weigh data based on how recent it is. Older |
| | observations are decreased or increased towards the median. Defaults to False. |
| | |
| | Returns: |
| | pd.DataFrame: the input dataframe with additional columns with calculated |
| | metrics. |
| | """ |
| | |
| | cols_to_keep = ["StudyId", "IndexDate", "TimeSinceLab"] |
| | cols_to_keep.append(lab_name) |
| | labs_calc = lab_df[cols_to_keep] |
| |
|
| | |
| | if (metric == "Max1Year") | (metric == "Min1Year"): |
| | labs_calc = labs_calc[labs_calc["TimeSinceLab"] <= 365] |
| | if (metric == "MaxLifetime") | (metric == "Max1Year"): |
| | labs_calc = labs_calc.groupby(["StudyId", "IndexDate"]).max() |
| | if (metric == "MinLifetime") | (metric == "Min1Year"): |
| | labs_calc = labs_calc.groupby(["StudyId", "IndexDate"]).min() |
| | labs_calc = labs_calc.drop(columns=["TimeSinceLab"]) |
| | if metric == "Latest": |
| | labs_calc = labs_calc[labs_calc["TimeSinceLab"] <= 365] |
| | labs_calc = labs_calc.sort_values( |
| | by=["StudyId", "IndexDate", "TimeSinceLab"], ascending=True |
| | ) |
| | labs_calc["TimeSinceLab"] = np.where( |
| | labs_calc[lab_name].isna(), np.NaN, labs_calc["TimeSinceLab"] |
| | ) |
| | labs_calc = labs_calc.bfill() |
| | labs_calc = labs_calc.drop_duplicates( |
| | subset=["StudyId", "IndexDate"], keep="first" |
| | ) |
| | if weigh_data_by_recency is True: |
| | median_val = labs_calc[lab_name].median() |
| | labs_calc = model_h.weigh_features_by_recency( |
| | df=labs_calc, |
| | feature=lab_name, |
| | feature_recency_days="TimeSinceLab", |
| | median_value=median_val, |
| | decay_rate=0.001, |
| | ) |
| | labs_calc = labs_calc.set_index(["StudyId", "IndexDate"]) |
| |
|
| | |
| | labs_calc = labs_calc.add_prefix(metric) |
| | labs_calc = labs_calc.reset_index() |
| | data = data.merge(labs_calc, on=["StudyId", "IndexDate"], how="left") |
| | return data |
| |
|
| |
|
| | with open("./training/config.yaml", "r") as config: |
| | config = yaml.safe_load(config) |
| |
|
| | |
| | model_type = config["model_settings"]["model_type"] |
| |
|
| | |
| | log = open("./training/logging/process_labs_" + model_type + ".log", "w") |
| | sys.stdout = log |
| |
|
| | |
| | data_to_process = config["model_settings"]["data_to_process"] |
| |
|
| | |
| | if data_to_process == "forward_val": |
| | data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl") |
| | patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl") |
| | else: |
| | data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl") |
| | patient_details = pd.read_pickle("./data/patient_details.pkl") |
| | data = data[["StudyId", "IndexDate"]] |
| | patient_details = data.merge( |
| | patient_details[["StudyId", "PatientId"]], |
| | on="StudyId", |
| | how="left", |
| | ) |
| |
|
| | |
| | id_mapping = pd.read_pickle("./data/sh_to_studyid_mapping.pkl") |
| |
|
| | |
| | id_mapping["SafeHavenID"] = np.where( |
| | id_mapping["StudyId"] == "SU125", np.NaN, id_mapping["SafeHavenID"] |
| | ) |
| | id_mapping = id_mapping.merge( |
| | data[["StudyId"]], on="StudyId", how="inner" |
| | ).drop_duplicates() |
| | print( |
| | "Num patients with SafeHaven mapping: {} of {}".format( |
| | len(id_mapping), data.StudyId.nunique() |
| | ) |
| | ) |
| |
|
| | |
| | patient_details = patient_details.merge(id_mapping, on="StudyId", how="left") |
| |
|
| | |
| | |
| | patient_details["LookbackStartDate"] = patient_details["IndexDate"] - pd.DateOffset( |
| | days=config["model_settings"]["lookback_period"] |
| | ) |
| |
|
| | |
| | |
| | |
| | |
| | cols_to_use = [ |
| | "SafeHavenID", |
| | "ClinicalCodeDescription", |
| | "QuantityUnit", |
| | "RangeHighValue", |
| | "RangeLowValue", |
| | "QuantityValue", |
| | "SampleDate", |
| | ] |
| |
|
| | labs = pd.read_csv(config["inputs"]["raw_data_paths"]["labs"], usecols=cols_to_use) |
| |
|
| | |
| | labs = labs[labs.SafeHavenID.isin(patient_details.SafeHavenID)] |
| |
|
| | |
| | lookup_table = pd.read_csv(config["inputs"]["raw_data_paths"]["labs_lookup_table"]) |
| | tests_of_interest = [ |
| | "Eosinophils", |
| | "Albumin", |
| | "Neutrophils", |
| | "White Blood Count", |
| | "Lymphocytes", |
| | ] |
| | labs_processed = labs_preprocessing.clean_labs_data( |
| | df=labs, |
| | tests_of_interest=tests_of_interest, |
| | units_lookup=lookup_table, |
| | print_log=True, |
| | ) |
| | labs_processed = patient_details[["StudyId", "IndexDate", "SafeHavenID"]].merge( |
| | labs_processed, on="SafeHavenID", how="left" |
| | ) |
| | labs_processed["SampleDate"] = pd.to_datetime(labs_processed["SampleDate"], utc=True) |
| | labs_processed["TimeSinceLab"] = ( |
| | labs_processed["IndexDate"] - labs_processed["SampleDate"] |
| | ).dt.days |
| |
|
| | |
| | labs_processed = labs_processed[labs_processed["TimeSinceLab"] >= 0] |
| |
|
| | |
| | labs_processed = pd.pivot_table( |
| | labs_processed, |
| | values="QuantityValue", |
| | index=["StudyId", "IndexDate", "TimeSinceLab"], |
| | columns=["ClinicalCodeDescription"], |
| | ) |
| | labs_processed = labs_processed.reset_index() |
| |
|
| | |
| | labs_processed["NeutLymphRatio"] = ( |
| | labs_processed["Neutrophils"] / labs_processed["Lymphocytes"] |
| | ) |
| |
|
| | |
| | data = calc_lab_metric(labs_processed, data, lab_name="Albumin", metric="Min1Year") |
| |
|
| | |
| | lab_names = [ |
| | "NeutLymphRatio", |
| | "Albumin", |
| | "Eosinophils", |
| | "Neutrophils", |
| | "White Blood Count", |
| | ] |
| |
|
| | for lab_name in lab_names: |
| | data = calc_lab_metric( |
| | labs_processed, data, lab_name, metric="Latest", weigh_data_by_recency=True |
| | ) |
| |
|
| | |
| | os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True) |
| | if data_to_process == "forward_val": |
| | data.to_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "labs_forward_val_" + model_type + ".pkl", |
| | ) |
| | ) |
| | else: |
| | data.to_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "labs_" + model_type + ".pkl", |
| | ) |
| | ) |
| |
|