| | """ |
| | Derive features from comorbidities dataset for 2 models: |
| | Parallel model 1: uses both hospital and community exacerbation events |
| | Parallel model 2: uses only hospital exacerbation events |
| | """ |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | import sys |
| | import os |
| | import yaml |
| | import model_h |
| |
|
| | with open("./training/config.yaml", "r") as config: |
| | config = yaml.safe_load(config) |
| |
|
| | |
| | model_type = config["model_settings"]["model_type"] |
| |
|
| | |
| | log = open("./training/logging/process_comorbidities_" + model_type + ".log", "w") |
| | sys.stdout = log |
| |
|
| | |
| | data_to_process = config["model_settings"]["data_to_process"] |
| |
|
| | |
| | if data_to_process == "forward_val": |
| | exac_data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl") |
| | patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl") |
| | else: |
| | exac_data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl") |
| | patient_details = pd.read_pickle("./data/patient_details.pkl") |
| | exac_data = exac_data[["StudyId", "IndexDate"]] |
| | patient_details = exac_data.merge( |
| | patient_details[["StudyId", "PatientId"]], |
| | on="StudyId", |
| | how="left", |
| | ) |
| |
|
| | comorbidities = pd.read_csv( |
| | config["inputs"]["raw_data_paths"]["comorbidities"], delimiter="|" |
| | ) |
| | comorbidities = patient_details.merge(comorbidities, on="PatientId", how="left") |
| |
|
| | |
| | comorbidities["Created"] = pd.to_datetime(comorbidities["Created"], utc=True) |
| | comorbidities["TimeSinceSubmission"] = ( |
| | comorbidities["IndexDate"] - comorbidities["Created"] |
| | ).dt.days |
| | comorbidities = comorbidities[comorbidities["TimeSinceSubmission"] > 0] |
| |
|
| | |
| | |
| | comorbidities = comorbidities.sort_values( |
| | by=["StudyId", "IndexDate", "TimeSinceSubmission"] |
| | ) |
| | comorbidities = comorbidities.drop_duplicates( |
| | subset=["StudyId", "IndexDate"], keep="first" |
| | ) |
| |
|
| | |
| | comorbidity_list = list(comorbidities) |
| | comorbidity_list = [ |
| | e |
| | for e in comorbidity_list |
| | if e |
| | not in ("PatientId", "Id", "StudyId", "IndexDate", "TimeSinceSubmission", "Created") |
| | ] |
| |
|
| | |
| | bool_mapping = {True: 1, False: 0} |
| | comorbidities[comorbidity_list] = ( |
| | comorbidities[comorbidity_list].replace(bool_mapping).fillna(0) |
| | ) |
| |
|
| | |
| | comorbidities["Comorbidities"] = comorbidities[comorbidity_list].sum(axis=1) |
| |
|
| | |
| | comorbidity_list.remove("AsthmaOverlap") |
| | comorbidities = comorbidities.drop(columns=comorbidity_list) |
| | comorbidities = comorbidities.drop(columns=["Id", "Created", "TimeSinceSubmission"]) |
| |
|
| | |
| | comorb_bins = [0, 1, 3, np.inf] |
| | comorb_labels = ["No comorbidities", "1-2", "3+"] |
| | comorbidities["Comorbidities"] = model_h.bin_numeric_column( |
| | col=comorbidities["Comorbidities"], bins=comorb_bins, labels=comorb_labels |
| | ) |
| |
|
| | comorbidities = comorbidities.drop(columns=["PatientId"]) |
| |
|
| | |
| | os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True) |
| | if data_to_process == "forward_val": |
| | comorbidities.to_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "comorbidities_forward_val_" + model_type + ".pkl", |
| | ) |
| | ) |
| | else: |
| | comorbidities.to_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "comorbidities_" + model_type + ".pkl", |
| | ) |
| | ) |
| |
|