""" Derive features from comorbidities dataset for 2 models: Parallel model 1: uses both hospital and community exacerbation events Parallel model 2: uses only hospital exacerbation events """ import numpy as np import pandas as pd import sys import os import yaml import model_h with open("./training/config.yaml", "r") as config: config = yaml.safe_load(config) # Specify which model to generate features for model_type = config["model_settings"]["model_type"] # Setup log file log = open("./training/logging/process_comorbidities_" + model_type + ".log", "w") sys.stdout = log # Dataset to process - set through config file data_to_process = config["model_settings"]["data_to_process"] # Load cohort data if data_to_process == "forward_val": exac_data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl") patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl") else: exac_data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl") patient_details = pd.read_pickle("./data/patient_details.pkl") exac_data = exac_data[["StudyId", "IndexDate"]] patient_details = exac_data.merge( patient_details[["StudyId", "PatientId"]], on="StudyId", how="left", ) comorbidities = pd.read_csv( config["inputs"]["raw_data_paths"]["comorbidities"], delimiter="|" ) comorbidities = patient_details.merge(comorbidities, on="PatientId", how="left") # Only keep records submitted before index date comorbidities["Created"] = pd.to_datetime(comorbidities["Created"], utc=True) comorbidities["TimeSinceSubmission"] = ( comorbidities["IndexDate"] - comorbidities["Created"] ).dt.days comorbidities = comorbidities[comorbidities["TimeSinceSubmission"] > 0] # If multiple records submitted for same patient keep the most recent record (in relation # to index date) comorbidities = comorbidities.sort_values( by=["StudyId", "IndexDate", "TimeSinceSubmission"] ) comorbidities = comorbidities.drop_duplicates( subset=["StudyId", "IndexDate"], keep="first" ) # Get list of comorbidities captured in the service comorbidity_list = list(comorbidities) comorbidity_list = [ e for e in comorbidity_list if e not in ("PatientId", "Id", "StudyId", "IndexDate", "TimeSinceSubmission", "Created") ] # Map True/False values to integers bool_mapping = {True: 1, False: 0} comorbidities[comorbidity_list] = ( comorbidities[comorbidity_list].replace(bool_mapping).fillna(0) ) # Get comorbidity counts for each patient comorbidities["Comorbidities"] = comorbidities[comorbidity_list].sum(axis=1) # Drop comorbidities columns from train data but retain AsthmaOverlap comorbidity_list.remove("AsthmaOverlap") comorbidities = comorbidities.drop(columns=comorbidity_list) comorbidities = comorbidities.drop(columns=["Id", "Created", "TimeSinceSubmission"]) # Bin number of comorbidities comorb_bins = [0, 1, 3, np.inf] comorb_labels = ["No comorbidities", "1-2", "3+"] comorbidities["Comorbidities"] = model_h.bin_numeric_column( col=comorbidities["Comorbidities"], bins=comorb_bins, labels=comorb_labels ) comorbidities = comorbidities.drop(columns=["PatientId"]) # Save data os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True) if data_to_process == "forward_val": comorbidities.to_pickle( os.path.join( config["outputs"]["processed_data_dir"], "comorbidities_forward_val_" + model_type + ".pkl", ) ) else: comorbidities.to_pickle( os.path.join( config["outputs"]["processed_data_dir"], "comorbidities_" + model_type + ".pkl", ) )