""" Derive features from spirometry for 2 models: Parallel model 1: uses both hospital and community exacerbation events Parallel model 2: uses only hospital exacerbation events """ import numpy as np import pandas as pd import sys import os import yaml import model_h with open("./training/config.yaml", "r") as config: config = yaml.safe_load(config) # Specify which model to generate features for model_type = config["model_settings"]["model_type"] # Setup log file log = open("./training/logging/process_spirometry_" + model_type + ".log", "w") sys.stdout = log # Dataset to process - set through config file data_to_process = config["model_settings"]["data_to_process"] # Load cohort data if data_to_process == "forward_val": data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl") patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl") else: data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl") patient_details = pd.read_pickle("./data/patient_details.pkl") data = data[["StudyId", "IndexDate"]] patient_details = data.merge( patient_details[["StudyId", "PatientId"]], on="StudyId", how="left", ) copd_status = pd.read_csv( config["inputs"]["raw_data_paths"]["copd_status"], delimiter="|" ) copd_status = patient_details.merge(copd_status, on="PatientId", how="left") copd_status["LungFunction_Date"] = pd.to_datetime( copd_status["LungFunction_Date"], utc=True ) copd_status["TimeSinceLungFunc"] = ( copd_status["IndexDate"] - copd_status["LungFunction_Date"] ).dt.days print( "COPD Status Details: Number of patients with a lung function date < 1 year \ from index date: {} of {}".format( len(copd_status[copd_status["TimeSinceLungFunc"] < 365]), len(patient_details) ) ) copd_status = copd_status[ [ "StudyId", "IndexDate", "RequiredAcuteNIV", "RequiredICUAdmission", "LungFunction_FEV1PercentPredicted", "LungFunction_FEV1Litres", "LungFunction_FEV1FVCRatio", "TimeSinceLungFunc", ] ] # Map bool values bool_mapping = {True: 1, False: 0} copd_status["RequiredAcuteNIV"] = copd_status.RequiredAcuteNIV.map(bool_mapping) copd_status["RequiredICUAdmission"] = copd_status.RequiredICUAdmission.map(bool_mapping) # Convert columns in COPD Status to numeric copd_status["LungFunction_FEV1PercentPredicted"] = copd_status[ "LungFunction_FEV1PercentPredicted" ].str.replace("%", "") for col in copd_status.drop( columns=["StudyId", "IndexDate", "RequiredAcuteNIV", "RequiredICUAdmission"] ).columns: copd_status[col] = pd.to_numeric(copd_status[col]) # Bin patient spirometry at onboarding spirometry_bins = [0, 30, 50, 80, np.inf] spirometry_labels = ["Very severe", "Severe", "Moderate", "Mild"] copd_status["FEV1PercentPredicted"] = model_h.bin_numeric_column( col=copd_status["LungFunction_FEV1PercentPredicted"], bins=spirometry_bins, labels=spirometry_labels, ) copd_status = copd_status.drop(columns=["LungFunction_FEV1PercentPredicted"]) # Assign patients without spirometry in service data to the Mild category copd_status.loc[ copd_status["FEV1PercentPredicted"] == "nan", "FEV1PercentPredicted" ] = "Mild" # Save data os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True) if data_to_process == "forward_val": copd_status.to_pickle( os.path.join( config["outputs"]["processed_data_dir"], "spirometry_forward_val_" + model_type + ".pkl", ) ) else: copd_status.to_pickle( os.path.join( config["outputs"]["processed_data_dir"], "spirometry_" + model_type + ".pkl", ) )