| | """ |
| | Derive features from spirometry for 2 models: |
| | Parallel model 1: uses both hospital and community exacerbation events |
| | Parallel model 2: uses only hospital exacerbation events |
| | """ |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | import sys |
| | import os |
| | import yaml |
| | import model_h |
| |
|
| | with open("./training/config.yaml", "r") as config: |
| | config = yaml.safe_load(config) |
| |
|
| | |
| | model_type = config["model_settings"]["model_type"] |
| |
|
| | |
| | log = open("./training/logging/process_spirometry_" + model_type + ".log", "w") |
| | sys.stdout = log |
| |
|
| | |
| | data_to_process = config["model_settings"]["data_to_process"] |
| |
|
| | |
| | if data_to_process == "forward_val": |
| | data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl") |
| | patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl") |
| | else: |
| | data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl") |
| | patient_details = pd.read_pickle("./data/patient_details.pkl") |
| | data = data[["StudyId", "IndexDate"]] |
| | patient_details = data.merge( |
| | patient_details[["StudyId", "PatientId"]], |
| | on="StudyId", |
| | how="left", |
| | ) |
| |
|
| |
|
| | copd_status = pd.read_csv( |
| | config["inputs"]["raw_data_paths"]["copd_status"], delimiter="|" |
| | ) |
| |
|
| | copd_status = patient_details.merge(copd_status, on="PatientId", how="left") |
| | copd_status["LungFunction_Date"] = pd.to_datetime( |
| | copd_status["LungFunction_Date"], utc=True |
| | ) |
| | copd_status["TimeSinceLungFunc"] = ( |
| | copd_status["IndexDate"] - copd_status["LungFunction_Date"] |
| | ).dt.days |
| | print( |
| | "COPD Status Details: Number of patients with a lung function date < 1 year \ |
| | from index date: {} of {}".format( |
| | len(copd_status[copd_status["TimeSinceLungFunc"] < 365]), len(patient_details) |
| | ) |
| | ) |
| | copd_status = copd_status[ |
| | [ |
| | "StudyId", |
| | "IndexDate", |
| | "RequiredAcuteNIV", |
| | "RequiredICUAdmission", |
| | "LungFunction_FEV1PercentPredicted", |
| | "LungFunction_FEV1Litres", |
| | "LungFunction_FEV1FVCRatio", |
| | "TimeSinceLungFunc", |
| | ] |
| | ] |
| |
|
| | |
| | bool_mapping = {True: 1, False: 0} |
| | copd_status["RequiredAcuteNIV"] = copd_status.RequiredAcuteNIV.map(bool_mapping) |
| | copd_status["RequiredICUAdmission"] = copd_status.RequiredICUAdmission.map(bool_mapping) |
| |
|
| | |
| | copd_status["LungFunction_FEV1PercentPredicted"] = copd_status[ |
| | "LungFunction_FEV1PercentPredicted" |
| | ].str.replace("%", "") |
| | for col in copd_status.drop( |
| | columns=["StudyId", "IndexDate", "RequiredAcuteNIV", "RequiredICUAdmission"] |
| | ).columns: |
| | copd_status[col] = pd.to_numeric(copd_status[col]) |
| |
|
| | |
| | spirometry_bins = [0, 30, 50, 80, np.inf] |
| | spirometry_labels = ["Very severe", "Severe", "Moderate", "Mild"] |
| | copd_status["FEV1PercentPredicted"] = model_h.bin_numeric_column( |
| | col=copd_status["LungFunction_FEV1PercentPredicted"], |
| | bins=spirometry_bins, |
| | labels=spirometry_labels, |
| | ) |
| | copd_status = copd_status.drop(columns=["LungFunction_FEV1PercentPredicted"]) |
| |
|
| | |
| | copd_status.loc[ |
| | copd_status["FEV1PercentPredicted"] == "nan", "FEV1PercentPredicted" |
| | ] = "Mild" |
| |
|
| | |
| | os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True) |
| | if data_to_process == "forward_val": |
| | copd_status.to_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "spirometry_forward_val_" + model_type + ".pkl", |
| | ) |
| | ) |
| | else: |
| | copd_status.to_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "spirometry_" + model_type + ".pkl", |
| | ) |
| | ) |
| |
|