copd-model-h / training /process_spirometry.py
IamGrooooot's picture
Inital Upload
000de75
"""
Derive features from spirometry for 2 models:
Parallel model 1: uses both hospital and community exacerbation events
Parallel model 2: uses only hospital exacerbation events
"""
import numpy as np
import pandas as pd
import sys
import os
import yaml
import model_h
with open("./training/config.yaml", "r") as config:
config = yaml.safe_load(config)
# Specify which model to generate features for
model_type = config["model_settings"]["model_type"]
# Setup log file
log = open("./training/logging/process_spirometry_" + model_type + ".log", "w")
sys.stdout = log
# Dataset to process - set through config file
data_to_process = config["model_settings"]["data_to_process"]
# Load cohort data
if data_to_process == "forward_val":
data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl")
patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl")
else:
data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl")
patient_details = pd.read_pickle("./data/patient_details.pkl")
data = data[["StudyId", "IndexDate"]]
patient_details = data.merge(
patient_details[["StudyId", "PatientId"]],
on="StudyId",
how="left",
)
copd_status = pd.read_csv(
config["inputs"]["raw_data_paths"]["copd_status"], delimiter="|"
)
copd_status = patient_details.merge(copd_status, on="PatientId", how="left")
copd_status["LungFunction_Date"] = pd.to_datetime(
copd_status["LungFunction_Date"], utc=True
)
copd_status["TimeSinceLungFunc"] = (
copd_status["IndexDate"] - copd_status["LungFunction_Date"]
).dt.days
print(
"COPD Status Details: Number of patients with a lung function date < 1 year \
from index date: {} of {}".format(
len(copd_status[copd_status["TimeSinceLungFunc"] < 365]), len(patient_details)
)
)
copd_status = copd_status[
[
"StudyId",
"IndexDate",
"RequiredAcuteNIV",
"RequiredICUAdmission",
"LungFunction_FEV1PercentPredicted",
"LungFunction_FEV1Litres",
"LungFunction_FEV1FVCRatio",
"TimeSinceLungFunc",
]
]
# Map bool values
bool_mapping = {True: 1, False: 0}
copd_status["RequiredAcuteNIV"] = copd_status.RequiredAcuteNIV.map(bool_mapping)
copd_status["RequiredICUAdmission"] = copd_status.RequiredICUAdmission.map(bool_mapping)
# Convert columns in COPD Status to numeric
copd_status["LungFunction_FEV1PercentPredicted"] = copd_status[
"LungFunction_FEV1PercentPredicted"
].str.replace("%", "")
for col in copd_status.drop(
columns=["StudyId", "IndexDate", "RequiredAcuteNIV", "RequiredICUAdmission"]
).columns:
copd_status[col] = pd.to_numeric(copd_status[col])
# Bin patient spirometry at onboarding
spirometry_bins = [0, 30, 50, 80, np.inf]
spirometry_labels = ["Very severe", "Severe", "Moderate", "Mild"]
copd_status["FEV1PercentPredicted"] = model_h.bin_numeric_column(
col=copd_status["LungFunction_FEV1PercentPredicted"],
bins=spirometry_bins,
labels=spirometry_labels,
)
copd_status = copd_status.drop(columns=["LungFunction_FEV1PercentPredicted"])
# Assign patients without spirometry in service data to the Mild category
copd_status.loc[
copd_status["FEV1PercentPredicted"] == "nan", "FEV1PercentPredicted"
] = "Mild"
# Save data
os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True)
if data_to_process == "forward_val":
copd_status.to_pickle(
os.path.join(
config["outputs"]["processed_data_dir"],
"spirometry_forward_val_" + model_type + ".pkl",
)
)
else:
copd_status.to_pickle(
os.path.join(
config["outputs"]["processed_data_dir"],
"spirometry_" + model_type + ".pkl",
)
)