File size: 3,724 Bytes

000de75

"""
Derive features from spirometry for 2 models:
    Parallel model 1: uses both hospital and community exacerbation events
    Parallel model 2: uses only hospital exacerbation events
"""

import numpy as np
import pandas as pd
import sys
import os
import yaml
import model_h

with open("./training/config.yaml", "r") as config:
    config = yaml.safe_load(config)

# Specify which model to generate features for
model_type = config["model_settings"]["model_type"]

# Setup log file
log = open("./training/logging/process_spirometry_" + model_type + ".log", "w")
sys.stdout = log

# Dataset to process - set through config file
data_to_process = config["model_settings"]["data_to_process"]

# Load cohort data
if data_to_process == "forward_val":
    data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl")
    patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl")
else:
    data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl")
    patient_details = pd.read_pickle("./data/patient_details.pkl")
data = data[["StudyId", "IndexDate"]]
patient_details = data.merge(
    patient_details[["StudyId", "PatientId"]],
    on="StudyId",
    how="left",
)


copd_status = pd.read_csv(
    config["inputs"]["raw_data_paths"]["copd_status"], delimiter="|"
)

copd_status = patient_details.merge(copd_status, on="PatientId", how="left")
copd_status["LungFunction_Date"] = pd.to_datetime(
    copd_status["LungFunction_Date"], utc=True
)
copd_status["TimeSinceLungFunc"] = (
    copd_status["IndexDate"] - copd_status["LungFunction_Date"]
).dt.days
print(
    "COPD Status Details: Number of patients with a lung function date < 1 year \
from index date: {} of {}".format(
        len(copd_status[copd_status["TimeSinceLungFunc"] < 365]), len(patient_details)
    )
)
copd_status = copd_status[
    [
        "StudyId",
        "IndexDate",
        "RequiredAcuteNIV",
        "RequiredICUAdmission",
        "LungFunction_FEV1PercentPredicted",
        "LungFunction_FEV1Litres",
        "LungFunction_FEV1FVCRatio",
        "TimeSinceLungFunc",
    ]
]

# Map bool values
bool_mapping = {True: 1, False: 0}
copd_status["RequiredAcuteNIV"] = copd_status.RequiredAcuteNIV.map(bool_mapping)
copd_status["RequiredICUAdmission"] = copd_status.RequiredICUAdmission.map(bool_mapping)

# Convert columns in COPD Status to numeric
copd_status["LungFunction_FEV1PercentPredicted"] = copd_status[
    "LungFunction_FEV1PercentPredicted"
].str.replace("%", "")
for col in copd_status.drop(
    columns=["StudyId", "IndexDate", "RequiredAcuteNIV", "RequiredICUAdmission"]
).columns:
    copd_status[col] = pd.to_numeric(copd_status[col])

# Bin patient spirometry at onboarding
spirometry_bins = [0, 30, 50, 80, np.inf]
spirometry_labels = ["Very severe", "Severe", "Moderate", "Mild"]
copd_status["FEV1PercentPredicted"] = model_h.bin_numeric_column(
    col=copd_status["LungFunction_FEV1PercentPredicted"],
    bins=spirometry_bins,
    labels=spirometry_labels,
)
copd_status = copd_status.drop(columns=["LungFunction_FEV1PercentPredicted"])

# Assign patients without spirometry in service data to the Mild category
copd_status.loc[
    copd_status["FEV1PercentPredicted"] == "nan", "FEV1PercentPredicted"
] = "Mild"

# Save data
os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True)
if data_to_process == "forward_val":
    copd_status.to_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "spirometry_forward_val_" + model_type + ".pkl",
        )
    )
else:
    copd_status.to_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "spirometry_" + model_type + ".pkl",
        )
    )