copd-model-h / training /process_labs.py
IamGrooooot's picture
Inital Upload
000de75
"""
Derive features from lab tests for 2 models:
Parallel model 1: uses both hospital and community exacerbation events
Parallel model 2: uses only hospital exacerbation events
"""
import numpy as np
import pandas as pd
import sys
import os
import model_h
import ggc.preprocessing.labs as labs_preprocessing
import yaml
def calc_lab_metric(lab_df, data, lab_name, metric, weigh_data_by_recency=False):
"""
Calculate metrics on laboratory data.
Args:
lab_df (pd.DataFrame): dataframe containing labs to be used in calculations.
data (pd.DataFrame): main dataframe to which columns containing the results from
the lab calculations are merged onto.
lab_name (list): name of labs required for metric calculations.
metric (str): name of metric to be calculated. The possible metrics are:
'MaxLifetime': calculates the maximum value of lab for patient within
entire dataset before their index date.
'MinLifetime': calculates the minimum value of lab for patient within
entire dataset before their index date.
'Max1Year': calculates the maximum value of lab for patient within 1
year prior to index date.
'Min1Year': calculates the maximum value of lab for patient within 1
year prior to index date.
'Latest': finds the closest lab value prior to index date.
weigh_data_by_recency (bool): option to weigh data based on how recent it is. Older
observations are decreased or increased towards the median. Defaults to False.
Returns:
pd.DataFrame: the input dataframe with additional columns with calculated
metrics.
"""
# Subset labs to only those specified in lab_names
cols_to_keep = ["StudyId", "IndexDate", "TimeSinceLab"]
cols_to_keep.append(lab_name)
labs_calc = lab_df[cols_to_keep]
# Subset labs to correct time frames and calculate metrics
if (metric == "Max1Year") | (metric == "Min1Year"):
labs_calc = labs_calc[labs_calc["TimeSinceLab"] <= 365]
if (metric == "MaxLifetime") | (metric == "Max1Year"):
labs_calc = labs_calc.groupby(["StudyId", "IndexDate"]).max()
if (metric == "MinLifetime") | (metric == "Min1Year"):
labs_calc = labs_calc.groupby(["StudyId", "IndexDate"]).min()
labs_calc = labs_calc.drop(columns=["TimeSinceLab"])
if metric == "Latest":
labs_calc = labs_calc[labs_calc["TimeSinceLab"] <= 365]
labs_calc = labs_calc.sort_values(
by=["StudyId", "IndexDate", "TimeSinceLab"], ascending=True
)
labs_calc["TimeSinceLab"] = np.where(
labs_calc[lab_name].isna(), np.NaN, labs_calc["TimeSinceLab"]
)
labs_calc = labs_calc.bfill()
labs_calc = labs_calc.drop_duplicates(
subset=["StudyId", "IndexDate"], keep="first"
)
if weigh_data_by_recency is True:
median_val = labs_calc[lab_name].median()
labs_calc = model_h.weigh_features_by_recency(
df=labs_calc,
feature=lab_name,
feature_recency_days="TimeSinceLab",
median_value=median_val,
decay_rate=0.001,
)
labs_calc = labs_calc.set_index(["StudyId", "IndexDate"])
# Add prefix to lab names and merge with main df
labs_calc = labs_calc.add_prefix(metric)
labs_calc = labs_calc.reset_index()
data = data.merge(labs_calc, on=["StudyId", "IndexDate"], how="left")
return data
with open("./training/config.yaml", "r") as config:
config = yaml.safe_load(config)
# Specify which model to generate features for
model_type = config["model_settings"]["model_type"]
# Setup log file
log = open("./training/logging/process_labs_" + model_type + ".log", "w")
sys.stdout = log
# Dataset to process - set through config file
data_to_process = config["model_settings"]["data_to_process"]
# Load cohort data
if data_to_process == "forward_val":
data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl")
patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl")
else:
data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl")
patient_details = pd.read_pickle("./data/patient_details.pkl")
data = data[["StudyId", "IndexDate"]]
patient_details = data.merge(
patient_details[["StudyId", "PatientId"]],
on="StudyId",
how="left",
)
# Read mapping between StudyId and SafeHavenID
id_mapping = pd.read_pickle("./data/sh_to_studyid_mapping.pkl")
# Remove mapping for patient SU125 as the mapping for this patient is incorrect
id_mapping["SafeHavenID"] = np.where(
id_mapping["StudyId"] == "SU125", np.NaN, id_mapping["SafeHavenID"]
)
id_mapping = id_mapping.merge(
data[["StudyId"]], on="StudyId", how="inner"
).drop_duplicates()
print(
"Num patients with SafeHaven mapping: {} of {}".format(
len(id_mapping), data.StudyId.nunique()
)
)
# Add column with SafeHavenID to main df
patient_details = patient_details.merge(id_mapping, on="StudyId", how="left")
# Calculate the lookback start date. Will need this to aggreggate data for model
# features
patient_details["LookbackStartDate"] = patient_details["IndexDate"] - pd.DateOffset(
days=config["model_settings"]["lookback_period"]
)
############################################################################
# Derive features from labs
############################################################################
# Convert column names into format required for labs processing using the ggc package
cols_to_use = [
"SafeHavenID",
"ClinicalCodeDescription",
"QuantityUnit",
"RangeHighValue",
"RangeLowValue",
"QuantityValue",
"SampleDate",
]
labs = pd.read_csv(config["inputs"]["raw_data_paths"]["labs"], usecols=cols_to_use)
# Subset labs table to only patients of interest
labs = labs[labs.SafeHavenID.isin(patient_details.SafeHavenID)]
# Process labs
lookup_table = pd.read_csv(config["inputs"]["raw_data_paths"]["labs_lookup_table"])
tests_of_interest = [
"Eosinophils",
"Albumin",
"Neutrophils",
"White Blood Count",
"Lymphocytes",
]
labs_processed = labs_preprocessing.clean_labs_data(
df=labs,
tests_of_interest=tests_of_interest,
units_lookup=lookup_table,
print_log=True,
)
labs_processed = patient_details[["StudyId", "IndexDate", "SafeHavenID"]].merge(
labs_processed, on="SafeHavenID", how="left"
)
labs_processed["SampleDate"] = pd.to_datetime(labs_processed["SampleDate"], utc=True)
labs_processed["TimeSinceLab"] = (
labs_processed["IndexDate"] - labs_processed["SampleDate"]
).dt.days
# Only keep labs performed before IndexDate
labs_processed = labs_processed[labs_processed["TimeSinceLab"] >= 0]
# Convert lab names to columns
labs_processed = pd.pivot_table(
labs_processed,
values="QuantityValue",
index=["StudyId", "IndexDate", "TimeSinceLab"],
columns=["ClinicalCodeDescription"],
)
labs_processed = labs_processed.reset_index()
# Calculate neutrophil/lymphocyte ratio
labs_processed["NeutLymphRatio"] = (
labs_processed["Neutrophils"] / labs_processed["Lymphocytes"]
)
# Calculate lowest albumin in past year
data = calc_lab_metric(labs_processed, data, lab_name="Albumin", metric="Min1Year")
# Calculate the latest lab value
lab_names = [
"NeutLymphRatio",
"Albumin",
"Eosinophils",
"Neutrophils",
"White Blood Count",
]
for lab_name in lab_names:
data = calc_lab_metric(
labs_processed, data, lab_name, metric="Latest", weigh_data_by_recency=True
)
# Save data
os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True)
if data_to_process == "forward_val":
data.to_pickle(
os.path.join(
config["outputs"]["processed_data_dir"],
"labs_forward_val_" + model_type + ".pkl",
)
)
else:
data.to_pickle(
os.path.join(
config["outputs"]["processed_data_dir"],
"labs_" + model_type + ".pkl",
)
)