stormid
/

copd-model-h

Tabular Classification

patient-reported-outcomes

Model card Files Files and versions

copd-model-h / training /process_spirometry.py

IamGrooooot's picture

Inital Upload

000de75 3 days ago

history blame contribute delete

3.72 kB

	"""
	Derive features from spirometry for 2 models:
	Parallel model 1: uses both hospital and community exacerbation events
	Parallel model 2: uses only hospital exacerbation events
	"""

	import numpy as np
	import pandas as pd
	import sys
	import os
	import yaml
	import model_h

	with open("./training/config.yaml", "r") as config:
	config = yaml.safe_load(config)

	# Specify which model to generate features for
	model_type = config["model_settings"]["model_type"]

	# Setup log file
	log = open("./training/logging/process_spirometry_" + model_type + ".log", "w")
	sys.stdout = log

	# Dataset to process - set through config file
	data_to_process = config["model_settings"]["data_to_process"]

	# Load cohort data
	if data_to_process == "forward_val":
	data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl")
	patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl")
	else:
	data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl")
	patient_details = pd.read_pickle("./data/patient_details.pkl")
	data = data[["StudyId", "IndexDate"]]
	patient_details = data.merge(
	patient_details[["StudyId", "PatientId"]],
	on="StudyId",
	how="left",
	)


	copd_status = pd.read_csv(
	config["inputs"]["raw_data_paths"]["copd_status"], delimiter="\|"
	)

	copd_status = patient_details.merge(copd_status, on="PatientId", how="left")
	copd_status["LungFunction_Date"] = pd.to_datetime(
	copd_status["LungFunction_Date"], utc=True
	)
	copd_status["TimeSinceLungFunc"] = (
	copd_status["IndexDate"] - copd_status["LungFunction_Date"]
	).dt.days
	print(
	"COPD Status Details: Number of patients with a lung function date < 1 year \
	from index date: {} of {}".format(
	len(copd_status[copd_status["TimeSinceLungFunc"] < 365]), len(patient_details)
	)
	)
	copd_status = copd_status[
	[
	"StudyId",
	"IndexDate",
	"RequiredAcuteNIV",
	"RequiredICUAdmission",
	"LungFunction_FEV1PercentPredicted",
	"LungFunction_FEV1Litres",
	"LungFunction_FEV1FVCRatio",
	"TimeSinceLungFunc",
	]
	]

	# Map bool values
	bool_mapping = {True: 1, False: 0}
	copd_status["RequiredAcuteNIV"] = copd_status.RequiredAcuteNIV.map(bool_mapping)
	copd_status["RequiredICUAdmission"] = copd_status.RequiredICUAdmission.map(bool_mapping)

	# Convert columns in COPD Status to numeric
	copd_status["LungFunction_FEV1PercentPredicted"] = copd_status[
	"LungFunction_FEV1PercentPredicted"
	].str.replace("%", "")
	for col in copd_status.drop(
	columns=["StudyId", "IndexDate", "RequiredAcuteNIV", "RequiredICUAdmission"]
	).columns:
	copd_status[col] = pd.to_numeric(copd_status[col])

	# Bin patient spirometry at onboarding
	spirometry_bins = [0, 30, 50, 80, np.inf]
	spirometry_labels = ["Very severe", "Severe", "Moderate", "Mild"]
	copd_status["FEV1PercentPredicted"] = model_h.bin_numeric_column(
	col=copd_status["LungFunction_FEV1PercentPredicted"],
	bins=spirometry_bins,
	labels=spirometry_labels,
	)
	copd_status = copd_status.drop(columns=["LungFunction_FEV1PercentPredicted"])

	# Assign patients without spirometry in service data to the Mild category
	copd_status.loc[
	copd_status["FEV1PercentPredicted"] == "nan", "FEV1PercentPredicted"
	] = "Mild"

	# Save data
	os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True)
	if data_to_process == "forward_val":
	copd_status.to_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"spirometry_forward_val_" + model_type + ".pkl",
	)
	)
	else:
	copd_status.to_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"spirometry_" + model_type + ".pkl",
	)
	)