stormid
/

copd-model-h

Tabular Classification

patient-reported-outcomes

Model card Files Files and versions

copd-model-h / training /process_demographics.py

IamGrooooot's picture

Inital Upload

000de75 2 days ago

history blame contribute delete

2.23 kB

	"""
	Derive features from demographics for 2 models:
	Parallel model 1: uses both hospital and community exacerbation events
	Parallel model 2: uses only hospital exacerbation events
	"""

	import numpy as np
	import pandas as pd
	import sys
	import os
	import model_h
	import yaml

	with open("./training/config.yaml", "r") as config:
	config = yaml.safe_load(config)

	# Specify which model to generate features for
	model_type = config["model_settings"]["model_type"]

	# Setup log file
	log = open("./training/logging/process_demographics_" + model_type + ".log", "w")
	sys.stdout = log

	# Dataset to process - set through config file
	data_to_process = config["model_settings"]["data_to_process"]

	# Load cohort data
	if data_to_process == "forward_val":
	data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl")
	patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl")
	else:
	data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl")
	patient_details = pd.read_pickle("./data/patient_details.pkl")
	data = data.merge(
	patient_details[["StudyId"]],
	on="StudyId",
	how="left",
	)

	# Calculate age
	data["DateOfBirth"] = pd.to_datetime(data["DateOfBirth"], utc=True)
	data["Age"] = (data["IndexDate"] - data["DateOfBirth"]).dt.days
	data["Age"] = np.floor(data["Age"] / 365)
	data = data.drop(columns="DateOfBirth")

	# Bin patient age
	age_bins = [0, 50, 60, 70, 80, np.inf]
	age_labels = ["<50", "50-59", "60-69", "70-79", "80+"]
	data["AgeBinned"] = model_h.bin_numeric_column(
	col=data["Age"], bins=age_bins, labels=age_labels
	)

	# Smoking status: TODO

	# Map the M and F sex column to binary (1=F)
	sex_mapping = {"F": 1, "M": 0}
	data["Sex_F"] = data.Sex.map(sex_mapping)
	data = data.drop(columns=["Sex"])

	# Save data
	os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True)
	if data_to_process == "forward_val":
	data.to_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"demographics_forward_val_" + model_type + ".pkl",
	)
	)
	else:
	data.to_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"demographics_" + model_type + ".pkl",
	)
	)