copd-model-h / training /combine_features.py

Inital Upload

000de75 2 days ago

5.58 kB

	"""Script that combines features, performs encoding of categorical features and imputation.

	Demographics, exacerbation history, comorbidities, spirometry, labs, and pro datasets
	combined. Splitting of dataset performed if the data_to_process specified in config.yaml is
	not forward_val. Performs encoding of categorical features, and imputation of missing
	values. Two versions of the data is saved: imputed and not imputed dataframes.
	"""

	import pandas as pd
	import numpy as np
	import os
	import sys
	import yaml
	import json
	import joblib
	import encoding
	import imputation


	with open("./training/config.yaml", "r") as config:
	config = yaml.safe_load(config)

	# Specify which model to generate features for
	model_type = config["model_settings"]["model_type"]

	# Setup log file
	log = open("./training/logging/combine_features_" + model_type + ".log", "w")
	sys.stdout = log

	# Dataset to process - set through config file
	data_to_process = config["model_settings"]["data_to_process"]

	############################################################################
	# Combine features
	############################################################################

	# Load cohort data
	if data_to_process == "forward_val":
	demographics = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"demographics_forward_val_{}.pkl".format(model_type),
	)
	)
	exac_history = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"exac_history_forward_val_{}.pkl".format(model_type),
	)
	)
	comorbidities = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"comorbidities_forward_val_{}.pkl".format(model_type),
	)
	)
	spirometry = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"spirometry_forward_val_{}.pkl".format(model_type),
	)
	)
	labs = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"labs_forward_val_{}.pkl".format(model_type),
	)
	)
	pros = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"pros_forward_val_{}.pkl".format(model_type),
	)
	)
	else:
	demographics = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"demographics_{}.pkl".format(model_type),
	)
	)
	exac_history = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"exac_history_{}.pkl".format(model_type),
	)
	)
	comorbidities = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"comorbidities_{}.pkl".format(model_type),
	)
	)
	spirometry = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"spirometry_{}.pkl".format(model_type),
	)
	)
	labs = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"], "labs_{}.pkl".format(model_type)
	)
	)
	pros = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"], "pros_{}.pkl".format(model_type)
	)
	)

	data_combined = demographics.merge(
	exac_history, on=["StudyId", "IndexDate"], how="left"
	)
	data_combined = data_combined.merge(
	comorbidities, on=["StudyId", "IndexDate"], how="left"
	)
	data_combined = data_combined.merge(spirometry, on=["StudyId", "IndexDate"], how="left")
	data_combined = data_combined.merge(labs, on=["StudyId", "IndexDate"], how="left")
	data_combined = data_combined.merge(pros, on=["StudyId", "IndexDate"], how="left")

	# Print dataset info
	print(
	"Data date range",
	data_combined["IndexDate"].min(),
	data_combined["IndexDate"].max(),
	)
	print("Mean age", data_combined["Age"].mean())
	print("Sex Female:", data_combined["Sex_F"].value_counts())

	if data_to_process != "forward_val":
	# Load training and test ids
	train_ids = pd.read_pickle(
	os.path.join(
	config["outputs"]["cohort_info_dir"], "train_ids_{}.pkl".format(model_type)
	)
	)
	test_ids = pd.read_pickle(
	os.path.join(
	config["outputs"]["cohort_info_dir"], "test_ids_{}.pkl".format(model_type)
	)
	)
	fold_patients = np.load(
	os.path.join(
	config["outputs"]["cohort_info_dir"],
	"fold_patients_{}.npy".format(model_type),
	),
	allow_pickle=True,
	)

	# Split data into training and test sets
	train_data = data_combined[data_combined["StudyId"].isin(train_ids)]
	test_data = data_combined[data_combined["StudyId"].isin(test_ids)]
	train_data = train_data.sort_values(by=["StudyId", "IndexDate"]).reset_index(
	drop=True
	)
	test_data = test_data.sort_values(by=["StudyId", "IndexDate"]).reset_index(
	drop=True
	)

	# Save data
	train_data.to_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"train_combined_{}.pkl".format(model_type),
	)
	)
	test_data.to_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"test_combined_{}.pkl".format(model_type),
	)
	)
	else:
	data_combined.to_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"forward_val_combined_{}.pkl".format(model_type),
	)
	)