"""Script that combines features, performs encoding of categorical features and imputation. Demographics, exacerbation history, comorbidities, spirometry, labs, and pro datasets combined. Splitting of dataset performed if the data_to_process specified in config.yaml is not forward_val. Performs encoding of categorical features, and imputation of missing values. Two versions of the data is saved: imputed and not imputed dataframes. """ import pandas as pd import numpy as np import os import sys import yaml import json import joblib import encoding import imputation with open("./training/config.yaml", "r") as config: config = yaml.safe_load(config) # Specify which model to generate features for model_type = config["model_settings"]["model_type"] # Setup log file log = open("./training/logging/combine_features_" + model_type + ".log", "w") sys.stdout = log # Dataset to process - set through config file data_to_process = config["model_settings"]["data_to_process"] ############################################################################ # Combine features ############################################################################ # Load cohort data if data_to_process == "forward_val": demographics = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "demographics_forward_val_{}.pkl".format(model_type), ) ) exac_history = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "exac_history_forward_val_{}.pkl".format(model_type), ) ) comorbidities = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "comorbidities_forward_val_{}.pkl".format(model_type), ) ) spirometry = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "spirometry_forward_val_{}.pkl".format(model_type), ) ) labs = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "labs_forward_val_{}.pkl".format(model_type), ) ) pros = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "pros_forward_val_{}.pkl".format(model_type), ) ) else: demographics = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "demographics_{}.pkl".format(model_type), ) ) exac_history = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "exac_history_{}.pkl".format(model_type), ) ) comorbidities = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "comorbidities_{}.pkl".format(model_type), ) ) spirometry = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "spirometry_{}.pkl".format(model_type), ) ) labs = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "labs_{}.pkl".format(model_type) ) ) pros = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "pros_{}.pkl".format(model_type) ) ) data_combined = demographics.merge( exac_history, on=["StudyId", "IndexDate"], how="left" ) data_combined = data_combined.merge( comorbidities, on=["StudyId", "IndexDate"], how="left" ) data_combined = data_combined.merge(spirometry, on=["StudyId", "IndexDate"], how="left") data_combined = data_combined.merge(labs, on=["StudyId", "IndexDate"], how="left") data_combined = data_combined.merge(pros, on=["StudyId", "IndexDate"], how="left") # Print dataset info print( "Data date range", data_combined["IndexDate"].min(), data_combined["IndexDate"].max(), ) print("Mean age", data_combined["Age"].mean()) print("Sex Female:", data_combined["Sex_F"].value_counts()) if data_to_process != "forward_val": # Load training and test ids train_ids = pd.read_pickle( os.path.join( config["outputs"]["cohort_info_dir"], "train_ids_{}.pkl".format(model_type) ) ) test_ids = pd.read_pickle( os.path.join( config["outputs"]["cohort_info_dir"], "test_ids_{}.pkl".format(model_type) ) ) fold_patients = np.load( os.path.join( config["outputs"]["cohort_info_dir"], "fold_patients_{}.npy".format(model_type), ), allow_pickle=True, ) # Split data into training and test sets train_data = data_combined[data_combined["StudyId"].isin(train_ids)] test_data = data_combined[data_combined["StudyId"].isin(test_ids)] train_data = train_data.sort_values(by=["StudyId", "IndexDate"]).reset_index( drop=True ) test_data = test_data.sort_values(by=["StudyId", "IndexDate"]).reset_index( drop=True ) # Save data train_data.to_pickle( os.path.join( config["outputs"]["processed_data_dir"], "train_combined_{}.pkl".format(model_type), ) ) test_data.to_pickle( os.path.join( config["outputs"]["processed_data_dir"], "test_combined_{}.pkl".format(model_type), ) ) else: data_combined.to_pickle( os.path.join( config["outputs"]["processed_data_dir"], "forward_val_combined_{}.pkl".format(model_type), ) )