| | """Script that combines features, performs encoding of categorical features and imputation. |
| | |
| | Demographics, exacerbation history, comorbidities, spirometry, labs, and pro datasets |
| | combined. Splitting of dataset performed if the data_to_process specified in config.yaml is |
| | not forward_val. Performs encoding of categorical features, and imputation of missing |
| | values. Two versions of the data is saved: imputed and not imputed dataframes. |
| | """ |
| |
|
| | import pandas as pd |
| | import numpy as np |
| | import os |
| | import sys |
| | import yaml |
| | import json |
| | import joblib |
| | import encoding |
| | import imputation |
| |
|
| |
|
| | with open("./training/config.yaml", "r") as config: |
| | config = yaml.safe_load(config) |
| |
|
| | |
| | model_type = config["model_settings"]["model_type"] |
| |
|
| | |
| | log = open("./training/logging/combine_features_" + model_type + ".log", "w") |
| | sys.stdout = log |
| |
|
| | |
| | data_to_process = config["model_settings"]["data_to_process"] |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | if data_to_process == "forward_val": |
| | demographics = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "demographics_forward_val_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | exac_history = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "exac_history_forward_val_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | comorbidities = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "comorbidities_forward_val_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | spirometry = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "spirometry_forward_val_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | labs = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "labs_forward_val_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | pros = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "pros_forward_val_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | else: |
| | demographics = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "demographics_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | exac_history = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "exac_history_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | comorbidities = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "comorbidities_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | spirometry = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "spirometry_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | labs = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], "labs_{}.pkl".format(model_type) |
| | ) |
| | ) |
| | pros = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], "pros_{}.pkl".format(model_type) |
| | ) |
| | ) |
| |
|
| | data_combined = demographics.merge( |
| | exac_history, on=["StudyId", "IndexDate"], how="left" |
| | ) |
| | data_combined = data_combined.merge( |
| | comorbidities, on=["StudyId", "IndexDate"], how="left" |
| | ) |
| | data_combined = data_combined.merge(spirometry, on=["StudyId", "IndexDate"], how="left") |
| | data_combined = data_combined.merge(labs, on=["StudyId", "IndexDate"], how="left") |
| | data_combined = data_combined.merge(pros, on=["StudyId", "IndexDate"], how="left") |
| |
|
| | |
| | print( |
| | "Data date range", |
| | data_combined["IndexDate"].min(), |
| | data_combined["IndexDate"].max(), |
| | ) |
| | print("Mean age", data_combined["Age"].mean()) |
| | print("Sex Female:", data_combined["Sex_F"].value_counts()) |
| |
|
| | if data_to_process != "forward_val": |
| | |
| | train_ids = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["cohort_info_dir"], "train_ids_{}.pkl".format(model_type) |
| | ) |
| | ) |
| | test_ids = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["cohort_info_dir"], "test_ids_{}.pkl".format(model_type) |
| | ) |
| | ) |
| | fold_patients = np.load( |
| | os.path.join( |
| | config["outputs"]["cohort_info_dir"], |
| | "fold_patients_{}.npy".format(model_type), |
| | ), |
| | allow_pickle=True, |
| | ) |
| |
|
| | |
| | train_data = data_combined[data_combined["StudyId"].isin(train_ids)] |
| | test_data = data_combined[data_combined["StudyId"].isin(test_ids)] |
| | train_data = train_data.sort_values(by=["StudyId", "IndexDate"]).reset_index( |
| | drop=True |
| | ) |
| | test_data = test_data.sort_values(by=["StudyId", "IndexDate"]).reset_index( |
| | drop=True |
| | ) |
| |
|
| | |
| | train_data.to_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "train_combined_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | test_data.to_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "test_combined_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | else: |
| | data_combined.to_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "forward_val_combined_{}.pkl".format(model_type), |
| | ) |
| | ) |