"""Script that performs encoding of categorical features and imputation. Performs encoding of categorical features, and imputation of missing values. After encoding and imputation are performed, features are dropped. Two versions of the data is saved: imputed and not imputed dataframes. """ import pandas as pd import numpy as np import os import sys import yaml import json import joblib import encoding import imputation with open("./training/config.yaml", "r") as config: config = yaml.safe_load(config) # Specify which model to generate features for model_type = config["model_settings"]["model_type"] # Setup log file log = open("./training/logging/encode_and_impute_" + model_type + ".log", "w") sys.stdout = log # Dataset to process - set through config file data_to_process = config["model_settings"]["data_to_process"] # Load data data = pd.read_pickle( os.path.join( config["outputs"]["processed_data_dir"], "{}_combined_{}.pkl".format(data_to_process, model_type), ) ) ############################################################################ # Target encode categorical data ############################################################################ categorical_cols = [ "LatestSymptomDiaryQ8", "LatestSymptomDiaryQ9", "LatestSymptomDiaryQ10", "DaysSinceLastExac", "AgeBinned", "Comorbidities", "FEV1PercentPredicted", ] # Multiple types of nans present in data ('nan' and np.NaN). Convert all these to 'nan' for # categorical columns for categorical_col in categorical_cols: data[categorical_col] = data[categorical_col].replace(np.nan, "nan") if data_to_process == "train": # Get target encodings for entire train set target_encodings = encoding.get_target_encodings( train_data=data, cols_to_encode=categorical_cols, target_col="ExacWithin3Months", smooth="auto", ) train_encoded = encoding.apply_target_encodings( data=data, cols_to_encode=categorical_cols, encodings=target_encodings, drop_categorical_cols=False, ) json.dump( target_encodings, open("./data/artifacts/target_encodings_" + model_type + ".json", "w"), ) # K-fold target encode # Get info on which patients belong to which fold fold_patients = np.load( os.path.join( config["outputs"]["cohort_info_dir"], "fold_patients_{}.npy".format(model_type), ), allow_pickle=True, ) train_encoded_cv, target_encodings = encoding.kfold_target_encode( df=data, fold_ids=fold_patients, cols_to_encode=categorical_cols, id_col="StudyId", target="ExacWithin3Months", smooth="auto", drop_categorical_cols=False, ) # Drop categorical cols except for AgeBinned as it is needed in imputation step categorical_cols.remove("AgeBinned") train_encoded = train_encoded.drop(columns=categorical_cols) train_encoded_cv = train_encoded_cv.drop(columns=categorical_cols) if (data_to_process == "test") | (data_to_process == "forward_val"): # Encode test set/forward val set based on entire train set target_encodings = json.load( open("./data/artifacts/target_encodings_" + model_type + ".json") ) test_encoded = encoding.apply_target_encodings( data=data, cols_to_encode=categorical_cols, encodings=target_encodings, drop_categorical_cols=False, ) # Drop categorical cols except for AgeBinned as it is needed in imputation step categorical_cols.remove("AgeBinned") test_encoded = test_encoded.drop(columns=categorical_cols) ############################################################################ # Impute missing data ############################################################################ cols_to_ignore = [ "StudyId", "PatientId", "IndexDate", "ExacWithin3Months", "HospExacWithin3Months", "CommExacWithin3Months", "Age", "Sex_F", "SafeHavenID", "AgeBinned", ] if data_to_process == "train": # Impute entire train set not_imputed_train = train_encoded.copy() cols_to_impute = train_encoded.drop(columns=cols_to_ignore).columns imputer = imputation.get_imputer( train_data=train_encoded, cols_to_impute=cols_to_impute, average_type="median", cols_to_groupby=["AgeBinned", "Sex_F"], ) imputed_train = imputation.apply_imputer( data=train_encoded, cols_to_impute=cols_to_impute, imputer=imputer, cols_to_groupby=["AgeBinned", "Sex_F"], ) joblib.dump(imputer, "./data/artifacts/imputer_" + model_type + ".pkl") # K-fold impute not_imputed_train_cv = train_encoded_cv.copy() imputed_train_cv = imputation.kfold_impute( df=train_encoded, fold_ids=fold_patients, cols_to_impute=cols_to_impute, average_type="median", cols_to_groupby=["AgeBinned", "Sex_F"], id_col="StudyId", ) df_columns = imputed_train.columns.tolist() if (data_to_process == "test") | (data_to_process == "forward_val"): not_imputed_test = test_encoded.copy() cols_to_impute = test_encoded.drop(columns=cols_to_ignore).columns # Impute test set/forward val set based on entire train set imputer = joblib.load("./data/artifacts/imputer_" + model_type + ".pkl") imputed_test = imputation.apply_imputer( data=test_encoded, cols_to_impute=cols_to_impute, imputer=imputer, cols_to_groupby=["AgeBinned", "Sex_F"], ) df_columns = imputed_test.columns.tolist() ############################################################################ # Reduce feature space ############################################################################ cols_to_drop_startswith = ( "DiffLatest", "Var", "LatestEQ5D", "TotalEngagement", "Age", "NumHosp", "Required", "LungFunction", "EngagementCAT", "LatestSymptomDiary", "LatestAlbumin", "LatestEosinophils", "LatestNeutrophils", "LatestWhite Blood Count", ) additional_cols_to_drop = [ "PatientId", "SafeHavenID", "Sex_F", "NumCommExacPrior6mo", "AsthmaOverlap", "TimeSinceLungFunc", "LatestNeutLymphRatio", "EngagementEQ5DTW1", "EngagementMRCTW1", "LatestMRCQ1", "WeekAvgCATQ1", "WeekAvgCATQ3", "WeekAvgCATQ4", "WeekAvgCATQ5", "WeekAvgCATQ6", "WeekAvgCATQ7", "WeekAvgCATQ8", "WeekAvgSymptomDiaryQ1", "WeekAvgSymptomDiaryQ3", "WeekAvgSymptomDiaryScore", "EngagementSymptomDiaryTW1", "ScaledSumSymptomDiaryQ3TW1", # "Comorbidities_te", ] cols_to_drop = [] cols_to_drop.extend( [item for item in df_columns if item.startswith(cols_to_drop_startswith)] ) cols_to_drop.extend(additional_cols_to_drop) if data_to_process == "train": imputed_train = imputed_train.drop(columns=cols_to_drop) not_imputed_train = not_imputed_train.drop(columns=cols_to_drop) imputed_train_cv = imputed_train_cv.drop(columns=cols_to_drop) not_imputed_train_cv = not_imputed_train_cv.drop(columns=cols_to_drop) if (data_to_process == "test") | (data_to_process == "forward_val"): imputed_test = imputed_test.drop(columns=cols_to_drop) not_imputed_test = not_imputed_test.drop(columns=cols_to_drop) ############################################################################ # Save data ############################################################################ os.makedirs(config["outputs"]["model_input_data_dir"], exist_ok=True) if data_to_process == "train": imputed_train.to_pickle( os.path.join( config["outputs"]["model_input_data_dir"], "{}_imputed_{}.pkl".format(data_to_process, model_type), ) ) not_imputed_train.to_pickle( os.path.join( config["outputs"]["model_input_data_dir"], "{}_not_imputed_{}.pkl".format(data_to_process, model_type), ) ) imputed_train_cv.to_pickle( os.path.join( config["outputs"]["model_input_data_dir"], "{}_imputed_cv_{}.pkl".format(data_to_process, model_type), ) ) not_imputed_train_cv.to_pickle( os.path.join( config["outputs"]["model_input_data_dir"], "{}_not_imputed_cv_{}.pkl".format(data_to_process, model_type), ) ) if (data_to_process == "test") | (data_to_process == "forward_val"): imputed_test.to_pickle( os.path.join( config["outputs"]["model_input_data_dir"], "{}_imputed_{}.pkl".format(data_to_process, model_type), ) ) not_imputed_test.to_pickle( os.path.join( config["outputs"]["model_input_data_dir"], "{}_not_imputed_{}.pkl".format(data_to_process, model_type), ) )