| | """Script that performs encoding of categorical features and imputation. |
| | |
| | Performs encoding of categorical features, and imputation of missing values. After encoding |
| | and imputation are performed, features are dropped. Two versions of the data is saved: |
| | imputed and not imputed dataframes. |
| | """ |
| |
|
| | import pandas as pd |
| | import numpy as np |
| | import os |
| | import sys |
| | import yaml |
| | import json |
| | import joblib |
| | import encoding |
| | import imputation |
| |
|
| |
|
| | with open("./training/config.yaml", "r") as config: |
| | config = yaml.safe_load(config) |
| |
|
| | |
| | model_type = config["model_settings"]["model_type"] |
| |
|
| | |
| | log = open("./training/logging/encode_and_impute_" + model_type + ".log", "w") |
| | sys.stdout = log |
| |
|
| | |
| | data_to_process = config["model_settings"]["data_to_process"] |
| |
|
| | |
| | data = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "{}_combined_{}.pkl".format(data_to_process, model_type), |
| | ) |
| | ) |
| |
|
| | |
| | |
| | |
| |
|
| | categorical_cols = [ |
| | "LatestSymptomDiaryQ8", |
| | "LatestSymptomDiaryQ9", |
| | "LatestSymptomDiaryQ10", |
| | "DaysSinceLastExac", |
| | "AgeBinned", |
| | "Comorbidities", |
| | "FEV1PercentPredicted", |
| | ] |
| |
|
| | |
| | |
| | for categorical_col in categorical_cols: |
| | data[categorical_col] = data[categorical_col].replace(np.nan, "nan") |
| |
|
| | if data_to_process == "train": |
| | |
| | target_encodings = encoding.get_target_encodings( |
| | train_data=data, |
| | cols_to_encode=categorical_cols, |
| | target_col="ExacWithin3Months", |
| | smooth="auto", |
| | ) |
| | train_encoded = encoding.apply_target_encodings( |
| | data=data, |
| | cols_to_encode=categorical_cols, |
| | encodings=target_encodings, |
| | drop_categorical_cols=False, |
| | ) |
| | json.dump( |
| | target_encodings, |
| | open("./data/artifacts/target_encodings_" + model_type + ".json", "w"), |
| | ) |
| |
|
| | |
| | |
| | fold_patients = np.load( |
| | os.path.join( |
| | config["outputs"]["cohort_info_dir"], |
| | "fold_patients_{}.npy".format(model_type), |
| | ), |
| | allow_pickle=True, |
| | ) |
| | train_encoded_cv, target_encodings = encoding.kfold_target_encode( |
| | df=data, |
| | fold_ids=fold_patients, |
| | cols_to_encode=categorical_cols, |
| | id_col="StudyId", |
| | target="ExacWithin3Months", |
| | smooth="auto", |
| | drop_categorical_cols=False, |
| | ) |
| |
|
| | |
| | categorical_cols.remove("AgeBinned") |
| | train_encoded = train_encoded.drop(columns=categorical_cols) |
| | train_encoded_cv = train_encoded_cv.drop(columns=categorical_cols) |
| |
|
| | if (data_to_process == "test") | (data_to_process == "forward_val"): |
| | |
| | target_encodings = json.load( |
| | open("./data/artifacts/target_encodings_" + model_type + ".json") |
| | ) |
| | test_encoded = encoding.apply_target_encodings( |
| | data=data, |
| | cols_to_encode=categorical_cols, |
| | encodings=target_encodings, |
| | drop_categorical_cols=False, |
| | ) |
| |
|
| | |
| | categorical_cols.remove("AgeBinned") |
| | test_encoded = test_encoded.drop(columns=categorical_cols) |
| |
|
| | |
| | |
| | |
| |
|
| | cols_to_ignore = [ |
| | "StudyId", |
| | "PatientId", |
| | "IndexDate", |
| | "ExacWithin3Months", |
| | "HospExacWithin3Months", |
| | "CommExacWithin3Months", |
| | "Age", |
| | "Sex_F", |
| | "SafeHavenID", |
| | "AgeBinned", |
| | ] |
| |
|
| | if data_to_process == "train": |
| | |
| | not_imputed_train = train_encoded.copy() |
| | cols_to_impute = train_encoded.drop(columns=cols_to_ignore).columns |
| |
|
| | imputer = imputation.get_imputer( |
| | train_data=train_encoded, |
| | cols_to_impute=cols_to_impute, |
| | average_type="median", |
| | cols_to_groupby=["AgeBinned", "Sex_F"], |
| | ) |
| | imputed_train = imputation.apply_imputer( |
| | data=train_encoded, |
| | cols_to_impute=cols_to_impute, |
| | imputer=imputer, |
| | cols_to_groupby=["AgeBinned", "Sex_F"], |
| | ) |
| | joblib.dump(imputer, "./data/artifacts/imputer_" + model_type + ".pkl") |
| |
|
| | |
| | not_imputed_train_cv = train_encoded_cv.copy() |
| | imputed_train_cv = imputation.kfold_impute( |
| | df=train_encoded, |
| | fold_ids=fold_patients, |
| | cols_to_impute=cols_to_impute, |
| | average_type="median", |
| | cols_to_groupby=["AgeBinned", "Sex_F"], |
| | id_col="StudyId", |
| | ) |
| |
|
| | df_columns = imputed_train.columns.tolist() |
| |
|
| | if (data_to_process == "test") | (data_to_process == "forward_val"): |
| | not_imputed_test = test_encoded.copy() |
| | cols_to_impute = test_encoded.drop(columns=cols_to_ignore).columns |
| |
|
| | |
| | imputer = joblib.load("./data/artifacts/imputer_" + model_type + ".pkl") |
| | imputed_test = imputation.apply_imputer( |
| | data=test_encoded, |
| | cols_to_impute=cols_to_impute, |
| | imputer=imputer, |
| | cols_to_groupby=["AgeBinned", "Sex_F"], |
| | ) |
| |
|
| | df_columns = imputed_test.columns.tolist() |
| |
|
| | |
| | |
| | |
| | cols_to_drop_startswith = ( |
| | "DiffLatest", |
| | "Var", |
| | "LatestEQ5D", |
| | "TotalEngagement", |
| | "Age", |
| | "NumHosp", |
| | "Required", |
| | "LungFunction", |
| | "EngagementCAT", |
| | "LatestSymptomDiary", |
| | "LatestAlbumin", |
| | "LatestEosinophils", |
| | "LatestNeutrophils", |
| | "LatestWhite Blood Count", |
| | ) |
| |
|
| | additional_cols_to_drop = [ |
| | "PatientId", |
| | "SafeHavenID", |
| | "Sex_F", |
| | "NumCommExacPrior6mo", |
| | "AsthmaOverlap", |
| | "TimeSinceLungFunc", |
| | "LatestNeutLymphRatio", |
| | "EngagementEQ5DTW1", |
| | "EngagementMRCTW1", |
| | "LatestMRCQ1", |
| | "WeekAvgCATQ1", |
| | "WeekAvgCATQ3", |
| | "WeekAvgCATQ4", |
| | "WeekAvgCATQ5", |
| | "WeekAvgCATQ6", |
| | "WeekAvgCATQ7", |
| | "WeekAvgCATQ8", |
| | "WeekAvgSymptomDiaryQ1", |
| | "WeekAvgSymptomDiaryQ3", |
| | "WeekAvgSymptomDiaryScore", |
| | "EngagementSymptomDiaryTW1", |
| | "ScaledSumSymptomDiaryQ3TW1", |
| | |
| | ] |
| |
|
| | cols_to_drop = [] |
| | cols_to_drop.extend( |
| | [item for item in df_columns if item.startswith(cols_to_drop_startswith)] |
| | ) |
| | cols_to_drop.extend(additional_cols_to_drop) |
| |
|
| | if data_to_process == "train": |
| | imputed_train = imputed_train.drop(columns=cols_to_drop) |
| | not_imputed_train = not_imputed_train.drop(columns=cols_to_drop) |
| | imputed_train_cv = imputed_train_cv.drop(columns=cols_to_drop) |
| | not_imputed_train_cv = not_imputed_train_cv.drop(columns=cols_to_drop) |
| | if (data_to_process == "test") | (data_to_process == "forward_val"): |
| | imputed_test = imputed_test.drop(columns=cols_to_drop) |
| | not_imputed_test = not_imputed_test.drop(columns=cols_to_drop) |
| |
|
| | |
| | |
| | |
| | os.makedirs(config["outputs"]["model_input_data_dir"], exist_ok=True) |
| |
|
| | if data_to_process == "train": |
| | imputed_train.to_pickle( |
| | os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "{}_imputed_{}.pkl".format(data_to_process, model_type), |
| | ) |
| | ) |
| | not_imputed_train.to_pickle( |
| | os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "{}_not_imputed_{}.pkl".format(data_to_process, model_type), |
| | ) |
| | ) |
| | imputed_train_cv.to_pickle( |
| | os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "{}_imputed_cv_{}.pkl".format(data_to_process, model_type), |
| | ) |
| | ) |
| | not_imputed_train_cv.to_pickle( |
| | os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "{}_not_imputed_cv_{}.pkl".format(data_to_process, model_type), |
| | ) |
| | ) |
| |
|
| | if (data_to_process == "test") | (data_to_process == "forward_val"): |
| | imputed_test.to_pickle( |
| | os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "{}_imputed_{}.pkl".format(data_to_process, model_type), |
| | ) |
| | ) |
| | not_imputed_test.to_pickle( |
| | os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "{}_not_imputed_{}.pkl".format(data_to_process, model_type), |
| | ) |
| | ) |
| |
|