File size: 8,903 Bytes

000de75

"""Script that performs encoding of categorical features and imputation.

Performs encoding of categorical features, and imputation of missing values. After encoding
and imputation are performed, features are dropped. Two versions of the data is saved:
imputed and not imputed dataframes.
"""

import pandas as pd
import numpy as np
import os
import sys
import yaml
import json
import joblib
import encoding
import imputation


with open("./training/config.yaml", "r") as config:
    config = yaml.safe_load(config)

# Specify which model to generate features for
model_type = config["model_settings"]["model_type"]

# Setup log file
log = open("./training/logging/encode_and_impute_" + model_type + ".log", "w")
sys.stdout = log

# Dataset to process - set through config file
data_to_process = config["model_settings"]["data_to_process"]

# Load data
data = pd.read_pickle(
    os.path.join(
        config["outputs"]["processed_data_dir"],
        "{}_combined_{}.pkl".format(data_to_process, model_type),
    )
)

############################################################################
# Target encode categorical data
############################################################################

categorical_cols = [
    "LatestSymptomDiaryQ8",
    "LatestSymptomDiaryQ9",
    "LatestSymptomDiaryQ10",
    "DaysSinceLastExac",
    "AgeBinned",
    "Comorbidities",
    "FEV1PercentPredicted",
]

# Multiple types of nans present in data ('nan' and np.NaN). Convert all these to 'nan' for
# categorical columns
for categorical_col in categorical_cols:
    data[categorical_col] = data[categorical_col].replace(np.nan, "nan")

if data_to_process == "train":
    # Get target encodings for entire train set
    target_encodings = encoding.get_target_encodings(
        train_data=data,
        cols_to_encode=categorical_cols,
        target_col="ExacWithin3Months",
        smooth="auto",
    )
    train_encoded = encoding.apply_target_encodings(
        data=data,
        cols_to_encode=categorical_cols,
        encodings=target_encodings,
        drop_categorical_cols=False,
    )
    json.dump(
        target_encodings,
        open("./data/artifacts/target_encodings_" + model_type + ".json", "w"),
    )

    # K-fold target encode
    # Get info on which patients belong to which fold
    fold_patients = np.load(
        os.path.join(
            config["outputs"]["cohort_info_dir"],
            "fold_patients_{}.npy".format(model_type),
        ),
        allow_pickle=True,
    )
    train_encoded_cv, target_encodings = encoding.kfold_target_encode(
        df=data,
        fold_ids=fold_patients,
        cols_to_encode=categorical_cols,
        id_col="StudyId",
        target="ExacWithin3Months",
        smooth="auto",
        drop_categorical_cols=False,
    )

    # Drop categorical cols except for AgeBinned as it is needed in imputation step
    categorical_cols.remove("AgeBinned")
    train_encoded = train_encoded.drop(columns=categorical_cols)
    train_encoded_cv = train_encoded_cv.drop(columns=categorical_cols)

if (data_to_process == "test") | (data_to_process == "forward_val"):
    # Encode test set/forward val set based on entire train set
    target_encodings = json.load(
        open("./data/artifacts/target_encodings_" + model_type + ".json")
    )
    test_encoded = encoding.apply_target_encodings(
        data=data,
        cols_to_encode=categorical_cols,
        encodings=target_encodings,
        drop_categorical_cols=False,
    )

    # Drop categorical cols except for AgeBinned as it is needed in imputation step
    categorical_cols.remove("AgeBinned")
    test_encoded = test_encoded.drop(columns=categorical_cols)

############################################################################
# Impute missing data
############################################################################

cols_to_ignore = [
    "StudyId",
    "PatientId",
    "IndexDate",
    "ExacWithin3Months",
    "HospExacWithin3Months",
    "CommExacWithin3Months",
    "Age",
    "Sex_F",
    "SafeHavenID",
    "AgeBinned",
]

if data_to_process == "train":
    # Impute entire train set
    not_imputed_train = train_encoded.copy()
    cols_to_impute = train_encoded.drop(columns=cols_to_ignore).columns

    imputer = imputation.get_imputer(
        train_data=train_encoded,
        cols_to_impute=cols_to_impute,
        average_type="median",
        cols_to_groupby=["AgeBinned", "Sex_F"],
    )
    imputed_train = imputation.apply_imputer(
        data=train_encoded,
        cols_to_impute=cols_to_impute,
        imputer=imputer,
        cols_to_groupby=["AgeBinned", "Sex_F"],
    )
    joblib.dump(imputer, "./data/artifacts/imputer_" + model_type + ".pkl")

    # K-fold impute
    not_imputed_train_cv = train_encoded_cv.copy()
    imputed_train_cv = imputation.kfold_impute(
        df=train_encoded,
        fold_ids=fold_patients,
        cols_to_impute=cols_to_impute,
        average_type="median",
        cols_to_groupby=["AgeBinned", "Sex_F"],
        id_col="StudyId",
    )

    df_columns = imputed_train.columns.tolist()

if (data_to_process == "test") | (data_to_process == "forward_val"):
    not_imputed_test = test_encoded.copy()
    cols_to_impute = test_encoded.drop(columns=cols_to_ignore).columns

    # Impute test set/forward val set based on entire train set
    imputer = joblib.load("./data/artifacts/imputer_" + model_type + ".pkl")
    imputed_test = imputation.apply_imputer(
        data=test_encoded,
        cols_to_impute=cols_to_impute,
        imputer=imputer,
        cols_to_groupby=["AgeBinned", "Sex_F"],
    )

    df_columns = imputed_test.columns.tolist()

############################################################################
# Reduce feature space
############################################################################
cols_to_drop_startswith = (
    "DiffLatest",
    "Var",
    "LatestEQ5D",
    "TotalEngagement",
    "Age",
    "NumHosp",
    "Required",
    "LungFunction",
    "EngagementCAT",
    "LatestSymptomDiary",
    "LatestAlbumin",
    "LatestEosinophils",
    "LatestNeutrophils",
    "LatestWhite Blood Count",
)

additional_cols_to_drop = [
    "PatientId",
    "SafeHavenID",
    "Sex_F",
    "NumCommExacPrior6mo",
    "AsthmaOverlap",
    "TimeSinceLungFunc",
    "LatestNeutLymphRatio",
    "EngagementEQ5DTW1",
    "EngagementMRCTW1",
    "LatestMRCQ1",
    "WeekAvgCATQ1",
    "WeekAvgCATQ3",
    "WeekAvgCATQ4",
    "WeekAvgCATQ5",
    "WeekAvgCATQ6",
    "WeekAvgCATQ7",
    "WeekAvgCATQ8",
    "WeekAvgSymptomDiaryQ1",
    "WeekAvgSymptomDiaryQ3",
    "WeekAvgSymptomDiaryScore",
    "EngagementSymptomDiaryTW1",
    "ScaledSumSymptomDiaryQ3TW1",
    # "Comorbidities_te",
]

cols_to_drop = []
cols_to_drop.extend(
    [item for item in df_columns if item.startswith(cols_to_drop_startswith)]
)
cols_to_drop.extend(additional_cols_to_drop)

if data_to_process == "train":
    imputed_train = imputed_train.drop(columns=cols_to_drop)
    not_imputed_train = not_imputed_train.drop(columns=cols_to_drop)
    imputed_train_cv = imputed_train_cv.drop(columns=cols_to_drop)
    not_imputed_train_cv = not_imputed_train_cv.drop(columns=cols_to_drop)
if (data_to_process == "test") | (data_to_process == "forward_val"):
    imputed_test = imputed_test.drop(columns=cols_to_drop)
    not_imputed_test = not_imputed_test.drop(columns=cols_to_drop)

############################################################################
# Save data
############################################################################
os.makedirs(config["outputs"]["model_input_data_dir"], exist_ok=True)

if data_to_process == "train":
    imputed_train.to_pickle(
        os.path.join(
            config["outputs"]["model_input_data_dir"],
            "{}_imputed_{}.pkl".format(data_to_process, model_type),
        )
    )
    not_imputed_train.to_pickle(
        os.path.join(
            config["outputs"]["model_input_data_dir"],
            "{}_not_imputed_{}.pkl".format(data_to_process, model_type),
        )
    )
    imputed_train_cv.to_pickle(
        os.path.join(
            config["outputs"]["model_input_data_dir"],
            "{}_imputed_cv_{}.pkl".format(data_to_process, model_type),
        )
    )
    not_imputed_train_cv.to_pickle(
        os.path.join(
            config["outputs"]["model_input_data_dir"],
            "{}_not_imputed_cv_{}.pkl".format(data_to_process, model_type),
        )
    )

if (data_to_process == "test") | (data_to_process == "forward_val"):
    imputed_test.to_pickle(
        os.path.join(
            config["outputs"]["model_input_data_dir"],
            "{}_imputed_{}.pkl".format(data_to_process, model_type),
        )
    )
    not_imputed_test.to_pickle(
        os.path.join(
            config["outputs"]["model_input_data_dir"],
            "{}_not_imputed_{}.pkl".format(data_to_process, model_type),
        )
    )