File size: 5,406 Bytes

53a6def

"""
TRAIN
Impute any null data, save ethnicity info for each ID and scale
final dataset
"""
import json
import joblib
import pandas as pd
import numpy as np
from numpy import savetxt
from sklearn.preprocessing import MinMaxScaler
from utils.reduction import calc_ds_med


demo_cols = ['age_bin', 'sex_bin']

ds_cols = ['days_since_copd_resp', 'days_since_adm', 'days_since_rescue']

null_cols = ['alt_med_2yr', 'ast_med_2yr', 'albumin_med_2yr',
             'alkaline_phosphatase_med_2yr', 'basophils_med_2yr',
             'c_reactive_protein_med_2yr', 'chloride_med_2yr',
             'creatinine_med_2yr', 'eosinophils_med_2yr',
             'estimated_gfr_med_2yr', 'haematocrit_med_2yr',
             'haemoglobin_med_2yr', 'lymphocytes_med_2yr',
             'mch_med_2yr', 'mean_cell_volume_med_2yr',
             'monocytes_med_2yr', 'neutrophils_med_2yr',
             'platelets_med_2yr', 'potassium_med_2yr',
             'red_blood_count_med_2yr', 'sodium_med_2yr',
             'total_bilirubin_med_2yr', 'urea_med_2yr',
             'white_blood_count_med_2yr', 'neut_lymph_med_2yr']

cols2drop = ['eth_grp', 'entry_dataset', 'first_entry', 'obf_dob',
             'sex_bin', 'marital_status', 'age_bin',
             'days_since_copd_resp_med', 'days_since_adm_med',
             'days_since_rescue_med', 'simd_vigintile', 'simd_decile',
             'simd_quintile']


def calc_age_bins_train(df, data_path):
    """
    Split ages into 10 bins and save results for median filling test data
    --------
    :param df: dataframe to be updated
    :param data_path: path to generated data
    :return: updated dataframe
    """
    # Split age column into 10 buckets and use the edges as labels
    cat, ed = pd.qcut(df['age'], q=10, precision=0, retbins=True)
    categories, edges = pd.qcut(
        df['age'], q=10, precision=0, retbins=True, labels=ed[1:])
    df['age_bin'] = categories.astype(int)

    # Save categories for test data
    savetxt(data_path + 'age_bins_train.csv', edges, delimiter=',')

    return df


def calc_df_med(df, data_path):
    """
    Calculate the medians for all columns in the dataset
    --------
    :param df: dataframe to update
    :param data_path: path to generated data
    :return: dataframe with null columns filled with median values and days_since
        median columns added to the dataframe
    """
    # Calculate median for all columns except SafeHavenID, year and ds_cols
    all_cols = df.columns
    all_cols = all_cols.drop(['SafeHavenID', 'eoy'])
    df_median = df[all_cols].groupby(demo_cols).median()

    # Calculate medians for ds_cols
    ds_med = df[demo_cols + ds_cols].groupby(demo_cols).apply(calc_ds_med)

    # Join ds_cols medians to median table and original dataframe
    df_median = df_median.join(ds_med)

    # Save medians for imputing testing data
    df_median.to_pickle(data_path + 'medians.pkl')

    # Rename and add to original dataframe
    ds_med.columns += '_med'
    df = df.join(ds_med, on=demo_cols)

    return df


def ds_fill_5year_train(df, col):
    """
    Fill days_since_X columns where patient has been in the dataset less than
    5 years
    --------
    :param df: dataframe to be updated
    :param col: column to check
    :return: dataframe with column nulls filled where patient has ggc_years < 5
    """
    df_5years = df.ggc_years < 5
    df.loc[df_5years, col] = df.loc[df_5years, col].fillna(df[col].max())

    return df


def scale_data_train(df, data_path, scaler):
    """
    Min-max scale final dataset
    -----
    :param df: dataframe to be scaled
    :param data_path: path to generated data
    :param scaler: scaler object to apply to df
    :return: scaled dataset for modelling
    """
    all_cols = df.columns
    all_cols = all_cols.drop(['SafeHavenID', 'eoy'])
    data_scaled = scaler.fit_transform(df[all_cols].to_numpy())
    df_scaled = pd.DataFrame(data_scaled, columns=all_cols)
    df_final = (df[['SafeHavenID', 'eoy']]
                .reset_index(drop=True)
                .join(df_scaled))

    # Save the scaler for testing
    joblib.dump(scaler, data_path + 'min_max_scaler_train.pkl')

    return df_final


def main():

    # Load in config items
    with open('../../../config.json') as json_config_file:
        config = json.load(json_config_file)
    data_path = config['model_data_path']
    
    # Read in combined data
    df = pd.read_pickle(data_path + 'merged_train.pkl')

    # Calculate age bins
    df = calc_age_bins_train(df, data_path)

    # Calculate medians for each column for imputation
    df = calc_df_med(df, data_path)

    # Fill null columns
    df[null_cols] = df.groupby(demo_cols)[null_cols].apply(
        lambda x: x.fillna(x.median()))

    # Fill null days_since columns
    day = np.timedelta64(1, 'D')
    df[ds_cols].max().to_pickle(data_path + 'maxs.pkl')
    for col in ds_cols:
        df = ds_fill_5year_train(df, col)
        df[col] = df[col].fillna(df[col + '_med'])
        df[col] = (df[col] / day).astype(int)

    # Save processed data before scaling
    df.to_pickle(data_path + 'filled_train.pkl')

    # Drop non-modelling columns
    df = df.drop(cols2drop, axis=1)

    # Initialize scaler
    scaler = MinMaxScaler()

    # Scale final dataset
    df_final = scale_data_train(df, data_path, scaler)

    # Save final dataset
    df_final.to_pickle(data_path + 'min_max_train.pkl')


main()