""" TRAIN Impute any null data, save ethnicity info for each ID and scale final dataset """ import json import joblib import pandas as pd import numpy as np from numpy import savetxt from sklearn.preprocessing import MinMaxScaler from utils.reduction import calc_ds_med demo_cols = ['age_bin', 'sex_bin'] ds_cols = ['days_since_copd_resp', 'days_since_adm', 'days_since_rescue'] null_cols = ['alt_med_2yr', 'ast_med_2yr', 'albumin_med_2yr', 'alkaline_phosphatase_med_2yr', 'basophils_med_2yr', 'c_reactive_protein_med_2yr', 'chloride_med_2yr', 'creatinine_med_2yr', 'eosinophils_med_2yr', 'estimated_gfr_med_2yr', 'haematocrit_med_2yr', 'haemoglobin_med_2yr', 'lymphocytes_med_2yr', 'mch_med_2yr', 'mean_cell_volume_med_2yr', 'monocytes_med_2yr', 'neutrophils_med_2yr', 'platelets_med_2yr', 'potassium_med_2yr', 'red_blood_count_med_2yr', 'sodium_med_2yr', 'total_bilirubin_med_2yr', 'urea_med_2yr', 'white_blood_count_med_2yr', 'neut_lymph_med_2yr'] cols2drop = ['eth_grp', 'entry_dataset', 'first_entry', 'obf_dob', 'sex_bin', 'marital_status', 'age_bin', 'days_since_copd_resp_med', 'days_since_adm_med', 'days_since_rescue_med', 'simd_vigintile', 'simd_decile', 'simd_quintile'] def calc_age_bins_train(df, data_path): """ Split ages into 10 bins and save results for median filling test data -------- :param df: dataframe to be updated :param data_path: path to generated data :return: updated dataframe """ # Split age column into 10 buckets and use the edges as labels cat, ed = pd.qcut(df['age'], q=10, precision=0, retbins=True) categories, edges = pd.qcut( df['age'], q=10, precision=0, retbins=True, labels=ed[1:]) df['age_bin'] = categories.astype(int) # Save categories for test data savetxt(data_path + 'age_bins_train.csv', edges, delimiter=',') return df def calc_df_med(df, data_path): """ Calculate the medians for all columns in the dataset -------- :param df: dataframe to update :param data_path: path to generated data :return: dataframe with null columns filled with median values and days_since median columns added to the dataframe """ # Calculate median for all columns except SafeHavenID, year and ds_cols all_cols = df.columns all_cols = all_cols.drop(['SafeHavenID', 'eoy']) df_median = df[all_cols].groupby(demo_cols).median() # Calculate medians for ds_cols ds_med = df[demo_cols + ds_cols].groupby(demo_cols).apply(calc_ds_med) # Join ds_cols medians to median table and original dataframe df_median = df_median.join(ds_med) # Save medians for imputing testing data df_median.to_pickle(data_path + 'medians.pkl') # Rename and add to original dataframe ds_med.columns += '_med' df = df.join(ds_med, on=demo_cols) return df def ds_fill_5year_train(df, col): """ Fill days_since_X columns where patient has been in the dataset less than 5 years -------- :param df: dataframe to be updated :param col: column to check :return: dataframe with column nulls filled where patient has ggc_years < 5 """ df_5years = df.ggc_years < 5 df.loc[df_5years, col] = df.loc[df_5years, col].fillna(df[col].max()) return df def scale_data_train(df, data_path, scaler): """ Min-max scale final dataset ----- :param df: dataframe to be scaled :param data_path: path to generated data :param scaler: scaler object to apply to df :return: scaled dataset for modelling """ all_cols = df.columns all_cols = all_cols.drop(['SafeHavenID', 'eoy']) data_scaled = scaler.fit_transform(df[all_cols].to_numpy()) df_scaled = pd.DataFrame(data_scaled, columns=all_cols) df_final = (df[['SafeHavenID', 'eoy']] .reset_index(drop=True) .join(df_scaled)) # Save the scaler for testing joblib.dump(scaler, data_path + 'min_max_scaler_train.pkl') return df_final def main(): # Load in config items with open('../../../config.json') as json_config_file: config = json.load(json_config_file) data_path = config['model_data_path'] # Read in combined data df = pd.read_pickle(data_path + 'merged_train.pkl') # Calculate age bins df = calc_age_bins_train(df, data_path) # Calculate medians for each column for imputation df = calc_df_med(df, data_path) # Fill null columns df[null_cols] = df.groupby(demo_cols)[null_cols].apply( lambda x: x.fillna(x.median())) # Fill null days_since columns day = np.timedelta64(1, 'D') df[ds_cols].max().to_pickle(data_path + 'maxs.pkl') for col in ds_cols: df = ds_fill_5year_train(df, col) df[col] = df[col].fillna(df[col + '_med']) df[col] = (df[col] / day).astype(int) # Save processed data before scaling df.to_pickle(data_path + 'filled_train.pkl') # Drop non-modelling columns df = df.drop(cols2drop, axis=1) # Initialize scaler scaler = MinMaxScaler() # Scale final dataset df_final = scale_data_train(df, data_path, scaler) # Save final dataset df_final.to_pickle(data_path + 'min_max_train.pkl') main()