File size: 5,406 Bytes
53a6def | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | """
TRAIN
Impute any null data, save ethnicity info for each ID and scale
final dataset
"""
import json
import joblib
import pandas as pd
import numpy as np
from numpy import savetxt
from sklearn.preprocessing import MinMaxScaler
from utils.reduction import calc_ds_med
demo_cols = ['age_bin', 'sex_bin']
ds_cols = ['days_since_copd_resp', 'days_since_adm', 'days_since_rescue']
null_cols = ['alt_med_2yr', 'ast_med_2yr', 'albumin_med_2yr',
'alkaline_phosphatase_med_2yr', 'basophils_med_2yr',
'c_reactive_protein_med_2yr', 'chloride_med_2yr',
'creatinine_med_2yr', 'eosinophils_med_2yr',
'estimated_gfr_med_2yr', 'haematocrit_med_2yr',
'haemoglobin_med_2yr', 'lymphocytes_med_2yr',
'mch_med_2yr', 'mean_cell_volume_med_2yr',
'monocytes_med_2yr', 'neutrophils_med_2yr',
'platelets_med_2yr', 'potassium_med_2yr',
'red_blood_count_med_2yr', 'sodium_med_2yr',
'total_bilirubin_med_2yr', 'urea_med_2yr',
'white_blood_count_med_2yr', 'neut_lymph_med_2yr']
cols2drop = ['eth_grp', 'entry_dataset', 'first_entry', 'obf_dob',
'sex_bin', 'marital_status', 'age_bin',
'days_since_copd_resp_med', 'days_since_adm_med',
'days_since_rescue_med', 'simd_vigintile', 'simd_decile',
'simd_quintile']
def calc_age_bins_train(df, data_path):
"""
Split ages into 10 bins and save results for median filling test data
--------
:param df: dataframe to be updated
:param data_path: path to generated data
:return: updated dataframe
"""
# Split age column into 10 buckets and use the edges as labels
cat, ed = pd.qcut(df['age'], q=10, precision=0, retbins=True)
categories, edges = pd.qcut(
df['age'], q=10, precision=0, retbins=True, labels=ed[1:])
df['age_bin'] = categories.astype(int)
# Save categories for test data
savetxt(data_path + 'age_bins_train.csv', edges, delimiter=',')
return df
def calc_df_med(df, data_path):
"""
Calculate the medians for all columns in the dataset
--------
:param df: dataframe to update
:param data_path: path to generated data
:return: dataframe with null columns filled with median values and days_since
median columns added to the dataframe
"""
# Calculate median for all columns except SafeHavenID, year and ds_cols
all_cols = df.columns
all_cols = all_cols.drop(['SafeHavenID', 'eoy'])
df_median = df[all_cols].groupby(demo_cols).median()
# Calculate medians for ds_cols
ds_med = df[demo_cols + ds_cols].groupby(demo_cols).apply(calc_ds_med)
# Join ds_cols medians to median table and original dataframe
df_median = df_median.join(ds_med)
# Save medians for imputing testing data
df_median.to_pickle(data_path + 'medians.pkl')
# Rename and add to original dataframe
ds_med.columns += '_med'
df = df.join(ds_med, on=demo_cols)
return df
def ds_fill_5year_train(df, col):
"""
Fill days_since_X columns where patient has been in the dataset less than
5 years
--------
:param df: dataframe to be updated
:param col: column to check
:return: dataframe with column nulls filled where patient has ggc_years < 5
"""
df_5years = df.ggc_years < 5
df.loc[df_5years, col] = df.loc[df_5years, col].fillna(df[col].max())
return df
def scale_data_train(df, data_path, scaler):
"""
Min-max scale final dataset
-----
:param df: dataframe to be scaled
:param data_path: path to generated data
:param scaler: scaler object to apply to df
:return: scaled dataset for modelling
"""
all_cols = df.columns
all_cols = all_cols.drop(['SafeHavenID', 'eoy'])
data_scaled = scaler.fit_transform(df[all_cols].to_numpy())
df_scaled = pd.DataFrame(data_scaled, columns=all_cols)
df_final = (df[['SafeHavenID', 'eoy']]
.reset_index(drop=True)
.join(df_scaled))
# Save the scaler for testing
joblib.dump(scaler, data_path + 'min_max_scaler_train.pkl')
return df_final
def main():
# Load in config items
with open('../../../config.json') as json_config_file:
config = json.load(json_config_file)
data_path = config['model_data_path']
# Read in combined data
df = pd.read_pickle(data_path + 'merged_train.pkl')
# Calculate age bins
df = calc_age_bins_train(df, data_path)
# Calculate medians for each column for imputation
df = calc_df_med(df, data_path)
# Fill null columns
df[null_cols] = df.groupby(demo_cols)[null_cols].apply(
lambda x: x.fillna(x.median()))
# Fill null days_since columns
day = np.timedelta64(1, 'D')
df[ds_cols].max().to_pickle(data_path + 'maxs.pkl')
for col in ds_cols:
df = ds_fill_5year_train(df, col)
df[col] = df[col].fillna(df[col + '_med'])
df[col] = (df[col] / day).astype(int)
# Save processed data before scaling
df.to_pickle(data_path + 'filled_train.pkl')
# Drop non-modelling columns
df = df.drop(cols2drop, axis=1)
# Initialize scaler
scaler = MinMaxScaler()
# Scale final dataset
df_final = scale_data_train(df, data_path, scaler)
# Save final dataset
df_final.to_pickle(data_path + 'min_max_train.pkl')
main()
|