File size: 5,490 Bytes
53a6def | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | """
TESTING
Impute any null data, save ethnicity info for each ID and scale
final dataset
NB: This script can be used for merged receiver, scale up or testing data
"""
import json
import sys
import joblib
import pandas as pd
import numpy as np
from numpy import loadtxt
ds_cols = ['days_since_copd_resp', 'days_since_adm', 'days_since_rescue']
null_cols = ['alt_med_2yr', 'ast_med_2yr', 'albumin_med_2yr',
'alkaline_phosphatase_med_2yr', 'basophils_med_2yr',
'c_reactive_protein_med_2yr', 'chloride_med_2yr',
'creatinine_med_2yr', 'eosinophils_med_2yr',
'estimated_gfr_med_2yr', 'haematocrit_med_2yr',
'haemoglobin_med_2yr', 'lymphocytes_med_2yr',
'mch_med_2yr', 'mean_cell_volume_med_2yr',
'monocytes_med_2yr', 'neutrophils_med_2yr',
'platelets_med_2yr', 'potassium_med_2yr',
'red_blood_count_med_2yr', 'sodium_med_2yr',
'total_bilirubin_med_2yr', 'urea_med_2yr',
'white_blood_count_med_2yr', 'neut_lymph_med_2yr',
'days_since_copd_resp', 'days_since_adm', 'days_since_rescue']
cols2drop = ['eth_grp', 'entry_dataset', 'first_entry', 'obf_dob',
'marital_status', 'label', 'simd_vigintile', 'simd_decile',
'simd_quintile', 'sex_bin']
def calc_age_bins_test(df, data_path):
"""
Load training bins and assign to testing data
--------
:param df: dataframe to be updated
:param data_path: path to generated data
:return: updated dataframe
"""
ed = loadtxt(data_path + 'age_bins_train.csv', delimiter=',')
categories, edges = pd.qcut(
df['age'], q=10, precision=0, retbins=True, labels=ed[1:])
df['age_bin'] = categories.astype(int)
return df
def create_label(df):
"""
Create a label containing the age and sex bins of the data
--------
:param df: dataframe
:return: dataframe with label added
"""
df['label'] = df['age_bin'].astype(str) + '_' + df['sex_bin'].astype(str)
df = df.drop('age_bin', axis=1)
return df
def fill_nulls(label, df, medians):
"""
Fill any null values in testing/REC/SUP data with median values from
training data.
--------
:param label: string label containing age and sex bin values, e.g. '51_0'
for a male patient in the less than 51 age bin
:param df: dataframe
:param medians: dataframe of training set medians for each label and
column
:return: filled dataframe for specified label
"""
meds = medians[medians['label'] == label].iloc[0]
df_2_fill = df[df['label'] == label]
for col in null_cols:
df_2_fill[col] = df_2_fill[col].fillna(meds[col])
return df_2_fill
def ds_fill_5year_test(df, col, max_vals):
"""
Fill days_since_X columns where patient has been in the dataset less than
5 years
--------
:param df: dataframe to be updated
:param col: column to check
:param max_vals: series with columns and their max value from training
:return: dataframe with column nulls filled where patient has ggc_years < 5
"""
df_5years = df.ggc_years < 5
df.loc[df_5years, col] = df.loc[df_5years, col].fillna(max_vals[col])
return df
def scale_data_test(df, scaler):
"""
Min-max scale final dataset
-----
:param df: dataframe to be scaled
:param scaler: scaler object to apply to df
:return: scaled dataset for modelling
"""
all_cols = df.columns
all_cols = all_cols.drop(['SafeHavenID', 'eoy'])
data_scaled = scaler.transform(df[all_cols].to_numpy())
df_scaled = pd.DataFrame(data_scaled, columns=all_cols)
df_final = (df[['SafeHavenID', 'eoy']]
.reset_index(drop=True)
.join(df_scaled))
return df_final
def main():
# Load in config items
with open('../../../config.json') as json_config_file:
config = json.load(json_config_file)
# Get generated data_path
data_path = config['model_data_path']
# Get datatype from cmd line
data_type = sys.argv[1]
# Read in data
df = pd.read_pickle(data_path + 'merged_' + data_type + '.pkl')
# Load training age groups and apply to data
df = calc_age_bins_test(df, data_path)
# Load in training median for each age-bin/sex-bin labelled group
df_medians = pd.read_pickle(data_path + 'medians.pkl')
df_medians = df_medians.reset_index()
df_medians = create_label(df_medians)
df = create_label(df)
labels = df_medians['label']
# Fill null days_since columns for patient with ggc_years < 5
max_vals = pd.read_pickle(data_path + 'maxs.pkl')
for col in ds_cols:
df = ds_fill_5year_test(df, col, max_vals)
# Fill remaining nulls using training medians
df_filled = pd.concat([fill_nulls(x, df, df_medians) for x in labels])
# Convert ds_cols to int
for col in ds_cols:
day = np.timedelta64(1, 'D')
df_filled[col] = (df_filled[col] / day).astype(int)
# Save processed data before scaling
df_filled.to_pickle(data_path + 'filled_' + data_type + '.pkl')
# Drop non-modelling columns
df_filled = df_filled.drop(cols2drop, axis=1)
# Load in min-max scaler from training set
scaler = joblib.load(data_path + 'min_max_scaler_train.pkl')
df_filled = scale_data_test(df_filled, scaler)
# Save final dataset
df_filled.to_pickle(data_path + 'min_max_' + data_type + '.pkl')
main()
|