File size: 5,490 Bytes
53a6def
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""
TESTING
Impute any null data, save ethnicity info for each ID and scale
final dataset

NB: This script can be used for merged receiver, scale up or testing data
"""
import json
import sys
import joblib
import pandas as pd
import numpy as np
from numpy import loadtxt


ds_cols = ['days_since_copd_resp', 'days_since_adm', 'days_since_rescue']

null_cols = ['alt_med_2yr', 'ast_med_2yr', 'albumin_med_2yr',
             'alkaline_phosphatase_med_2yr', 'basophils_med_2yr',
             'c_reactive_protein_med_2yr', 'chloride_med_2yr',
             'creatinine_med_2yr', 'eosinophils_med_2yr',
             'estimated_gfr_med_2yr', 'haematocrit_med_2yr',
             'haemoglobin_med_2yr', 'lymphocytes_med_2yr',
             'mch_med_2yr', 'mean_cell_volume_med_2yr',
             'monocytes_med_2yr', 'neutrophils_med_2yr',
             'platelets_med_2yr', 'potassium_med_2yr',
             'red_blood_count_med_2yr', 'sodium_med_2yr',
             'total_bilirubin_med_2yr', 'urea_med_2yr',
             'white_blood_count_med_2yr', 'neut_lymph_med_2yr',
             'days_since_copd_resp', 'days_since_adm', 'days_since_rescue']

cols2drop = ['eth_grp', 'entry_dataset', 'first_entry', 'obf_dob',
             'marital_status', 'label', 'simd_vigintile', 'simd_decile',
             'simd_quintile', 'sex_bin']


def calc_age_bins_test(df, data_path):
    """
    Load training bins and assign to testing data
    --------
    :param df: dataframe to be updated
    :param data_path: path to generated data
    :return: updated dataframe
    """
    ed = loadtxt(data_path + 'age_bins_train.csv', delimiter=',')
    categories, edges = pd.qcut(
        df['age'], q=10, precision=0, retbins=True, labels=ed[1:])
    df['age_bin'] = categories.astype(int)

    return df


def create_label(df):
    """
    Create a label containing the age and sex bins of the data
    --------
    :param df: dataframe
    :return: dataframe with label added
    """
    df['label'] = df['age_bin'].astype(str) + '_' + df['sex_bin'].astype(str)
    df = df.drop('age_bin', axis=1)

    return df


def fill_nulls(label, df, medians):
    """
    Fill any null values in testing/REC/SUP data with median values from
    training data.
    --------
    :param label: string label containing age and sex bin values, e.g. '51_0'
        for a male patient in the less than 51 age bin
    :param df: dataframe
    :param medians: dataframe of training set medians for each label and
        column
    :return: filled dataframe for specified label
    """
    meds = medians[medians['label'] == label].iloc[0]
    df_2_fill = df[df['label'] == label]
    for col in null_cols:
        df_2_fill[col] = df_2_fill[col].fillna(meds[col])
    
    return df_2_fill


def ds_fill_5year_test(df, col, max_vals):
    """
    Fill days_since_X columns where patient has been in the dataset less than
    5 years
    --------
    :param df: dataframe to be updated
    :param col: column to check
    :param max_vals: series with columns and their max value from training
    :return: dataframe with column nulls filled where patient has ggc_years < 5
    """
    df_5years = df.ggc_years < 5
    df.loc[df_5years, col] = df.loc[df_5years, col].fillna(max_vals[col])

    return df


def scale_data_test(df, scaler):
    """
    Min-max scale final dataset
    -----
    :param df: dataframe to be scaled
    :param scaler: scaler object to apply to df
    :return: scaled dataset for modelling
    """
    all_cols = df.columns
    all_cols = all_cols.drop(['SafeHavenID', 'eoy'])
    data_scaled = scaler.transform(df[all_cols].to_numpy())
    df_scaled = pd.DataFrame(data_scaled, columns=all_cols)
    df_final = (df[['SafeHavenID', 'eoy']]
                .reset_index(drop=True)
                .join(df_scaled))

    return df_final


def main():

    # Load in config items
    with open('../../../config.json') as json_config_file:
        config = json.load(json_config_file)

    # Get generated data_path
    data_path = config['model_data_path']

    # Get datatype from cmd line
    data_type = sys.argv[1]

    # Read in data
    df = pd.read_pickle(data_path + 'merged_' + data_type + '.pkl')

    # Load training age groups and apply to data
    df = calc_age_bins_test(df, data_path)

    # Load in training median for each age-bin/sex-bin labelled group
    df_medians = pd.read_pickle(data_path + 'medians.pkl')
    df_medians = df_medians.reset_index()
    df_medians = create_label(df_medians)
    df = create_label(df)
    labels = df_medians['label']

    # Fill null days_since columns for patient with ggc_years < 5
    max_vals = pd.read_pickle(data_path + 'maxs.pkl')
    for col in ds_cols:
        df = ds_fill_5year_test(df, col, max_vals)

    # Fill remaining nulls using training medians
    df_filled = pd.concat([fill_nulls(x, df, df_medians) for x in labels])

    # Convert ds_cols to int
    for col in ds_cols:
        day = np.timedelta64(1, 'D')
        df_filled[col] = (df_filled[col] / day).astype(int)

    # Save processed data before scaling
    df_filled.to_pickle(data_path + 'filled_' + data_type + '.pkl')

    # Drop non-modelling columns
    df_filled = df_filled.drop(cols2drop, axis=1)

    # Load in min-max scaler from training set
    scaler = joblib.load(data_path + 'min_max_scaler_train.pkl')
    df_filled = scale_data_test(df_filled, scaler)

    # Save final dataset
    df_filled.to_pickle(data_path + 'min_max_' + data_type + '.pkl')


main()