File size: 5,406 Bytes
53a6def
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
TRAIN
Impute any null data, save ethnicity info for each ID and scale
final dataset
"""
import json
import joblib
import pandas as pd
import numpy as np
from numpy import savetxt
from sklearn.preprocessing import MinMaxScaler
from utils.reduction import calc_ds_med


demo_cols = ['age_bin', 'sex_bin']

ds_cols = ['days_since_copd_resp', 'days_since_adm', 'days_since_rescue']

null_cols = ['alt_med_2yr', 'ast_med_2yr', 'albumin_med_2yr',
             'alkaline_phosphatase_med_2yr', 'basophils_med_2yr',
             'c_reactive_protein_med_2yr', 'chloride_med_2yr',
             'creatinine_med_2yr', 'eosinophils_med_2yr',
             'estimated_gfr_med_2yr', 'haematocrit_med_2yr',
             'haemoglobin_med_2yr', 'lymphocytes_med_2yr',
             'mch_med_2yr', 'mean_cell_volume_med_2yr',
             'monocytes_med_2yr', 'neutrophils_med_2yr',
             'platelets_med_2yr', 'potassium_med_2yr',
             'red_blood_count_med_2yr', 'sodium_med_2yr',
             'total_bilirubin_med_2yr', 'urea_med_2yr',
             'white_blood_count_med_2yr', 'neut_lymph_med_2yr']

cols2drop = ['eth_grp', 'entry_dataset', 'first_entry', 'obf_dob',
             'sex_bin', 'marital_status', 'age_bin',
             'days_since_copd_resp_med', 'days_since_adm_med',
             'days_since_rescue_med', 'simd_vigintile', 'simd_decile',
             'simd_quintile']


def calc_age_bins_train(df, data_path):
    """
    Split ages into 10 bins and save results for median filling test data
    --------
    :param df: dataframe to be updated
    :param data_path: path to generated data
    :return: updated dataframe
    """
    # Split age column into 10 buckets and use the edges as labels
    cat, ed = pd.qcut(df['age'], q=10, precision=0, retbins=True)
    categories, edges = pd.qcut(
        df['age'], q=10, precision=0, retbins=True, labels=ed[1:])
    df['age_bin'] = categories.astype(int)

    # Save categories for test data
    savetxt(data_path + 'age_bins_train.csv', edges, delimiter=',')

    return df


def calc_df_med(df, data_path):
    """
    Calculate the medians for all columns in the dataset
    --------
    :param df: dataframe to update
    :param data_path: path to generated data
    :return: dataframe with null columns filled with median values and days_since
        median columns added to the dataframe
    """
    # Calculate median for all columns except SafeHavenID, year and ds_cols
    all_cols = df.columns
    all_cols = all_cols.drop(['SafeHavenID', 'eoy'])
    df_median = df[all_cols].groupby(demo_cols).median()

    # Calculate medians for ds_cols
    ds_med = df[demo_cols + ds_cols].groupby(demo_cols).apply(calc_ds_med)

    # Join ds_cols medians to median table and original dataframe
    df_median = df_median.join(ds_med)

    # Save medians for imputing testing data
    df_median.to_pickle(data_path + 'medians.pkl')

    # Rename and add to original dataframe
    ds_med.columns += '_med'
    df = df.join(ds_med, on=demo_cols)

    return df


def ds_fill_5year_train(df, col):
    """
    Fill days_since_X columns where patient has been in the dataset less than
    5 years
    --------
    :param df: dataframe to be updated
    :param col: column to check
    :return: dataframe with column nulls filled where patient has ggc_years < 5
    """
    df_5years = df.ggc_years < 5
    df.loc[df_5years, col] = df.loc[df_5years, col].fillna(df[col].max())

    return df


def scale_data_train(df, data_path, scaler):
    """
    Min-max scale final dataset
    -----
    :param df: dataframe to be scaled
    :param data_path: path to generated data
    :param scaler: scaler object to apply to df
    :return: scaled dataset for modelling
    """
    all_cols = df.columns
    all_cols = all_cols.drop(['SafeHavenID', 'eoy'])
    data_scaled = scaler.fit_transform(df[all_cols].to_numpy())
    df_scaled = pd.DataFrame(data_scaled, columns=all_cols)
    df_final = (df[['SafeHavenID', 'eoy']]
                .reset_index(drop=True)
                .join(df_scaled))

    # Save the scaler for testing
    joblib.dump(scaler, data_path + 'min_max_scaler_train.pkl')

    return df_final


def main():

    # Load in config items
    with open('../../../config.json') as json_config_file:
        config = json.load(json_config_file)
    data_path = config['model_data_path']
    
    # Read in combined data
    df = pd.read_pickle(data_path + 'merged_train.pkl')

    # Calculate age bins
    df = calc_age_bins_train(df, data_path)

    # Calculate medians for each column for imputation
    df = calc_df_med(df, data_path)

    # Fill null columns
    df[null_cols] = df.groupby(demo_cols)[null_cols].apply(
        lambda x: x.fillna(x.median()))

    # Fill null days_since columns
    day = np.timedelta64(1, 'D')
    df[ds_cols].max().to_pickle(data_path + 'maxs.pkl')
    for col in ds_cols:
        df = ds_fill_5year_train(df, col)
        df[col] = df[col].fillna(df[col + '_med'])
        df[col] = (df[col] / day).astype(int)

    # Save processed data before scaling
    df.to_pickle(data_path + 'filled_train.pkl')

    # Drop non-modelling columns
    df = df.drop(cols2drop, axis=1)

    # Initialize scaler
    scaler = MinMaxScaler()

    # Scale final dataset
    df_final = scale_data_train(df, data_path, scaler)

    # Save final dataset
    df_final.to_pickle(data_path + 'min_max_train.pkl')


main()