File size: 14,281 Bytes
12ac942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
# required libraries
import re
import yaml
import pickle

import numpy as np
import pandas as pd

from lifelines import LogLogisticAFTFitter
from KaplanMeierEstimator import KaplanMeierEstimator

import warnings
warnings.filterwarnings('ignore')

class CanSave:
    '''Object of the Can-Save method for feature engineering.'''
    def __load_object_by_pickle(self, path):
        '''Method to load a deserializated file.'''
        loaded_obj = pickle.load(open(path, 'rb'))
        return loaded_obj

    def __load_config(self, config_path):
        '''Method to load config-file.'''
        with open(config_path, 'r') as file:
            self.config = yaml.safe_load(file)

    def __load_ICD10_groups(self):
        '''Method to load the table of ICD-10 groups.'''
        path_icd10_groups = self.config['path_icd10_groups']
        groups = pd.read_excel(path_icd10_groups).dropna(subset=['left_code', 'right_code'])
        groups = groups[groups['selected'] == 1]
        self.groups = groups

    def __load_survival_models(self):
        '''Method to load YOUR trained survival models.'''

        '''
            DISCLAIMER!
            We do not make trained models publicly available.  
            Therefore, we only provide an example of how survival models can be trained. 
            Example is located in the "Example_How_To_Train_Survival_Models.py".
        '''

        # Load trained survival models
        kaplan_meier_males = self.__load_object_by_pickle(self.config['path_kaplan_meier_males'])
        kaplan_meier_females = self.__load_object_by_pickle(self.config['path_kaplan_meier_females'])
        kaplan_meier_both = self.__load_object_by_pickle(self.config['path_kaplan_meier_both'])
        aft = self.__load_object_by_pickle(self.config['path_aft'])

        # Form dict of survival models
        self.survival_models = {
            'kaplan_meier_males': kaplan_meier_males,
            'kaplan_meier_females': kaplan_meier_females,
            'kaplan_meier_both': kaplan_meier_both,
            'aft': aft,
        }

    def __init__(self, CONFIG_PATH):
        '''Constructor.'''
        # Load config-file
        self.__load_config(CONFIG_PATH)
        # Load table of ICD-10 groups
        self.__load_ICD10_groups()
        # Load trained survival models
        self.__load_survival_models()

    def __check_russian_service_code(self, code):
        '''Method to check the medical service code used in the Russian Federation (e.g.: A04.16, B01.042).'''
        code = str(code).upper()
        mas = code.split('.')
        first = mas[0]
        second = mas[1] if len(mas) > 1 else '00'

        if 'A' in first:
            length = 2
        elif 'B' in first:
            length = 3
        else:
            raise ValueError(f'Invalid medical code: {code}')

        if len(first) < 3:
            first = first[0] + '0' + first[1]

        if len(second) == length:
            pass
        elif len(second) > length:
            second = second[-length:]
        else:
            while len(second) != length:
                second = '0' + second

        code = '{}.{}'.format(first, second)

        return code

    def __common_features(self, features, ehr, date_start, date_pred):
        '''Method to make common features.'''
        # MONTH OF THE PREDICTION
        month_pred = date_pred.month
        features['month_pred'] = month_pred

        # DAY OF WEEK
        features['day_of_week'] = date_pred.isoweekday()

        # VISIT_NUM
        visit_num = len(ehr)
        features['visit_num'] = visit_num

        # PROPORTION OF DIAGNOSIS AND SERVICES
        features['diagnosis_prop'] = ehr['is_diagnose'].mean()
        features['services_prop'] = 1.0 - ehr['is_diagnose'].mean()

        # WEEKS AFTER FIRST VISIT
        features['weeks_after_first_visit'] = (date_pred - ehr['date'].min()).days / 7.

        # WEEKS AFTER LAST VISIT
        features['weeks_after_last_visit'] = (date_pred - ehr['date'].max()).days / 7.

        # AVG. WEEKS BETWEEN VISITS
        features['weeks_betw_visits_avg'] = (ehr['date'].max() - ehr['date'].min()).days / (visit_num + 1.0e-14)

        # VISITS IN LAST N MONTHS
        for month in [1, 3, 6, 9, 12, 15, 18, 21, 24]:
            end_dt = date_pred
            start_dt = date_pred - pd.DateOffset(months=month)
            mask = (start_dt <= ehr['date']) & (ehr['date'] <= end_dt)

            new_col = f'visits_in_last_{month}_months'
            features[new_col] = mask.sum()

        # DIAGNOSE GROUPS
        mask = ehr['is_diagnose'] == 1
        df = ehr[mask]
        df['code_short'] = df['code'].apply(lambda code: str(code).split('.')[0])

        for left_mkb, right_mkb in zip(self.groups['left_code'], self.groups['right_code']):
            mask = (left_mkb <= df['code_short']) & (df['code_short'] <= right_mkb)
            new_col = 'diagnose_group_{}_{}'.format(left_mkb, right_mkb)
            features[new_col] = mask.sum()

        # UNIQUE DIAGNOSIS
        features['unique_diagnosis'] = len(set(df['code_short']))

        # UNIQUE CLASSES OF DIAGNOSIS
        df['code_class'] = df['code_short'].apply(lambda code: re.match('[A-Z]', str(code)).group(0))
        features['unique_classes_of_diagnosis'] = len(set(df['code_class']))

        # BINARY (1 - Diagnosis group has been in the EHR, 0 - has not been in the EHR)
        for left_mkb, right_mkb in zip(self.groups['left_code'], self.groups['right_code']):
            feature = f'diagnose_group_{left_mkb}_{right_mkb}'
            val = features[feature]

            new_col = f'has_{left_mkb}_{right_mkb}'
            features[new_col] = 1 if val > 0 else 0

        # WEEKS FROM LAST AND FIRST OCCURRENCE OF DIAGNOSIS GROUP
        for left_mkb, right_mkb in zip(self.groups['left_code'], self.groups['right_code']):
            mask = (left_mkb <= df['code']) & (df['code'] <= right_mkb)
            df_group = df[mask]

            new_col = f'weeks_from_first_{left_mkb}_{right_mkb}'
            if len(df_group) > 0:
                features[new_col] = (date_pred - df_group['date'].min()).days / 7.
            else:
                features[new_col] = (date_pred - date_start).days / 7.

            new_col = f'weeks_from_last_{left_mkb}_{right_mkb}'
            if len(df_group) > 0:
                features[new_col] = (date_pred - df_group['date'].max()).days / 7.
            else:
                features[new_col] = (date_pred - date_start).days / 7.

        return features

    def __regional_features(self, features, ehr, date_start, date_pred):
        '''
            Method to make features adopted to some regional specific characteristics.
            This method should be adopted for your country.
        '''
        # AVG. HUMIDITY AT MONTH FOR THE PREDICTION IN RUSSIA
        avg_humidity = {
            1: 88., 2: 84., 3: 74., 4: 65., 5: 64., 6: 68.,
            7: 72., 8: 69., 9: 77., 10: 81., 11: 88., 12: 91.
        }
        month_pred = features['month_pred']
        features['avg_humidity'] = avg_humidity[month_pred]

        # AVG. TEMPERATURE AT MONTH FOR THE PREDICTION IN RUSSIA
        avg_temperature = {
            1: -7.2, 2: -5.2, 3: -0.5, 4: 8.1, 5: 15.5, 6: 18.0,
            7: 20.3, 8: 18.9, 9: 13.3, 10: 5.7, 11: 1.3, 12: -3.4
        }
        features['avg_temperature'] = avg_temperature[month_pred]

        # SERVICE GROUPS (in according to ORDER 804n of the Ministry of Health of RUSSIA)
        mask = ehr['is_diagnose'] == 0
        df = ehr[mask]
        df['code_mod'] = df['code'].apply(self.__check_russian_service_code)

        # type of medical service
        codes = [f'A0{num}.' if num < 10 else f'A{num}.' for num in range(30+1)] + ['B01.','B02.','B03.','B04.','B05.']
        for val in codes:
            mask = df['code_mod'].str.contains(val)
            new_col = 'service_group_{}XX'.format(val)
            features[new_col] = mask.sum()

        # anatomical and functional area of medical service & list of medical specialties
        group1 = [f'.0{num}' if num < 10 else f'.{num}' for num in range(1, 30+1)]
        group2 = [f'.00{num}' if num < 10 else f'.0{num}' for num in range(1, 70+1)]
        codes = group1 + group2
        for val in codes:
            mask = df['code_mod'].str.contains(val)
            new_col = 'service_group_Axx{}'.format(val)
            features[new_col] = mask.sum()

        return features

    def __survival_analysis_features(self, features):
        '''Method to make features based on the trained survival models.'''
        # Get sociodemographics parameters of the patient
        sex = features['sex']
        age = features['actual_age']
        H = list(range(0, 24+1))

        # Make features based on the fitted Kaplan-Meier estimators (Males & Females)
        model = self.survival_models['kaplan_meier_both']
        s_age = model(age)
        for horizon in H:
            # Survival risk prediction S(t) at the t = AGE
            t = age + horizon/12.
            s = model(t)
            new_col = f'KaplanMeier_BOTH_S(AGE+{horizon}M)'
            features[new_col] = s

            # Difference |S(t+horizon) - S(t)|, where t = AGE
            ds = s - s_age
            new_col = f'KaplanMeier_BOTH_|S(AGE+{horizon}M)-S(AGE)|'
            features[new_col] = abs(ds)

        # Make sex-oriented features based on the fitted Kaplan-Meier estimators (Males and Females)
        model = self.survival_models['kaplan_meier_males'] if sex == 1 else self.survival_models['kaplan_meier_females']
        s_age = model(age)
        for horizon in H:
            # Survival risk prediction S(t) at the t = AGE
            t = age + horizon/12.
            s = model(t)
            new_col = f'KaplanMeier_SEX_S(AGE+{horizon}M)'
            features[new_col] = s

            # Difference |S(t+horizon) - S(t)|, where t = AGE
            ds = s - s_age
            new_col = f'KaplanMeier_SEX_|S(AGE+{horizon}M)-S(AGE)|'
            features[new_col] = abs(ds)

        # Make features based on the trained AFT model
        model = self.survival_models['aft']['model']
        covariates = self.survival_models['aft']['covariates']

        df = pd.DataFrame([{key:features[key] for key in covariates}])
        times = [age + val/12. for val in H]
        survivals = model.predict_survival_function(df, times=times)

        s_age = survivals[0].iloc[0]
        for i in range(len(survivals)):
            # Survival risk prediction S(t) at the t = AGE
            horizon = H[i]
            s = survivals[0].iloc[i]
            new_col = f'AFT_S(AGE+{horizon}M)'
            features[new_col] = s

            # Difference |S(t+horizon) - S(t)|, where t = AGE
            ds = s - s_age
            new_col = f'AFT_|S(AGE+{horizon}M)-S(AGE)|'
            features[new_col] = abs(ds)

        return features

    def feature_engineering(self,
            sex:        str,            # 'F' - FEMALE, 'M' - MALE
            birth_date: str,            # 'YYYY-MM-DD'
            ehr:        pd.DataFrame,   # Example is located in './EHR/id_26.csv'
            date_pred:  str,            # 'YYYY-MM-DD'
            deep_weeks: int             # > 0
    ):
        '''
            Method to make feature engineering for the Can-Save method.
            sex:              str      # 'F' - FEMALE, 'M' - MALE
            birth_date:       str      # 'YYYY-MM-DD'
            ehr:         pd.DataFrame  # Example is located in './EHR/id_26.csv'
            date_pred:        str      # 'YYYY-MM-DD'
            deep_weeks:       int      # deep_weeks > 0
        '''
        # Check the presence of the required columns in the EHR
        set_of_required_columns = {'date', 'code', 'is_diagnose'}
        missing_columns = set_of_required_columns.difference(set(ehr.columns))
        if len(missing_columns) > 0:
            raise KeyError(f'There are missing columns: {missing_columns}.')

        # Prepare dates
        ehr['date'] = pd.to_datetime(ehr['date'], format='%Y-%m-%d')
        date_pred = pd.to_datetime([date_pred], format='%Y-%m-%d')[0]
        date_start = date_pred - pd.DateOffset(weeks=deep_weeks)

        # Get sociodemographic parameters
        sex = 1 if 'M' in sex else 0
        birth_date = pd.to_datetime([birth_date], format='%Y-%m-%d')[0]

        # Select medical events in the certain period
        mask = (date_start <= ehr['date']) & (ehr['date'] <= date_pred)
        ehr = ehr[mask]

        # if EHR is empty, then we add a special medical service (placeholder) => EHR is always not empty
        if len(ehr) == 0:
            ehr.loc[0] = {'date': date_pred, 'code': 'A00.00', 'is_diagnose': 0}

        # Form the list of features for risk estimation
        days_per_year = (365 * 3 + 366) / 4.
        features = {
            'sex': sex,                                                     # SEX (0 - FEMALE, 1 - MALE)
            'actual_age': (date_pred - birth_date).days / days_per_year,    # ACTUAL AGE
        }

        # Make common features
        features = self.__common_features(features, ehr, date_start, date_pred)

        # Make features adopted to some regional specific characteristics
        features = self.__regional_features(features, ehr, date_start, date_pred)

        # Make features based on the trained survival models
        features = self.__survival_analysis_features(features)

        return features

# entry point
if __name__ == '__main__':
    # Make new object for feature engineering
    config_path = './CONFIG_CanSave.yaml'
    cs = CanSave(CONFIG_PATH=config_path)
    print(help(cs))

    # Load the patient's EHR
    path_ehr = './EHR/id_26.csv'
    ehr = pd.read_csv(path_ehr, sep=';').set_index('patient_id')
    sex = ehr['sex'].iloc[0]
    birth_date = ehr['birth_date'].iloc[0]

    # Make feature engineering for the risk prediction
    features = cs.feature_engineering(
        sex         = sex,              # sex of the patient
        birth_date  = birth_date,       # birth date of the patient
        ehr         = ehr,              # Electronic Health Records of the patient
        date_pred   = '2022-01-01',     # date of the risk estimation
        deep_weeks  = 108               # deep of the EHR's history (in weeks)
    )