File size: 14,281 Bytes
12ac942 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 |
# required libraries
import re
import yaml
import pickle
import numpy as np
import pandas as pd
from lifelines import LogLogisticAFTFitter
from KaplanMeierEstimator import KaplanMeierEstimator
import warnings
warnings.filterwarnings('ignore')
class CanSave:
'''Object of the Can-Save method for feature engineering.'''
def __load_object_by_pickle(self, path):
'''Method to load a deserializated file.'''
loaded_obj = pickle.load(open(path, 'rb'))
return loaded_obj
def __load_config(self, config_path):
'''Method to load config-file.'''
with open(config_path, 'r') as file:
self.config = yaml.safe_load(file)
def __load_ICD10_groups(self):
'''Method to load the table of ICD-10 groups.'''
path_icd10_groups = self.config['path_icd10_groups']
groups = pd.read_excel(path_icd10_groups).dropna(subset=['left_code', 'right_code'])
groups = groups[groups['selected'] == 1]
self.groups = groups
def __load_survival_models(self):
'''Method to load YOUR trained survival models.'''
'''
DISCLAIMER!
We do not make trained models publicly available.
Therefore, we only provide an example of how survival models can be trained.
Example is located in the "Example_How_To_Train_Survival_Models.py".
'''
# Load trained survival models
kaplan_meier_males = self.__load_object_by_pickle(self.config['path_kaplan_meier_males'])
kaplan_meier_females = self.__load_object_by_pickle(self.config['path_kaplan_meier_females'])
kaplan_meier_both = self.__load_object_by_pickle(self.config['path_kaplan_meier_both'])
aft = self.__load_object_by_pickle(self.config['path_aft'])
# Form dict of survival models
self.survival_models = {
'kaplan_meier_males': kaplan_meier_males,
'kaplan_meier_females': kaplan_meier_females,
'kaplan_meier_both': kaplan_meier_both,
'aft': aft,
}
def __init__(self, CONFIG_PATH):
'''Constructor.'''
# Load config-file
self.__load_config(CONFIG_PATH)
# Load table of ICD-10 groups
self.__load_ICD10_groups()
# Load trained survival models
self.__load_survival_models()
def __check_russian_service_code(self, code):
'''Method to check the medical service code used in the Russian Federation (e.g.: A04.16, B01.042).'''
code = str(code).upper()
mas = code.split('.')
first = mas[0]
second = mas[1] if len(mas) > 1 else '00'
if 'A' in first:
length = 2
elif 'B' in first:
length = 3
else:
raise ValueError(f'Invalid medical code: {code}')
if len(first) < 3:
first = first[0] + '0' + first[1]
if len(second) == length:
pass
elif len(second) > length:
second = second[-length:]
else:
while len(second) != length:
second = '0' + second
code = '{}.{}'.format(first, second)
return code
def __common_features(self, features, ehr, date_start, date_pred):
'''Method to make common features.'''
# MONTH OF THE PREDICTION
month_pred = date_pred.month
features['month_pred'] = month_pred
# DAY OF WEEK
features['day_of_week'] = date_pred.isoweekday()
# VISIT_NUM
visit_num = len(ehr)
features['visit_num'] = visit_num
# PROPORTION OF DIAGNOSIS AND SERVICES
features['diagnosis_prop'] = ehr['is_diagnose'].mean()
features['services_prop'] = 1.0 - ehr['is_diagnose'].mean()
# WEEKS AFTER FIRST VISIT
features['weeks_after_first_visit'] = (date_pred - ehr['date'].min()).days / 7.
# WEEKS AFTER LAST VISIT
features['weeks_after_last_visit'] = (date_pred - ehr['date'].max()).days / 7.
# AVG. WEEKS BETWEEN VISITS
features['weeks_betw_visits_avg'] = (ehr['date'].max() - ehr['date'].min()).days / (visit_num + 1.0e-14)
# VISITS IN LAST N MONTHS
for month in [1, 3, 6, 9, 12, 15, 18, 21, 24]:
end_dt = date_pred
start_dt = date_pred - pd.DateOffset(months=month)
mask = (start_dt <= ehr['date']) & (ehr['date'] <= end_dt)
new_col = f'visits_in_last_{month}_months'
features[new_col] = mask.sum()
# DIAGNOSE GROUPS
mask = ehr['is_diagnose'] == 1
df = ehr[mask]
df['code_short'] = df['code'].apply(lambda code: str(code).split('.')[0])
for left_mkb, right_mkb in zip(self.groups['left_code'], self.groups['right_code']):
mask = (left_mkb <= df['code_short']) & (df['code_short'] <= right_mkb)
new_col = 'diagnose_group_{}_{}'.format(left_mkb, right_mkb)
features[new_col] = mask.sum()
# UNIQUE DIAGNOSIS
features['unique_diagnosis'] = len(set(df['code_short']))
# UNIQUE CLASSES OF DIAGNOSIS
df['code_class'] = df['code_short'].apply(lambda code: re.match('[A-Z]', str(code)).group(0))
features['unique_classes_of_diagnosis'] = len(set(df['code_class']))
# BINARY (1 - Diagnosis group has been in the EHR, 0 - has not been in the EHR)
for left_mkb, right_mkb in zip(self.groups['left_code'], self.groups['right_code']):
feature = f'diagnose_group_{left_mkb}_{right_mkb}'
val = features[feature]
new_col = f'has_{left_mkb}_{right_mkb}'
features[new_col] = 1 if val > 0 else 0
# WEEKS FROM LAST AND FIRST OCCURRENCE OF DIAGNOSIS GROUP
for left_mkb, right_mkb in zip(self.groups['left_code'], self.groups['right_code']):
mask = (left_mkb <= df['code']) & (df['code'] <= right_mkb)
df_group = df[mask]
new_col = f'weeks_from_first_{left_mkb}_{right_mkb}'
if len(df_group) > 0:
features[new_col] = (date_pred - df_group['date'].min()).days / 7.
else:
features[new_col] = (date_pred - date_start).days / 7.
new_col = f'weeks_from_last_{left_mkb}_{right_mkb}'
if len(df_group) > 0:
features[new_col] = (date_pred - df_group['date'].max()).days / 7.
else:
features[new_col] = (date_pred - date_start).days / 7.
return features
def __regional_features(self, features, ehr, date_start, date_pred):
'''
Method to make features adopted to some regional specific characteristics.
This method should be adopted for your country.
'''
# AVG. HUMIDITY AT MONTH FOR THE PREDICTION IN RUSSIA
avg_humidity = {
1: 88., 2: 84., 3: 74., 4: 65., 5: 64., 6: 68.,
7: 72., 8: 69., 9: 77., 10: 81., 11: 88., 12: 91.
}
month_pred = features['month_pred']
features['avg_humidity'] = avg_humidity[month_pred]
# AVG. TEMPERATURE AT MONTH FOR THE PREDICTION IN RUSSIA
avg_temperature = {
1: -7.2, 2: -5.2, 3: -0.5, 4: 8.1, 5: 15.5, 6: 18.0,
7: 20.3, 8: 18.9, 9: 13.3, 10: 5.7, 11: 1.3, 12: -3.4
}
features['avg_temperature'] = avg_temperature[month_pred]
# SERVICE GROUPS (in according to ORDER 804n of the Ministry of Health of RUSSIA)
mask = ehr['is_diagnose'] == 0
df = ehr[mask]
df['code_mod'] = df['code'].apply(self.__check_russian_service_code)
# type of medical service
codes = [f'A0{num}.' if num < 10 else f'A{num}.' for num in range(30+1)] + ['B01.','B02.','B03.','B04.','B05.']
for val in codes:
mask = df['code_mod'].str.contains(val)
new_col = 'service_group_{}XX'.format(val)
features[new_col] = mask.sum()
# anatomical and functional area of medical service & list of medical specialties
group1 = [f'.0{num}' if num < 10 else f'.{num}' for num in range(1, 30+1)]
group2 = [f'.00{num}' if num < 10 else f'.0{num}' for num in range(1, 70+1)]
codes = group1 + group2
for val in codes:
mask = df['code_mod'].str.contains(val)
new_col = 'service_group_Axx{}'.format(val)
features[new_col] = mask.sum()
return features
def __survival_analysis_features(self, features):
'''Method to make features based on the trained survival models.'''
# Get sociodemographics parameters of the patient
sex = features['sex']
age = features['actual_age']
H = list(range(0, 24+1))
# Make features based on the fitted Kaplan-Meier estimators (Males & Females)
model = self.survival_models['kaplan_meier_both']
s_age = model(age)
for horizon in H:
# Survival risk prediction S(t) at the t = AGE
t = age + horizon/12.
s = model(t)
new_col = f'KaplanMeier_BOTH_S(AGE+{horizon}M)'
features[new_col] = s
# Difference |S(t+horizon) - S(t)|, where t = AGE
ds = s - s_age
new_col = f'KaplanMeier_BOTH_|S(AGE+{horizon}M)-S(AGE)|'
features[new_col] = abs(ds)
# Make sex-oriented features based on the fitted Kaplan-Meier estimators (Males and Females)
model = self.survival_models['kaplan_meier_males'] if sex == 1 else self.survival_models['kaplan_meier_females']
s_age = model(age)
for horizon in H:
# Survival risk prediction S(t) at the t = AGE
t = age + horizon/12.
s = model(t)
new_col = f'KaplanMeier_SEX_S(AGE+{horizon}M)'
features[new_col] = s
# Difference |S(t+horizon) - S(t)|, where t = AGE
ds = s - s_age
new_col = f'KaplanMeier_SEX_|S(AGE+{horizon}M)-S(AGE)|'
features[new_col] = abs(ds)
# Make features based on the trained AFT model
model = self.survival_models['aft']['model']
covariates = self.survival_models['aft']['covariates']
df = pd.DataFrame([{key:features[key] for key in covariates}])
times = [age + val/12. for val in H]
survivals = model.predict_survival_function(df, times=times)
s_age = survivals[0].iloc[0]
for i in range(len(survivals)):
# Survival risk prediction S(t) at the t = AGE
horizon = H[i]
s = survivals[0].iloc[i]
new_col = f'AFT_S(AGE+{horizon}M)'
features[new_col] = s
# Difference |S(t+horizon) - S(t)|, where t = AGE
ds = s - s_age
new_col = f'AFT_|S(AGE+{horizon}M)-S(AGE)|'
features[new_col] = abs(ds)
return features
def feature_engineering(self,
sex: str, # 'F' - FEMALE, 'M' - MALE
birth_date: str, # 'YYYY-MM-DD'
ehr: pd.DataFrame, # Example is located in './EHR/id_26.csv'
date_pred: str, # 'YYYY-MM-DD'
deep_weeks: int # > 0
):
'''
Method to make feature engineering for the Can-Save method.
sex: str # 'F' - FEMALE, 'M' - MALE
birth_date: str # 'YYYY-MM-DD'
ehr: pd.DataFrame # Example is located in './EHR/id_26.csv'
date_pred: str # 'YYYY-MM-DD'
deep_weeks: int # deep_weeks > 0
'''
# Check the presence of the required columns in the EHR
set_of_required_columns = {'date', 'code', 'is_diagnose'}
missing_columns = set_of_required_columns.difference(set(ehr.columns))
if len(missing_columns) > 0:
raise KeyError(f'There are missing columns: {missing_columns}.')
# Prepare dates
ehr['date'] = pd.to_datetime(ehr['date'], format='%Y-%m-%d')
date_pred = pd.to_datetime([date_pred], format='%Y-%m-%d')[0]
date_start = date_pred - pd.DateOffset(weeks=deep_weeks)
# Get sociodemographic parameters
sex = 1 if 'M' in sex else 0
birth_date = pd.to_datetime([birth_date], format='%Y-%m-%d')[0]
# Select medical events in the certain period
mask = (date_start <= ehr['date']) & (ehr['date'] <= date_pred)
ehr = ehr[mask]
# if EHR is empty, then we add a special medical service (placeholder) => EHR is always not empty
if len(ehr) == 0:
ehr.loc[0] = {'date': date_pred, 'code': 'A00.00', 'is_diagnose': 0}
# Form the list of features for risk estimation
days_per_year = (365 * 3 + 366) / 4.
features = {
'sex': sex, # SEX (0 - FEMALE, 1 - MALE)
'actual_age': (date_pred - birth_date).days / days_per_year, # ACTUAL AGE
}
# Make common features
features = self.__common_features(features, ehr, date_start, date_pred)
# Make features adopted to some regional specific characteristics
features = self.__regional_features(features, ehr, date_start, date_pred)
# Make features based on the trained survival models
features = self.__survival_analysis_features(features)
return features
# entry point
if __name__ == '__main__':
# Make new object for feature engineering
config_path = './CONFIG_CanSave.yaml'
cs = CanSave(CONFIG_PATH=config_path)
print(help(cs))
# Load the patient's EHR
path_ehr = './EHR/id_26.csv'
ehr = pd.read_csv(path_ehr, sep=';').set_index('patient_id')
sex = ehr['sex'].iloc[0]
birth_date = ehr['birth_date'].iloc[0]
# Make feature engineering for the risk prediction
features = cs.feature_engineering(
sex = sex, # sex of the patient
birth_date = birth_date, # birth date of the patient
ehr = ehr, # Electronic Health Records of the patient
date_pred = '2022-01-01', # date of the risk estimation
deep_weeks = 108 # deep of the EHR's history (in weeks)
)
|