|
|
|
|
|
import re |
|
|
import yaml |
|
|
import pickle |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
|
|
|
from lifelines import LogLogisticAFTFitter |
|
|
from KaplanMeierEstimator import KaplanMeierEstimator |
|
|
|
|
|
import warnings |
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
class CanSave: |
|
|
'''Object of the Can-Save method for feature engineering.''' |
|
|
def __load_object_by_pickle(self, path): |
|
|
'''Method to load a deserializated file.''' |
|
|
loaded_obj = pickle.load(open(path, 'rb')) |
|
|
return loaded_obj |
|
|
|
|
|
def __load_config(self, config_path): |
|
|
'''Method to load config-file.''' |
|
|
with open(config_path, 'r') as file: |
|
|
self.config = yaml.safe_load(file) |
|
|
|
|
|
def __load_ICD10_groups(self): |
|
|
'''Method to load the table of ICD-10 groups.''' |
|
|
path_icd10_groups = self.config['path_icd10_groups'] |
|
|
groups = pd.read_excel(path_icd10_groups).dropna(subset=['left_code', 'right_code']) |
|
|
groups = groups[groups['selected'] == 1] |
|
|
self.groups = groups |
|
|
|
|
|
def __load_survival_models(self): |
|
|
'''Method to load YOUR trained survival models.''' |
|
|
|
|
|
''' |
|
|
DISCLAIMER! |
|
|
We do not make trained models publicly available. |
|
|
Therefore, we only provide an example of how survival models can be trained. |
|
|
Example is located in the "Example_How_To_Train_Survival_Models.py". |
|
|
''' |
|
|
|
|
|
|
|
|
kaplan_meier_males = self.__load_object_by_pickle(self.config['path_kaplan_meier_males']) |
|
|
kaplan_meier_females = self.__load_object_by_pickle(self.config['path_kaplan_meier_females']) |
|
|
kaplan_meier_both = self.__load_object_by_pickle(self.config['path_kaplan_meier_both']) |
|
|
aft = self.__load_object_by_pickle(self.config['path_aft']) |
|
|
|
|
|
|
|
|
self.survival_models = { |
|
|
'kaplan_meier_males': kaplan_meier_males, |
|
|
'kaplan_meier_females': kaplan_meier_females, |
|
|
'kaplan_meier_both': kaplan_meier_both, |
|
|
'aft': aft, |
|
|
} |
|
|
|
|
|
def __init__(self, CONFIG_PATH): |
|
|
'''Constructor.''' |
|
|
|
|
|
self.__load_config(CONFIG_PATH) |
|
|
|
|
|
self.__load_ICD10_groups() |
|
|
|
|
|
self.__load_survival_models() |
|
|
|
|
|
def __check_russian_service_code(self, code): |
|
|
'''Method to check the medical service code used in the Russian Federation (e.g.: A04.16, B01.042).''' |
|
|
code = str(code).upper() |
|
|
mas = code.split('.') |
|
|
first = mas[0] |
|
|
second = mas[1] if len(mas) > 1 else '00' |
|
|
|
|
|
if 'A' in first: |
|
|
length = 2 |
|
|
elif 'B' in first: |
|
|
length = 3 |
|
|
else: |
|
|
raise ValueError(f'Invalid medical code: {code}') |
|
|
|
|
|
if len(first) < 3: |
|
|
first = first[0] + '0' + first[1] |
|
|
|
|
|
if len(second) == length: |
|
|
pass |
|
|
elif len(second) > length: |
|
|
second = second[-length:] |
|
|
else: |
|
|
while len(second) != length: |
|
|
second = '0' + second |
|
|
|
|
|
code = '{}.{}'.format(first, second) |
|
|
|
|
|
return code |
|
|
|
|
|
def __common_features(self, features, ehr, date_start, date_pred): |
|
|
'''Method to make common features.''' |
|
|
|
|
|
month_pred = date_pred.month |
|
|
features['month_pred'] = month_pred |
|
|
|
|
|
|
|
|
features['day_of_week'] = date_pred.isoweekday() |
|
|
|
|
|
|
|
|
visit_num = len(ehr) |
|
|
features['visit_num'] = visit_num |
|
|
|
|
|
|
|
|
features['diagnosis_prop'] = ehr['is_diagnose'].mean() |
|
|
features['services_prop'] = 1.0 - ehr['is_diagnose'].mean() |
|
|
|
|
|
|
|
|
features['weeks_after_first_visit'] = (date_pred - ehr['date'].min()).days / 7. |
|
|
|
|
|
|
|
|
features['weeks_after_last_visit'] = (date_pred - ehr['date'].max()).days / 7. |
|
|
|
|
|
|
|
|
features['weeks_betw_visits_avg'] = (ehr['date'].max() - ehr['date'].min()).days / (visit_num + 1.0e-14) |
|
|
|
|
|
|
|
|
for month in [1, 3, 6, 9, 12, 15, 18, 21, 24]: |
|
|
end_dt = date_pred |
|
|
start_dt = date_pred - pd.DateOffset(months=month) |
|
|
mask = (start_dt <= ehr['date']) & (ehr['date'] <= end_dt) |
|
|
|
|
|
new_col = f'visits_in_last_{month}_months' |
|
|
features[new_col] = mask.sum() |
|
|
|
|
|
|
|
|
mask = ehr['is_diagnose'] == 1 |
|
|
df = ehr[mask] |
|
|
df['code_short'] = df['code'].apply(lambda code: str(code).split('.')[0]) |
|
|
|
|
|
for left_mkb, right_mkb in zip(self.groups['left_code'], self.groups['right_code']): |
|
|
mask = (left_mkb <= df['code_short']) & (df['code_short'] <= right_mkb) |
|
|
new_col = 'diagnose_group_{}_{}'.format(left_mkb, right_mkb) |
|
|
features[new_col] = mask.sum() |
|
|
|
|
|
|
|
|
features['unique_diagnosis'] = len(set(df['code_short'])) |
|
|
|
|
|
|
|
|
df['code_class'] = df['code_short'].apply(lambda code: re.match('[A-Z]', str(code)).group(0)) |
|
|
features['unique_classes_of_diagnosis'] = len(set(df['code_class'])) |
|
|
|
|
|
|
|
|
for left_mkb, right_mkb in zip(self.groups['left_code'], self.groups['right_code']): |
|
|
feature = f'diagnose_group_{left_mkb}_{right_mkb}' |
|
|
val = features[feature] |
|
|
|
|
|
new_col = f'has_{left_mkb}_{right_mkb}' |
|
|
features[new_col] = 1 if val > 0 else 0 |
|
|
|
|
|
|
|
|
for left_mkb, right_mkb in zip(self.groups['left_code'], self.groups['right_code']): |
|
|
mask = (left_mkb <= df['code']) & (df['code'] <= right_mkb) |
|
|
df_group = df[mask] |
|
|
|
|
|
new_col = f'weeks_from_first_{left_mkb}_{right_mkb}' |
|
|
if len(df_group) > 0: |
|
|
features[new_col] = (date_pred - df_group['date'].min()).days / 7. |
|
|
else: |
|
|
features[new_col] = (date_pred - date_start).days / 7. |
|
|
|
|
|
new_col = f'weeks_from_last_{left_mkb}_{right_mkb}' |
|
|
if len(df_group) > 0: |
|
|
features[new_col] = (date_pred - df_group['date'].max()).days / 7. |
|
|
else: |
|
|
features[new_col] = (date_pred - date_start).days / 7. |
|
|
|
|
|
return features |
|
|
|
|
|
def __regional_features(self, features, ehr, date_start, date_pred): |
|
|
''' |
|
|
Method to make features adopted to some regional specific characteristics. |
|
|
This method should be adopted for your country. |
|
|
''' |
|
|
|
|
|
avg_humidity = { |
|
|
1: 88., 2: 84., 3: 74., 4: 65., 5: 64., 6: 68., |
|
|
7: 72., 8: 69., 9: 77., 10: 81., 11: 88., 12: 91. |
|
|
} |
|
|
month_pred = features['month_pred'] |
|
|
features['avg_humidity'] = avg_humidity[month_pred] |
|
|
|
|
|
|
|
|
avg_temperature = { |
|
|
1: -7.2, 2: -5.2, 3: -0.5, 4: 8.1, 5: 15.5, 6: 18.0, |
|
|
7: 20.3, 8: 18.9, 9: 13.3, 10: 5.7, 11: 1.3, 12: -3.4 |
|
|
} |
|
|
features['avg_temperature'] = avg_temperature[month_pred] |
|
|
|
|
|
|
|
|
mask = ehr['is_diagnose'] == 0 |
|
|
df = ehr[mask] |
|
|
df['code_mod'] = df['code'].apply(self.__check_russian_service_code) |
|
|
|
|
|
|
|
|
codes = [f'A0{num}.' if num < 10 else f'A{num}.' for num in range(30+1)] + ['B01.','B02.','B03.','B04.','B05.'] |
|
|
for val in codes: |
|
|
mask = df['code_mod'].str.contains(val) |
|
|
new_col = 'service_group_{}XX'.format(val) |
|
|
features[new_col] = mask.sum() |
|
|
|
|
|
|
|
|
group1 = [f'.0{num}' if num < 10 else f'.{num}' for num in range(1, 30+1)] |
|
|
group2 = [f'.00{num}' if num < 10 else f'.0{num}' for num in range(1, 70+1)] |
|
|
codes = group1 + group2 |
|
|
for val in codes: |
|
|
mask = df['code_mod'].str.contains(val) |
|
|
new_col = 'service_group_Axx{}'.format(val) |
|
|
features[new_col] = mask.sum() |
|
|
|
|
|
return features |
|
|
|
|
|
def __survival_analysis_features(self, features): |
|
|
'''Method to make features based on the trained survival models.''' |
|
|
|
|
|
sex = features['sex'] |
|
|
age = features['actual_age'] |
|
|
H = list(range(0, 24+1)) |
|
|
|
|
|
|
|
|
model = self.survival_models['kaplan_meier_both'] |
|
|
s_age = model(age) |
|
|
for horizon in H: |
|
|
|
|
|
t = age + horizon/12. |
|
|
s = model(t) |
|
|
new_col = f'KaplanMeier_BOTH_S(AGE+{horizon}M)' |
|
|
features[new_col] = s |
|
|
|
|
|
|
|
|
ds = s - s_age |
|
|
new_col = f'KaplanMeier_BOTH_|S(AGE+{horizon}M)-S(AGE)|' |
|
|
features[new_col] = abs(ds) |
|
|
|
|
|
|
|
|
model = self.survival_models['kaplan_meier_males'] if sex == 1 else self.survival_models['kaplan_meier_females'] |
|
|
s_age = model(age) |
|
|
for horizon in H: |
|
|
|
|
|
t = age + horizon/12. |
|
|
s = model(t) |
|
|
new_col = f'KaplanMeier_SEX_S(AGE+{horizon}M)' |
|
|
features[new_col] = s |
|
|
|
|
|
|
|
|
ds = s - s_age |
|
|
new_col = f'KaplanMeier_SEX_|S(AGE+{horizon}M)-S(AGE)|' |
|
|
features[new_col] = abs(ds) |
|
|
|
|
|
|
|
|
model = self.survival_models['aft']['model'] |
|
|
covariates = self.survival_models['aft']['covariates'] |
|
|
|
|
|
df = pd.DataFrame([{key:features[key] for key in covariates}]) |
|
|
times = [age + val/12. for val in H] |
|
|
survivals = model.predict_survival_function(df, times=times) |
|
|
|
|
|
s_age = survivals[0].iloc[0] |
|
|
for i in range(len(survivals)): |
|
|
|
|
|
horizon = H[i] |
|
|
s = survivals[0].iloc[i] |
|
|
new_col = f'AFT_S(AGE+{horizon}M)' |
|
|
features[new_col] = s |
|
|
|
|
|
|
|
|
ds = s - s_age |
|
|
new_col = f'AFT_|S(AGE+{horizon}M)-S(AGE)|' |
|
|
features[new_col] = abs(ds) |
|
|
|
|
|
return features |
|
|
|
|
|
def feature_engineering(self, |
|
|
sex: str, |
|
|
birth_date: str, |
|
|
ehr: pd.DataFrame, |
|
|
date_pred: str, |
|
|
deep_weeks: int |
|
|
): |
|
|
''' |
|
|
Method to make feature engineering for the Can-Save method. |
|
|
sex: str # 'F' - FEMALE, 'M' - MALE |
|
|
birth_date: str # 'YYYY-MM-DD' |
|
|
ehr: pd.DataFrame # Example is located in './EHR/id_26.csv' |
|
|
date_pred: str # 'YYYY-MM-DD' |
|
|
deep_weeks: int # deep_weeks > 0 |
|
|
''' |
|
|
|
|
|
set_of_required_columns = {'date', 'code', 'is_diagnose'} |
|
|
missing_columns = set_of_required_columns.difference(set(ehr.columns)) |
|
|
if len(missing_columns) > 0: |
|
|
raise KeyError(f'There are missing columns: {missing_columns}.') |
|
|
|
|
|
|
|
|
ehr['date'] = pd.to_datetime(ehr['date'], format='%Y-%m-%d') |
|
|
date_pred = pd.to_datetime([date_pred], format='%Y-%m-%d')[0] |
|
|
date_start = date_pred - pd.DateOffset(weeks=deep_weeks) |
|
|
|
|
|
|
|
|
sex = 1 if 'M' in sex else 0 |
|
|
birth_date = pd.to_datetime([birth_date], format='%Y-%m-%d')[0] |
|
|
|
|
|
|
|
|
mask = (date_start <= ehr['date']) & (ehr['date'] <= date_pred) |
|
|
ehr = ehr[mask] |
|
|
|
|
|
|
|
|
if len(ehr) == 0: |
|
|
ehr.loc[0] = {'date': date_pred, 'code': 'A00.00', 'is_diagnose': 0} |
|
|
|
|
|
|
|
|
days_per_year = (365 * 3 + 366) / 4. |
|
|
features = { |
|
|
'sex': sex, |
|
|
'actual_age': (date_pred - birth_date).days / days_per_year, |
|
|
} |
|
|
|
|
|
|
|
|
features = self.__common_features(features, ehr, date_start, date_pred) |
|
|
|
|
|
|
|
|
features = self.__regional_features(features, ehr, date_start, date_pred) |
|
|
|
|
|
|
|
|
features = self.__survival_analysis_features(features) |
|
|
|
|
|
return features |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
config_path = './CONFIG_CanSave.yaml' |
|
|
cs = CanSave(CONFIG_PATH=config_path) |
|
|
print(help(cs)) |
|
|
|
|
|
|
|
|
path_ehr = './EHR/id_26.csv' |
|
|
ehr = pd.read_csv(path_ehr, sep=';').set_index('patient_id') |
|
|
sex = ehr['sex'].iloc[0] |
|
|
birth_date = ehr['birth_date'].iloc[0] |
|
|
|
|
|
|
|
|
features = cs.feature_engineering( |
|
|
sex = sex, |
|
|
birth_date = birth_date, |
|
|
ehr = ehr, |
|
|
date_pred = '2022-01-01', |
|
|
deep_weeks = 108 |
|
|
) |
|
|
|