File size: 5,274 Bytes
53a6def
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
Process SMR01 comorbidities data
--------
Clean and process comorbidities, tracking specific comorbidities and returning
the total number of comorbidities per patient per year
"""
import json
import pandas as pd
from datetime import date
from dateutil.relativedelta import relativedelta
from utils.common import track_event
from utils.adm_common import initialize_adm_data, correct_stays
from utils.comorb_processing import diagnosis_mapping_lists


def track_comorbidity(df, excel_file, sheet_name, diag_names):
    """
    Map from admission descriptions to comorbidities using provided sheet.
    Add new column for each comorbidity.
    --------
    :param df: pandas dataframe
    :param excel_file: str filename for diagnosis mapping
    :param sheet_name: str sheet name for diagnosis mapping
    :param diag_names: list of diagnoses
    :return: dataframe update with diagnosis mapping
    """
    print('Tracking comorbidities')

    # Load in mappings
    mapping = diagnosis_mapping_lists(excel_file, sheet_name, diag_names)
    
    # Select relevant columns
    diag_columns = ['DIAG1Desc', 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc',
                    'DIAG5Desc', 'DIAG6Desc']
    df_diag = df[diag_columns]

    # Create column for each comorbidity
    for key in mapping:
        com = mapping[key]
        com_bool = df_diag.apply(lambda x: track_event(x, com, False))
        com_int = com_bool.any(axis=1).astype(int)
        df[key] = com_int

    return df


def fill_comorbidities(df, diag_names):
    """
    Fill comorbidites
    --------
    :param df: dataframe of groupby values
    :param diag_names: list of diagnoses
    :return: updated dataframe
    """

    df[diag_names] = df[diag_names].replace(to_replace=0, method='ffill')

    return df


def add_eoy_column(df, dt_col, eoy_date):
    """
    Add EOY relative to user-specified end date
    --------
    :param df: dataframe
    :param dt_col: date column in dataframe
    :param eoy_date: EOY date from config
    :return: updated df with EOY column added
    """
    # Needed to stop error with creating a new column
    df = df.reset_index(drop=True)

    # Add column with user-specified end of year date
    end_date = pd.to_datetime(eoy_date)
    end_month = end_date.month
    end_day = end_date.day

    # Add for every year
    df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year]

    # Check that EOY date is after dt_col for each entry
    eoy_index = df.columns[df.columns == 'eoy']
    adm_vs_eoy = df[dt_col] > df.eoy
    row_index = df.index[adm_vs_eoy]
    df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1)
    df['eoy'] = pd.to_datetime(df.eoy)

    return df


def add_yearly_stats(df):
    """
    Sum comorbidities per patient per year
    --------
    :param df: dataframe to update
    :return: sum of comorbidities per patient per year
    """
    print('Adding comorbidity count per year')

    # Drop cols not required anymore
    cols_2_drop = ['ADMDATE', 'DISDATE', 'STAY', 'ETHGRP', 'DIAG1Desc',
                   'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', 'DIAG5Desc',
                   'DIAG6Desc', 'DISDATE', 'STAY']
    df = df.drop(cols_2_drop, axis=1)

    # Sum comorbidities
    df = df.groupby(['SafeHavenID', 'eoy']).last().sum(axis=1)
    df = df.to_frame().rename(columns={0: 'comorb_per_year'})

    return df


def main():

    # Load in config items
    with open('../../../config.json') as json_config_file:
        config = json.load(json_config_file)

    # Load in data
    adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv'
    adm = initialize_adm_data(adm_file)

    # Fill null STAY data and combine transfer admissions
    adm = correct_stays(adm)

    # Prepare text data - strip string columns
    adm = adm.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

    # Track comorbidities
    excel_file = "mappings/Comorbidity feature review for models & clin " \
                 "summary update v2 May 2021.xlsx"
    sheet_name = 'Diagnosis category mapping3'
    diag_names = ['Ischaemic_hd', 'Atrial_fib', 'pacemake', 'periph_vasc',
                  'cog_imp', 'HF1', 'LV_sys', 'valv_hd', 'HF_pres_ejec',
                  'hypertension', 'Cerebrovascula_dis', 'Diabetes_mel',
                  'Osteoporosis', 'frailty', 'liver_dis', 'metastat_canc',
                  'headneck_canc', 'breast_canc', 'gi_canc', 'other_canc',
                  'kidney_dis', 'Asthma_ov', 'Pulmonary_fib',
                  'Obstructive_apnoea', 'Pulmonary_hyp', 'Previous_pneum',
                  'DVT_PTE', 'Lung_cancer', 'Bronchiectasis', 'Resp_fail']
    adm_comorb = track_comorbidity(adm, excel_file, sheet_name, diag_names)

    # Drop date column
    adm_comorb = adm_comorb.sort_values('ADMDATE').reset_index(drop=True)

    # Drop fill comorb cols
    print('Filling comorbidities')
    adm_filled = adm_comorb.groupby('SafeHavenID').apply(
        fill_comorbidities, diag_names)

    # Add column relative to user-specified date
    adm_filled = add_eoy_column(adm_filled, 'ADMDATE', config['date'])

    # Add yearly stats
    adm_yearly = add_yearly_stats(adm_filled)

    # Save data
    adm_yearly.to_pickle(config['model_data_path'] + 'comorb_proc.pkl')


main()