File size: 4,846 Bytes
53a6def
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""
Process SMR01 admission data
--------
Clean and process admission data while adding tracking for COPD and respiratory
admissions per year for each SafeHavenID
"""
import json
import pandas as pd
from datetime import date
from dateutil.relativedelta import relativedelta
from utils.common import add_hist_adm_presc, first_patient_appearance
from utils.adm_common import (initialize_adm_data, correct_stays,
                              track_copd_resp)
from utils.adm_processing import (convert_ethgrp_desc, mode_ethnicity,
                                  search_diag)
from utils.adm_reduction import fill_missing_years, calc_adm_per_year


def process_ethnicity(df):
    """
    Find relevant ethnic group for each patient, accounting for null data
    --------
    :param df: admission dataframe to be updated
    :return: admission dataframe with ethnicity cleaned and updated
    """
    print('Processing ethnicity')

    # Fill in missing ethnicities
    df = df.rename(columns={'ETHGRP': 'eth_grp'})
    df['eth_grp'] = df.eth_grp.str.strip()
    df['eth_grp'] = df.groupby('SafeHavenID')['eth_grp'].apply(
        lambda x: x.ffill().bfill().fillna('Unknown'))

    # Convert to 1 of 7 ethnic groups
    df['eth_grp'] = [convert_ethgrp_desc(eth) for eth in df.eth_grp]

    # Find most commonly occurring ethnicity per SafeHavenID
    df = df.groupby('SafeHavenID').apply(mode_ethnicity, 'eth_grp')

    return df


def add_eoy_column(df, dt_col, eoy_date):
    """
    Add EOY relative to user-specified end date
    --------
    :param df: dataframe
    :param dt_col: date column in dataframe
    :param eoy_date: EOY date from config
    :return: updated df with EOY column added
    """
    # Needed to stop error with creating a new column
    df = df.reset_index(drop=True)

    # Add column with user-specified end of year date
    end_date = pd.to_datetime(eoy_date)
    end_month = end_date.month
    end_day = end_date.day

    # Add for every year
    df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year]

    # Check that EOY date is after dt_col for each entry
    eoy_index = df.columns[df.columns == 'eoy']
    adm_vs_eoy = df[dt_col] > df.eoy
    row_index = df.index[adm_vs_eoy]
    df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1)
    df['eoy'] = pd.to_datetime(df.eoy)

    return df


def extract_yearly_data(df):
    """
    Extract features on a yearly basis for each SafeHavenID
    --------
    :param adm: admission dataframe to be updated
    :return: dataframe with feature values per year
    """
    print('Reducing to 1 row SafeHavenID per year')

    # Track rows which are admissions
    df['adm'] = 1

    # Add rows from years where patient did not have admissions
    df = df.groupby('SafeHavenID').apply(fill_missing_years)
    df = df.reset_index(drop=True)

    # Add any historical count columns
    df = df.groupby('SafeHavenID').apply(add_hist_adm_presc, 'adm', 'ADMDATE')
    df = df.reset_index(drop=True)

    # Reduce data to 1 row per year
    df = calc_adm_per_year(df)

    # Select columns in final order
    final_cols = ['eth_grp', 'adm_per_year', 'total_hosp_days',
                  'mean_los', 'copd_per_year', 'resp_per_year',
                  'anxiety_depression_per_year', 'days_since_copd',
                  'days_since_resp', 'days_since_adm', 'adm_to_date',
                  'copd_to_date', 'resp_to_date', 'anxiety_depression_to_date',
                  'copd_date', 'resp_date', 'adm_date']

    df = df[final_cols]

    return df


def main():

    # Load in config items
    with open('../../../config.json') as json_config_file:
        config = json.load(json_config_file)

    # Load in data
    adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv'
    adm = initialize_adm_data(adm_file)

    # Fill null STAY data and combine transfer admissions
    adm = correct_stays(adm)

    # Save first date in dataset
    data_path = config['model_data_path']
    first_patient_appearance(adm, 'ADMDATE', 'adm', data_path)

    # Process ethnicity data
    adm = process_ethnicity(adm)

    # Track COPD and respiratory events
    adm = track_copd_resp(adm)

    # Track anxiety event
    adm = search_diag(adm, 'anxiety_depression')

    # Select relevant columns
    reduced_cols = ['SafeHavenID', 'eth_grp', 'ADMDATE', 'STAY', 'copd_event',
                    'resp_event', 'anxiety_depression_event']
    adm_reduced = adm[reduced_cols]

    # Save per event dataset
    adm_reduced.to_pickle(data_path + 'validation_adm_proc.pkl')

    # Add column relative to user-specified date
    adm_reduced = add_eoy_column(adm_reduced, 'ADMDATE', config['date'])

    # Extract yearly data
    adm_yearly = extract_yearly_data(adm_reduced)

    # Save data
    adm_yearly.to_pickle(data_path + 'adm_proc.pkl')


main()