File size: 3,961 Bytes
53a6def
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
Utilities required across all processing scripts
"""
import pandas as pd
import numpy as np


def read_data(file, cols, types):
    """
    Read in data source
    --------
    :param file: string filename
    :param cols: string list of column names
    :param types: string list of column types
    :return: dataframe
    """
    schema = dict(zip(cols, types))
    df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema)

    return df


def first_patient_appearance(df, dt_col, typ, data_path):
    """
    Save first appearance of patient in dataset
    --------
    :param df: dataframe to check
    :param dt_col: date column to sort by
    :param typ: type of dataset being used
    :param data_path: path to data extracts
    :return: None, dataframe with first dates saved
    """
    df = df.sort_values(dt_col)
    df_first = df.groupby('SafeHavenID')[dt_col].first()
    df_first = df_first.to_frame().reset_index()
    df_first.columns = ['SafeHavenID', 'first_adm']
    df_first.to_pickle(data_path + typ + '_first_dates.pkl')


def add_days_since_event(df, typ, dt_col):
    """
    Historical features: add days since features e.g. copd/resp/rescue
    --------
    :param df: dataframe to be updated
    :param typ: 'rescue', 'copd' or 'resp' feature to be created
    :param dt_col: str date column name
    :return: updated dataframe with historical column added
    """
    if typ == 'rescue':
        event_col = 'rescue_meds'
    elif typ == 'adm':
        event_col = 'adm'
    else:
        event_col = typ + '_event'
    date_col = typ + '_date'
    days_col = 'days_since_' + typ
    df[date_col] = df.apply(
        lambda x: x[dt_col] if x[event_col] else np.nan, axis=1).ffill()
    if df[date_col].isna().all():
        df[days_col] = np.nan
    else:
        df[days_col] = (df.eoy - df[date_col]).dt.days

    return df


def track_event(x, desc, single):
    """
    Fill nulls and search to see if x matches a description
    --------
    :param x: str list of features to track
    :param desc: str list to compare
    :param single: boolean for checking against single description e.g.
        "COPD" True otherwise False
    :return: tracked feature list
    """
    x = x.fillna('')

    # COPD only has single description to search
    if single:
        result = [desc in s for s in x]

    # Respiratory has a list of descriptions to search
    else:
        result = [s in desc for s in x]

    return result


def add_hist_adm_presc(df, typ, dt_col):
    """
    Historical features: add days since and to-date features
    --------
    :param df: dataframe to be updated
    :param typ: type of data - 'adm' or 'presc'
    :param dt_col: string name of date column
    :return: updated dataframe with historical columns added
    """
    if typ == 'presc':
        df = df.sort_values(dt_col).reset_index(drop=True)
        df = add_days_since_event(df, 'rescue', dt_col)
        df['rescue_to_date'] = df.rescue_meds.cumsum()
        df['anxiety_depression_presc_to_date'] = df.anxiety_depression_presc.cumsum()
    else:
        for col in ['adm', 'copd', 'resp']:
            df = add_days_since_event(df, col, dt_col)
        for col in ['copd', 'resp', 'anxiety_depression']:
            df[col + '_to_date'] = df[col + '_event'].cumsum()

    # Add counter for events to date
    df[typ + '_to_date'] = df[typ].cumsum()

    return df


def correct_column_names(cols, typ):
    """
    Convert column names to lower case and fill any spaces with underscores
    --------
    :param cols: string list of column names
    :param typ: type of dataset being updated
    :return: cleaned column names
    """
    print('Correcting column headers')

    if typ == 'presc':
        lower_cols = cols.str.replace('[+-]', ' ').str.lower()
        new_cols = ["_".join(col.split()) for col in lower_cols]
    else:
        new_cols = cols.str.lower().str.replace(' ', '_').tolist()

    return new_cols