File size: 3,961 Bytes
53a6def | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | """
Utilities required across all processing scripts
"""
import pandas as pd
import numpy as np
def read_data(file, cols, types):
"""
Read in data source
--------
:param file: string filename
:param cols: string list of column names
:param types: string list of column types
:return: dataframe
"""
schema = dict(zip(cols, types))
df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema)
return df
def first_patient_appearance(df, dt_col, typ, data_path):
"""
Save first appearance of patient in dataset
--------
:param df: dataframe to check
:param dt_col: date column to sort by
:param typ: type of dataset being used
:param data_path: path to data extracts
:return: None, dataframe with first dates saved
"""
df = df.sort_values(dt_col)
df_first = df.groupby('SafeHavenID')[dt_col].first()
df_first = df_first.to_frame().reset_index()
df_first.columns = ['SafeHavenID', 'first_adm']
df_first.to_pickle(data_path + typ + '_first_dates.pkl')
def add_days_since_event(df, typ, dt_col):
"""
Historical features: add days since features e.g. copd/resp/rescue
--------
:param df: dataframe to be updated
:param typ: 'rescue', 'copd' or 'resp' feature to be created
:param dt_col: str date column name
:return: updated dataframe with historical column added
"""
if typ == 'rescue':
event_col = 'rescue_meds'
elif typ == 'adm':
event_col = 'adm'
else:
event_col = typ + '_event'
date_col = typ + '_date'
days_col = 'days_since_' + typ
df[date_col] = df.apply(
lambda x: x[dt_col] if x[event_col] else np.nan, axis=1).ffill()
if df[date_col].isna().all():
df[days_col] = np.nan
else:
df[days_col] = (df.eoy - df[date_col]).dt.days
return df
def track_event(x, desc, single):
"""
Fill nulls and search to see if x matches a description
--------
:param x: str list of features to track
:param desc: str list to compare
:param single: boolean for checking against single description e.g.
"COPD" True otherwise False
:return: tracked feature list
"""
x = x.fillna('')
# COPD only has single description to search
if single:
result = [desc in s for s in x]
# Respiratory has a list of descriptions to search
else:
result = [s in desc for s in x]
return result
def add_hist_adm_presc(df, typ, dt_col):
"""
Historical features: add days since and to-date features
--------
:param df: dataframe to be updated
:param typ: type of data - 'adm' or 'presc'
:param dt_col: string name of date column
:return: updated dataframe with historical columns added
"""
if typ == 'presc':
df = df.sort_values(dt_col).reset_index(drop=True)
df = add_days_since_event(df, 'rescue', dt_col)
df['rescue_to_date'] = df.rescue_meds.cumsum()
df['anxiety_depression_presc_to_date'] = df.anxiety_depression_presc.cumsum()
else:
for col in ['adm', 'copd', 'resp']:
df = add_days_since_event(df, col, dt_col)
for col in ['copd', 'resp', 'anxiety_depression']:
df[col + '_to_date'] = df[col + '_event'].cumsum()
# Add counter for events to date
df[typ + '_to_date'] = df[typ].cumsum()
return df
def correct_column_names(cols, typ):
"""
Convert column names to lower case and fill any spaces with underscores
--------
:param cols: string list of column names
:param typ: type of dataset being updated
:return: cleaned column names
"""
print('Correcting column headers')
if typ == 'presc':
lower_cols = cols.str.replace('[+-]', ' ').str.lower()
new_cols = ["_".join(col.split()) for col in lower_cols]
else:
new_cols = cols.str.lower().str.replace(' ', '_').tolist()
return new_cols |