IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
"""
Utilities required across all processing scripts
"""
import pandas as pd
import numpy as np
def read_data(file, cols, types):
"""
Read in data source
--------
:param file: string filename
:param cols: string list of column names
:param types: string list of column types
:return: dataframe
"""
schema = dict(zip(cols, types))
df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema)
return df
def first_patient_appearance(df, dt_col, typ, data_path):
"""
Save first appearance of patient in dataset
--------
:param df: dataframe to check
:param dt_col: date column to sort by
:param typ: type of dataset being used
:param data_path: path to data extracts
:return: None, dataframe with first dates saved
"""
df = df.sort_values(dt_col)
df_first = df.groupby('SafeHavenID')[dt_col].first()
df_first = df_first.to_frame().reset_index()
df_first.columns = ['SafeHavenID', 'first_adm']
df_first.to_pickle(data_path + typ + '_first_dates.pkl')
def add_days_since_event(df, typ, dt_col):
"""
Historical features: add days since features e.g. copd/resp/rescue
--------
:param df: dataframe to be updated
:param typ: 'rescue', 'copd' or 'resp' feature to be created
:param dt_col: str date column name
:return: updated dataframe with historical column added
"""
if typ == 'rescue':
event_col = 'rescue_meds'
elif typ == 'adm':
event_col = 'adm'
else:
event_col = typ + '_event'
date_col = typ + '_date'
days_col = 'days_since_' + typ
df[date_col] = df.apply(
lambda x: x[dt_col] if x[event_col] else np.nan, axis=1).ffill()
if df[date_col].isna().all():
df[days_col] = np.nan
else:
df[days_col] = (df.eoy - df[date_col]).dt.days
return df
def track_event(x, desc, single):
"""
Fill nulls and search to see if x matches a description
--------
:param x: str list of features to track
:param desc: str list to compare
:param single: boolean for checking against single description e.g.
"COPD" True otherwise False
:return: tracked feature list
"""
x = x.fillna('')
# COPD only has single description to search
if single:
result = [desc in s for s in x]
# Respiratory has a list of descriptions to search
else:
result = [s in desc for s in x]
return result
def add_hist_adm_presc(df, typ, dt_col):
"""
Historical features: add days since and to-date features
--------
:param df: dataframe to be updated
:param typ: type of data - 'adm' or 'presc'
:param dt_col: string name of date column
:return: updated dataframe with historical columns added
"""
if typ == 'presc':
df = df.sort_values(dt_col).reset_index(drop=True)
df = add_days_since_event(df, 'rescue', dt_col)
df['rescue_to_date'] = df.rescue_meds.cumsum()
df['anxiety_depression_presc_to_date'] = df.anxiety_depression_presc.cumsum()
else:
for col in ['adm', 'copd', 'resp']:
df = add_days_since_event(df, col, dt_col)
for col in ['copd', 'resp', 'anxiety_depression']:
df[col + '_to_date'] = df[col + '_event'].cumsum()
# Add counter for events to date
df[typ + '_to_date'] = df[typ].cumsum()
return df
def correct_column_names(cols, typ):
"""
Convert column names to lower case and fill any spaces with underscores
--------
:param cols: string list of column names
:param typ: type of dataset being updated
:return: cleaned column names
"""
print('Correcting column headers')
if typ == 'presc':
lower_cols = cols.str.replace('[+-]', ' ').str.lower()
new_cols = ["_".join(col.split()) for col in lower_cols]
else:
new_cols = cols.str.lower().str.replace(' ', '_').tolist()
return new_cols