""" Utilities required across all processing scripts """ import pandas as pd import numpy as np def read_data(file, cols, types): """ Read in data source -------- :param file: string filename :param cols: string list of column names :param types: string list of column types :return: dataframe """ schema = dict(zip(cols, types)) df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema) return df def first_patient_appearance(df, dt_col, typ, data_path): """ Save first appearance of patient in dataset -------- :param df: dataframe to check :param dt_col: date column to sort by :param typ: type of dataset being used :param data_path: path to data extracts :return: None, dataframe with first dates saved """ df = df.sort_values(dt_col) df_first = df.groupby('SafeHavenID')[dt_col].first() df_first = df_first.to_frame().reset_index() df_first.columns = ['SafeHavenID', 'first_adm'] df_first.to_pickle(data_path + typ + '_first_dates.pkl') def add_days_since_event(df, typ, dt_col): """ Historical features: add days since features e.g. copd/resp/rescue -------- :param df: dataframe to be updated :param typ: 'rescue', 'copd' or 'resp' feature to be created :param dt_col: str date column name :return: updated dataframe with historical column added """ if typ == 'rescue': event_col = 'rescue_meds' elif typ == 'adm': event_col = 'adm' else: event_col = typ + '_event' date_col = typ + '_date' days_col = 'days_since_' + typ df[date_col] = df.apply( lambda x: x[dt_col] if x[event_col] else np.nan, axis=1).ffill() if df[date_col].isna().all(): df[days_col] = np.nan else: df[days_col] = (df.eoy - df[date_col]).dt.days return df def track_event(x, desc, single): """ Fill nulls and search to see if x matches a description -------- :param x: str list of features to track :param desc: str list to compare :param single: boolean for checking against single description e.g. "COPD" True otherwise False :return: tracked feature list """ x = x.fillna('') # COPD only has single description to search if single: result = [desc in s for s in x] # Respiratory has a list of descriptions to search else: result = [s in desc for s in x] return result def add_hist_adm_presc(df, typ, dt_col): """ Historical features: add days since and to-date features -------- :param df: dataframe to be updated :param typ: type of data - 'adm' or 'presc' :param dt_col: string name of date column :return: updated dataframe with historical columns added """ if typ == 'presc': df = df.sort_values(dt_col).reset_index(drop=True) df = add_days_since_event(df, 'rescue', dt_col) df['rescue_to_date'] = df.rescue_meds.cumsum() df['anxiety_depression_presc_to_date'] = df.anxiety_depression_presc.cumsum() else: for col in ['adm', 'copd', 'resp']: df = add_days_since_event(df, col, dt_col) for col in ['copd', 'resp', 'anxiety_depression']: df[col + '_to_date'] = df[col + '_event'].cumsum() # Add counter for events to date df[typ + '_to_date'] = df[typ].cumsum() return df def correct_column_names(cols, typ): """ Convert column names to lower case and fill any spaces with underscores -------- :param cols: string list of column names :param typ: type of dataset being updated :return: cleaned column names """ print('Correcting column headers') if typ == 'presc': lower_cols = cols.str.replace('[+-]', ' ').str.lower() new_cols = ["_".join(col.split()) for col in lower_cols] else: new_cols = cols.str.lower().str.replace(' ', '_').tolist() return new_cols