| | """ |
| | Utilities required across all processing scripts |
| | """ |
| | import pandas as pd |
| | import numpy as np |
| |
|
| |
|
| | def read_data(file, cols, types): |
| | """ |
| | Read in data source |
| | -------- |
| | :param file: string filename |
| | :param cols: string list of column names |
| | :param types: string list of column types |
| | :return: dataframe |
| | """ |
| | schema = dict(zip(cols, types)) |
| | df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema) |
| |
|
| | return df |
| |
|
| |
|
| | def first_patient_appearance(df, dt_col, typ, data_path): |
| | """ |
| | Save first appearance of patient in dataset |
| | -------- |
| | :param df: dataframe to check |
| | :param dt_col: date column to sort by |
| | :param typ: type of dataset being used |
| | :param data_path: path to data extracts |
| | :return: None, dataframe with first dates saved |
| | """ |
| | df = df.sort_values(dt_col) |
| | df_first = df.groupby('SafeHavenID')[dt_col].first() |
| | df_first = df_first.to_frame().reset_index() |
| | df_first.columns = ['SafeHavenID', 'first_adm'] |
| | df_first.to_pickle(data_path + typ + '_first_dates.pkl') |
| |
|
| |
|
| | def add_days_since_event(df, typ, dt_col): |
| | """ |
| | Historical features: add days since features e.g. copd/resp/rescue |
| | -------- |
| | :param df: dataframe to be updated |
| | :param typ: 'rescue', 'copd' or 'resp' feature to be created |
| | :param dt_col: str date column name |
| | :return: updated dataframe with historical column added |
| | """ |
| | if typ == 'rescue': |
| | event_col = 'rescue_meds' |
| | elif typ == 'adm': |
| | event_col = 'adm' |
| | else: |
| | event_col = typ + '_event' |
| | date_col = typ + '_date' |
| | days_col = 'days_since_' + typ |
| | df[date_col] = df.apply( |
| | lambda x: x[dt_col] if x[event_col] else np.nan, axis=1).ffill() |
| | if df[date_col].isna().all(): |
| | df[days_col] = np.nan |
| | else: |
| | df[days_col] = (df.eoy - df[date_col]).dt.days |
| |
|
| | return df |
| |
|
| |
|
| | def track_event(x, desc, single): |
| | """ |
| | Fill nulls and search to see if x matches a description |
| | -------- |
| | :param x: str list of features to track |
| | :param desc: str list to compare |
| | :param single: boolean for checking against single description e.g. |
| | "COPD" True otherwise False |
| | :return: tracked feature list |
| | """ |
| | x = x.fillna('') |
| |
|
| | |
| | if single: |
| | result = [desc in s for s in x] |
| |
|
| | |
| | else: |
| | result = [s in desc for s in x] |
| |
|
| | return result |
| |
|
| |
|
| | def add_hist_adm_presc(df, typ, dt_col): |
| | """ |
| | Historical features: add days since and to-date features |
| | -------- |
| | :param df: dataframe to be updated |
| | :param typ: type of data - 'adm' or 'presc' |
| | :param dt_col: string name of date column |
| | :return: updated dataframe with historical columns added |
| | """ |
| | if typ == 'presc': |
| | df = df.sort_values(dt_col).reset_index(drop=True) |
| | df = add_days_since_event(df, 'rescue', dt_col) |
| | df['rescue_to_date'] = df.rescue_meds.cumsum() |
| | df['anxiety_depression_presc_to_date'] = df.anxiety_depression_presc.cumsum() |
| | else: |
| | for col in ['adm', 'copd', 'resp']: |
| | df = add_days_since_event(df, col, dt_col) |
| | for col in ['copd', 'resp', 'anxiety_depression']: |
| | df[col + '_to_date'] = df[col + '_event'].cumsum() |
| |
|
| | |
| | df[typ + '_to_date'] = df[typ].cumsum() |
| |
|
| | return df |
| |
|
| |
|
| | def correct_column_names(cols, typ): |
| | """ |
| | Convert column names to lower case and fill any spaces with underscores |
| | -------- |
| | :param cols: string list of column names |
| | :param typ: type of dataset being updated |
| | :return: cleaned column names |
| | """ |
| | print('Correcting column headers') |
| |
|
| | if typ == 'presc': |
| | lower_cols = cols.str.replace('[+-]', ' ').str.lower() |
| | new_cols = ["_".join(col.split()) for col in lower_cols] |
| | else: |
| | new_cols = cols.str.lower().str.replace(' ', '_').tolist() |
| |
|
| | return new_cols |