File size: 3,961 Bytes

53a6def

"""
Utilities required across all processing scripts
"""
import pandas as pd
import numpy as np


def read_data(file, cols, types):
    """
    Read in data source
    --------
    :param file: string filename
    :param cols: string list of column names
    :param types: string list of column types
    :return: dataframe
    """
    schema = dict(zip(cols, types))
    df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema)

    return df


def first_patient_appearance(df, dt_col, typ, data_path):
    """
    Save first appearance of patient in dataset
    --------
    :param df: dataframe to check
    :param dt_col: date column to sort by
    :param typ: type of dataset being used
    :param data_path: path to data extracts
    :return: None, dataframe with first dates saved
    """
    df = df.sort_values(dt_col)
    df_first = df.groupby('SafeHavenID')[dt_col].first()
    df_first = df_first.to_frame().reset_index()
    df_first.columns = ['SafeHavenID', 'first_adm']
    df_first.to_pickle(data_path + typ + '_first_dates.pkl')


def add_days_since_event(df, typ, dt_col):
    """
    Historical features: add days since features e.g. copd/resp/rescue
    --------
    :param df: dataframe to be updated
    :param typ: 'rescue', 'copd' or 'resp' feature to be created
    :param dt_col: str date column name
    :return: updated dataframe with historical column added
    """
    if typ == 'rescue':
        event_col = 'rescue_meds'
    elif typ == 'adm':
        event_col = 'adm'
    else:
        event_col = typ + '_event'
    date_col = typ + '_date'
    days_col = 'days_since_' + typ
    df[date_col] = df.apply(
        lambda x: x[dt_col] if x[event_col] else np.nan, axis=1).ffill()
    if df[date_col].isna().all():
        df[days_col] = np.nan
    else:
        df[days_col] = (df.eoy - df[date_col]).dt.days

    return df


def track_event(x, desc, single):
    """
    Fill nulls and search to see if x matches a description
    --------
    :param x: str list of features to track
    :param desc: str list to compare
    :param single: boolean for checking against single description e.g.
        "COPD" True otherwise False
    :return: tracked feature list
    """
    x = x.fillna('')

    # COPD only has single description to search
    if single:
        result = [desc in s for s in x]

    # Respiratory has a list of descriptions to search
    else:
        result = [s in desc for s in x]

    return result


def add_hist_adm_presc(df, typ, dt_col):
    """
    Historical features: add days since and to-date features
    --------
    :param df: dataframe to be updated
    :param typ: type of data - 'adm' or 'presc'
    :param dt_col: string name of date column
    :return: updated dataframe with historical columns added
    """
    if typ == 'presc':
        df = df.sort_values(dt_col).reset_index(drop=True)
        df = add_days_since_event(df, 'rescue', dt_col)
        df['rescue_to_date'] = df.rescue_meds.cumsum()
        df['anxiety_depression_presc_to_date'] = df.anxiety_depression_presc.cumsum()
    else:
        for col in ['adm', 'copd', 'resp']:
            df = add_days_since_event(df, col, dt_col)
        for col in ['copd', 'resp', 'anxiety_depression']:
            df[col + '_to_date'] = df[col + '_event'].cumsum()

    # Add counter for events to date
    df[typ + '_to_date'] = df[typ].cumsum()

    return df


def correct_column_names(cols, typ):
    """
    Convert column names to lower case and fill any spaces with underscores
    --------
    :param cols: string list of column names
    :param typ: type of dataset being updated
    :return: cleaned column names
    """
    print('Correcting column headers')

    if typ == 'presc':
        lower_cols = cols.str.replace('[+-]', ' ').str.lower()
        new_cols = ["_".join(col.split()) for col in lower_cols]
    else:
        new_cols = cols.str.lower().str.replace(' ', '_').tolist()

    return new_cols