File size: 4,419 Bytes

53a6def

"""
Script for preprocessing pharmacy data
--------
Process pharmacy data and track inhaler prescriptions and rescue meds
"""
import json
import pandas as pd
from datetime import date
from dateutil.relativedelta import relativedelta
from utils.common import (add_hist_adm_presc, correct_column_names,
                          first_patient_appearance)
from utils.presc_common import initialize_presc_data, track_medication


def add_inhaler_mappings(df):
    """
    Load inhaler prescription mappings and track where they appear in the data
    --------
    :param df: dataframe
    :return: dataframe with column added for each inhaler type
    """
    print('Mapping inhaler prescriptions')

    # Load in inhaler mapping
    with open('mappings/inhaler_mapping.json') as json_file:
        inhaler_mapping = json.load(json_file)

    for k, v in inhaler_mapping.items():
        df[k + '_inhaler'] = df.PI_Approved_Name.str.contains(
            '|'.join(v)).astype(int)

    # Remove for now as empty
    df = df.drop(['LABA-LAMA-ICS_inhaler', 'Ignore_inhaler'], axis=1)

    return df


def add_eoy_column(df, dt_col, eoy_date):
    """
    Add EOY relative to user-specified end date
    --------
    :param df: dataframe
    :param dt_col: date column in dataframe
    :param eoy_date: EOY date from config
    :return: updated df with EOY column added
    """
    # Needed to stop error with creating a new column
    df = df.reset_index(drop=True)

    # Add column with user-specified end of year date
    end_date = pd.to_datetime(eoy_date)
    end_month = end_date.month
    end_day = end_date.day

    # Add for every year
    df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year]

    # Check that EOY date is after dt_col for each entry
    eoy_index = df.columns[df.columns == 'eoy']
    adm_vs_eoy = df[dt_col] > df.eoy
    row_index = df.index[adm_vs_eoy]
    df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1)
    df['eoy'] = pd.to_datetime(df.eoy)

    return df


def calc_presc_per_year(df):
    """
    Reduce data to 1 row per year
    --------
    :param df: dataframe to reduced
    :return: reduced dataframe
    """
    print('Reducing to 1 row per year')

    # Add end of year columns
    eoy_cols = ['presc_to_date', 'days_since_rescue', 'rescue_to_date',
                'anxiety_depression_presc_to_date', 'rescue_date']
    last = df.groupby(['SafeHavenID', 'eoy'])[eoy_cols].last()

    # Total columns
    sum_cols = ['SALBUTAMOL', 'SABA_inhaler', 'LABA_inhaler', 'LAMA_inhaler',
                'SAMA_inhaler', 'ICS_inhaler', 'LABA-ICS_inhaler',
                'LAMA +LABA-ICS_inhaler', 'SABA + SAMA_inhaler',
                'MCS_inhaler', 'rescue_meds', 'presc', 'anxiety_depression_presc']
    total_cols = [col + '_per_year' for col in sum_cols]
    total = df.groupby(['SafeHavenID', 'eoy'])[sum_cols].sum()
    total.columns = total_cols

    # Join together
    results = last.join(total)

    return results


def main():

    # Load in config items
    with open('../../../config.json') as json_config_file:
        config = json.load(json_config_file)

    # Load in data
    presc_file = config['extract_data_path'] + 'Pharmacy_Cohort3R.csv'
    presc = initialize_presc_data(presc_file)

    # Save first date in dataset
    data_path = config['model_data_path']
    first_patient_appearance(presc, 'PRESC_DATE', 'presc', data_path)

    # Add inhaler mapping
    presc = add_inhaler_mappings(presc)

    # Track salbutamol and rescue meds
    presc = track_medication(presc)

    # Drop columns
    cols_2_drop = ['PI_Approved_Name', 'PI_BNF_Item_Code', 'code']
    presc = presc.drop(cols_2_drop, axis=1)

    # Add column relative to user-specified date
    presc = add_eoy_column(presc, 'PRESC_DATE', config['date'])

    # Track rows which are admissions
    presc['presc'] = 1

    # Add any historical count columns
    presc = presc.groupby('SafeHavenID').apply(
        add_hist_adm_presc, 'presc', 'PRESC_DATE')
    presc = presc.reset_index(drop=True)

    # Save per event dataset
    presc.to_pickle(data_path + 'validation_presc_proc.pkl')

    # Reduce data to 1 row per year
    presc_yearly = calc_presc_per_year(presc)

    # Correct column names
    presc_yearly.columns = correct_column_names(presc_yearly.columns, 'presc')

    # Save data
    presc_yearly.to_pickle(data_path + 'presc_proc.pkl')


main()