""" Script for preprocessing pharmacy data -------- Process pharmacy data and track inhaler prescriptions and rescue meds """ import json import pandas as pd from datetime import date from dateutil.relativedelta import relativedelta from utils.common import (add_hist_adm_presc, correct_column_names, first_patient_appearance) from utils.presc_common import initialize_presc_data, track_medication def add_inhaler_mappings(df): """ Load inhaler prescription mappings and track where they appear in the data -------- :param df: dataframe :return: dataframe with column added for each inhaler type """ print('Mapping inhaler prescriptions') # Load in inhaler mapping with open('mappings/inhaler_mapping.json') as json_file: inhaler_mapping = json.load(json_file) for k, v in inhaler_mapping.items(): df[k + '_inhaler'] = df.PI_Approved_Name.str.contains( '|'.join(v)).astype(int) # Remove for now as empty df = df.drop(['LABA-LAMA-ICS_inhaler', 'Ignore_inhaler'], axis=1) return df def add_eoy_column(df, dt_col, eoy_date): """ Add EOY relative to user-specified end date -------- :param df: dataframe :param dt_col: date column in dataframe :param eoy_date: EOY date from config :return: updated df with EOY column added """ # Needed to stop error with creating a new column df = df.reset_index(drop=True) # Add column with user-specified end of year date end_date = pd.to_datetime(eoy_date) end_month = end_date.month end_day = end_date.day # Add for every year df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year] # Check that EOY date is after dt_col for each entry eoy_index = df.columns[df.columns == 'eoy'] adm_vs_eoy = df[dt_col] > df.eoy row_index = df.index[adm_vs_eoy] df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1) df['eoy'] = pd.to_datetime(df.eoy) return df def calc_presc_per_year(df): """ Reduce data to 1 row per year -------- :param df: dataframe to reduced :return: reduced dataframe """ print('Reducing to 1 row per year') # Add end of year columns eoy_cols = ['presc_to_date', 'days_since_rescue', 'rescue_to_date', 'anxiety_depression_presc_to_date', 'rescue_date'] last = df.groupby(['SafeHavenID', 'eoy'])[eoy_cols].last() # Total columns sum_cols = ['SALBUTAMOL', 'SABA_inhaler', 'LABA_inhaler', 'LAMA_inhaler', 'SAMA_inhaler', 'ICS_inhaler', 'LABA-ICS_inhaler', 'LAMA +LABA-ICS_inhaler', 'SABA + SAMA_inhaler', 'MCS_inhaler', 'rescue_meds', 'presc', 'anxiety_depression_presc'] total_cols = [col + '_per_year' for col in sum_cols] total = df.groupby(['SafeHavenID', 'eoy'])[sum_cols].sum() total.columns = total_cols # Join together results = last.join(total) return results def main(): # Load in config items with open('../../../config.json') as json_config_file: config = json.load(json_config_file) # Load in data presc_file = config['extract_data_path'] + 'Pharmacy_Cohort3R.csv' presc = initialize_presc_data(presc_file) # Save first date in dataset data_path = config['model_data_path'] first_patient_appearance(presc, 'PRESC_DATE', 'presc', data_path) # Add inhaler mapping presc = add_inhaler_mappings(presc) # Track salbutamol and rescue meds presc = track_medication(presc) # Drop columns cols_2_drop = ['PI_Approved_Name', 'PI_BNF_Item_Code', 'code'] presc = presc.drop(cols_2_drop, axis=1) # Add column relative to user-specified date presc = add_eoy_column(presc, 'PRESC_DATE', config['date']) # Track rows which are admissions presc['presc'] = 1 # Add any historical count columns presc = presc.groupby('SafeHavenID').apply( add_hist_adm_presc, 'presc', 'PRESC_DATE') presc = presc.reset_index(drop=True) # Save per event dataset presc.to_pickle(data_path + 'validation_presc_proc.pkl') # Reduce data to 1 row per year presc_yearly = calc_presc_per_year(presc) # Correct column names presc_yearly.columns = correct_column_names(presc_yearly.columns, 'presc') # Save data presc_yearly.to_pickle(data_path + 'presc_proc.pkl') main()