File size: 4,419 Bytes
53a6def | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | """
Script for preprocessing pharmacy data
--------
Process pharmacy data and track inhaler prescriptions and rescue meds
"""
import json
import pandas as pd
from datetime import date
from dateutil.relativedelta import relativedelta
from utils.common import (add_hist_adm_presc, correct_column_names,
first_patient_appearance)
from utils.presc_common import initialize_presc_data, track_medication
def add_inhaler_mappings(df):
"""
Load inhaler prescription mappings and track where they appear in the data
--------
:param df: dataframe
:return: dataframe with column added for each inhaler type
"""
print('Mapping inhaler prescriptions')
# Load in inhaler mapping
with open('mappings/inhaler_mapping.json') as json_file:
inhaler_mapping = json.load(json_file)
for k, v in inhaler_mapping.items():
df[k + '_inhaler'] = df.PI_Approved_Name.str.contains(
'|'.join(v)).astype(int)
# Remove for now as empty
df = df.drop(['LABA-LAMA-ICS_inhaler', 'Ignore_inhaler'], axis=1)
return df
def add_eoy_column(df, dt_col, eoy_date):
"""
Add EOY relative to user-specified end date
--------
:param df: dataframe
:param dt_col: date column in dataframe
:param eoy_date: EOY date from config
:return: updated df with EOY column added
"""
# Needed to stop error with creating a new column
df = df.reset_index(drop=True)
# Add column with user-specified end of year date
end_date = pd.to_datetime(eoy_date)
end_month = end_date.month
end_day = end_date.day
# Add for every year
df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year]
# Check that EOY date is after dt_col for each entry
eoy_index = df.columns[df.columns == 'eoy']
adm_vs_eoy = df[dt_col] > df.eoy
row_index = df.index[adm_vs_eoy]
df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1)
df['eoy'] = pd.to_datetime(df.eoy)
return df
def calc_presc_per_year(df):
"""
Reduce data to 1 row per year
--------
:param df: dataframe to reduced
:return: reduced dataframe
"""
print('Reducing to 1 row per year')
# Add end of year columns
eoy_cols = ['presc_to_date', 'days_since_rescue', 'rescue_to_date',
'anxiety_depression_presc_to_date', 'rescue_date']
last = df.groupby(['SafeHavenID', 'eoy'])[eoy_cols].last()
# Total columns
sum_cols = ['SALBUTAMOL', 'SABA_inhaler', 'LABA_inhaler', 'LAMA_inhaler',
'SAMA_inhaler', 'ICS_inhaler', 'LABA-ICS_inhaler',
'LAMA +LABA-ICS_inhaler', 'SABA + SAMA_inhaler',
'MCS_inhaler', 'rescue_meds', 'presc', 'anxiety_depression_presc']
total_cols = [col + '_per_year' for col in sum_cols]
total = df.groupby(['SafeHavenID', 'eoy'])[sum_cols].sum()
total.columns = total_cols
# Join together
results = last.join(total)
return results
def main():
# Load in config items
with open('../../../config.json') as json_config_file:
config = json.load(json_config_file)
# Load in data
presc_file = config['extract_data_path'] + 'Pharmacy_Cohort3R.csv'
presc = initialize_presc_data(presc_file)
# Save first date in dataset
data_path = config['model_data_path']
first_patient_appearance(presc, 'PRESC_DATE', 'presc', data_path)
# Add inhaler mapping
presc = add_inhaler_mappings(presc)
# Track salbutamol and rescue meds
presc = track_medication(presc)
# Drop columns
cols_2_drop = ['PI_Approved_Name', 'PI_BNF_Item_Code', 'code']
presc = presc.drop(cols_2_drop, axis=1)
# Add column relative to user-specified date
presc = add_eoy_column(presc, 'PRESC_DATE', config['date'])
# Track rows which are admissions
presc['presc'] = 1
# Add any historical count columns
presc = presc.groupby('SafeHavenID').apply(
add_hist_adm_presc, 'presc', 'PRESC_DATE')
presc = presc.reset_index(drop=True)
# Save per event dataset
presc.to_pickle(data_path + 'validation_presc_proc.pkl')
# Reduce data to 1 row per year
presc_yearly = calc_presc_per_year(presc)
# Correct column names
presc_yearly.columns = correct_column_names(presc_yearly.columns, 'presc')
# Save data
presc_yearly.to_pickle(data_path + 'presc_proc.pkl')
main()
|