copd-model-e / training /src /processing /process_prescribing.py
IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
"""
Script for preprocessing pharmacy data
--------
Process pharmacy data and track inhaler prescriptions and rescue meds
"""
import json
import pandas as pd
from datetime import date
from dateutil.relativedelta import relativedelta
from utils.common import (add_hist_adm_presc, correct_column_names,
first_patient_appearance)
from utils.presc_common import initialize_presc_data, track_medication
def add_inhaler_mappings(df):
"""
Load inhaler prescription mappings and track where they appear in the data
--------
:param df: dataframe
:return: dataframe with column added for each inhaler type
"""
print('Mapping inhaler prescriptions')
# Load in inhaler mapping
with open('mappings/inhaler_mapping.json') as json_file:
inhaler_mapping = json.load(json_file)
for k, v in inhaler_mapping.items():
df[k + '_inhaler'] = df.PI_Approved_Name.str.contains(
'|'.join(v)).astype(int)
# Remove for now as empty
df = df.drop(['LABA-LAMA-ICS_inhaler', 'Ignore_inhaler'], axis=1)
return df
def add_eoy_column(df, dt_col, eoy_date):
"""
Add EOY relative to user-specified end date
--------
:param df: dataframe
:param dt_col: date column in dataframe
:param eoy_date: EOY date from config
:return: updated df with EOY column added
"""
# Needed to stop error with creating a new column
df = df.reset_index(drop=True)
# Add column with user-specified end of year date
end_date = pd.to_datetime(eoy_date)
end_month = end_date.month
end_day = end_date.day
# Add for every year
df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year]
# Check that EOY date is after dt_col for each entry
eoy_index = df.columns[df.columns == 'eoy']
adm_vs_eoy = df[dt_col] > df.eoy
row_index = df.index[adm_vs_eoy]
df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1)
df['eoy'] = pd.to_datetime(df.eoy)
return df
def calc_presc_per_year(df):
"""
Reduce data to 1 row per year
--------
:param df: dataframe to reduced
:return: reduced dataframe
"""
print('Reducing to 1 row per year')
# Add end of year columns
eoy_cols = ['presc_to_date', 'days_since_rescue', 'rescue_to_date',
'anxiety_depression_presc_to_date', 'rescue_date']
last = df.groupby(['SafeHavenID', 'eoy'])[eoy_cols].last()
# Total columns
sum_cols = ['SALBUTAMOL', 'SABA_inhaler', 'LABA_inhaler', 'LAMA_inhaler',
'SAMA_inhaler', 'ICS_inhaler', 'LABA-ICS_inhaler',
'LAMA +LABA-ICS_inhaler', 'SABA + SAMA_inhaler',
'MCS_inhaler', 'rescue_meds', 'presc', 'anxiety_depression_presc']
total_cols = [col + '_per_year' for col in sum_cols]
total = df.groupby(['SafeHavenID', 'eoy'])[sum_cols].sum()
total.columns = total_cols
# Join together
results = last.join(total)
return results
def main():
# Load in config items
with open('../../../config.json') as json_config_file:
config = json.load(json_config_file)
# Load in data
presc_file = config['extract_data_path'] + 'Pharmacy_Cohort3R.csv'
presc = initialize_presc_data(presc_file)
# Save first date in dataset
data_path = config['model_data_path']
first_patient_appearance(presc, 'PRESC_DATE', 'presc', data_path)
# Add inhaler mapping
presc = add_inhaler_mappings(presc)
# Track salbutamol and rescue meds
presc = track_medication(presc)
# Drop columns
cols_2_drop = ['PI_Approved_Name', 'PI_BNF_Item_Code', 'code']
presc = presc.drop(cols_2_drop, axis=1)
# Add column relative to user-specified date
presc = add_eoy_column(presc, 'PRESC_DATE', config['date'])
# Track rows which are admissions
presc['presc'] = 1
# Add any historical count columns
presc = presc.groupby('SafeHavenID').apply(
add_hist_adm_presc, 'presc', 'PRESC_DATE')
presc = presc.reset_index(drop=True)
# Save per event dataset
presc.to_pickle(data_path + 'validation_presc_proc.pkl')
# Reduce data to 1 row per year
presc_yearly = calc_presc_per_year(presc)
# Correct column names
presc_yearly.columns = correct_column_names(presc_yearly.columns, 'presc')
# Save data
presc_yearly.to_pickle(data_path + 'presc_proc.pkl')
main()