copd-model-e / training /src /processing /process_prescribing.py

Model E: Unsupervised PCA + clustering risk stratification

53a6def 3 days ago

4.42 kB

	"""
	Script for preprocessing pharmacy data
	--------
	Process pharmacy data and track inhaler prescriptions and rescue meds
	"""
	import json
	import pandas as pd
	from datetime import date
	from dateutil.relativedelta import relativedelta
	from utils.common import (add_hist_adm_presc, correct_column_names,
	first_patient_appearance)
	from utils.presc_common import initialize_presc_data, track_medication


	def add_inhaler_mappings(df):
	"""
	Load inhaler prescription mappings and track where they appear in the data
	--------
	:param df: dataframe
	:return: dataframe with column added for each inhaler type
	"""
	print('Mapping inhaler prescriptions')

	# Load in inhaler mapping
	with open('mappings/inhaler_mapping.json') as json_file:
	inhaler_mapping = json.load(json_file)

	for k, v in inhaler_mapping.items():
	df[k + '_inhaler'] = df.PI_Approved_Name.str.contains(
	'\|'.join(v)).astype(int)

	# Remove for now as empty
	df = df.drop(['LABA-LAMA-ICS_inhaler', 'Ignore_inhaler'], axis=1)

	return df


	def add_eoy_column(df, dt_col, eoy_date):
	"""
	Add EOY relative to user-specified end date
	--------
	:param df: dataframe
	:param dt_col: date column in dataframe
	:param eoy_date: EOY date from config
	:return: updated df with EOY column added
	"""
	# Needed to stop error with creating a new column
	df = df.reset_index(drop=True)

	# Add column with user-specified end of year date
	end_date = pd.to_datetime(eoy_date)
	end_month = end_date.month
	end_day = end_date.day

	# Add for every year
	df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year]

	# Check that EOY date is after dt_col for each entry
	eoy_index = df.columns[df.columns == 'eoy']
	adm_vs_eoy = df[dt_col] > df.eoy
	row_index = df.index[adm_vs_eoy]
	df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1)
	df['eoy'] = pd.to_datetime(df.eoy)

	return df


	def calc_presc_per_year(df):
	"""
	Reduce data to 1 row per year
	--------
	:param df: dataframe to reduced
	:return: reduced dataframe
	"""
	print('Reducing to 1 row per year')

	# Add end of year columns
	eoy_cols = ['presc_to_date', 'days_since_rescue', 'rescue_to_date',
	'anxiety_depression_presc_to_date', 'rescue_date']
	last = df.groupby(['SafeHavenID', 'eoy'])[eoy_cols].last()

	# Total columns
	sum_cols = ['SALBUTAMOL', 'SABA_inhaler', 'LABA_inhaler', 'LAMA_inhaler',
	'SAMA_inhaler', 'ICS_inhaler', 'LABA-ICS_inhaler',
	'LAMA +LABA-ICS_inhaler', 'SABA + SAMA_inhaler',
	'MCS_inhaler', 'rescue_meds', 'presc', 'anxiety_depression_presc']
	total_cols = [col + '_per_year' for col in sum_cols]
	total = df.groupby(['SafeHavenID', 'eoy'])[sum_cols].sum()
	total.columns = total_cols

	# Join together
	results = last.join(total)

	return results


	def main():

	# Load in config items
	with open('../../../config.json') as json_config_file:
	config = json.load(json_config_file)

	# Load in data
	presc_file = config['extract_data_path'] + 'Pharmacy_Cohort3R.csv'
	presc = initialize_presc_data(presc_file)

	# Save first date in dataset
	data_path = config['model_data_path']
	first_patient_appearance(presc, 'PRESC_DATE', 'presc', data_path)

	# Add inhaler mapping
	presc = add_inhaler_mappings(presc)

	# Track salbutamol and rescue meds
	presc = track_medication(presc)

	# Drop columns
	cols_2_drop = ['PI_Approved_Name', 'PI_BNF_Item_Code', 'code']
	presc = presc.drop(cols_2_drop, axis=1)

	# Add column relative to user-specified date
	presc = add_eoy_column(presc, 'PRESC_DATE', config['date'])

	# Track rows which are admissions
	presc['presc'] = 1

	# Add any historical count columns
	presc = presc.groupby('SafeHavenID').apply(
	add_hist_adm_presc, 'presc', 'PRESC_DATE')
	presc = presc.reset_index(drop=True)

	# Save per event dataset
	presc.to_pickle(data_path + 'validation_presc_proc.pkl')

	# Reduce data to 1 row per year
	presc_yearly = calc_presc_per_year(presc)

	# Correct column names
	presc_yearly.columns = correct_column_names(presc_yearly.columns, 'presc')

	# Save data
	presc_yearly.to_pickle(data_path + 'presc_proc.pkl')


	main()