copd-model-e / validation /event_tracking /model_e_exacerbation_calculations.py

Model E: Unsupervised PCA + clustering risk stratification

53a6def 3 days ago

5.12 kB

	# Import libraries
	import pandas as pd
	import numpy as np

	# Set file paths
	input_file_path = '<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/'
	output_file_path = '<YOUR_DATA_PATH>/summary_files/'

	steroid_codes = ['0603020T0AAACAC', '0603020T0AABKBK', '0603020T0AAAXAX',
	'0603020T0AAAGAG', '0603020T0AABHBH', '0603020T0AAACAC',
	'0603020T0AABKBK', '0603020T0AABNBN', '0603020T0AAAGAG',
	'0603020T0AABHBH']

	antib_codes = ['0501013B0AAAAAA', '0501013B0AAABAB', '0501030I0AAABAB',
	'0501030I0AAAAAA', '0501050B0AAAAAA', '0501050B0AAADAD',
	'0501013K0AAAJAJ']

	exac_meds = steroid_codes + antib_codes


	def read_data(file, cols, types):
	"""
	Read in data source
	--------
	:param file: string filename
	:param cols: string list of column names
	:param types: string list of column types
	:return: dataframe
	"""
	schema = dict(zip(cols, types))
	df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema)
	return df


	def initialize_presc_data(presc_file):
	"""
	Load in prescribing dataset to correct format
	--------
	:param presc_file: prescribing data file name
	:return: prescribing dataframe with correct column names and types
	"""
	print('Loading prescribing data')

	# Read in data
	presc_cols = ['SafeHavenID', 'PRESC_DATE', 'PI_Approved_Name',
	'PI_BNF_Item_Code']
	presc_types = ['int', 'object', 'str', 'str']
	df = read_data(presc_file, presc_cols, presc_types)

	# Drop any nulls or duplicates
	df = df.dropna()
	df = df.drop_duplicates()

	# Convert date
	df['PRESC_DATE'] = pd.to_datetime(df.PRESC_DATE)

	return df


	def track_medication(df):
	"""
	Track salbutamol and rescue med prescriptions
	--------
	:param df: dataframe
	:return: dataframe with tracked meds
	"""
	print('Tracking medication')

	# Extract BNF codes without brand info
	df['code'] = df.PI_BNF_Item_Code.apply(lambda x: x[0:9])

	# Track rescue meds
	df['rescue_meds'] = df.PI_BNF_Item_Code.str.contains(
	'\|'.join(exac_meds)).astype(int)

	return df


	def filter_data(data, date):
	"""
	Filter data to only include rescue med prescritpions occurring
	after the index date
	--------
	:param data: dataframe
	:param date: Index date in 'DD-MM-YYYY' format
	:return: filtered dataframe
	"""
	data['PRESC_DATE'] = pd.to_datetime(data['PRESC_DATE'])
	data = data[data['PRESC_DATE'] >= date]
	data = data[data['rescue_meds'] == 1]
	return data


	def calculate_time_to_first_exacerbation(data, date):
	"""
	Calculate days to first exacerbation
	--------
	:param data: dataframe
	:param date: Index date in 'DD-MM-YYYY' format
	:return: dataframe showing the number of days to the first exacerbation
	event for each ID since the index date
	"""
	first_exac = data.groupby('SafeHavenID').agg(first_exac=('PRESC_DATE', np.min))
	first_exac['index_date'] = date
	first_exac['index_date'] = pd.to_datetime(first_exac['index_date'])
	first_exac['days_to_first_exac'] = (first_exac['first_exac'] - first_exac['index_date']).dt.days
	return first_exac


	def calculate_exac_count_1_year(data, year_censor, first_exac_df):
	"""
	Calculate the number of exacerbations in the year following the index date
	and join this data to the time to first exacerbation data for each ID
	--------
	:param data: dataframe containing exacerbation dates (based on rescue meds)
	:param year_censor: date 1 year following Index date 'DD-MM-YYYY' format
	:param first_exac_df: dataframe showing days to first exacerbations for IDs
	:return: dataframe showing the number of days to the first exacerbation
	event for each ID since the index date
	"""
	presc_year = data[data['PRESC_DATE'] < year_censor]
	year_exac_count = presc_year.groupby('SafeHavenID').agg(exac_count_year_post_index=('PRESC_DATE', 'nunique'))
	all_exac_data = pd.merge(year_exac_count, first_exac_df, on="SafeHavenID", how="outer")
	all_exac_data['exac_count_year_post_index'] = all_exac_data['exac_count_year_post_index'].fillna(0)
	return all_exac_data


	def main():

	# Initialise prescription data
	presc = initialize_presc_data(input_file_path + 'Pharmacy_Cohort3R.csv')

	# Track rescue med prescriptions
	presc = track_medication(presc)

	# Filter to only include exacerbation events (rescue med prescriptions) occurring after the index date
	presc = filter_data(presc, '01-01-2020')

	# Calculate time to first respiratory and COPD admissions
	first_exac = calculate_time_to_first_exacerbation(presc, '01-01-2020')

	# Calculate number of respiratory and COPD admissions in the follow up year and join this to the time to admission data
	first_exac = calculate_exac_count_1_year(presc, '01-01-2021', first_exac)

	# Save data
	presc.to_csv(output_file_path + 'all_exacerbations_from_index_date.csv')
	first_exac.to_pickle(output_file_path + 'community_managed_exacerbations_cohort_summary.pkl')


	main()