copd-model-c / training /define_exacerbations_prologic.py

Initial release: 72-hour COPD exacerbation prediction model

e69d4e4 5 days ago

24.6 kB

	"""Collate all hospital, clincian verified and patient reported events and apply LOGIC."""
	import copd
	import numpy as np
	import os
	import pandas as pd

	data_dir = '<YOUR_DATA_PATH>/copd-dataset'

	############################################################################
	# Define model cohort and training data windows
	############################################################################

	# Read relevant info from patient details
	patient_details = pd.read_csv(os.path.join(data_dir, 'CopdDatasetPatientDetails.txt'),
	usecols=['PatientId', 'FirstSubmissionDate',
	'MostRecentSubmissionDate',
	'DateOfBirth', 'Sex', 'StudyId'],
	delimiter="\|")

	# Select patients for inclusion (those with up to date events in service)
	# Create list of patients for model inclusion
	# Original RECEIVER cohort study id list
	receiver_patients = ["RC{:02d}".format(i) for i in range(1, 85)]
	# This patient needs removing
	receiver_patients.remove('RC34')
	# Scale up patients (subset)
	scaleup_patients = ["SU{:02d}".format(i) for i in range(1, 219)]
	scaleup_patients.append('SU287')

	# List of all valid patients for modelling
	valid_patients = receiver_patients + scaleup_patients

	# Filter for valid patients accounting for white spaces in StudyId (e.g. RC 26 and RC 52)
	patient_details = patient_details[patient_details.StudyId.str.replace(' ', '').isin(
	valid_patients)]
	# Select only non null entries in patient data start/end dates
	patient_details = patient_details[(patient_details.FirstSubmissionDate.notna()) &
	(patient_details.MostRecentSubmissionDate.notna())]

	# Create a column stating the latest date permitted based on events added to service data
	patient_details['LatestPredictionDate'] = '2022-02-28'

	date_cols = ['FirstSubmissionDate', 'MostRecentSubmissionDate', 'LatestPredictionDate']
	patient_details[date_cols] = patient_details[date_cols].apply(
	lambda x: pd.to_datetime(x, utc=True).dt.normalize(), axis=1)

	# Choose the earlier date out of the patient's last submission and the latest COPD data
	# events
	patient_details['LatestPredictionDate'] = patient_details[
	['MostRecentSubmissionDate', 'LatestPredictionDate']].min(axis=1)

	# Add N days to start of data window because predictions are made N days in advance
	# N=3 for the 72 hr exac model
	N = 3
	patient_details['EarliestPredictionDate'] = patient_details['FirstSubmissionDate']\
	+ pd.DateOffset(days=N)

	# Remove any patients for whom the prediction start date overlaps the final submission
	# date, i.e. they have too short a window of data
	patient_details = patient_details[patient_details['EarliestPredictionDate'] <
	patient_details['LatestPredictionDate']]
	# List of remaining patients
	model_patients = list(patient_details.PatientId.unique())
	model_study_ids = list(patient_details.StudyId.unique())

	print('Model cohort: {} patients. {} RECEIVER and {} SU'.format(
	len(model_patients),
	len(patient_details[patient_details['StudyId'].str.startswith('RC')]),
	len(patient_details[patient_details['StudyId'].str.startswith('SU')])))

	df = patient_details[['PatientId', 'DateOfBirth', 'Sex', 'StudyId',
	'FirstSubmissionDate', 'LatestPredictionDate']].copy()

	# Create a dataframe with daily entries for each patient for their data window
	# df["DateOfEvent"] = df.apply(lambda x: pd.date_range(x.FirstSubmissionDate,
	# x.MostRecentSubmissionDate, freq='D'), axis=1)
	df["DateOfEvent"] = df.apply(lambda x: pd.date_range(x.FirstSubmissionDate -
	pd.DateOffset(days=N), x.LatestPredictionDate, freq='D'),
	axis=1)
	df = df.explode('DateOfEvent').reset_index(drop=True)

	############################################################################
	# Extract hospital exacerbations and admissions from COPD service data
	# Includes 1 year pre-onboarding plus time on Lenus COPD service
	############################################################################

	# Contains exacerbations among other event types
	patient_events = pd.read_csv(os.path.join(data_dir, 'PatientEvents.txt'),
	delimiter="\|", usecols=['PatientId', 'DateOfEvent',
	'EventTypeId'])

	# Filter for only patients in model cohort - will still contain events out of data windows
	patient_events = patient_events[patient_events.PatientId.isin(model_patients)]

	# Lookup table for patient event types
	patient_event_types = pd.read_csv(os.path.join(data_dir, 'PatientEventTypes.txt'),
	delimiter="\|", usecols=['Id', 'Name'])
	patient_event_types = patient_event_types.rename(columns={'Id': 'EventTypeId',
	'Name': 'EventName'})
	# Merge patient events with lookup table)
	patient_events = patient_events.merge(patient_event_types, on='EventTypeId')

	# Identify hospital exacerbation events
	patient_events['IsHospExac'] = copd.define_service_exac_event(
	events=patient_events.EventName, include_community=False)
	# Identify hospital admissions (all causes)
	patient_events['IsHospAdmission'] = copd.define_hospital_admission(
	patient_events.EventName)

	admissions = patient_events[patient_events.IsHospAdmission == 1][['PatientId',
	'DateOfEvent',
	'IsHospAdmission']]
	hosp_exacs = patient_events[patient_events.IsHospExac == 1][['PatientId',
	'DateOfEvent',
	'IsHospExac']]
	admissions['DateOfEvent'] = pd.to_datetime(admissions.DateOfEvent,
	utc=True).dt.normalize()
	hosp_exacs['DateOfEvent'] = pd.to_datetime(hosp_exacs.DateOfEvent,
	utc=True).dt.normalize()

	hosp_exacs = hosp_exacs.drop_duplicates()
	admissions = admissions.drop_duplicates()
	# Save hospital exacerbations and admissions data
	hosp_exacs.to_pickle(os.path.join(data_dir, 'hospital_exacerbations.pkl'))
	admissions.to_pickle(os.path.join(data_dir, 'admissions.pkl'))

	##########################################################################################
	# Extract all rescue meds for model cohort in the year prior to onboarding. These will be
	# used as a proxy for community exacerbations pre-OB (not captured in service data)
	##########################################################################################

	# Read mapping between StudyId and SafeHavenID, and filter for model cohort
	id_mapping = pd.read_pickle('../data/sh_to_studyid_mapping.pkl')
	id_mapping = id_mapping[id_mapping.StudyId.isin(model_study_ids)]

	# Read pharmacy data and filter for model cohort
	pharmacy = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA',
	'Pharmacy_Cohort4.csv'))
	pharmacy = pharmacy[pharmacy.SafeHavenID.isin(id_mapping.SafeHavenID)]

	# Pull out rescue med prescriptions only
	steroid_codes = ['0603020T0AAACAC', '0603020T0AABKBK', '0603020T0AAAXAX',
	'0603020T0AAAGAG', '0603020T0AABHBH', '0603020T0AAACAC',
	'0603020T0AABKBK', '0603020T0AABNBN', '0603020T0AAAGAG',
	'0603020T0AABHBH']

	antibiotic_codes = ['0501013B0AAAAAA', '0501013B0AAABAB', '0501030I0AAABAB',
	'0501030I0AAAAAA', '0501050B0AAAAAA', '0501050B0AAADAD',
	'0501013K0AAAJAJ']
	rescue_med_bnf_codes = steroid_codes + antibiotic_codes
	pharmacy = pharmacy[pharmacy.PI_BNF_Item_Code.isin(rescue_med_bnf_codes)]

	# Get latest and earliest dates for model cohort
	cohort_dates = id_mapping.merge(patient_details[
	['PatientId', 'StudyId', 'FirstSubmissionDate', 'LatestPredictionDate']],
	on='StudyId')

	# Merge and keep only rescue meds in the year before patient onboarding
	pharmacy_exacs = cohort_dates.merge(pharmacy, on='SafeHavenID').drop(
	columns=['PatientId', 'PI_BNF_Item_Code', 'PI_BNF_Item_Description',
	'DISP_DATE', 'SafeHavenID'])
	pharmacy_exacs = pharmacy_exacs.rename(columns={'PRESC_DATE': 'DateOfEvent'})
	pharmacy_exacs['DateOfEvent'] = pd.to_datetime(pharmacy_exacs['DateOfEvent'],
	utc=True).dt.normalize()
	# Drop duplicates
	pharmacy_exacs = pharmacy_exacs.drop_duplicates()
	# Filter on dates
	pharmacy_exacs = pharmacy_exacs[
	(pharmacy_exacs.DateOfEvent < pharmacy_exacs.FirstSubmissionDate) &
	(pharmacy_exacs.DateOfEvent >= pharmacy_exacs.FirstSubmissionDate -
	pd.DateOffset(years=1))]
	# New column for rescue med exac type
	pharmacy_exacs['IsRescueMedExac'] = 1
	pharmacy_exacs = pharmacy_exacs.drop(
	columns=['FirstSubmissionDate', 'LatestPredictionDate'])

	# Save "pharmacy exacerbations" data
	pharmacy_exacs.to_pickle(os.path.join(data_dir, 'pharmacy_exacerbations.pkl'))
	######################################################
	# Extract patient reported exacerbation events
	######################################################
	########################
	# Data post Q5 change
	#######################

	# Read file containing patient reported events (not patient_events because it contains
	# the dates when patients answered PROs and not which date they reported as having taken
	# their rescue meds)
	symptom_diary = pd.read_csv(os.path.join(data_dir, 'CopdDatasetProSymptomDiary.txt'),
	usecols=['PatientId', 'StudyId', 'Score', 'SubmissionTime',
	'SymptomDiaryQ5', 'SymptomDiaryQ11a', 'SymptomDiaryQ11b'],
	delimiter="\|")

	Q5ChangeDate = pd.to_datetime('2021-04-22', utc=True)
	symptom_diary = copd.filter_symptom_diary(df=symptom_diary, date_cutoff=Q5ChangeDate,
	patients=model_patients)

	weekly_pros = copd.get_rescue_med_pro_responses(symptom_diary)
	weekly_pros = copd.set_pro_exac_dates(weekly_pros)
	weekly_pros = weekly_pros[['PatientId', 'Q5Answered', 'NegativeQ5', 'IsCommExac',
	'DateOfEvent', 'ExacDateUnknown']]

	#########################
	# Pre Q5 change events
	#########################

	# RECEIVER cohort - community events verified up to 16/03/21
	receiver = pd.read_excel('./LenusEvents/breakdown_of_com_exac_160321.xlsx')
	receiver = receiver.rename(columns={'Study number': 'StudyId',
	'Exacerbation recorded': 'DateRecorded'})
	receiver_exacs = copd.extract_clinician_verified_exacerbations(receiver)

	# Scale up cohort - community events verified up to 17/05/2021
	scaleup = pd.read_excel('./LenusEvents/Scale_Up_comm_exac_count_V9_deident.xlsx')
	scaleup = scaleup.rename(columns={'Study Number': 'StudyId',
	'Date Exacerbation recorded': 'DateRecorded'})
	scaleup['StudyId'] = scaleup['StudyId'].ffill()

	scaleup_exacs = copd.extract_clinician_verified_exacerbations(scaleup)

	# Combine RECEIVER and scale up events into one df
	verified_exacs = pd.concat([receiver_exacs, scaleup_exacs])

	####################################################################################
	# Merge hospital, patient reported and rescue med exacs with daily patient records
	#
	# Exacerbations occurring in Lenus service period include verified clinician events
	# pre-April 2021 (after onboarding) and community exacerbations recorded in weekly
	# PROs post-April 2021. Hospital exacs occur in year prior to OB and on Lenus service.
	# Rescue med exacs are only used for the year prior to OB.
	# Need to ensure each record has both StudyId and PatientId to prevent losing events
	######################################################################################

	# Patient reported, clinician verified (during COPD service time only, inner join)
	df = df.merge(verified_exacs, on=['StudyId', 'DateOfEvent'], how='left')

	# Patient reported, new rescue med PRO (April 2021 onwards, inner join)
	df = df.merge(weekly_pros, on=['PatientId', 'DateOfEvent'], how='left')

	# Hospital exacs (one year prior to OB plus time on service, outer join)
	df = df.merge(hosp_exacs, on=['PatientId', 'DateOfEvent'], how='outer')
	df = copd.fill_column_by_patient(df=df, id_col='PatientId', col='StudyId')

	# Pharmacy exacs, (one year prior to OB up to OB only, outer join)
	df = df.merge(pharmacy_exacs, on=['StudyId', 'DateOfEvent'], how='outer')
	df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='PatientId')

	# Respiratory hospital admissions (one year prior to OB plus time on service, outer join)
	df = df.merge(admissions, on=['PatientId', 'DateOfEvent'], how='outer')
	df = copd.fill_column_by_patient(df=df, id_col='PatientId', col='StudyId')

	# Combine cols from individual datasets into one
	df['ExacDateUnknown'] = np.where((df.ExacDateUnknown_x == 1) \|
	(df.ExacDateUnknown_y == 1), 1, 0)
	df['IsCommExac'] = np.where((df.IsCommExac_x == 1) \|
	(df.IsCommExac_y == 1) \| (df.IsRescueMedExac == 1), 1, 0)

	# Column for whether an exacerbation of any kind occurred on each date. To be filtered
	# using (PRO) LOGIC
	df['IsExac'] = np.where((df.IsCommExac == 1) \| (df.IsHospExac == 1), 1, 0)

	# Resample the df to one day per patient starting from the earliest record (may be a
	# pre-onboarding exac. Complete daily records required for calculating DaysSinceLastExac)
	df = df.set_index('DateOfEvent').groupby('StudyId').resample('D').asfreq().drop(
	'StudyId', axis=1).reset_index()

	# Infill binary cols with zero where applicable
	df[['Q5Answered', 'NegativeQ5', 'IsHospExac', 'IsCommExac', 'ExacDateUnknown', 'IsExac',
	'IsRescueMedExac', 'IsHospAdmission']] = df[
	['Q5Answered', 'NegativeQ5', 'IsHospExac', 'IsCommExac', 'ExacDateUnknown',
	'IsExac', 'IsRescueMedExac', 'IsHospAdmission']].fillna(0)

	# Infill some columns by StudyId to populate entire df
	df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='FirstSubmissionDate')
	df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='LatestPredictionDate')
	df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='PatientId')

	# Retain only dates before the end of each patient's data window
	df = df[df.DateOfEvent <= df.LatestPredictionDate]

	print('Starting number of exacerbations: {}'.format(df.IsExac.sum()))
	print('Exacerbations pre-onboarding to COPD service: {}'.format(
	len(df[(df.IsExac == 1) & (df.DateOfEvent < df.FirstSubmissionDate)])))
	print('Exacerbations post-onboarding to COPD service: {}'.format(
	len(df[(df.IsExac == 1) & (df.DateOfEvent >= df.FirstSubmissionDate)])))
	print('Number of unique exacerbation patients: {}'.format(
	len(df[df.IsExac == 1].PatientId.unique())))
	# print('Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping'
	# .format(df.IsHospExac.sum(), df.IsCommExac.sum(),
	# len(df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1)])))
	print('Rescue med prescriptions in year prior to onboarding: {} ({} unique patients, \
	{} prescription dates overlapping with hospital events)'
	.format(len(df[df.IsRescueMedExac == 1]),
	len(df[df.IsRescueMedExac == 1].StudyId.unique()),
	len(df[(df.IsRescueMedExac == 1) & (df.IsHospExac == 1)])))
	print('Hospital exacerbations in year prior to onboarding: {} ({} unique patients)'
	.format(len(df[(df.IsHospExac == 1) &
	(df.DateOfEvent < df.FirstSubmissionDate)]),
	len(df[(df.IsHospExac == 1) &
	(df.DateOfEvent < df.FirstSubmissionDate)].StudyId.unique())))
	print('Hospital exacerbations post-OB: {} ({} unique patients)'
	.format(len(df[(df.IsHospExac == 1) &
	(df.DateOfEvent >= df.FirstSubmissionDate)]),
	len(df[(df.IsHospExac == 1) &
	(df.DateOfEvent >= df.FirstSubmissionDate)].StudyId.unique())))
	print('Clinician verified community exacerbations post-OB: {} ({} unique patients)'
	.format(len(df[df.IsCommExac_x == 1]),
	len(df[df.IsCommExac_x == 1].StudyId.unique())))
	print('Community exacerbations post-OB from weekly PROs: {} ({} unique patients)'
	.format(len(df[df.IsCommExac_y == 1]),
	len(df[df.IsCommExac_y == 1].StudyId.unique())))

	print('Number of patient reported exacerbations with unknown dates: {} ({} overlapping\
	with hospital events)'.format(df.ExacDateUnknown.sum(),
	len(df[(df.IsHospExac == 1) & (df.ExacDateUnknown == 1)])))

	# Check for any patient reported events with unknown dates that occurred on the same day
	# as a hospital event. Hospital events are trusted so set the date to known
	df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1), 'ExacDateUnknown'] = 0
	print('Remaining exacerbations with unknown dates: {}'.format(df.ExacDateUnknown.sum()))

	df = df.drop(columns=['IsCommExac_x', 'IsCommExac_y', 'ExacDateUnknown_x',
	'ExacDateUnknown_y'])

	############################################################################
	# Implement PRO LOGIC on hospital and patient reported exacerbation events
	############################################################################

	# Define min and max days for PRO LOGIC. No predictions made or data used within
	# logic_min_days after an exacerbation. Events falling between logic_min_days and
	# logic_max_days after an event are subject to the weekly rescue med LOGIC criterion
	logic_min_days = 14
	logic_max_days = 35

	# Calculate the days since last rescue med prescription
	df = df.groupby('StudyId').apply(
	lambda x: copd.calculate_days_since_last_event(
	df=x, event_col='IsRescueMedExac',
	output_col='DaysSinceLastRescueMeds')).reset_index(drop=True)

	rescue_med_min_days = 7
	print('Rescue med prescriptions occuring within {} days of a previous prescription: {}'
	.format(rescue_med_min_days,
	len(df[(df.DaysSinceLastRescueMeds > -1) &
	(df.DaysSinceLastRescueMeds <= rescue_med_min_days) &
	(df.IsRescueMedExac == 1)])))

	# Reset IsExac to 0 for rescue med prescriptions within 7 days of a previous prescription
	df.loc[(df.DaysSinceLastRescueMeds > -1) &
	(df.DaysSinceLastRescueMeds <= rescue_med_min_days) &
	(df.IsRescueMedExac == 1), 'IsExac'] = 0

	# Calculate the days since the previous exacerbation for all patient days. Now includes
	# events before patient onboarding
	df = df.groupby('StudyId').apply(
	lambda x: copd.calculate_days_since_last_event(
	df=x, event_col='IsExac', output_col='DaysSinceLastExac')).reset_index(drop=True)

	pre_onboarding_min_days = 14
	print('Pre-onboarding exacerbations occuring within {} days of a previous exac: {}'
	.format(pre_onboarding_min_days,
	len(df[(df.IsExac == 1) &
	(df.DaysSinceLastExac > -1) &
	(df.DaysSinceLastExac <= pre_onboarding_min_days) &
	(df.DateOfEvent < df.FirstSubmissionDate)])))

	# Set IsExac to 0 for any pre-OB exacs within 14 days of a previous exac
	df.loc[(df.DaysSinceLastExac > -1) & (df.DaysSinceLastExac <= pre_onboarding_min_days) &
	(df.DateOfEvent < df.FirstSubmissionDate), 'IsExac'] = 0

	# Recalculate DaysSinceLastExac to avoid counting exacs removed above
	df = df.groupby('StudyId').apply(
	lambda x: copd.calculate_days_since_last_event(
	df=x, event_col='IsExac', output_col='DaysSinceLastExac')).reset_index(drop=True)

	# Apply exclusion period following all exacerbations
	df['RemoveRow'] = copd.minimum_period_between_exacerbations(
	df, minimum_days=logic_min_days)
	# Don't apply this criterion to pre-OB events (already accounted for above)
	df.loc[(df.DateOfEvent < df.FirstSubmissionDate), 'RemoveRow'] = 0

	print('Number of post-OB exacerbations excluded by PRO LOGIC {} day criterion: {}'.format(
	logic_min_days, len(df[(df.IsExac == 1) & (df.RemoveRow == 1) &
	(df.DateOfEvent >= df.FirstSubmissionDate)])))

	# Apply criterion for negative weekly Q5 responses - doesn't capture anything post Q5
	# change
	consecutive_replies = 2
	df = copd.apply_logic_response_criterion(df,
	minimum_period=logic_min_days,
	maximum_period=logic_max_days,
	N=consecutive_replies)

	print('Weekly rescue med (Q5) criterion applied to events occurring between {} and {} \
	days after a previous event. {} consecutive negative replies required for the event to \
	count as a new event'.format(logic_min_days, logic_max_days, consecutive_replies))
	# Don't apply this criterion to pre-OB events (already accounted for above)
	df.loc[(df.DateOfEvent < df.FirstSubmissionDate), 'RemoveExac'] = 0

	print('Number of exacerbations excluded by PRO LOGIC Q5 response criterion: {}'.format(
	df.RemoveExac.sum()))
	print('Earliest and latest exacerbations excluded: {}, {}'.format(
	df[df.RemoveExac == 1].DateOfEvent.min(), df[df.RemoveExac == 1].DateOfEvent.max()))

	print('Remaining post-OB exacerbations: {}'.format(
	len(df[(df.IsExac == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1) &
	(df.DateOfEvent >= df.FirstSubmissionDate)])))

	print('Remaining exacerbations with unknown dates: {}'.format(
	len(df[(df.ExacDateUnknown == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1)])))

	# Remove data between segments of prolonged events, count only first occurrence
	df = copd.remove_data_between_exacerbations(df)

	# Remove 7 days before each reported exacerbation within unknown date (meds in last week)
	df = copd.remove_unknown_date_exacerbations(df)

	# New df with unwanted rows removed for events breakdown. Don't drop rows before setting
	# the prediction window in case of events that occur immediately after the exclusion
	# period (prediction window is set on df index rather than dates so want full daily df)
	df_counts = df[(df.RemoveRow != 1) & (df.DateOfEvent >= df.FirstSubmissionDate)].copy()

	print('Final number of exacerbations: {}'.format(df_counts.IsExac.sum()))
	exac_patients = pd.Series(df_counts[df_counts.IsExac == 1].StudyId.unique())
	print('Number of unique exacerbation patients: {} ({} RC and {} SU)'.format(
	len(exac_patients), exac_patients.str.startswith('RC').sum(),
	exac_patients.str.startswith('SU').sum()))
	print('Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping'
	.format(df_counts.IsHospExac.sum(), df_counts.IsCommExac.sum(),
	len(df_counts.loc[
	(df_counts.IsCommExac == 1) & (df_counts.IsHospExac == 1)])))

	#################################################################
	# Set the prediction window to N days and remove unwanted rows
	# Calculate rolling exac counts before removing pre-OB events
	#################################################################
	# Create column of exacerbations to use for rolling counts
	df['ExacsToKeep'] = np.where((df.RemoveRow != 1) & (df.RemoveExac != 1), df.IsExac, 0)

	# Calculate rolling 365 day sums of exacerbations and respiratory admissions
	df = copd.rolling_sum_previous_period(df=df, date_col='DateOfEvent', col='ExacsToKeep',
	id_col='StudyId', window=365,
	output_col='ExacsPrevYear')
	df = copd.rolling_sum_previous_period(df=df, date_col='DateOfEvent',
	col='IsHospAdmission', id_col='StudyId', window=365,
	output_col='AdmissionsPrevYear')

	# Filter for data in the training data window (first submission date onwards)
	df = df[(df.DateOfEvent >= df.FirstSubmissionDate) & (df.RemoveRow != 1)]

	print('Setting {} day prediction window'.format(N))
	df = copd.set_prediction_window(df=df, prediction_window=N)

	print('Full data set now contains {} exacerbation days out of {} ({:.1f}%)'.format(
	df.IsExac.value_counts()[1], len(df),
	100 * df.IsExac.value_counts(normalize=True)[1]))

	################
	# Save data
	################
	df = df[['PatientId', 'StudyId', 'DateOfBirth', 'Sex',
	'DateOfEvent', 'IsExac', 'DaysSinceLastExac', 'FirstSubmissionDate',
	'LatestPredictionDate', 'ExacsPrevYear', 'AdmissionsPrevYear']]

	df.to_pickle(os.path.join(data_dir, 'exac_data.pkl'))
	patient_details.to_pickle(os.path.join(data_dir, 'patient_details.pkl'))