"""Collate all hospital, clincian verified and patient reported events and apply LOGIC.""" import copd import numpy as np import os import pandas as pd data_dir = '/copd-dataset' ############################################################################ # Define model cohort and training data windows ############################################################################ # Read relevant info from patient details patient_details = pd.read_csv(os.path.join(data_dir, 'CopdDatasetPatientDetails.txt'), usecols=['PatientId', 'FirstSubmissionDate', 'MostRecentSubmissionDate', 'DateOfBirth', 'Sex', 'StudyId'], delimiter="|") # Select patients for inclusion (those with up to date events in service) # Create list of patients for model inclusion # Original RECEIVER cohort study id list receiver_patients = ["RC{:02d}".format(i) for i in range(1, 85)] # This patient needs removing receiver_patients.remove('RC34') # Scale up patients (subset) scaleup_patients = ["SU{:02d}".format(i) for i in range(1, 219)] scaleup_patients.append('SU287') # List of all valid patients for modelling valid_patients = receiver_patients + scaleup_patients # Filter for valid patients accounting for white spaces in StudyId (e.g. RC 26 and RC 52) patient_details = patient_details[patient_details.StudyId.str.replace(' ', '').isin( valid_patients)] # Select only non null entries in patient data start/end dates patient_details = patient_details[(patient_details.FirstSubmissionDate.notna()) & (patient_details.MostRecentSubmissionDate.notna())] # Create a column stating the latest date permitted based on events added to service data patient_details['LatestPredictionDate'] = '2022-02-28' date_cols = ['FirstSubmissionDate', 'MostRecentSubmissionDate', 'LatestPredictionDate'] patient_details[date_cols] = patient_details[date_cols].apply( lambda x: pd.to_datetime(x, utc=True).dt.normalize(), axis=1) # Choose the earlier date out of the patient's last submission and the latest COPD data # events patient_details['LatestPredictionDate'] = patient_details[ ['MostRecentSubmissionDate', 'LatestPredictionDate']].min(axis=1) # Add N days to start of data window because predictions are made N days in advance # N=3 for the 72 hr exac model N = 3 patient_details['EarliestPredictionDate'] = patient_details['FirstSubmissionDate']\ + pd.DateOffset(days=N) # Remove any patients for whom the prediction start date overlaps the final submission # date, i.e. they have too short a window of data patient_details = patient_details[patient_details['EarliestPredictionDate'] < patient_details['LatestPredictionDate']] # List of remaining patients model_patients = list(patient_details.PatientId.unique()) model_study_ids = list(patient_details.StudyId.unique()) print('Model cohort: {} patients. {} RECEIVER and {} SU'.format( len(model_patients), len(patient_details[patient_details['StudyId'].str.startswith('RC')]), len(patient_details[patient_details['StudyId'].str.startswith('SU')]))) df = patient_details[['PatientId', 'DateOfBirth', 'Sex', 'StudyId', 'FirstSubmissionDate', 'LatestPredictionDate']].copy() # Create a dataframe with daily entries for each patient for their data window # df["DateOfEvent"] = df.apply(lambda x: pd.date_range(x.FirstSubmissionDate, # x.MostRecentSubmissionDate, freq='D'), axis=1) df["DateOfEvent"] = df.apply(lambda x: pd.date_range(x.FirstSubmissionDate - pd.DateOffset(days=N), x.LatestPredictionDate, freq='D'), axis=1) df = df.explode('DateOfEvent').reset_index(drop=True) ############################################################################ # Extract hospital exacerbations and admissions from COPD service data # Includes 1 year pre-onboarding plus time on Lenus COPD service ############################################################################ # Contains exacerbations among other event types patient_events = pd.read_csv(os.path.join(data_dir, 'PatientEvents.txt'), delimiter="|", usecols=['PatientId', 'DateOfEvent', 'EventTypeId']) # Filter for only patients in model cohort - will still contain events out of data windows patient_events = patient_events[patient_events.PatientId.isin(model_patients)] # Lookup table for patient event types patient_event_types = pd.read_csv(os.path.join(data_dir, 'PatientEventTypes.txt'), delimiter="|", usecols=['Id', 'Name']) patient_event_types = patient_event_types.rename(columns={'Id': 'EventTypeId', 'Name': 'EventName'}) # Merge patient events with lookup table) patient_events = patient_events.merge(patient_event_types, on='EventTypeId') # Identify hospital exacerbation events patient_events['IsHospExac'] = copd.define_service_exac_event( events=patient_events.EventName, include_community=False) # Identify hospital admissions (all causes) patient_events['IsHospAdmission'] = copd.define_hospital_admission( patient_events.EventName) admissions = patient_events[patient_events.IsHospAdmission == 1][['PatientId', 'DateOfEvent', 'IsHospAdmission']] hosp_exacs = patient_events[patient_events.IsHospExac == 1][['PatientId', 'DateOfEvent', 'IsHospExac']] admissions['DateOfEvent'] = pd.to_datetime(admissions.DateOfEvent, utc=True).dt.normalize() hosp_exacs['DateOfEvent'] = pd.to_datetime(hosp_exacs.DateOfEvent, utc=True).dt.normalize() hosp_exacs = hosp_exacs.drop_duplicates() admissions = admissions.drop_duplicates() # Save hospital exacerbations and admissions data hosp_exacs.to_pickle(os.path.join(data_dir, 'hospital_exacerbations.pkl')) admissions.to_pickle(os.path.join(data_dir, 'admissions.pkl')) ########################################################################################## # Extract all rescue meds for model cohort in the year prior to onboarding. These will be # used as a proxy for community exacerbations pre-OB (not captured in service data) ########################################################################################## # Read mapping between StudyId and SafeHavenID, and filter for model cohort id_mapping = pd.read_pickle('../data/sh_to_studyid_mapping.pkl') id_mapping = id_mapping[id_mapping.StudyId.isin(model_study_ids)] # Read pharmacy data and filter for model cohort pharmacy = pd.read_csv(os.path.join('/EXAMPLE_STUDY_DATA', 'Pharmacy_Cohort4.csv')) pharmacy = pharmacy[pharmacy.SafeHavenID.isin(id_mapping.SafeHavenID)] # Pull out rescue med prescriptions only steroid_codes = ['0603020T0AAACAC', '0603020T0AABKBK', '0603020T0AAAXAX', '0603020T0AAAGAG', '0603020T0AABHBH', '0603020T0AAACAC', '0603020T0AABKBK', '0603020T0AABNBN', '0603020T0AAAGAG', '0603020T0AABHBH'] antibiotic_codes = ['0501013B0AAAAAA', '0501013B0AAABAB', '0501030I0AAABAB', '0501030I0AAAAAA', '0501050B0AAAAAA', '0501050B0AAADAD', '0501013K0AAAJAJ'] rescue_med_bnf_codes = steroid_codes + antibiotic_codes pharmacy = pharmacy[pharmacy.PI_BNF_Item_Code.isin(rescue_med_bnf_codes)] # Get latest and earliest dates for model cohort cohort_dates = id_mapping.merge(patient_details[ ['PatientId', 'StudyId', 'FirstSubmissionDate', 'LatestPredictionDate']], on='StudyId') # Merge and keep only rescue meds in the year before patient onboarding pharmacy_exacs = cohort_dates.merge(pharmacy, on='SafeHavenID').drop( columns=['PatientId', 'PI_BNF_Item_Code', 'PI_BNF_Item_Description', 'DISP_DATE', 'SafeHavenID']) pharmacy_exacs = pharmacy_exacs.rename(columns={'PRESC_DATE': 'DateOfEvent'}) pharmacy_exacs['DateOfEvent'] = pd.to_datetime(pharmacy_exacs['DateOfEvent'], utc=True).dt.normalize() # Drop duplicates pharmacy_exacs = pharmacy_exacs.drop_duplicates() # Filter on dates pharmacy_exacs = pharmacy_exacs[ (pharmacy_exacs.DateOfEvent < pharmacy_exacs.FirstSubmissionDate) & (pharmacy_exacs.DateOfEvent >= pharmacy_exacs.FirstSubmissionDate - pd.DateOffset(years=1))] # New column for rescue med exac type pharmacy_exacs['IsRescueMedExac'] = 1 pharmacy_exacs = pharmacy_exacs.drop( columns=['FirstSubmissionDate', 'LatestPredictionDate']) # Save "pharmacy exacerbations" data pharmacy_exacs.to_pickle(os.path.join(data_dir, 'pharmacy_exacerbations.pkl')) ###################################################### # Extract patient reported exacerbation events ###################################################### ######################## # Data post Q5 change ####################### # Read file containing patient reported events (not patient_events because it contains # the dates when patients answered PROs and not which date they reported as having taken # their rescue meds) symptom_diary = pd.read_csv(os.path.join(data_dir, 'CopdDatasetProSymptomDiary.txt'), usecols=['PatientId', 'StudyId', 'Score', 'SubmissionTime', 'SymptomDiaryQ5', 'SymptomDiaryQ11a', 'SymptomDiaryQ11b'], delimiter="|") Q5ChangeDate = pd.to_datetime('2021-04-22', utc=True) symptom_diary = copd.filter_symptom_diary(df=symptom_diary, date_cutoff=Q5ChangeDate, patients=model_patients) weekly_pros = copd.get_rescue_med_pro_responses(symptom_diary) weekly_pros = copd.set_pro_exac_dates(weekly_pros) weekly_pros = weekly_pros[['PatientId', 'Q5Answered', 'NegativeQ5', 'IsCommExac', 'DateOfEvent', 'ExacDateUnknown']] ######################### # Pre Q5 change events ######################### # RECEIVER cohort - community events verified up to 16/03/21 receiver = pd.read_excel('./LenusEvents/breakdown_of_com_exac_160321.xlsx') receiver = receiver.rename(columns={'Study number': 'StudyId', 'Exacerbation recorded': 'DateRecorded'}) receiver_exacs = copd.extract_clinician_verified_exacerbations(receiver) # Scale up cohort - community events verified up to 17/05/2021 scaleup = pd.read_excel('./LenusEvents/Scale_Up_comm_exac_count_V9_deident.xlsx') scaleup = scaleup.rename(columns={'Study Number': 'StudyId', 'Date Exacerbation recorded': 'DateRecorded'}) scaleup['StudyId'] = scaleup['StudyId'].ffill() scaleup_exacs = copd.extract_clinician_verified_exacerbations(scaleup) # Combine RECEIVER and scale up events into one df verified_exacs = pd.concat([receiver_exacs, scaleup_exacs]) #################################################################################### # Merge hospital, patient reported and rescue med exacs with daily patient records # # Exacerbations occurring in Lenus service period include verified clinician events # pre-April 2021 (after onboarding) and community exacerbations recorded in weekly # PROs post-April 2021. Hospital exacs occur in year prior to OB and on Lenus service. # Rescue med exacs are only used for the year prior to OB. # Need to ensure each record has both StudyId and PatientId to prevent losing events ###################################################################################### # Patient reported, clinician verified (during COPD service time only, inner join) df = df.merge(verified_exacs, on=['StudyId', 'DateOfEvent'], how='left') # Patient reported, new rescue med PRO (April 2021 onwards, inner join) df = df.merge(weekly_pros, on=['PatientId', 'DateOfEvent'], how='left') # Hospital exacs (one year prior to OB plus time on service, outer join) df = df.merge(hosp_exacs, on=['PatientId', 'DateOfEvent'], how='outer') df = copd.fill_column_by_patient(df=df, id_col='PatientId', col='StudyId') # Pharmacy exacs, (one year prior to OB up to OB only, outer join) df = df.merge(pharmacy_exacs, on=['StudyId', 'DateOfEvent'], how='outer') df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='PatientId') # Respiratory hospital admissions (one year prior to OB plus time on service, outer join) df = df.merge(admissions, on=['PatientId', 'DateOfEvent'], how='outer') df = copd.fill_column_by_patient(df=df, id_col='PatientId', col='StudyId') # Combine cols from individual datasets into one df['ExacDateUnknown'] = np.where((df.ExacDateUnknown_x == 1) | (df.ExacDateUnknown_y == 1), 1, 0) df['IsCommExac'] = np.where((df.IsCommExac_x == 1) | (df.IsCommExac_y == 1) | (df.IsRescueMedExac == 1), 1, 0) # Column for whether an exacerbation of any kind occurred on each date. To be filtered # using (PRO) LOGIC df['IsExac'] = np.where((df.IsCommExac == 1) | (df.IsHospExac == 1), 1, 0) # Resample the df to one day per patient starting from the earliest record (may be a # pre-onboarding exac. Complete daily records required for calculating DaysSinceLastExac) df = df.set_index('DateOfEvent').groupby('StudyId').resample('D').asfreq().drop( 'StudyId', axis=1).reset_index() # Infill binary cols with zero where applicable df[['Q5Answered', 'NegativeQ5', 'IsHospExac', 'IsCommExac', 'ExacDateUnknown', 'IsExac', 'IsRescueMedExac', 'IsHospAdmission']] = df[ ['Q5Answered', 'NegativeQ5', 'IsHospExac', 'IsCommExac', 'ExacDateUnknown', 'IsExac', 'IsRescueMedExac', 'IsHospAdmission']].fillna(0) # Infill some columns by StudyId to populate entire df df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='FirstSubmissionDate') df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='LatestPredictionDate') df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='PatientId') # Retain only dates before the end of each patient's data window df = df[df.DateOfEvent <= df.LatestPredictionDate] print('Starting number of exacerbations: {}'.format(df.IsExac.sum())) print('Exacerbations pre-onboarding to COPD service: {}'.format( len(df[(df.IsExac == 1) & (df.DateOfEvent < df.FirstSubmissionDate)]))) print('Exacerbations post-onboarding to COPD service: {}'.format( len(df[(df.IsExac == 1) & (df.DateOfEvent >= df.FirstSubmissionDate)]))) print('Number of unique exacerbation patients: {}'.format( len(df[df.IsExac == 1].PatientId.unique()))) # print('Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping' # .format(df.IsHospExac.sum(), df.IsCommExac.sum(), # len(df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1)]))) print('Rescue med prescriptions in year prior to onboarding: {} ({} unique patients, \ {} prescription dates overlapping with hospital events)' .format(len(df[df.IsRescueMedExac == 1]), len(df[df.IsRescueMedExac == 1].StudyId.unique()), len(df[(df.IsRescueMedExac == 1) & (df.IsHospExac == 1)]))) print('Hospital exacerbations in year prior to onboarding: {} ({} unique patients)' .format(len(df[(df.IsHospExac == 1) & (df.DateOfEvent < df.FirstSubmissionDate)]), len(df[(df.IsHospExac == 1) & (df.DateOfEvent < df.FirstSubmissionDate)].StudyId.unique()))) print('Hospital exacerbations post-OB: {} ({} unique patients)' .format(len(df[(df.IsHospExac == 1) & (df.DateOfEvent >= df.FirstSubmissionDate)]), len(df[(df.IsHospExac == 1) & (df.DateOfEvent >= df.FirstSubmissionDate)].StudyId.unique()))) print('Clinician verified community exacerbations post-OB: {} ({} unique patients)' .format(len(df[df.IsCommExac_x == 1]), len(df[df.IsCommExac_x == 1].StudyId.unique()))) print('Community exacerbations post-OB from weekly PROs: {} ({} unique patients)' .format(len(df[df.IsCommExac_y == 1]), len(df[df.IsCommExac_y == 1].StudyId.unique()))) print('Number of patient reported exacerbations with unknown dates: {} ({} overlapping\ with hospital events)'.format(df.ExacDateUnknown.sum(), len(df[(df.IsHospExac == 1) & (df.ExacDateUnknown == 1)]))) # Check for any patient reported events with unknown dates that occurred on the same day # as a hospital event. Hospital events are trusted so set the date to known df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1), 'ExacDateUnknown'] = 0 print('Remaining exacerbations with unknown dates: {}'.format(df.ExacDateUnknown.sum())) df = df.drop(columns=['IsCommExac_x', 'IsCommExac_y', 'ExacDateUnknown_x', 'ExacDateUnknown_y']) ############################################################################ # Implement PRO LOGIC on hospital and patient reported exacerbation events ############################################################################ # Define min and max days for PRO LOGIC. No predictions made or data used within # logic_min_days after an exacerbation. Events falling between logic_min_days and # logic_max_days after an event are subject to the weekly rescue med LOGIC criterion logic_min_days = 14 logic_max_days = 35 # Calculate the days since last rescue med prescription df = df.groupby('StudyId').apply( lambda x: copd.calculate_days_since_last_event( df=x, event_col='IsRescueMedExac', output_col='DaysSinceLastRescueMeds')).reset_index(drop=True) rescue_med_min_days = 7 print('Rescue med prescriptions occuring within {} days of a previous prescription: {}' .format(rescue_med_min_days, len(df[(df.DaysSinceLastRescueMeds > -1) & (df.DaysSinceLastRescueMeds <= rescue_med_min_days) & (df.IsRescueMedExac == 1)]))) # Reset IsExac to 0 for rescue med prescriptions within 7 days of a previous prescription df.loc[(df.DaysSinceLastRescueMeds > -1) & (df.DaysSinceLastRescueMeds <= rescue_med_min_days) & (df.IsRescueMedExac == 1), 'IsExac'] = 0 # Calculate the days since the previous exacerbation for all patient days. Now includes # events before patient onboarding df = df.groupby('StudyId').apply( lambda x: copd.calculate_days_since_last_event( df=x, event_col='IsExac', output_col='DaysSinceLastExac')).reset_index(drop=True) pre_onboarding_min_days = 14 print('Pre-onboarding exacerbations occuring within {} days of a previous exac: {}' .format(pre_onboarding_min_days, len(df[(df.IsExac == 1) & (df.DaysSinceLastExac > -1) & (df.DaysSinceLastExac <= pre_onboarding_min_days) & (df.DateOfEvent < df.FirstSubmissionDate)]))) # Set IsExac to 0 for any pre-OB exacs within 14 days of a previous exac df.loc[(df.DaysSinceLastExac > -1) & (df.DaysSinceLastExac <= pre_onboarding_min_days) & (df.DateOfEvent < df.FirstSubmissionDate), 'IsExac'] = 0 # Recalculate DaysSinceLastExac to avoid counting exacs removed above df = df.groupby('StudyId').apply( lambda x: copd.calculate_days_since_last_event( df=x, event_col='IsExac', output_col='DaysSinceLastExac')).reset_index(drop=True) # Apply exclusion period following all exacerbations df['RemoveRow'] = copd.minimum_period_between_exacerbations( df, minimum_days=logic_min_days) # Don't apply this criterion to pre-OB events (already accounted for above) df.loc[(df.DateOfEvent < df.FirstSubmissionDate), 'RemoveRow'] = 0 print('Number of post-OB exacerbations excluded by PRO LOGIC {} day criterion: {}'.format( logic_min_days, len(df[(df.IsExac == 1) & (df.RemoveRow == 1) & (df.DateOfEvent >= df.FirstSubmissionDate)]))) # Apply criterion for negative weekly Q5 responses - doesn't capture anything post Q5 # change consecutive_replies = 2 df = copd.apply_logic_response_criterion(df, minimum_period=logic_min_days, maximum_period=logic_max_days, N=consecutive_replies) print('Weekly rescue med (Q5) criterion applied to events occurring between {} and {} \ days after a previous event. {} consecutive negative replies required for the event to \ count as a new event'.format(logic_min_days, logic_max_days, consecutive_replies)) # Don't apply this criterion to pre-OB events (already accounted for above) df.loc[(df.DateOfEvent < df.FirstSubmissionDate), 'RemoveExac'] = 0 print('Number of exacerbations excluded by PRO LOGIC Q5 response criterion: {}'.format( df.RemoveExac.sum())) print('Earliest and latest exacerbations excluded: {}, {}'.format( df[df.RemoveExac == 1].DateOfEvent.min(), df[df.RemoveExac == 1].DateOfEvent.max())) print('Remaining post-OB exacerbations: {}'.format( len(df[(df.IsExac == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1) & (df.DateOfEvent >= df.FirstSubmissionDate)]))) print('Remaining exacerbations with unknown dates: {}'.format( len(df[(df.ExacDateUnknown == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1)]))) # Remove data between segments of prolonged events, count only first occurrence df = copd.remove_data_between_exacerbations(df) # Remove 7 days before each reported exacerbation within unknown date (meds in last week) df = copd.remove_unknown_date_exacerbations(df) # New df with unwanted rows removed for events breakdown. Don't drop rows before setting # the prediction window in case of events that occur immediately after the exclusion # period (prediction window is set on df index rather than dates so want full daily df) df_counts = df[(df.RemoveRow != 1) & (df.DateOfEvent >= df.FirstSubmissionDate)].copy() print('Final number of exacerbations: {}'.format(df_counts.IsExac.sum())) exac_patients = pd.Series(df_counts[df_counts.IsExac == 1].StudyId.unique()) print('Number of unique exacerbation patients: {} ({} RC and {} SU)'.format( len(exac_patients), exac_patients.str.startswith('RC').sum(), exac_patients.str.startswith('SU').sum())) print('Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping' .format(df_counts.IsHospExac.sum(), df_counts.IsCommExac.sum(), len(df_counts.loc[ (df_counts.IsCommExac == 1) & (df_counts.IsHospExac == 1)]))) ################################################################# # Set the prediction window to N days and remove unwanted rows # Calculate rolling exac counts before removing pre-OB events ################################################################# # Create column of exacerbations to use for rolling counts df['ExacsToKeep'] = np.where((df.RemoveRow != 1) & (df.RemoveExac != 1), df.IsExac, 0) # Calculate rolling 365 day sums of exacerbations and respiratory admissions df = copd.rolling_sum_previous_period(df=df, date_col='DateOfEvent', col='ExacsToKeep', id_col='StudyId', window=365, output_col='ExacsPrevYear') df = copd.rolling_sum_previous_period(df=df, date_col='DateOfEvent', col='IsHospAdmission', id_col='StudyId', window=365, output_col='AdmissionsPrevYear') # Filter for data in the training data window (first submission date onwards) df = df[(df.DateOfEvent >= df.FirstSubmissionDate) & (df.RemoveRow != 1)] print('Setting {} day prediction window'.format(N)) df = copd.set_prediction_window(df=df, prediction_window=N) print('Full data set now contains {} exacerbation days out of {} ({:.1f}%)'.format( df.IsExac.value_counts()[1], len(df), 100 * df.IsExac.value_counts(normalize=True)[1])) ################ # Save data ################ df = df[['PatientId', 'StudyId', 'DateOfBirth', 'Sex', 'DateOfEvent', 'IsExac', 'DaysSinceLastExac', 'FirstSubmissionDate', 'LatestPredictionDate', 'ExacsPrevYear', 'AdmissionsPrevYear']] df.to_pickle(os.path.join(data_dir, 'exac_data.pkl')) patient_details.to_pickle(os.path.join(data_dir, 'patient_details.pkl'))