File size: 24,553 Bytes

e69d4e4

"""Collate all hospital, clincian verified and patient reported events and apply LOGIC."""
import copd
import numpy as np
import os
import pandas as pd

data_dir = '<YOUR_DATA_PATH>/copd-dataset'

############################################################################
# Define model cohort and training data windows
############################################################################

# Read relevant info from patient details
patient_details = pd.read_csv(os.path.join(data_dir, 'CopdDatasetPatientDetails.txt'),
                              usecols=['PatientId', 'FirstSubmissionDate',
                                       'MostRecentSubmissionDate',
                                       'DateOfBirth', 'Sex', 'StudyId'],
                              delimiter="|")

# Select patients for inclusion (those with up to date events in service)
# Create list of patients for model inclusion
# Original RECEIVER cohort study id list
receiver_patients = ["RC{:02d}".format(i) for i in range(1, 85)]
# This patient needs removing
receiver_patients.remove('RC34')
# Scale up patients (subset)
scaleup_patients = ["SU{:02d}".format(i) for i in range(1, 219)]
scaleup_patients.append('SU287')

# List of all valid patients for modelling
valid_patients = receiver_patients + scaleup_patients

# Filter for valid patients accounting for white spaces in StudyId (e.g. RC 26 and RC 52)
patient_details = patient_details[patient_details.StudyId.str.replace(' ', '').isin(
    valid_patients)]
# Select only non null entries in patient data start/end dates
patient_details = patient_details[(patient_details.FirstSubmissionDate.notna()) &
                                  (patient_details.MostRecentSubmissionDate.notna())]

# Create a column stating the latest date permitted based on events added to service data
patient_details['LatestPredictionDate'] = '2022-02-28'

date_cols = ['FirstSubmissionDate', 'MostRecentSubmissionDate', 'LatestPredictionDate']
patient_details[date_cols] = patient_details[date_cols].apply(
    lambda x: pd.to_datetime(x, utc=True).dt.normalize(), axis=1)

# Choose the earlier date out of the patient's last submission and the latest COPD data
# events
patient_details['LatestPredictionDate'] = patient_details[
    ['MostRecentSubmissionDate', 'LatestPredictionDate']].min(axis=1)

# Add N days to start of data window because predictions are made N days in advance
# N=3 for the 72 hr exac model
N = 3
patient_details['EarliestPredictionDate'] = patient_details['FirstSubmissionDate']\
    + pd.DateOffset(days=N)

# Remove any patients for whom the prediction start date overlaps the final submission
# date, i.e. they have too short a window of data
patient_details = patient_details[patient_details['EarliestPredictionDate'] <
                                  patient_details['LatestPredictionDate']]
# List of remaining patients
model_patients = list(patient_details.PatientId.unique())
model_study_ids = list(patient_details.StudyId.unique())

print('Model cohort: {} patients. {} RECEIVER and {} SU'.format(
    len(model_patients),
    len(patient_details[patient_details['StudyId'].str.startswith('RC')]),
    len(patient_details[patient_details['StudyId'].str.startswith('SU')])))

df = patient_details[['PatientId', 'DateOfBirth', 'Sex', 'StudyId',
                      'FirstSubmissionDate', 'LatestPredictionDate']].copy()

# Create a dataframe with daily entries for each patient for their data window
# df["DateOfEvent"] = df.apply(lambda x: pd.date_range(x.FirstSubmissionDate,
#                              x.MostRecentSubmissionDate, freq='D'), axis=1)
df["DateOfEvent"] = df.apply(lambda x: pd.date_range(x.FirstSubmissionDate -
                             pd.DateOffset(days=N), x.LatestPredictionDate, freq='D'),
                             axis=1)
df = df.explode('DateOfEvent').reset_index(drop=True)

############################################################################
# Extract hospital exacerbations and admissions from COPD service data
# Includes 1 year pre-onboarding plus time on Lenus COPD service
############################################################################

# Contains exacerbations among other event types
patient_events = pd.read_csv(os.path.join(data_dir, 'PatientEvents.txt'),
                             delimiter="|", usecols=['PatientId', 'DateOfEvent',
                                                     'EventTypeId'])

# Filter for only patients in model cohort - will still contain events out of data windows
patient_events = patient_events[patient_events.PatientId.isin(model_patients)]

# Lookup table for patient event types
patient_event_types = pd.read_csv(os.path.join(data_dir, 'PatientEventTypes.txt'),
                                  delimiter="|", usecols=['Id', 'Name'])
patient_event_types = patient_event_types.rename(columns={'Id': 'EventTypeId',
                                                          'Name': 'EventName'})
# Merge patient events with lookup table)
patient_events = patient_events.merge(patient_event_types, on='EventTypeId')

# Identify hospital exacerbation events
patient_events['IsHospExac'] = copd.define_service_exac_event(
    events=patient_events.EventName, include_community=False)
# Identify hospital admissions (all causes)
patient_events['IsHospAdmission'] = copd.define_hospital_admission(
    patient_events.EventName)

admissions = patient_events[patient_events.IsHospAdmission == 1][['PatientId',
                                                                  'DateOfEvent',
                                                                  'IsHospAdmission']]
hosp_exacs = patient_events[patient_events.IsHospExac == 1][['PatientId',
                                                             'DateOfEvent',
                                                             'IsHospExac']]
admissions['DateOfEvent'] = pd.to_datetime(admissions.DateOfEvent,
                                           utc=True).dt.normalize()
hosp_exacs['DateOfEvent'] = pd.to_datetime(hosp_exacs.DateOfEvent,
                                           utc=True).dt.normalize()

hosp_exacs = hosp_exacs.drop_duplicates()
admissions = admissions.drop_duplicates()
# Save hospital exacerbations and admissions data
hosp_exacs.to_pickle(os.path.join(data_dir, 'hospital_exacerbations.pkl'))
admissions.to_pickle(os.path.join(data_dir, 'admissions.pkl'))

##########################################################################################
# Extract all rescue meds for model cohort in the year prior to onboarding. These will be
# used as a proxy for community exacerbations pre-OB (not captured in service data)
##########################################################################################

# Read mapping between StudyId and SafeHavenID, and filter for model cohort
id_mapping = pd.read_pickle('../data/sh_to_studyid_mapping.pkl')
id_mapping = id_mapping[id_mapping.StudyId.isin(model_study_ids)]

# Read pharmacy data and filter for model cohort
pharmacy = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA',
                                    'Pharmacy_Cohort4.csv'))
pharmacy = pharmacy[pharmacy.SafeHavenID.isin(id_mapping.SafeHavenID)]

# Pull out rescue med prescriptions only
steroid_codes = ['0603020T0AAACAC', '0603020T0AABKBK', '0603020T0AAAXAX',
                 '0603020T0AAAGAG', '0603020T0AABHBH', '0603020T0AAACAC',
                 '0603020T0AABKBK', '0603020T0AABNBN', '0603020T0AAAGAG',
                 '0603020T0AABHBH']

antibiotic_codes = ['0501013B0AAAAAA', '0501013B0AAABAB', '0501030I0AAABAB',
                    '0501030I0AAAAAA', '0501050B0AAAAAA', '0501050B0AAADAD',
                    '0501013K0AAAJAJ']
rescue_med_bnf_codes = steroid_codes + antibiotic_codes
pharmacy = pharmacy[pharmacy.PI_BNF_Item_Code.isin(rescue_med_bnf_codes)]

# Get latest and earliest dates for model cohort
cohort_dates = id_mapping.merge(patient_details[
    ['PatientId', 'StudyId', 'FirstSubmissionDate', 'LatestPredictionDate']],
    on='StudyId')

# Merge and keep only rescue meds in the year before patient onboarding
pharmacy_exacs = cohort_dates.merge(pharmacy, on='SafeHavenID').drop(
    columns=['PatientId', 'PI_BNF_Item_Code', 'PI_BNF_Item_Description',
             'DISP_DATE', 'SafeHavenID'])
pharmacy_exacs = pharmacy_exacs.rename(columns={'PRESC_DATE': 'DateOfEvent'})
pharmacy_exacs['DateOfEvent'] = pd.to_datetime(pharmacy_exacs['DateOfEvent'],
                                               utc=True).dt.normalize()
# Drop duplicates
pharmacy_exacs = pharmacy_exacs.drop_duplicates()
# Filter on dates
pharmacy_exacs = pharmacy_exacs[
    (pharmacy_exacs.DateOfEvent < pharmacy_exacs.FirstSubmissionDate) &
    (pharmacy_exacs.DateOfEvent >= pharmacy_exacs.FirstSubmissionDate -
        pd.DateOffset(years=1))]
# New column for rescue med exac type
pharmacy_exacs['IsRescueMedExac'] = 1
pharmacy_exacs = pharmacy_exacs.drop(
    columns=['FirstSubmissionDate', 'LatestPredictionDate'])

# Save "pharmacy exacerbations" data
pharmacy_exacs.to_pickle(os.path.join(data_dir, 'pharmacy_exacerbations.pkl'))
######################################################
# Extract patient reported exacerbation events
######################################################
########################
# Data post Q5 change
#######################

# Read file containing patient reported events (not patient_events because it contains
# the dates when patients answered PROs and not which date they reported as having taken
# their rescue meds)
symptom_diary = pd.read_csv(os.path.join(data_dir, 'CopdDatasetProSymptomDiary.txt'),
                            usecols=['PatientId', 'StudyId', 'Score', 'SubmissionTime',
                            'SymptomDiaryQ5', 'SymptomDiaryQ11a', 'SymptomDiaryQ11b'],
                            delimiter="|")

Q5ChangeDate = pd.to_datetime('2021-04-22', utc=True)
symptom_diary = copd.filter_symptom_diary(df=symptom_diary, date_cutoff=Q5ChangeDate,
                                          patients=model_patients)

weekly_pros = copd.get_rescue_med_pro_responses(symptom_diary)
weekly_pros = copd.set_pro_exac_dates(weekly_pros)
weekly_pros = weekly_pros[['PatientId', 'Q5Answered', 'NegativeQ5', 'IsCommExac',
                           'DateOfEvent', 'ExacDateUnknown']]

#########################
# Pre Q5 change events
#########################

# RECEIVER cohort - community events verified up to 16/03/21
receiver = pd.read_excel('./LenusEvents/breakdown_of_com_exac_160321.xlsx')
receiver = receiver.rename(columns={'Study number': 'StudyId',
                           'Exacerbation recorded': 'DateRecorded'})
receiver_exacs = copd.extract_clinician_verified_exacerbations(receiver)

# Scale up cohort - community events verified up to 17/05/2021
scaleup = pd.read_excel('./LenusEvents/Scale_Up_comm_exac_count_V9_deident.xlsx')
scaleup = scaleup.rename(columns={'Study Number': 'StudyId',
                         'Date Exacerbation recorded': 'DateRecorded'})
scaleup['StudyId'] = scaleup['StudyId'].ffill()

scaleup_exacs = copd.extract_clinician_verified_exacerbations(scaleup)

# Combine RECEIVER and scale up events into one df
verified_exacs = pd.concat([receiver_exacs, scaleup_exacs])

####################################################################################
# Merge hospital, patient reported and rescue med exacs with daily patient records
#
# Exacerbations occurring in Lenus service period include verified clinician events
# pre-April 2021 (after onboarding) and community exacerbations recorded in weekly
# PROs post-April 2021. Hospital exacs occur in year prior to OB and on Lenus service.
# Rescue med exacs are only used for the year prior to OB.
# Need to ensure each record has both StudyId and PatientId to prevent losing events
######################################################################################

# Patient reported, clinician verified (during COPD service time only, inner join)
df = df.merge(verified_exacs, on=['StudyId', 'DateOfEvent'], how='left')

# Patient reported, new rescue med PRO (April 2021 onwards, inner join)
df = df.merge(weekly_pros, on=['PatientId', 'DateOfEvent'], how='left')

# Hospital exacs (one year prior to OB plus time on service, outer join)
df = df.merge(hosp_exacs, on=['PatientId', 'DateOfEvent'], how='outer')
df = copd.fill_column_by_patient(df=df, id_col='PatientId', col='StudyId')

# Pharmacy exacs, (one year prior to OB up to OB only, outer join)
df = df.merge(pharmacy_exacs, on=['StudyId', 'DateOfEvent'], how='outer')
df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='PatientId')

# Respiratory hospital admissions (one year prior to OB plus time on service, outer join)
df = df.merge(admissions, on=['PatientId', 'DateOfEvent'], how='outer')
df = copd.fill_column_by_patient(df=df, id_col='PatientId', col='StudyId')

# Combine cols from individual datasets into one
df['ExacDateUnknown'] = np.where((df.ExacDateUnknown_x == 1) |
                                 (df.ExacDateUnknown_y == 1), 1, 0)
df['IsCommExac'] = np.where((df.IsCommExac_x == 1) |
                            (df.IsCommExac_y == 1) | (df.IsRescueMedExac == 1), 1, 0)

# Column for whether an exacerbation of any kind occurred on each date. To be filtered
# using (PRO) LOGIC
df['IsExac'] = np.where((df.IsCommExac == 1) | (df.IsHospExac == 1), 1, 0)

# Resample the df to one day per patient starting from the earliest record (may be a
# pre-onboarding exac. Complete daily records required for calculating DaysSinceLastExac)
df = df.set_index('DateOfEvent').groupby('StudyId').resample('D').asfreq().drop(
    'StudyId', axis=1).reset_index()

# Infill binary cols with zero where applicable
df[['Q5Answered', 'NegativeQ5', 'IsHospExac', 'IsCommExac', 'ExacDateUnknown', 'IsExac',
    'IsRescueMedExac', 'IsHospAdmission']] = df[
        ['Q5Answered', 'NegativeQ5', 'IsHospExac', 'IsCommExac', 'ExacDateUnknown',
         'IsExac', 'IsRescueMedExac', 'IsHospAdmission']].fillna(0)

# Infill some columns by StudyId to populate entire df
df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='FirstSubmissionDate')
df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='LatestPredictionDate')
df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='PatientId')

# Retain only dates before the end of each patient's data window
df = df[df.DateOfEvent <= df.LatestPredictionDate]

print('Starting number of exacerbations: {}'.format(df.IsExac.sum()))
print('Exacerbations pre-onboarding to COPD service: {}'.format(
    len(df[(df.IsExac == 1) & (df.DateOfEvent < df.FirstSubmissionDate)])))
print('Exacerbations post-onboarding to COPD service: {}'.format(
    len(df[(df.IsExac == 1) & (df.DateOfEvent >= df.FirstSubmissionDate)])))
print('Number of unique exacerbation patients: {}'.format(
    len(df[df.IsExac == 1].PatientId.unique())))
# print('Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping'
#       .format(df.IsHospExac.sum(), df.IsCommExac.sum(),
#               len(df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1)])))
print('Rescue med prescriptions in year prior to onboarding: {} ({} unique patients, \
{} prescription dates overlapping with hospital events)'
      .format(len(df[df.IsRescueMedExac == 1]),
              len(df[df.IsRescueMedExac == 1].StudyId.unique()),
              len(df[(df.IsRescueMedExac == 1) & (df.IsHospExac == 1)])))
print('Hospital exacerbations in year prior to onboarding: {} ({} unique patients)'
      .format(len(df[(df.IsHospExac == 1) &
                     (df.DateOfEvent < df.FirstSubmissionDate)]),
              len(df[(df.IsHospExac == 1) &
                     (df.DateOfEvent < df.FirstSubmissionDate)].StudyId.unique())))
print('Hospital exacerbations post-OB: {} ({} unique patients)'
      .format(len(df[(df.IsHospExac == 1) &
                     (df.DateOfEvent >= df.FirstSubmissionDate)]),
              len(df[(df.IsHospExac == 1) &
                     (df.DateOfEvent >= df.FirstSubmissionDate)].StudyId.unique())))
print('Clinician verified community exacerbations post-OB: {} ({} unique patients)'
      .format(len(df[df.IsCommExac_x == 1]),
              len(df[df.IsCommExac_x == 1].StudyId.unique())))
print('Community exacerbations post-OB from weekly PROs: {} ({} unique patients)'
      .format(len(df[df.IsCommExac_y == 1]),
              len(df[df.IsCommExac_y == 1].StudyId.unique())))

print('Number of patient reported exacerbations with unknown dates: {} ({} overlapping\
 with hospital events)'.format(df.ExacDateUnknown.sum(),
                               len(df[(df.IsHospExac == 1) & (df.ExacDateUnknown == 1)])))

# Check for any patient reported events with unknown dates that occurred on the same day
# as a hospital event. Hospital events are trusted so set the date to known
df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1), 'ExacDateUnknown'] = 0
print('Remaining exacerbations with unknown dates: {}'.format(df.ExacDateUnknown.sum()))

df = df.drop(columns=['IsCommExac_x', 'IsCommExac_y', 'ExacDateUnknown_x',
                      'ExacDateUnknown_y'])

############################################################################
# Implement PRO LOGIC on hospital and patient reported exacerbation events
############################################################################

# Define min and max days for PRO LOGIC. No predictions made or data used within
# logic_min_days after an exacerbation. Events falling between logic_min_days and
# logic_max_days after an event are subject to the weekly rescue med LOGIC criterion
logic_min_days = 14
logic_max_days = 35

# Calculate the days since last rescue med prescription
df = df.groupby('StudyId').apply(
    lambda x: copd.calculate_days_since_last_event(
        df=x, event_col='IsRescueMedExac',
        output_col='DaysSinceLastRescueMeds')).reset_index(drop=True)

rescue_med_min_days = 7
print('Rescue med prescriptions occuring within {} days of a previous prescription: {}'
      .format(rescue_med_min_days,
              len(df[(df.DaysSinceLastRescueMeds > -1) &
                     (df.DaysSinceLastRescueMeds <= rescue_med_min_days) &
                     (df.IsRescueMedExac == 1)])))

# Reset IsExac to 0 for rescue med prescriptions within 7 days of a previous prescription
df.loc[(df.DaysSinceLastRescueMeds > -1) &
       (df.DaysSinceLastRescueMeds <= rescue_med_min_days) &
       (df.IsRescueMedExac == 1), 'IsExac'] = 0

# Calculate the days since the previous exacerbation for all patient days. Now includes
# events before patient onboarding
df = df.groupby('StudyId').apply(
    lambda x: copd.calculate_days_since_last_event(
        df=x, event_col='IsExac', output_col='DaysSinceLastExac')).reset_index(drop=True)

pre_onboarding_min_days = 14
print('Pre-onboarding exacerbations occuring within {} days of a previous exac: {}'
      .format(pre_onboarding_min_days,
              len(df[(df.IsExac == 1) &
                     (df.DaysSinceLastExac > -1) &
                     (df.DaysSinceLastExac <= pre_onboarding_min_days) &
                     (df.DateOfEvent < df.FirstSubmissionDate)])))

# Set IsExac to 0 for any pre-OB exacs within 14 days of a previous exac
df.loc[(df.DaysSinceLastExac > -1) & (df.DaysSinceLastExac <= pre_onboarding_min_days) &
       (df.DateOfEvent < df.FirstSubmissionDate), 'IsExac'] = 0

# Recalculate DaysSinceLastExac to avoid counting exacs removed above
df = df.groupby('StudyId').apply(
    lambda x: copd.calculate_days_since_last_event(
        df=x, event_col='IsExac', output_col='DaysSinceLastExac')).reset_index(drop=True)

# Apply exclusion period following all exacerbations
df['RemoveRow'] = copd.minimum_period_between_exacerbations(
    df, minimum_days=logic_min_days)
# Don't apply this criterion to pre-OB events (already accounted for above)
df.loc[(df.DateOfEvent < df.FirstSubmissionDate), 'RemoveRow'] = 0

print('Number of post-OB exacerbations excluded by PRO LOGIC {} day criterion: {}'.format(
    logic_min_days, len(df[(df.IsExac == 1) & (df.RemoveRow == 1) &
                        (df.DateOfEvent >= df.FirstSubmissionDate)])))

# Apply criterion for negative weekly Q5 responses - doesn't capture anything post Q5
# change
consecutive_replies = 2
df = copd.apply_logic_response_criterion(df,
                                         minimum_period=logic_min_days,
                                         maximum_period=logic_max_days,
                                         N=consecutive_replies)

print('Weekly rescue med (Q5) criterion applied to events occurring between {} and {} \
days after a previous event. {} consecutive negative replies required for the event to \
count as a new event'.format(logic_min_days, logic_max_days, consecutive_replies))
# Don't apply this criterion to pre-OB events (already accounted for above)
df.loc[(df.DateOfEvent < df.FirstSubmissionDate), 'RemoveExac'] = 0

print('Number of exacerbations excluded by PRO LOGIC Q5 response criterion: {}'.format(
    df.RemoveExac.sum()))
print('Earliest and latest exacerbations excluded: {}, {}'.format(
    df[df.RemoveExac == 1].DateOfEvent.min(), df[df.RemoveExac == 1].DateOfEvent.max()))

print('Remaining post-OB exacerbations: {}'.format(
    len(df[(df.IsExac == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1) &
           (df.DateOfEvent >= df.FirstSubmissionDate)])))

print('Remaining exacerbations with unknown dates: {}'.format(
    len(df[(df.ExacDateUnknown == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1)])))

# Remove data between segments of prolonged events, count only first occurrence
df = copd.remove_data_between_exacerbations(df)

# Remove 7 days before each reported exacerbation within unknown date (meds in last week)
df = copd.remove_unknown_date_exacerbations(df)

# New df with unwanted rows removed for events breakdown. Don't drop rows before setting
# the prediction window in case of events that occur immediately after the exclusion
# period (prediction window is set on df index rather than dates so want full daily df)
df_counts = df[(df.RemoveRow != 1) & (df.DateOfEvent >= df.FirstSubmissionDate)].copy()

print('Final number of exacerbations: {}'.format(df_counts.IsExac.sum()))
exac_patients = pd.Series(df_counts[df_counts.IsExac == 1].StudyId.unique())
print('Number of unique exacerbation patients: {} ({} RC and {} SU)'.format(
    len(exac_patients), exac_patients.str.startswith('RC').sum(),
    exac_patients.str.startswith('SU').sum()))
print('Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping'
      .format(df_counts.IsHospExac.sum(), df_counts.IsCommExac.sum(),
              len(df_counts.loc[
                  (df_counts.IsCommExac == 1) & (df_counts.IsHospExac == 1)])))

#################################################################
# Set the prediction window to N days and remove unwanted rows
# Calculate rolling exac counts before removing pre-OB events
#################################################################
# Create column of exacerbations to use for rolling counts
df['ExacsToKeep'] = np.where((df.RemoveRow != 1) & (df.RemoveExac != 1), df.IsExac, 0)

# Calculate rolling 365 day sums of exacerbations and respiratory admissions
df = copd.rolling_sum_previous_period(df=df, date_col='DateOfEvent', col='ExacsToKeep',
                                      id_col='StudyId', window=365,
                                      output_col='ExacsPrevYear')
df = copd.rolling_sum_previous_period(df=df, date_col='DateOfEvent',
                                      col='IsHospAdmission', id_col='StudyId', window=365,
                                      output_col='AdmissionsPrevYear')

# Filter for data in the training data window (first submission date onwards)
df = df[(df.DateOfEvent >= df.FirstSubmissionDate) & (df.RemoveRow != 1)]

print('Setting {} day prediction window'.format(N))
df = copd.set_prediction_window(df=df, prediction_window=N)

print('Full data set now contains {} exacerbation days out of {} ({:.1f}%)'.format(
    df.IsExac.value_counts()[1], len(df),
    100 * df.IsExac.value_counts(normalize=True)[1]))

################
# Save data
################
df = df[['PatientId', 'StudyId', 'DateOfBirth', 'Sex',
         'DateOfEvent', 'IsExac', 'DaysSinceLastExac', 'FirstSubmissionDate',
         'LatestPredictionDate', 'ExacsPrevYear', 'AdmissionsPrevYear']]

df.to_pickle(os.path.join(data_dir, 'exac_data.pkl'))
patient_details.to_pickle(os.path.join(data_dir, 'patient_details.pkl'))