| | """Collate all hospital, clincian verified and patient reported events and apply LOGIC.""" |
| | import copd |
| | import numpy as np |
| | import os |
| | import pandas as pd |
| |
|
| | data_dir = '<YOUR_DATA_PATH>/copd-dataset' |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | patient_details = pd.read_csv(os.path.join(data_dir, 'CopdDatasetPatientDetails.txt'), |
| | usecols=['PatientId', 'FirstSubmissionDate', |
| | 'MostRecentSubmissionDate', |
| | 'DateOfBirth', 'Sex', 'StudyId'], |
| | delimiter="|") |
| |
|
| | |
| | |
| | |
| | receiver_patients = ["RC{:02d}".format(i) for i in range(1, 85)] |
| | |
| | receiver_patients.remove('RC34') |
| | |
| | scaleup_patients = ["SU{:02d}".format(i) for i in range(1, 219)] |
| | scaleup_patients.append('SU287') |
| |
|
| | |
| | valid_patients = receiver_patients + scaleup_patients |
| |
|
| | |
| | patient_details = patient_details[patient_details.StudyId.str.replace(' ', '').isin( |
| | valid_patients)] |
| | |
| | patient_details = patient_details[(patient_details.FirstSubmissionDate.notna()) & |
| | (patient_details.MostRecentSubmissionDate.notna())] |
| |
|
| | |
| | patient_details['LatestPredictionDate'] = '2022-02-28' |
| |
|
| | date_cols = ['FirstSubmissionDate', 'MostRecentSubmissionDate', 'LatestPredictionDate'] |
| | patient_details[date_cols] = patient_details[date_cols].apply( |
| | lambda x: pd.to_datetime(x, utc=True).dt.normalize(), axis=1) |
| |
|
| | |
| | |
| | patient_details['LatestPredictionDate'] = patient_details[ |
| | ['MostRecentSubmissionDate', 'LatestPredictionDate']].min(axis=1) |
| |
|
| | |
| | |
| | N = 3 |
| | patient_details['EarliestPredictionDate'] = patient_details['FirstSubmissionDate']\ |
| | + pd.DateOffset(days=N) |
| |
|
| | |
| | |
| | patient_details = patient_details[patient_details['EarliestPredictionDate'] < |
| | patient_details['LatestPredictionDate']] |
| | |
| | model_patients = list(patient_details.PatientId.unique()) |
| | model_study_ids = list(patient_details.StudyId.unique()) |
| |
|
| | print('Model cohort: {} patients. {} RECEIVER and {} SU'.format( |
| | len(model_patients), |
| | len(patient_details[patient_details['StudyId'].str.startswith('RC')]), |
| | len(patient_details[patient_details['StudyId'].str.startswith('SU')]))) |
| |
|
| | df = patient_details[['PatientId', 'DateOfBirth', 'Sex', 'StudyId', |
| | 'FirstSubmissionDate', 'LatestPredictionDate']].copy() |
| |
|
| | |
| | |
| | |
| | df["DateOfEvent"] = df.apply(lambda x: pd.date_range(x.FirstSubmissionDate - |
| | pd.DateOffset(days=N), x.LatestPredictionDate, freq='D'), |
| | axis=1) |
| | df = df.explode('DateOfEvent').reset_index(drop=True) |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | patient_events = pd.read_csv(os.path.join(data_dir, 'PatientEvents.txt'), |
| | delimiter="|", usecols=['PatientId', 'DateOfEvent', |
| | 'EventTypeId']) |
| |
|
| | |
| | patient_events = patient_events[patient_events.PatientId.isin(model_patients)] |
| |
|
| | |
| | patient_event_types = pd.read_csv(os.path.join(data_dir, 'PatientEventTypes.txt'), |
| | delimiter="|", usecols=['Id', 'Name']) |
| | patient_event_types = patient_event_types.rename(columns={'Id': 'EventTypeId', |
| | 'Name': 'EventName'}) |
| | |
| | patient_events = patient_events.merge(patient_event_types, on='EventTypeId') |
| |
|
| | |
| | patient_events['IsHospExac'] = copd.define_service_exac_event( |
| | events=patient_events.EventName, include_community=False) |
| | |
| | patient_events['IsHospAdmission'] = copd.define_hospital_admission( |
| | patient_events.EventName) |
| |
|
| | admissions = patient_events[patient_events.IsHospAdmission == 1][['PatientId', |
| | 'DateOfEvent', |
| | 'IsHospAdmission']] |
| | hosp_exacs = patient_events[patient_events.IsHospExac == 1][['PatientId', |
| | 'DateOfEvent', |
| | 'IsHospExac']] |
| | admissions['DateOfEvent'] = pd.to_datetime(admissions.DateOfEvent, |
| | utc=True).dt.normalize() |
| | hosp_exacs['DateOfEvent'] = pd.to_datetime(hosp_exacs.DateOfEvent, |
| | utc=True).dt.normalize() |
| |
|
| | hosp_exacs = hosp_exacs.drop_duplicates() |
| | admissions = admissions.drop_duplicates() |
| | |
| | hosp_exacs.to_pickle(os.path.join(data_dir, 'hospital_exacerbations.pkl')) |
| | admissions.to_pickle(os.path.join(data_dir, 'admissions.pkl')) |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | id_mapping = pd.read_pickle('../data/sh_to_studyid_mapping.pkl') |
| | id_mapping = id_mapping[id_mapping.StudyId.isin(model_study_ids)] |
| |
|
| | |
| | pharmacy = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA', |
| | 'Pharmacy_Cohort4.csv')) |
| | pharmacy = pharmacy[pharmacy.SafeHavenID.isin(id_mapping.SafeHavenID)] |
| |
|
| | |
| | steroid_codes = ['0603020T0AAACAC', '0603020T0AABKBK', '0603020T0AAAXAX', |
| | '0603020T0AAAGAG', '0603020T0AABHBH', '0603020T0AAACAC', |
| | '0603020T0AABKBK', '0603020T0AABNBN', '0603020T0AAAGAG', |
| | '0603020T0AABHBH'] |
| |
|
| | antibiotic_codes = ['0501013B0AAAAAA', '0501013B0AAABAB', '0501030I0AAABAB', |
| | '0501030I0AAAAAA', '0501050B0AAAAAA', '0501050B0AAADAD', |
| | '0501013K0AAAJAJ'] |
| | rescue_med_bnf_codes = steroid_codes + antibiotic_codes |
| | pharmacy = pharmacy[pharmacy.PI_BNF_Item_Code.isin(rescue_med_bnf_codes)] |
| |
|
| | |
| | cohort_dates = id_mapping.merge(patient_details[ |
| | ['PatientId', 'StudyId', 'FirstSubmissionDate', 'LatestPredictionDate']], |
| | on='StudyId') |
| |
|
| | |
| | pharmacy_exacs = cohort_dates.merge(pharmacy, on='SafeHavenID').drop( |
| | columns=['PatientId', 'PI_BNF_Item_Code', 'PI_BNF_Item_Description', |
| | 'DISP_DATE', 'SafeHavenID']) |
| | pharmacy_exacs = pharmacy_exacs.rename(columns={'PRESC_DATE': 'DateOfEvent'}) |
| | pharmacy_exacs['DateOfEvent'] = pd.to_datetime(pharmacy_exacs['DateOfEvent'], |
| | utc=True).dt.normalize() |
| | |
| | pharmacy_exacs = pharmacy_exacs.drop_duplicates() |
| | |
| | pharmacy_exacs = pharmacy_exacs[ |
| | (pharmacy_exacs.DateOfEvent < pharmacy_exacs.FirstSubmissionDate) & |
| | (pharmacy_exacs.DateOfEvent >= pharmacy_exacs.FirstSubmissionDate - |
| | pd.DateOffset(years=1))] |
| | |
| | pharmacy_exacs['IsRescueMedExac'] = 1 |
| | pharmacy_exacs = pharmacy_exacs.drop( |
| | columns=['FirstSubmissionDate', 'LatestPredictionDate']) |
| |
|
| | |
| | pharmacy_exacs.to_pickle(os.path.join(data_dir, 'pharmacy_exacerbations.pkl')) |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | symptom_diary = pd.read_csv(os.path.join(data_dir, 'CopdDatasetProSymptomDiary.txt'), |
| | usecols=['PatientId', 'StudyId', 'Score', 'SubmissionTime', |
| | 'SymptomDiaryQ5', 'SymptomDiaryQ11a', 'SymptomDiaryQ11b'], |
| | delimiter="|") |
| |
|
| | Q5ChangeDate = pd.to_datetime('2021-04-22', utc=True) |
| | symptom_diary = copd.filter_symptom_diary(df=symptom_diary, date_cutoff=Q5ChangeDate, |
| | patients=model_patients) |
| |
|
| | weekly_pros = copd.get_rescue_med_pro_responses(symptom_diary) |
| | weekly_pros = copd.set_pro_exac_dates(weekly_pros) |
| | weekly_pros = weekly_pros[['PatientId', 'Q5Answered', 'NegativeQ5', 'IsCommExac', |
| | 'DateOfEvent', 'ExacDateUnknown']] |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | receiver = pd.read_excel('./LenusEvents/breakdown_of_com_exac_160321.xlsx') |
| | receiver = receiver.rename(columns={'Study number': 'StudyId', |
| | 'Exacerbation recorded': 'DateRecorded'}) |
| | receiver_exacs = copd.extract_clinician_verified_exacerbations(receiver) |
| |
|
| | |
| | scaleup = pd.read_excel('./LenusEvents/Scale_Up_comm_exac_count_V9_deident.xlsx') |
| | scaleup = scaleup.rename(columns={'Study Number': 'StudyId', |
| | 'Date Exacerbation recorded': 'DateRecorded'}) |
| | scaleup['StudyId'] = scaleup['StudyId'].ffill() |
| |
|
| | scaleup_exacs = copd.extract_clinician_verified_exacerbations(scaleup) |
| |
|
| | |
| | verified_exacs = pd.concat([receiver_exacs, scaleup_exacs]) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | df = df.merge(verified_exacs, on=['StudyId', 'DateOfEvent'], how='left') |
| |
|
| | |
| | df = df.merge(weekly_pros, on=['PatientId', 'DateOfEvent'], how='left') |
| |
|
| | |
| | df = df.merge(hosp_exacs, on=['PatientId', 'DateOfEvent'], how='outer') |
| | df = copd.fill_column_by_patient(df=df, id_col='PatientId', col='StudyId') |
| |
|
| | |
| | df = df.merge(pharmacy_exacs, on=['StudyId', 'DateOfEvent'], how='outer') |
| | df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='PatientId') |
| |
|
| | |
| | df = df.merge(admissions, on=['PatientId', 'DateOfEvent'], how='outer') |
| | df = copd.fill_column_by_patient(df=df, id_col='PatientId', col='StudyId') |
| |
|
| | |
| | df['ExacDateUnknown'] = np.where((df.ExacDateUnknown_x == 1) | |
| | (df.ExacDateUnknown_y == 1), 1, 0) |
| | df['IsCommExac'] = np.where((df.IsCommExac_x == 1) | |
| | (df.IsCommExac_y == 1) | (df.IsRescueMedExac == 1), 1, 0) |
| |
|
| | |
| | |
| | df['IsExac'] = np.where((df.IsCommExac == 1) | (df.IsHospExac == 1), 1, 0) |
| |
|
| | |
| | |
| | df = df.set_index('DateOfEvent').groupby('StudyId').resample('D').asfreq().drop( |
| | 'StudyId', axis=1).reset_index() |
| |
|
| | |
| | df[['Q5Answered', 'NegativeQ5', 'IsHospExac', 'IsCommExac', 'ExacDateUnknown', 'IsExac', |
| | 'IsRescueMedExac', 'IsHospAdmission']] = df[ |
| | ['Q5Answered', 'NegativeQ5', 'IsHospExac', 'IsCommExac', 'ExacDateUnknown', |
| | 'IsExac', 'IsRescueMedExac', 'IsHospAdmission']].fillna(0) |
| |
|
| | |
| | df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='FirstSubmissionDate') |
| | df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='LatestPredictionDate') |
| | df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='PatientId') |
| |
|
| | |
| | df = df[df.DateOfEvent <= df.LatestPredictionDate] |
| |
|
| | print('Starting number of exacerbations: {}'.format(df.IsExac.sum())) |
| | print('Exacerbations pre-onboarding to COPD service: {}'.format( |
| | len(df[(df.IsExac == 1) & (df.DateOfEvent < df.FirstSubmissionDate)]))) |
| | print('Exacerbations post-onboarding to COPD service: {}'.format( |
| | len(df[(df.IsExac == 1) & (df.DateOfEvent >= df.FirstSubmissionDate)]))) |
| | print('Number of unique exacerbation patients: {}'.format( |
| | len(df[df.IsExac == 1].PatientId.unique()))) |
| | |
| | |
| | |
| | print('Rescue med prescriptions in year prior to onboarding: {} ({} unique patients, \ |
| | {} prescription dates overlapping with hospital events)' |
| | .format(len(df[df.IsRescueMedExac == 1]), |
| | len(df[df.IsRescueMedExac == 1].StudyId.unique()), |
| | len(df[(df.IsRescueMedExac == 1) & (df.IsHospExac == 1)]))) |
| | print('Hospital exacerbations in year prior to onboarding: {} ({} unique patients)' |
| | .format(len(df[(df.IsHospExac == 1) & |
| | (df.DateOfEvent < df.FirstSubmissionDate)]), |
| | len(df[(df.IsHospExac == 1) & |
| | (df.DateOfEvent < df.FirstSubmissionDate)].StudyId.unique()))) |
| | print('Hospital exacerbations post-OB: {} ({} unique patients)' |
| | .format(len(df[(df.IsHospExac == 1) & |
| | (df.DateOfEvent >= df.FirstSubmissionDate)]), |
| | len(df[(df.IsHospExac == 1) & |
| | (df.DateOfEvent >= df.FirstSubmissionDate)].StudyId.unique()))) |
| | print('Clinician verified community exacerbations post-OB: {} ({} unique patients)' |
| | .format(len(df[df.IsCommExac_x == 1]), |
| | len(df[df.IsCommExac_x == 1].StudyId.unique()))) |
| | print('Community exacerbations post-OB from weekly PROs: {} ({} unique patients)' |
| | .format(len(df[df.IsCommExac_y == 1]), |
| | len(df[df.IsCommExac_y == 1].StudyId.unique()))) |
| |
|
| | print('Number of patient reported exacerbations with unknown dates: {} ({} overlapping\ |
| | with hospital events)'.format(df.ExacDateUnknown.sum(), |
| | len(df[(df.IsHospExac == 1) & (df.ExacDateUnknown == 1)]))) |
| |
|
| | |
| | |
| | df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1), 'ExacDateUnknown'] = 0 |
| | print('Remaining exacerbations with unknown dates: {}'.format(df.ExacDateUnknown.sum())) |
| |
|
| | df = df.drop(columns=['IsCommExac_x', 'IsCommExac_y', 'ExacDateUnknown_x', |
| | 'ExacDateUnknown_y']) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | logic_min_days = 14 |
| | logic_max_days = 35 |
| |
|
| | |
| | df = df.groupby('StudyId').apply( |
| | lambda x: copd.calculate_days_since_last_event( |
| | df=x, event_col='IsRescueMedExac', |
| | output_col='DaysSinceLastRescueMeds')).reset_index(drop=True) |
| |
|
| | rescue_med_min_days = 7 |
| | print('Rescue med prescriptions occuring within {} days of a previous prescription: {}' |
| | .format(rescue_med_min_days, |
| | len(df[(df.DaysSinceLastRescueMeds > -1) & |
| | (df.DaysSinceLastRescueMeds <= rescue_med_min_days) & |
| | (df.IsRescueMedExac == 1)]))) |
| |
|
| | |
| | df.loc[(df.DaysSinceLastRescueMeds > -1) & |
| | (df.DaysSinceLastRescueMeds <= rescue_med_min_days) & |
| | (df.IsRescueMedExac == 1), 'IsExac'] = 0 |
| |
|
| | |
| | |
| | df = df.groupby('StudyId').apply( |
| | lambda x: copd.calculate_days_since_last_event( |
| | df=x, event_col='IsExac', output_col='DaysSinceLastExac')).reset_index(drop=True) |
| |
|
| | pre_onboarding_min_days = 14 |
| | print('Pre-onboarding exacerbations occuring within {} days of a previous exac: {}' |
| | .format(pre_onboarding_min_days, |
| | len(df[(df.IsExac == 1) & |
| | (df.DaysSinceLastExac > -1) & |
| | (df.DaysSinceLastExac <= pre_onboarding_min_days) & |
| | (df.DateOfEvent < df.FirstSubmissionDate)]))) |
| |
|
| | |
| | df.loc[(df.DaysSinceLastExac > -1) & (df.DaysSinceLastExac <= pre_onboarding_min_days) & |
| | (df.DateOfEvent < df.FirstSubmissionDate), 'IsExac'] = 0 |
| |
|
| | |
| | df = df.groupby('StudyId').apply( |
| | lambda x: copd.calculate_days_since_last_event( |
| | df=x, event_col='IsExac', output_col='DaysSinceLastExac')).reset_index(drop=True) |
| |
|
| | |
| | df['RemoveRow'] = copd.minimum_period_between_exacerbations( |
| | df, minimum_days=logic_min_days) |
| | |
| | df.loc[(df.DateOfEvent < df.FirstSubmissionDate), 'RemoveRow'] = 0 |
| |
|
| | print('Number of post-OB exacerbations excluded by PRO LOGIC {} day criterion: {}'.format( |
| | logic_min_days, len(df[(df.IsExac == 1) & (df.RemoveRow == 1) & |
| | (df.DateOfEvent >= df.FirstSubmissionDate)]))) |
| |
|
| | |
| | |
| | consecutive_replies = 2 |
| | df = copd.apply_logic_response_criterion(df, |
| | minimum_period=logic_min_days, |
| | maximum_period=logic_max_days, |
| | N=consecutive_replies) |
| |
|
| | print('Weekly rescue med (Q5) criterion applied to events occurring between {} and {} \ |
| | days after a previous event. {} consecutive negative replies required for the event to \ |
| | count as a new event'.format(logic_min_days, logic_max_days, consecutive_replies)) |
| | |
| | df.loc[(df.DateOfEvent < df.FirstSubmissionDate), 'RemoveExac'] = 0 |
| |
|
| | print('Number of exacerbations excluded by PRO LOGIC Q5 response criterion: {}'.format( |
| | df.RemoveExac.sum())) |
| | print('Earliest and latest exacerbations excluded: {}, {}'.format( |
| | df[df.RemoveExac == 1].DateOfEvent.min(), df[df.RemoveExac == 1].DateOfEvent.max())) |
| |
|
| | print('Remaining post-OB exacerbations: {}'.format( |
| | len(df[(df.IsExac == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1) & |
| | (df.DateOfEvent >= df.FirstSubmissionDate)]))) |
| |
|
| | print('Remaining exacerbations with unknown dates: {}'.format( |
| | len(df[(df.ExacDateUnknown == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1)]))) |
| |
|
| | |
| | df = copd.remove_data_between_exacerbations(df) |
| |
|
| | |
| | df = copd.remove_unknown_date_exacerbations(df) |
| |
|
| | |
| | |
| | |
| | df_counts = df[(df.RemoveRow != 1) & (df.DateOfEvent >= df.FirstSubmissionDate)].copy() |
| |
|
| | print('Final number of exacerbations: {}'.format(df_counts.IsExac.sum())) |
| | exac_patients = pd.Series(df_counts[df_counts.IsExac == 1].StudyId.unique()) |
| | print('Number of unique exacerbation patients: {} ({} RC and {} SU)'.format( |
| | len(exac_patients), exac_patients.str.startswith('RC').sum(), |
| | exac_patients.str.startswith('SU').sum())) |
| | print('Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping' |
| | .format(df_counts.IsHospExac.sum(), df_counts.IsCommExac.sum(), |
| | len(df_counts.loc[ |
| | (df_counts.IsCommExac == 1) & (df_counts.IsHospExac == 1)]))) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | df['ExacsToKeep'] = np.where((df.RemoveRow != 1) & (df.RemoveExac != 1), df.IsExac, 0) |
| |
|
| | |
| | df = copd.rolling_sum_previous_period(df=df, date_col='DateOfEvent', col='ExacsToKeep', |
| | id_col='StudyId', window=365, |
| | output_col='ExacsPrevYear') |
| | df = copd.rolling_sum_previous_period(df=df, date_col='DateOfEvent', |
| | col='IsHospAdmission', id_col='StudyId', window=365, |
| | output_col='AdmissionsPrevYear') |
| |
|
| | |
| | df = df[(df.DateOfEvent >= df.FirstSubmissionDate) & (df.RemoveRow != 1)] |
| |
|
| | print('Setting {} day prediction window'.format(N)) |
| | df = copd.set_prediction_window(df=df, prediction_window=N) |
| |
|
| | print('Full data set now contains {} exacerbation days out of {} ({:.1f}%)'.format( |
| | df.IsExac.value_counts()[1], len(df), |
| | 100 * df.IsExac.value_counts(normalize=True)[1])) |
| |
|
| | |
| | |
| | |
| | df = df[['PatientId', 'StudyId', 'DateOfBirth', 'Sex', |
| | 'DateOfEvent', 'IsExac', 'DaysSinceLastExac', 'FirstSubmissionDate', |
| | 'LatestPredictionDate', 'ExacsPrevYear', 'AdmissionsPrevYear']] |
| |
|
| | df.to_pickle(os.path.join(data_dir, 'exac_data.pkl')) |
| | patient_details.to_pickle(os.path.join(data_dir, 'patient_details.pkl')) |
| |
|