""" Find information on COPD, respiratory, rescue med and death event tracking for patients within a timeframe """ import json import pandas as pd import numpy as np merged_cols = ['adm_per_year', 'copd_resp_per_year', 'anxiety_depression_per_year', 'rescue_meds_per_year', 'anxiety_depression_presc_per_year'] base_cols = ['admission_any', 'admission_copd_resp', 'admission_anxiety_depression', 'presc_rescue_med', 'presc_anxiety_depression'] n_cols = ["n_" + col for col in base_cols] adm_cols = ['SafeHavenID', 'ADMDATE', 'admission_any', 'admission_copd_resp'] presc_cols = ['SafeHavenID', 'PRESC_DATE', 'rescue_meds'] def read_deaths(extract_data_path): """ Read in deaths dataset -------- :param extract_data_path: path to data extracts :return: dataframe """ filename = extract_data_path + 'Deaths_Cohort3R.csv' cols = ['SafeHavenID', 'DOD'] df = pd.read_csv(filename, usecols=cols).drop_duplicates() df['DOD'] = pd.to_datetime(df.DOD) return df def filter_data(df, date_col, eoy_date, start_date, end_date, typ): """ Filter data to only include events occurring within given date range -------- :param df: dataframe :param date_col: str name of date column :param eoy_date: end of year date :param start_date: validation data start date :param end_date: validation data end date :param typ: type of data: 'adm', 'presc', 'merged', 'deaths' :return: filtered dataframe """ if typ == 'merged': df = df[df.eoy == eoy_date] else: df = df[(df[date_col] >= start_date) & (df[date_col] < end_date)] return df def calc_time_to_event(df, date_col, start_date, new_col): """ Calculate time to next event -------- :param df: dataframe :param date_col: str name of date column :param start_date: validation data start date :param new_col: new column name :return: dataframe with SafeHavenID days to event """ df_next = df.groupby('SafeHavenID').agg(next_event=(date_col, min)) df_next = (df_next - start_date) / np.timedelta64(1, 'M') df_next.columns = ['time_to_' + new_col] return df_next def bucket_time_to_event(df): """ Calculate time in months to next event and bucket into 1, 3, 6, 12, 12+ months. -------- :param df: dataframe :return: dataframe with event times in categories """ month = [-1, 1, 3, 6, 12, 13] label = ['1', '3', '6', '12', '12+'] df = df.apply(lambda x: pd.cut(x, month, labels=label)) df = df.fillna('12+') return df def calculate_event_metrics(data_path, eoy_date, start_date, end_date): """ Generate tables with number of events in 12 months and boolean for events -------- :param data_path: path to generated data :param eoy_date: end of year date :param start_date: validation data start date :param end_date: validation data end date """ # Load in data merged = pd.read_pickle(data_path + 'merged.pkl') # Select relevant dates and columns merged = filter_data( merged, 'eoy', eoy_date, start_date, end_date, 'merged') df_event = merged[['SafeHavenID'] + merged_cols] # Create frame with total events within 12mo period df_count = df_event.copy() df_count.columns = ['SafeHavenID'] + n_cols df_count.to_pickle(data_path + 'metric_table_counts.pkl') # Create frame with boolean events within 12mo period df_event[merged_cols] = (df_event[merged_cols] > 0).astype(int) df_event.columns = ['SafeHavenID'] + base_cols df_event.to_pickle(data_path + 'metric_table_events.pkl') def calculate_next_event(data_path, extract_data_path, eoy_date, start_date, end_date): """ Generate table with the time in 1, 3, 6, 12, 12+ months -------- :param data_path: path to generated data :param extract_data_path: path to data extracts :param eoy_date: end of year date :param start_date: validation data start date :param end_date: validation data end date """ # Find next adm events adm = pd.read_pickle(data_path + 'validation_adm_proc.pkl') adm = filter_data( adm, 'ADMDATE', eoy_date, start_date, end_date, 'adm') adm['admission_any'] = 1 adm['admission_copd_resp'] = adm.copd_event | adm.resp_event adm = adm[adm_cols] time_to_adm_any = calc_time_to_event( adm, 'ADMDATE', start_date, 'admission_any') time_to_adm_copd = calc_time_to_event( adm[adm.admission_copd_resp == 1], 'ADMDATE', start_date, 'admission_copd_resp') # Find next presc events presc = pd.read_pickle(data_path + 'validation_presc_proc.pkl') presc = filter_data( presc, 'PRESC_DATE', eoy_date, start_date, end_date, 'presc') presc = presc[presc_cols] presc = presc[presc.rescue_meds == 1] time_to_rescue = calc_time_to_event( presc, 'PRESC_DATE', start_date, 'presc_rescue_med') # Find next deaths deaths = read_deaths(extract_data_path) deaths = filter_data( deaths, 'DOD', eoy_date, start_date, end_date, 'deaths') deaths['death'] = 1 time_to_death = calc_time_to_event( deaths, 'DOD', start_date, 'death') # Merge results frames = [time_to_adm_any, time_to_adm_copd, time_to_rescue, time_to_death] results = pd.concat(frames, join='outer', axis=1) results = bucket_time_to_event(results) results.to_pickle(data_path + 'metric_table_next.pkl') def main(): # Load in config items with open('../../../config.json') as json_config_file: config = json.load(json_config_file) data_path = config['model_data_path'] extract_data_path = config['extract_data_path'] eoy_date = pd.to_datetime(config['date']) start_date = eoy_date + pd.Timedelta(days=1) end_date = eoy_date + pd.offsets.DateOffset(years=1) calculate_event_metrics(data_path, eoy_date, start_date, end_date) calculate_next_event(data_path, extract_data_path, eoy_date, start_date, end_date) main()