| | """ |
| | Find information on COPD, respiratory, rescue med and death event tracking |
| | for patients within a timeframe |
| | """ |
| | import json |
| | import pandas as pd |
| | import numpy as np |
| |
|
| |
|
| | merged_cols = ['adm_per_year', 'copd_resp_per_year', |
| | 'anxiety_depression_per_year', |
| | 'rescue_meds_per_year', 'anxiety_depression_presc_per_year'] |
| | base_cols = ['admission_any', 'admission_copd_resp', |
| | 'admission_anxiety_depression', |
| | 'presc_rescue_med', 'presc_anxiety_depression'] |
| | n_cols = ["n_" + col for col in base_cols] |
| | adm_cols = ['SafeHavenID', 'ADMDATE', 'admission_any', 'admission_copd_resp'] |
| | presc_cols = ['SafeHavenID', 'PRESC_DATE', 'rescue_meds'] |
| |
|
| |
|
| | def read_deaths(extract_data_path): |
| | """ |
| | Read in deaths dataset |
| | -------- |
| | :param extract_data_path: path to data extracts |
| | :return: dataframe |
| | """ |
| | filename = extract_data_path + 'Deaths_Cohort3R.csv' |
| | cols = ['SafeHavenID', 'DOD'] |
| | df = pd.read_csv(filename, usecols=cols).drop_duplicates() |
| | df['DOD'] = pd.to_datetime(df.DOD) |
| |
|
| | return df |
| |
|
| |
|
| | def filter_data(df, date_col, eoy_date, start_date, end_date, typ): |
| | """ |
| | Filter data to only include events occurring within given date range |
| | -------- |
| | :param df: dataframe |
| | :param date_col: str name of date column |
| | :param eoy_date: end of year date |
| | :param start_date: validation data start date |
| | :param end_date: validation data end date |
| | :param typ: type of data: 'adm', 'presc', 'merged', 'deaths' |
| | :return: filtered dataframe |
| | """ |
| | if typ == 'merged': |
| | df = df[df.eoy == eoy_date] |
| | else: |
| | df = df[(df[date_col] >= start_date) & (df[date_col] < end_date)] |
| |
|
| | return df |
| |
|
| |
|
| | def calc_time_to_event(df, date_col, start_date, new_col): |
| | """ |
| | Calculate time to next event |
| | -------- |
| | :param df: dataframe |
| | :param date_col: str name of date column |
| | :param start_date: validation data start date |
| | :param new_col: new column name |
| | :return: dataframe with SafeHavenID days to event |
| | """ |
| | df_next = df.groupby('SafeHavenID').agg(next_event=(date_col, min)) |
| | df_next = (df_next - start_date) / np.timedelta64(1, 'M') |
| | df_next.columns = ['time_to_' + new_col] |
| |
|
| | return df_next |
| |
|
| |
|
| | def bucket_time_to_event(df): |
| | """ |
| | Calculate time in months to next event and bucket into |
| | 1, 3, 6, 12, 12+ months. |
| | -------- |
| | :param df: dataframe |
| | :return: dataframe with event times in categories |
| | """ |
| | month = [-1, 1, 3, 6, 12, 13] |
| | label = ['1', '3', '6', '12', '12+'] |
| | df = df.apply(lambda x: pd.cut(x, month, labels=label)) |
| | df = df.fillna('12+') |
| |
|
| | return df |
| |
|
| |
|
| | def calculate_event_metrics(data_path, eoy_date, start_date, end_date): |
| | """ |
| | Generate tables with number of events in 12 months and |
| | boolean for events |
| | -------- |
| | :param data_path: path to generated data |
| | :param eoy_date: end of year date |
| | :param start_date: validation data start date |
| | :param end_date: validation data end date |
| | """ |
| | |
| | merged = pd.read_pickle(data_path + 'merged.pkl') |
| |
|
| | |
| | merged = filter_data( |
| | merged, 'eoy', eoy_date, start_date, end_date, 'merged') |
| | df_event = merged[['SafeHavenID'] + merged_cols] |
| |
|
| | |
| | df_count = df_event.copy() |
| | df_count.columns = ['SafeHavenID'] + n_cols |
| | df_count.to_pickle(data_path + 'metric_table_counts.pkl') |
| |
|
| | |
| | df_event[merged_cols] = (df_event[merged_cols] > 0).astype(int) |
| | df_event.columns = ['SafeHavenID'] + base_cols |
| | df_event.to_pickle(data_path + 'metric_table_events.pkl') |
| |
|
| |
|
| | def calculate_next_event(data_path, extract_data_path, eoy_date, |
| | start_date, end_date): |
| | """ |
| | Generate table with the time in 1, 3, 6, 12, 12+ months |
| | -------- |
| | :param data_path: path to generated data |
| | :param extract_data_path: path to data extracts |
| | :param eoy_date: end of year date |
| | :param start_date: validation data start date |
| | :param end_date: validation data end date |
| | """ |
| | |
| | adm = pd.read_pickle(data_path + 'validation_adm_proc.pkl') |
| | adm = filter_data( |
| | adm, 'ADMDATE', eoy_date, start_date, end_date, 'adm') |
| | adm['admission_any'] = 1 |
| | adm['admission_copd_resp'] = adm.copd_event | adm.resp_event |
| | adm = adm[adm_cols] |
| | time_to_adm_any = calc_time_to_event( |
| | adm, 'ADMDATE', start_date, 'admission_any') |
| | time_to_adm_copd = calc_time_to_event( |
| | adm[adm.admission_copd_resp == 1], 'ADMDATE', start_date, |
| | 'admission_copd_resp') |
| |
|
| | |
| | presc = pd.read_pickle(data_path + 'validation_presc_proc.pkl') |
| | presc = filter_data( |
| | presc, 'PRESC_DATE', eoy_date, start_date, end_date, 'presc') |
| | presc = presc[presc_cols] |
| | presc = presc[presc.rescue_meds == 1] |
| | time_to_rescue = calc_time_to_event( |
| | presc, 'PRESC_DATE', start_date, 'presc_rescue_med') |
| |
|
| | |
| | deaths = read_deaths(extract_data_path) |
| | deaths = filter_data( |
| | deaths, 'DOD', eoy_date, start_date, end_date, 'deaths') |
| | deaths['death'] = 1 |
| | time_to_death = calc_time_to_event( |
| | deaths, 'DOD', start_date, 'death') |
| |
|
| | |
| | frames = [time_to_adm_any, time_to_adm_copd, time_to_rescue, time_to_death] |
| | results = pd.concat(frames, join='outer', axis=1) |
| | results = bucket_time_to_event(results) |
| | results.to_pickle(data_path + 'metric_table_next.pkl') |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | with open('../../../config.json') as json_config_file: |
| | config = json.load(json_config_file) |
| | |
| | data_path = config['model_data_path'] |
| | extract_data_path = config['extract_data_path'] |
| | eoy_date = pd.to_datetime(config['date']) |
| | start_date = eoy_date + pd.Timedelta(days=1) |
| | end_date = eoy_date + pd.offsets.DateOffset(years=1) |
| |
|
| | calculate_event_metrics(data_path, eoy_date, start_date, end_date) |
| | calculate_next_event(data_path, extract_data_path, eoy_date, |
| | start_date, end_date) |
| |
|
| |
|
| | main() |
| |
|