copd-model-e / training /src /modelling /event_calculations.py
IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
"""
Find information on COPD, respiratory, rescue med and death event tracking
for patients within a timeframe
"""
import json
import pandas as pd
import numpy as np
merged_cols = ['adm_per_year', 'copd_resp_per_year',
'anxiety_depression_per_year',
'rescue_meds_per_year', 'anxiety_depression_presc_per_year']
base_cols = ['admission_any', 'admission_copd_resp',
'admission_anxiety_depression',
'presc_rescue_med', 'presc_anxiety_depression']
n_cols = ["n_" + col for col in base_cols]
adm_cols = ['SafeHavenID', 'ADMDATE', 'admission_any', 'admission_copd_resp']
presc_cols = ['SafeHavenID', 'PRESC_DATE', 'rescue_meds']
def read_deaths(extract_data_path):
"""
Read in deaths dataset
--------
:param extract_data_path: path to data extracts
:return: dataframe
"""
filename = extract_data_path + 'Deaths_Cohort3R.csv'
cols = ['SafeHavenID', 'DOD']
df = pd.read_csv(filename, usecols=cols).drop_duplicates()
df['DOD'] = pd.to_datetime(df.DOD)
return df
def filter_data(df, date_col, eoy_date, start_date, end_date, typ):
"""
Filter data to only include events occurring within given date range
--------
:param df: dataframe
:param date_col: str name of date column
:param eoy_date: end of year date
:param start_date: validation data start date
:param end_date: validation data end date
:param typ: type of data: 'adm', 'presc', 'merged', 'deaths'
:return: filtered dataframe
"""
if typ == 'merged':
df = df[df.eoy == eoy_date]
else:
df = df[(df[date_col] >= start_date) & (df[date_col] < end_date)]
return df
def calc_time_to_event(df, date_col, start_date, new_col):
"""
Calculate time to next event
--------
:param df: dataframe
:param date_col: str name of date column
:param start_date: validation data start date
:param new_col: new column name
:return: dataframe with SafeHavenID days to event
"""
df_next = df.groupby('SafeHavenID').agg(next_event=(date_col, min))
df_next = (df_next - start_date) / np.timedelta64(1, 'M')
df_next.columns = ['time_to_' + new_col]
return df_next
def bucket_time_to_event(df):
"""
Calculate time in months to next event and bucket into
1, 3, 6, 12, 12+ months.
--------
:param df: dataframe
:return: dataframe with event times in categories
"""
month = [-1, 1, 3, 6, 12, 13]
label = ['1', '3', '6', '12', '12+']
df = df.apply(lambda x: pd.cut(x, month, labels=label))
df = df.fillna('12+')
return df
def calculate_event_metrics(data_path, eoy_date, start_date, end_date):
"""
Generate tables with number of events in 12 months and
boolean for events
--------
:param data_path: path to generated data
:param eoy_date: end of year date
:param start_date: validation data start date
:param end_date: validation data end date
"""
# Load in data
merged = pd.read_pickle(data_path + 'merged.pkl')
# Select relevant dates and columns
merged = filter_data(
merged, 'eoy', eoy_date, start_date, end_date, 'merged')
df_event = merged[['SafeHavenID'] + merged_cols]
# Create frame with total events within 12mo period
df_count = df_event.copy()
df_count.columns = ['SafeHavenID'] + n_cols
df_count.to_pickle(data_path + 'metric_table_counts.pkl')
# Create frame with boolean events within 12mo period
df_event[merged_cols] = (df_event[merged_cols] > 0).astype(int)
df_event.columns = ['SafeHavenID'] + base_cols
df_event.to_pickle(data_path + 'metric_table_events.pkl')
def calculate_next_event(data_path, extract_data_path, eoy_date,
start_date, end_date):
"""
Generate table with the time in 1, 3, 6, 12, 12+ months
--------
:param data_path: path to generated data
:param extract_data_path: path to data extracts
:param eoy_date: end of year date
:param start_date: validation data start date
:param end_date: validation data end date
"""
# Find next adm events
adm = pd.read_pickle(data_path + 'validation_adm_proc.pkl')
adm = filter_data(
adm, 'ADMDATE', eoy_date, start_date, end_date, 'adm')
adm['admission_any'] = 1
adm['admission_copd_resp'] = adm.copd_event | adm.resp_event
adm = adm[adm_cols]
time_to_adm_any = calc_time_to_event(
adm, 'ADMDATE', start_date, 'admission_any')
time_to_adm_copd = calc_time_to_event(
adm[adm.admission_copd_resp == 1], 'ADMDATE', start_date,
'admission_copd_resp')
# Find next presc events
presc = pd.read_pickle(data_path + 'validation_presc_proc.pkl')
presc = filter_data(
presc, 'PRESC_DATE', eoy_date, start_date, end_date, 'presc')
presc = presc[presc_cols]
presc = presc[presc.rescue_meds == 1]
time_to_rescue = calc_time_to_event(
presc, 'PRESC_DATE', start_date, 'presc_rescue_med')
# Find next deaths
deaths = read_deaths(extract_data_path)
deaths = filter_data(
deaths, 'DOD', eoy_date, start_date, end_date, 'deaths')
deaths['death'] = 1
time_to_death = calc_time_to_event(
deaths, 'DOD', start_date, 'death')
# Merge results
frames = [time_to_adm_any, time_to_adm_copd, time_to_rescue, time_to_death]
results = pd.concat(frames, join='outer', axis=1)
results = bucket_time_to_event(results)
results.to_pickle(data_path + 'metric_table_next.pkl')
def main():
# Load in config items
with open('../../../config.json') as json_config_file:
config = json.load(json_config_file)
data_path = config['model_data_path']
extract_data_path = config['extract_data_path']
eoy_date = pd.to_datetime(config['date'])
start_date = eoy_date + pd.Timedelta(days=1)
end_date = eoy_date + pd.offsets.DateOffset(years=1)
calculate_event_metrics(data_path, eoy_date, start_date, end_date)
calculate_next_event(data_path, extract_data_path, eoy_date,
start_date, end_date)
main()