File size: 3,655 Bytes
53a6def | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | # Import libraries
import functools as ft
import numpy as np
import pandas as pd
# Set file paths
file_path = '<YOUR_DATA_PATH>/'
input_file_path = file_path + 'data_for_model_e_columns/'
def read_data(file):
"""
Read in data source
--------
:param file: string filename
:return: dataframe
"""
df = pd.read_csv(file)
return df
def format_data(exacerbations_data, admissions_data, onboard, IDs):
"""
Remove unescessary columns from dataframes,
merge onboarding, admissions, and exacerbations dataframes,
convert datetime columns to datetime format,
filter to include only RECEIVER and scale up 1 IDs,
and create new column showing date of death for those who died during the study
--------
:param exacerbations_data: dataframe containing exacerbations data
:param admissions_data: dataframe containing admissions data
:param IDs: dataframe containing RECEIVER and scale up 1 study IDs
:param onboard: dataframe containing onboarding dates
:return: formatted dataframe
"""
admissions_data = admissions_data[['Study_ID', 'admitted_1']]
exacerbations_data = exacerbations_data[['Study_ID', 'first_exacerbation']]
dfs = [onboard, exacerbations_data, admissions_data]
df_combined = ft.reduce(lambda left, right: pd.merge(left, right, on='Study_ID', how="outer"), dfs)
data = pd.merge(IDs, df_combined, on="Study_ID", how="left")
data['first_exacerbation'] = pd.to_datetime(data['first_exacerbation'])
data['admitted_1'] = pd.to_datetime(data['admitted_1'])
data['OB_date'] = pd.to_datetime(data['OB_date'])
data['censor'] = pd.to_datetime(data['censor'])
conditions_DOD = [data['censor'] != '2021-08-31']
values_DOD = [data['censor'].dt.date]
data['DOD'] = np.select(conditions_DOD, values_DOD, default=None)
data['DOD'] = pd.to_datetime(data['DOD'])
return data
def time_to_events(data):
"""
Calculate time to first event (exacerbation, admission, or death) and first admission or death
for each study ID and save the summary dataframe
--------
:param data: dataframe containing admissions data, exacerbations data, and onboarding dates
:return: dataframe with additional columns showing number of days until first event and number of days
to first admission/ death
"""
data['first_event'] = data[["admitted_1", "first_exacerbation", "DOD"]].min(axis=1)
data['first_event'] = pd.to_datetime(data['first_event'])
data['first_admission_or_death'] = data[["admitted_1", "DOD"]].min(axis=1)
data['first_admission_or_death'] = pd.to_datetime(data['first_admission_or_death'])
data['days_to_first_event'] = (data['first_event'] - data['OB_date']).dt.days
data['days_to_first_admission_death'] = (data['first_admission_or_death'] - data['OB_date']).dt.days
data.to_csv(file_path + 'Time_to_first_event.csv')
def main():
# Read data
PRO_LOGIC_data = input_file_path + "First_exacerbation_data.csv"
admissions_data_file = input_file_path + "admissions_data_up_to_31082021.csv"
RC_SU1_IDs_data_file = input_file_path + "RC_SU1_IDs.csv"
onboard_file = input_file_path + "onboarding_dates.csv"
PRO_LOGIC_data = read_data(PRO_LOGIC_data)
admissions_data = read_data(admissions_data_file)
RC_SU1_IDs = read_data(RC_SU1_IDs_data_file)
Onboard = read_data(onboard_file)
# Format data
RC_combined_data = format_data(PRO_LOGIC_data, admissions_data, Onboard, RC_SU1_IDs)
# Calculate time to first event for each study ID and save the summary dataframe
time_to_events(RC_combined_data)
main() |