# Import libraries import functools as ft import numpy as np import pandas as pd # Set file paths file_path = '/' input_file_path = file_path + 'data_for_model_e_columns/' def read_data(file): """ Read in data source -------- :param file: string filename :return: dataframe """ df = pd.read_csv(file) return df def format_data(exacerbations_data, admissions_data, onboard, IDs): """ Remove unescessary columns from dataframes, merge onboarding, admissions, and exacerbations dataframes, convert datetime columns to datetime format, filter to include only RECEIVER and scale up 1 IDs, and create new column showing date of death for those who died during the study -------- :param exacerbations_data: dataframe containing exacerbations data :param admissions_data: dataframe containing admissions data :param IDs: dataframe containing RECEIVER and scale up 1 study IDs :param onboard: dataframe containing onboarding dates :return: formatted dataframe """ admissions_data = admissions_data[['Study_ID', 'admitted_1']] exacerbations_data = exacerbations_data[['Study_ID', 'first_exacerbation']] dfs = [onboard, exacerbations_data, admissions_data] df_combined = ft.reduce(lambda left, right: pd.merge(left, right, on='Study_ID', how="outer"), dfs) data = pd.merge(IDs, df_combined, on="Study_ID", how="left") data['first_exacerbation'] = pd.to_datetime(data['first_exacerbation']) data['admitted_1'] = pd.to_datetime(data['admitted_1']) data['OB_date'] = pd.to_datetime(data['OB_date']) data['censor'] = pd.to_datetime(data['censor']) conditions_DOD = [data['censor'] != '2021-08-31'] values_DOD = [data['censor'].dt.date] data['DOD'] = np.select(conditions_DOD, values_DOD, default=None) data['DOD'] = pd.to_datetime(data['DOD']) return data def time_to_events(data): """ Calculate time to first event (exacerbation, admission, or death) and first admission or death for each study ID and save the summary dataframe -------- :param data: dataframe containing admissions data, exacerbations data, and onboarding dates :return: dataframe with additional columns showing number of days until first event and number of days to first admission/ death """ data['first_event'] = data[["admitted_1", "first_exacerbation", "DOD"]].min(axis=1) data['first_event'] = pd.to_datetime(data['first_event']) data['first_admission_or_death'] = data[["admitted_1", "DOD"]].min(axis=1) data['first_admission_or_death'] = pd.to_datetime(data['first_admission_or_death']) data['days_to_first_event'] = (data['first_event'] - data['OB_date']).dt.days data['days_to_first_admission_death'] = (data['first_admission_or_death'] - data['OB_date']).dt.days data.to_csv(file_path + 'Time_to_first_event.csv') def main(): # Read data PRO_LOGIC_data = input_file_path + "First_exacerbation_data.csv" admissions_data_file = input_file_path + "admissions_data_up_to_31082021.csv" RC_SU1_IDs_data_file = input_file_path + "RC_SU1_IDs.csv" onboard_file = input_file_path + "onboarding_dates.csv" PRO_LOGIC_data = read_data(PRO_LOGIC_data) admissions_data = read_data(admissions_data_file) RC_SU1_IDs = read_data(RC_SU1_IDs_data_file) Onboard = read_data(onboard_file) # Format data RC_combined_data = format_data(PRO_LOGIC_data, admissions_data, Onboard, RC_SU1_IDs) # Calculate time to first event for each study ID and save the summary dataframe time_to_events(RC_combined_data) main()