# Import libraries import pandas as pd # Set file paths file_path = '/' input_file_path = file_path + 'data_for_model_e_columns/' def read_data(file): """ Read in data source -------- :param file: string filename :return: dataframe """ df = pd.read_csv(file) return df def format_data(data, IDs, onboard): """ Convert datetime columns to datetime format, filter to only include RECEIVER and scale up 1 IDs, and join onboarding dates to exacerbations data for each study ID -------- :param data: exacerbations dataframe :param IDs: dataframe containing RC and SU1 study IDs :param onboard: dataframe containing onboarding dates :return: formatted dataframe """ data['SubmissionTime'] = pd.to_datetime(data['SubmissionTime'], utc=True) onboard['OB_date'] = pd.to_datetime(onboard['OB_date'], utc=True) onboard['yearcensor'] = onboard['OB_date'] + pd.offsets.DateOffset(days=365) data = pd.merge(IDs, data, on="Study_ID", how="left") data = pd.merge(data, onboard, on="Study_ID", how="left") return data def filter_study_censor(data): """ Filter the dataframe to only contain data obtained before the study censor date -------- :param data: dataframe :return: dataframe containing data obtained before the study censor date """ return data[data['SubmissionTime'] < '2021-09-01'] def filter_first_year(data): """ Filter a dataframe to only contain data obtained in the first year post-onboarding -------- :param data: dataframe :return: dataframe containing only data obtained in the first year post-onboarding """ return data[data['yearcensor'] >= data['SubmissionTime']] def get_exac_data(data, onboard, IDs): """ Calculate the number of exacerbations to year censor and study censor and the length of time to first exacerbation for each study ID and save the resulting dataframe -------- :param censor_data: PRO LOGIC exacerbations data censored at the study censor date :param year_censor_data: PRO LOGIC exacerbations data censored a year post onboaridng :param onboard: Dataframe showing onboarding dates for the study participants :param IDs: Dataframe containing all RC and SU1 study IDs :return: dataframe showing exacerbation counts and the length of time to first exacerbation for each study ID """ censor_data = filter_study_censor(data) year_censor_data = filter_first_year(data) censor_sum = censor_data.groupby("Study_ID").SubmissionTime.agg( first_exacerbation='min', exacerbation_count_to_censor='count').copy() censor_sum = pd.merge(censor_sum, onboard, on="Study_ID", how="outer") censor_sum["days_to_first_exacerbation"] = (censor_sum["first_exacerbation"] - censor_sum["OB_date"]).dt.days year_censor_sum = year_censor_data.groupby("Study_ID").SubmissionTime.agg( exacerbation_count_to_year='count').copy() PRO_LOGIC_exacerbation_data = pd.merge(censor_sum, year_censor_sum, on="Study_ID", how="outer") PRO_LOGIC_exacerbation_data = pd.merge(IDs, PRO_LOGIC_exacerbation_data, on="Study_ID", how="left") PRO_LOGIC_exacerbation_data.to_csv(file_path + 'PRO_LOGIC_exacerbation_data.csv') def main(): # Read data PRO_LOGIC_data = input_file_path + "PRO_LOGIC_exacerbations_and_dates.csv" RC_SU1_IDs_data_file = input_file_path + "RC_SU1_IDs.csv" onboard_file = input_file_path + "onboarding_dates.csv" PRO_LOGIC_data = read_data(PRO_LOGIC_data) RC_SU1_IDs = read_data(RC_SU1_IDs_data_file) Onboard = read_data(onboard_file) # Format data PRO_LOGIC_data = format_data(PRO_LOGIC_data, RC_SU1_IDs, Onboard) # Calculate and save summary exacerbation data to year and study censor dates for each ID get_exac_data(PRO_LOGIC_data, Onboard, RC_SU1_IDs) main()