| | |
| | import functools as ft |
| | import numpy as np |
| | import pandas as pd |
| |
|
| | |
| | file_path = '<YOUR_DATA_PATH>/' |
| | input_file_path = file_path + 'data_for_model_e_columns/' |
| |
|
| |
|
| | def read_data(file): |
| | """ |
| | Read in data source |
| | -------- |
| | :param file: string filename |
| | :return: dataframe |
| | """ |
| | df = pd.read_csv(file) |
| | return df |
| |
|
| |
|
| | def format_data(exacerbations_data, admissions_data, onboard, IDs): |
| | """ |
| | Remove unescessary columns from dataframes, |
| | merge onboarding, admissions, and exacerbations dataframes, |
| | convert datetime columns to datetime format, |
| | filter to include only RECEIVER and scale up 1 IDs, |
| | and create new column showing date of death for those who died during the study |
| | -------- |
| | :param exacerbations_data: dataframe containing exacerbations data |
| | :param admissions_data: dataframe containing admissions data |
| | :param IDs: dataframe containing RECEIVER and scale up 1 study IDs |
| | :param onboard: dataframe containing onboarding dates |
| | :return: formatted dataframe |
| | """ |
| | admissions_data = admissions_data[['Study_ID', 'admitted_1']] |
| | exacerbations_data = exacerbations_data[['Study_ID', 'first_exacerbation']] |
| |
|
| | dfs = [onboard, exacerbations_data, admissions_data] |
| | df_combined = ft.reduce(lambda left, right: pd.merge(left, right, on='Study_ID', how="outer"), dfs) |
| | data = pd.merge(IDs, df_combined, on="Study_ID", how="left") |
| | |
| | data['first_exacerbation'] = pd.to_datetime(data['first_exacerbation']) |
| | data['admitted_1'] = pd.to_datetime(data['admitted_1']) |
| | data['OB_date'] = pd.to_datetime(data['OB_date']) |
| | data['censor'] = pd.to_datetime(data['censor']) |
| | |
| | conditions_DOD = [data['censor'] != '2021-08-31'] |
| | values_DOD = [data['censor'].dt.date] |
| | data['DOD'] = np.select(conditions_DOD, values_DOD, default=None) |
| | data['DOD'] = pd.to_datetime(data['DOD']) |
| | return data |
| |
|
| |
|
| | def time_to_events(data): |
| | """ |
| | Calculate time to first event (exacerbation, admission, or death) and first admission or death |
| | for each study ID and save the summary dataframe |
| | -------- |
| | :param data: dataframe containing admissions data, exacerbations data, and onboarding dates |
| | :return: dataframe with additional columns showing number of days until first event and number of days |
| | to first admission/ death |
| | """ |
| | data['first_event'] = data[["admitted_1", "first_exacerbation", "DOD"]].min(axis=1) |
| | data['first_event'] = pd.to_datetime(data['first_event']) |
| | data['first_admission_or_death'] = data[["admitted_1", "DOD"]].min(axis=1) |
| | data['first_admission_or_death'] = pd.to_datetime(data['first_admission_or_death']) |
| |
|
| | data['days_to_first_event'] = (data['first_event'] - data['OB_date']).dt.days |
| | data['days_to_first_admission_death'] = (data['first_admission_or_death'] - data['OB_date']).dt.days |
| | |
| | data.to_csv(file_path + 'Time_to_first_event.csv') |
| |
|
| |
|
| | def main(): |
| | |
| | PRO_LOGIC_data = input_file_path + "First_exacerbation_data.csv" |
| | admissions_data_file = input_file_path + "admissions_data_up_to_31082021.csv" |
| | RC_SU1_IDs_data_file = input_file_path + "RC_SU1_IDs.csv" |
| | onboard_file = input_file_path + "onboarding_dates.csv" |
| |
|
| | PRO_LOGIC_data = read_data(PRO_LOGIC_data) |
| | admissions_data = read_data(admissions_data_file) |
| | RC_SU1_IDs = read_data(RC_SU1_IDs_data_file) |
| | Onboard = read_data(onboard_file) |
| |
|
| | |
| | RC_combined_data = format_data(PRO_LOGIC_data, admissions_data, Onboard, RC_SU1_IDs) |
| | |
| | |
| | time_to_events(RC_combined_data) |
| |
|
| |
|
| | main() |