| | |
| | import pandas as pd |
| |
|
| | |
| | file_path = '<YOUR_DATA_PATH>/' |
| | input_file_path = file_path + 'data_for_model_e_columns/' |
| |
|
| |
|
| | def read_data(file): |
| | """ |
| | Read in data source |
| | -------- |
| | :param file: string filename |
| | :return: dataframe |
| | """ |
| | df = pd.read_csv(file) |
| | return df |
| | |
| |
|
| | def format_data(data, IDs, onboard): |
| | """ |
| | Convert datetime columns to datetime format, filter to only include RECEIVER and scale up 1 IDs, |
| | and join onboarding dates to exacerbations data for each study ID |
| | -------- |
| | :param data: exacerbations dataframe |
| | :param IDs: dataframe containing RC and SU1 study IDs |
| | :param onboard: dataframe containing onboarding dates |
| | :return: formatted dataframe |
| | """ |
| | data['SubmissionTime'] = pd.to_datetime(data['SubmissionTime'], utc=True) |
| | onboard['OB_date'] = pd.to_datetime(onboard['OB_date'], utc=True) |
| | onboard['yearcensor'] = onboard['OB_date'] + pd.offsets.DateOffset(days=365) |
| | data = pd.merge(IDs, data, on="Study_ID", how="left") |
| | data = pd.merge(data, onboard, on="Study_ID", how="left") |
| | return data |
| |
|
| |
|
| | def filter_study_censor(data): |
| | """ |
| | Filter the dataframe to only contain data obtained before the study censor date |
| | -------- |
| | :param data: dataframe |
| | :return: dataframe containing data obtained before the study censor date |
| | """ |
| | return data[data['SubmissionTime'] < '2021-09-01'] |
| |
|
| |
|
| | def filter_first_year(data): |
| | """ |
| | Filter a dataframe to only contain data obtained in the first year post-onboarding |
| | -------- |
| | :param data: dataframe |
| | :return: dataframe containing only data obtained in the first year post-onboarding |
| | """ |
| | return data[data['yearcensor'] >= data['SubmissionTime']] |
| |
|
| |
|
| | def get_exac_data(data, onboard, IDs): |
| | """ |
| | Calculate the number of exacerbations to year censor and study censor |
| | and the length of time to first exacerbation for each study ID and save the |
| | resulting dataframe |
| | -------- |
| | :param censor_data: PRO LOGIC exacerbations data censored at the study censor date |
| | :param year_censor_data: PRO LOGIC exacerbations data censored a year post onboaridng |
| | :param onboard: Dataframe showing onboarding dates for the study participants |
| | :param IDs: Dataframe containing all RC and SU1 study IDs |
| | :return: dataframe showing exacerbation counts and the length of time to first exacerbation for each study ID |
| | """ |
| | censor_data = filter_study_censor(data) |
| | year_censor_data = filter_first_year(data) |
| |
|
| | censor_sum = censor_data.groupby("Study_ID").SubmissionTime.agg( |
| | first_exacerbation='min', |
| | exacerbation_count_to_censor='count').copy() |
| | censor_sum = pd.merge(censor_sum, onboard, on="Study_ID", how="outer") |
| | censor_sum["days_to_first_exacerbation"] = (censor_sum["first_exacerbation"] - censor_sum["OB_date"]).dt.days |
| | |
| | year_censor_sum = year_censor_data.groupby("Study_ID").SubmissionTime.agg( |
| | exacerbation_count_to_year='count').copy() |
| | |
| | PRO_LOGIC_exacerbation_data = pd.merge(censor_sum, year_censor_sum, on="Study_ID", how="outer") |
| | PRO_LOGIC_exacerbation_data = pd.merge(IDs, PRO_LOGIC_exacerbation_data, on="Study_ID", how="left") |
| |
|
| | PRO_LOGIC_exacerbation_data.to_csv(file_path + 'PRO_LOGIC_exacerbation_data.csv') |
| |
|
| |
|
| | def main(): |
| | |
| | PRO_LOGIC_data = input_file_path + "PRO_LOGIC_exacerbations_and_dates.csv" |
| | RC_SU1_IDs_data_file = input_file_path + "RC_SU1_IDs.csv" |
| | onboard_file = input_file_path + "onboarding_dates.csv" |
| |
|
| | PRO_LOGIC_data = read_data(PRO_LOGIC_data) |
| | RC_SU1_IDs = read_data(RC_SU1_IDs_data_file) |
| | Onboard = read_data(onboard_file) |
| |
|
| | |
| | PRO_LOGIC_data = format_data(PRO_LOGIC_data, RC_SU1_IDs, Onboard) |
| |
|
| | |
| | get_exac_data(PRO_LOGIC_data, Onboard, RC_SU1_IDs) |
| |
|
| |
|
| | main() |