copd-model-e / validation /parameter_calculation /PRO_LOGIC_exacerbation_calculations.py
IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
# Import libraries
import pandas as pd
# Set file paths
file_path = '<YOUR_DATA_PATH>/'
input_file_path = file_path + 'data_for_model_e_columns/'
def read_data(file):
"""
Read in data source
--------
:param file: string filename
:return: dataframe
"""
df = pd.read_csv(file)
return df
def format_data(data, IDs, onboard):
"""
Convert datetime columns to datetime format, filter to only include RECEIVER and scale up 1 IDs,
and join onboarding dates to exacerbations data for each study ID
--------
:param data: exacerbations dataframe
:param IDs: dataframe containing RC and SU1 study IDs
:param onboard: dataframe containing onboarding dates
:return: formatted dataframe
"""
data['SubmissionTime'] = pd.to_datetime(data['SubmissionTime'], utc=True)
onboard['OB_date'] = pd.to_datetime(onboard['OB_date'], utc=True)
onboard['yearcensor'] = onboard['OB_date'] + pd.offsets.DateOffset(days=365)
data = pd.merge(IDs, data, on="Study_ID", how="left")
data = pd.merge(data, onboard, on="Study_ID", how="left")
return data
def filter_study_censor(data):
"""
Filter the dataframe to only contain data obtained before the study censor date
--------
:param data: dataframe
:return: dataframe containing data obtained before the study censor date
"""
return data[data['SubmissionTime'] < '2021-09-01']
def filter_first_year(data):
"""
Filter a dataframe to only contain data obtained in the first year post-onboarding
--------
:param data: dataframe
:return: dataframe containing only data obtained in the first year post-onboarding
"""
return data[data['yearcensor'] >= data['SubmissionTime']]
def get_exac_data(data, onboard, IDs):
"""
Calculate the number of exacerbations to year censor and study censor
and the length of time to first exacerbation for each study ID and save the
resulting dataframe
--------
:param censor_data: PRO LOGIC exacerbations data censored at the study censor date
:param year_censor_data: PRO LOGIC exacerbations data censored a year post onboaridng
:param onboard: Dataframe showing onboarding dates for the study participants
:param IDs: Dataframe containing all RC and SU1 study IDs
:return: dataframe showing exacerbation counts and the length of time to first exacerbation for each study ID
"""
censor_data = filter_study_censor(data)
year_censor_data = filter_first_year(data)
censor_sum = censor_data.groupby("Study_ID").SubmissionTime.agg(
first_exacerbation='min',
exacerbation_count_to_censor='count').copy()
censor_sum = pd.merge(censor_sum, onboard, on="Study_ID", how="outer")
censor_sum["days_to_first_exacerbation"] = (censor_sum["first_exacerbation"] - censor_sum["OB_date"]).dt.days
year_censor_sum = year_censor_data.groupby("Study_ID").SubmissionTime.agg(
exacerbation_count_to_year='count').copy()
PRO_LOGIC_exacerbation_data = pd.merge(censor_sum, year_censor_sum, on="Study_ID", how="outer")
PRO_LOGIC_exacerbation_data = pd.merge(IDs, PRO_LOGIC_exacerbation_data, on="Study_ID", how="left")
PRO_LOGIC_exacerbation_data.to_csv(file_path + 'PRO_LOGIC_exacerbation_data.csv')
def main():
# Read data
PRO_LOGIC_data = input_file_path + "PRO_LOGIC_exacerbations_and_dates.csv"
RC_SU1_IDs_data_file = input_file_path + "RC_SU1_IDs.csv"
onboard_file = input_file_path + "onboarding_dates.csv"
PRO_LOGIC_data = read_data(PRO_LOGIC_data)
RC_SU1_IDs = read_data(RC_SU1_IDs_data_file)
Onboard = read_data(onboard_file)
# Format data
PRO_LOGIC_data = format_data(PRO_LOGIC_data, RC_SU1_IDs, Onboard)
# Calculate and save summary exacerbation data to year and study censor dates for each ID
get_exac_data(PRO_LOGIC_data, Onboard, RC_SU1_IDs)
main()