copd-model-e / validation /parameter_calculation /NIV_parameters_calculation.py
IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
# Import libraries
import pandas as pd
# Set file paths
file_path = '<YOUR_DATA_PATH>/'
input_file_path = file_path + 'data_for_model_e_columns/'
def read_data(file):
"""
Read in data source
--------
:param file: string filename
:return: dataframe
"""
df = pd.read_csv(file)
return df
def format_data(data, IDs, onboard):
"""
Convert datetime columns to datetime format, filter to only include RECEIVER and scale up IDs,
and join oboarding dates
--------
:param data: NIV dataframe
:param IDs: dataframe containing Study IDs
:param onboard: dataframe containing onboarding dates
:return: formatted dataframe
"""
data = data[['Study_ID', 'ie_ratio_value_50', 'ie_ratio_value_95',
'ie_ratio_maximum_value', 'resp_events_AHI',
'resp_events_HI', 'Stop_time', 'Start_time']]
data['Stop_time'] = pd.to_datetime(data['Stop_time'])
onboard['OB_date'] = pd.to_datetime(onboard['OB_date'])
onboard['yearcensor'] = onboard['OB_date'] + pd.offsets.DateOffset(days=365)
data = pd.merge(IDs, data, on="Study_ID", how="left")
data = pd.merge(data, onboard, on="Study_ID", how="left")
return data
def filter_study_censor(data):
"""
Filter the dataframe to only contain data obtained before the study censor date
--------
:param data: dataframe
:return: dataframe containing data obtained before the study censor date
"""
return data[data['Stop_time'] < '2021-09-01']
def filter_first_year(data):
"""
Filter the dataframe to only contain data obtained in the first year post-onboarding
--------
:param data: dataframe
:return: dataframe containing only data obtained in the first year post-onboarding
"""
return data[data['yearcensor'] >= data['Stop_time']]
def mean_max_summary(data, col):
"""
Create a dataframe showing mean and max values per group
--------
:param data: dataframe
:param col: parameter to group on
:return: summary dataframe showing mean and max scores for each study ID
"""
summary_metrics = ['mean', 'max', 'count']
return data.groupby(col).agg(
{'ie_ratio_value_50': summary_metrics,
'ie_ratio_value_95': summary_metrics,
'ie_ratio_maximum_value': summary_metrics,
'resp_events_AHI': summary_metrics,
'resp_events_HI': summary_metrics})
def calculate_summary_data(data):
"""
Calculate the average NIV parameters up to the study censor date and a year
after onboarding for each study ID and save the resulting summary
dataframe as a csv file
--------
:param data: dataframe
:param typ: string value to be input into file name showing what is summarised
"""
data_filter_censor = filter_study_censor(data)
summary_censor = mean_max_summary(data_filter_censor, 'Study_ID')
data_year_censor = filter_first_year(data)
summary_year = mean_max_summary(data_year_censor, 'Study_ID')
output_file_path = file_path + 'NIV_ Average_parameters_to_'
summary_censor.to_csv(output_file_path + 'censor.csv')
summary_year.to_csv(output_file_path + 'year.csv')
def main():
# Read data
NIV_data_file = input_file_path + "NIV_data_wrangled.csv"
onboard_file = input_file_path + "onboarding_dates.csv"
RC_SU1_IDs_file = input_file_path + "RC_SU1_IDs.csv"
NIV_data = read_data(NIV_data_file)
onboard = read_data(onboard_file)
RC_SU1_IDs = read_data(RC_SU1_IDs_file)
# Format data
NIV_data = format_data(NIV_data, RC_SU1_IDs, onboard)
# Calculate and save summary NIV data to year and study censor dates for each ID
calculate_summary_data(NIV_data)
main()