copd-model-e / validation /event_tracking /model_e_survival_calculations.py
IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
# Import libraries
import pandas as pd
# Set file paths
input_file_path = '<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/'
output_file_path = '<YOUR_DATA_PATH>/summary_csv_files/'
def read_data(file):
"""
Read in data source
--------
:param file: string filename
:return: dataframe
"""
df = pd.read_csv(file)
return df
def format_data_for_output(survival_data):
"""
Remove columns not needed for output
--------
:param survival_data: dataframe containing date of death field
:return: above dataframe filtered to only contain columns required
for analysis/ output
"""
survival_data = survival_data[['SafeHavenID', 'DOD']]
return survival_data
def filter_data(survival_data, date):
"""
Filter data to only include those alive on the index date for analysis
--------
:param data: string filename
:param date: Index date in 'DD-MM-YYYY' format
:return: dataframe including only those alive on the index date for
analysis
"""
survival_data['DOD'] = pd.to_datetime(survival_data['DOD'])
return survival_data[survival_data['DOD'] >= date]
def calulate_days_survived(survival_data, date):
"""
Calcualte days survived following the index date
--------
:param survival_data: dataframe containing date of death field
:param date: Index date in 'DD-MM-YYYY' format
:return: days survived from index date
"""
survival_data['index_date'] = date
survival_data['index_date'] = pd.to_datetime(survival_data['index_date'])
return (survival_data['DOD'] - survival_data['index_date']).dt.days
def main():
# Read in data
survival_file = input_file_path + "Deaths_Cohort3R.csv"
survival_data = read_data(survival_file)
# Drop duplicates
survival_data = survival_data.drop_duplicates()
# Filter to only include Safehaven ID and date of death fields
survival_data = format_data_for_output(survival_data)
# Filter data to only include deaths after the index date for analysis
survival_data = filter_data(survival_data, '01-01-2020')
# Calculate days survived
survival_data['days_survived'] = calulate_days_survived(survival_data,
'01-01-2020')
# Save data
survival_data.to_pickle(output_file_path + 'Survival_from_index.pkl')
main()