# Import libraries import pandas as pd # Set file paths input_file_path = '/EXAMPLE_STUDY_DATA/' output_file_path = '/summary_csv_files/' def read_data(file): """ Read in data source -------- :param file: string filename :return: dataframe """ df = pd.read_csv(file) return df def format_data_for_output(survival_data): """ Remove columns not needed for output -------- :param survival_data: dataframe containing date of death field :return: above dataframe filtered to only contain columns required for analysis/ output """ survival_data = survival_data[['SafeHavenID', 'DOD']] return survival_data def filter_data(survival_data, date): """ Filter data to only include those alive on the index date for analysis -------- :param data: string filename :param date: Index date in 'DD-MM-YYYY' format :return: dataframe including only those alive on the index date for analysis """ survival_data['DOD'] = pd.to_datetime(survival_data['DOD']) return survival_data[survival_data['DOD'] >= date] def calulate_days_survived(survival_data, date): """ Calcualte days survived following the index date -------- :param survival_data: dataframe containing date of death field :param date: Index date in 'DD-MM-YYYY' format :return: days survived from index date """ survival_data['index_date'] = date survival_data['index_date'] = pd.to_datetime(survival_data['index_date']) return (survival_data['DOD'] - survival_data['index_date']).dt.days def main(): # Read in data survival_file = input_file_path + "Deaths_Cohort3R.csv" survival_data = read_data(survival_file) # Drop duplicates survival_data = survival_data.drop_duplicates() # Filter to only include Safehaven ID and date of death fields survival_data = format_data_for_output(survival_data) # Filter data to only include deaths after the index date for analysis survival_data = filter_data(survival_data, '01-01-2020') # Calculate days survived survival_data['days_survived'] = calulate_days_survived(survival_data, '01-01-2020') # Save data survival_data.to_pickle(output_file_path + 'Survival_from_index.pkl') main()