File size: 2,376 Bytes
53a6def | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | # Import libraries
import pandas as pd
# Set file paths
input_file_path = '<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/'
output_file_path = '<YOUR_DATA_PATH>/summary_csv_files/'
def read_data(file):
"""
Read in data source
--------
:param file: string filename
:return: dataframe
"""
df = pd.read_csv(file)
return df
def format_data_for_output(survival_data):
"""
Remove columns not needed for output
--------
:param survival_data: dataframe containing date of death field
:return: above dataframe filtered to only contain columns required
for analysis/ output
"""
survival_data = survival_data[['SafeHavenID', 'DOD']]
return survival_data
def filter_data(survival_data, date):
"""
Filter data to only include those alive on the index date for analysis
--------
:param data: string filename
:param date: Index date in 'DD-MM-YYYY' format
:return: dataframe including only those alive on the index date for
analysis
"""
survival_data['DOD'] = pd.to_datetime(survival_data['DOD'])
return survival_data[survival_data['DOD'] >= date]
def calulate_days_survived(survival_data, date):
"""
Calcualte days survived following the index date
--------
:param survival_data: dataframe containing date of death field
:param date: Index date in 'DD-MM-YYYY' format
:return: days survived from index date
"""
survival_data['index_date'] = date
survival_data['index_date'] = pd.to_datetime(survival_data['index_date'])
return (survival_data['DOD'] - survival_data['index_date']).dt.days
def main():
# Read in data
survival_file = input_file_path + "Deaths_Cohort3R.csv"
survival_data = read_data(survival_file)
# Drop duplicates
survival_data = survival_data.drop_duplicates()
# Filter to only include Safehaven ID and date of death fields
survival_data = format_data_for_output(survival_data)
# Filter data to only include deaths after the index date for analysis
survival_data = filter_data(survival_data, '01-01-2020')
# Calculate days survived
survival_data['days_survived'] = calulate_days_survived(survival_data,
'01-01-2020')
# Save data
survival_data.to_pickle(output_file_path + 'Survival_from_index.pkl')
main()
|