| | |
| | import pandas as pd |
| |
|
| | |
| | input_file_path = '<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/' |
| | output_file_path = '<YOUR_DATA_PATH>/summary_csv_files/' |
| |
|
| |
|
| | def read_data(file): |
| | """ |
| | Read in data source |
| | -------- |
| | :param file: string filename |
| | :return: dataframe |
| | """ |
| | df = pd.read_csv(file) |
| | return df |
| |
|
| |
|
| | def format_data_for_output(survival_data): |
| | """ |
| | Remove columns not needed for output |
| | -------- |
| | :param survival_data: dataframe containing date of death field |
| | :return: above dataframe filtered to only contain columns required |
| | for analysis/ output |
| | """ |
| | survival_data = survival_data[['SafeHavenID', 'DOD']] |
| | return survival_data |
| |
|
| |
|
| | def filter_data(survival_data, date): |
| | """ |
| | Filter data to only include those alive on the index date for analysis |
| | -------- |
| | :param data: string filename |
| | :param date: Index date in 'DD-MM-YYYY' format |
| | :return: dataframe including only those alive on the index date for |
| | analysis |
| | """ |
| | survival_data['DOD'] = pd.to_datetime(survival_data['DOD']) |
| | return survival_data[survival_data['DOD'] >= date] |
| |
|
| |
|
| | def calulate_days_survived(survival_data, date): |
| | """ |
| | Calcualte days survived following the index date |
| | -------- |
| | :param survival_data: dataframe containing date of death field |
| | :param date: Index date in 'DD-MM-YYYY' format |
| | :return: days survived from index date |
| | """ |
| | survival_data['index_date'] = date |
| | survival_data['index_date'] = pd.to_datetime(survival_data['index_date']) |
| | return (survival_data['DOD'] - survival_data['index_date']).dt.days |
| |
|
| |
|
| | def main(): |
| | |
| | survival_file = input_file_path + "Deaths_Cohort3R.csv" |
| | survival_data = read_data(survival_file) |
| |
|
| | |
| | survival_data = survival_data.drop_duplicates() |
| |
|
| | |
| | survival_data = format_data_for_output(survival_data) |
| |
|
| | |
| | survival_data = filter_data(survival_data, '01-01-2020') |
| |
|
| | |
| | survival_data['days_survived'] = calulate_days_survived(survival_data, |
| | '01-01-2020') |
| |
|
| | |
| | survival_data.to_pickle(output_file_path + 'Survival_from_index.pkl') |
| |
|
| |
|
| | main() |
| |
|