| | |
| | import pandas as pd |
| | import numpy as np |
| |
|
| | |
| | input_file_path = '<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/' |
| | output_file_path = '<YOUR_DATA_PATH>/summary_files/' |
| |
|
| | steroid_codes = ['0603020T0AAACAC', '0603020T0AABKBK', '0603020T0AAAXAX', |
| | '0603020T0AAAGAG', '0603020T0AABHBH', '0603020T0AAACAC', |
| | '0603020T0AABKBK', '0603020T0AABNBN', '0603020T0AAAGAG', |
| | '0603020T0AABHBH'] |
| |
|
| | antib_codes = ['0501013B0AAAAAA', '0501013B0AAABAB', '0501030I0AAABAB', |
| | '0501030I0AAAAAA', '0501050B0AAAAAA', '0501050B0AAADAD', |
| | '0501013K0AAAJAJ'] |
| |
|
| | exac_meds = steroid_codes + antib_codes |
| |
|
| |
|
| | def read_data(file, cols, types): |
| | """ |
| | Read in data source |
| | -------- |
| | :param file: string filename |
| | :param cols: string list of column names |
| | :param types: string list of column types |
| | :return: dataframe |
| | """ |
| | schema = dict(zip(cols, types)) |
| | df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema) |
| | return df |
| |
|
| |
|
| | def initialize_presc_data(presc_file): |
| | """ |
| | Load in prescribing dataset to correct format |
| | -------- |
| | :param presc_file: prescribing data file name |
| | :return: prescribing dataframe with correct column names and types |
| | """ |
| | print('Loading prescribing data') |
| |
|
| | |
| | presc_cols = ['SafeHavenID', 'PRESC_DATE', 'PI_Approved_Name', |
| | 'PI_BNF_Item_Code'] |
| | presc_types = ['int', 'object', 'str', 'str'] |
| | df = read_data(presc_file, presc_cols, presc_types) |
| |
|
| | |
| | df = df.dropna() |
| | df = df.drop_duplicates() |
| |
|
| | |
| | df['PRESC_DATE'] = pd.to_datetime(df.PRESC_DATE) |
| |
|
| | return df |
| |
|
| |
|
| | def track_medication(df): |
| | """ |
| | Track salbutamol and rescue med prescriptions |
| | -------- |
| | :param df: dataframe |
| | :return: dataframe with tracked meds |
| | """ |
| | print('Tracking medication') |
| |
|
| | |
| | df['code'] = df.PI_BNF_Item_Code.apply(lambda x: x[0:9]) |
| |
|
| | |
| | df['rescue_meds'] = df.PI_BNF_Item_Code.str.contains( |
| | '|'.join(exac_meds)).astype(int) |
| |
|
| | return df |
| |
|
| |
|
| | def filter_data(data, date): |
| | """ |
| | Filter data to only include rescue med prescritpions occurring |
| | after the index date |
| | -------- |
| | :param data: dataframe |
| | :param date: Index date in 'DD-MM-YYYY' format |
| | :return: filtered dataframe |
| | """ |
| | data['PRESC_DATE'] = pd.to_datetime(data['PRESC_DATE']) |
| | data = data[data['PRESC_DATE'] >= date] |
| | data = data[data['rescue_meds'] == 1] |
| | return data |
| |
|
| |
|
| | def calculate_time_to_first_exacerbation(data, date): |
| | """ |
| | Calculate days to first exacerbation |
| | -------- |
| | :param data: dataframe |
| | :param date: Index date in 'DD-MM-YYYY' format |
| | :return: dataframe showing the number of days to the first exacerbation |
| | event for each ID since the index date |
| | """ |
| | first_exac = data.groupby('SafeHavenID').agg(first_exac=('PRESC_DATE', np.min)) |
| | first_exac['index_date'] = date |
| | first_exac['index_date'] = pd.to_datetime(first_exac['index_date']) |
| | first_exac['days_to_first_exac'] = (first_exac['first_exac'] - first_exac['index_date']).dt.days |
| | return first_exac |
| |
|
| |
|
| | def calculate_exac_count_1_year(data, year_censor, first_exac_df): |
| | """ |
| | Calculate the number of exacerbations in the year following the index date |
| | and join this data to the time to first exacerbation data for each ID |
| | -------- |
| | :param data: dataframe containing exacerbation dates (based on rescue meds) |
| | :param year_censor: date 1 year following Index date 'DD-MM-YYYY' format |
| | :param first_exac_df: dataframe showing days to first exacerbations for IDs |
| | :return: dataframe showing the number of days to the first exacerbation |
| | event for each ID since the index date |
| | """ |
| | presc_year = data[data['PRESC_DATE'] < year_censor] |
| | year_exac_count = presc_year.groupby('SafeHavenID').agg(exac_count_year_post_index=('PRESC_DATE', 'nunique')) |
| | all_exac_data = pd.merge(year_exac_count, first_exac_df, on="SafeHavenID", how="outer") |
| | all_exac_data['exac_count_year_post_index'] = all_exac_data['exac_count_year_post_index'].fillna(0) |
| | return all_exac_data |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | presc = initialize_presc_data(input_file_path + 'Pharmacy_Cohort3R.csv') |
| |
|
| | |
| | presc = track_medication(presc) |
| |
|
| | |
| | presc = filter_data(presc, '01-01-2020') |
| |
|
| | |
| | first_exac = calculate_time_to_first_exacerbation(presc, '01-01-2020') |
| |
|
| | |
| | first_exac = calculate_exac_count_1_year(presc, '01-01-2021', first_exac) |
| |
|
| | |
| | presc.to_csv(output_file_path + 'all_exacerbations_from_index_date.csv') |
| | first_exac.to_pickle(output_file_path + 'community_managed_exacerbations_cohort_summary.pkl') |
| |
|
| |
|
| | main() |
| |
|