File size: 5,117 Bytes
53a6def | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | # Import libraries
import pandas as pd
import numpy as np
# Set file paths
input_file_path = '<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/'
output_file_path = '<YOUR_DATA_PATH>/summary_files/'
steroid_codes = ['0603020T0AAACAC', '0603020T0AABKBK', '0603020T0AAAXAX',
'0603020T0AAAGAG', '0603020T0AABHBH', '0603020T0AAACAC',
'0603020T0AABKBK', '0603020T0AABNBN', '0603020T0AAAGAG',
'0603020T0AABHBH']
antib_codes = ['0501013B0AAAAAA', '0501013B0AAABAB', '0501030I0AAABAB',
'0501030I0AAAAAA', '0501050B0AAAAAA', '0501050B0AAADAD',
'0501013K0AAAJAJ']
exac_meds = steroid_codes + antib_codes
def read_data(file, cols, types):
"""
Read in data source
--------
:param file: string filename
:param cols: string list of column names
:param types: string list of column types
:return: dataframe
"""
schema = dict(zip(cols, types))
df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema)
return df
def initialize_presc_data(presc_file):
"""
Load in prescribing dataset to correct format
--------
:param presc_file: prescribing data file name
:return: prescribing dataframe with correct column names and types
"""
print('Loading prescribing data')
# Read in data
presc_cols = ['SafeHavenID', 'PRESC_DATE', 'PI_Approved_Name',
'PI_BNF_Item_Code']
presc_types = ['int', 'object', 'str', 'str']
df = read_data(presc_file, presc_cols, presc_types)
# Drop any nulls or duplicates
df = df.dropna()
df = df.drop_duplicates()
# Convert date
df['PRESC_DATE'] = pd.to_datetime(df.PRESC_DATE)
return df
def track_medication(df):
"""
Track salbutamol and rescue med prescriptions
--------
:param df: dataframe
:return: dataframe with tracked meds
"""
print('Tracking medication')
# Extract BNF codes without brand info
df['code'] = df.PI_BNF_Item_Code.apply(lambda x: x[0:9])
# Track rescue meds
df['rescue_meds'] = df.PI_BNF_Item_Code.str.contains(
'|'.join(exac_meds)).astype(int)
return df
def filter_data(data, date):
"""
Filter data to only include rescue med prescritpions occurring
after the index date
--------
:param data: dataframe
:param date: Index date in 'DD-MM-YYYY' format
:return: filtered dataframe
"""
data['PRESC_DATE'] = pd.to_datetime(data['PRESC_DATE'])
data = data[data['PRESC_DATE'] >= date]
data = data[data['rescue_meds'] == 1]
return data
def calculate_time_to_first_exacerbation(data, date):
"""
Calculate days to first exacerbation
--------
:param data: dataframe
:param date: Index date in 'DD-MM-YYYY' format
:return: dataframe showing the number of days to the first exacerbation
event for each ID since the index date
"""
first_exac = data.groupby('SafeHavenID').agg(first_exac=('PRESC_DATE', np.min))
first_exac['index_date'] = date
first_exac['index_date'] = pd.to_datetime(first_exac['index_date'])
first_exac['days_to_first_exac'] = (first_exac['first_exac'] - first_exac['index_date']).dt.days
return first_exac
def calculate_exac_count_1_year(data, year_censor, first_exac_df):
"""
Calculate the number of exacerbations in the year following the index date
and join this data to the time to first exacerbation data for each ID
--------
:param data: dataframe containing exacerbation dates (based on rescue meds)
:param year_censor: date 1 year following Index date 'DD-MM-YYYY' format
:param first_exac_df: dataframe showing days to first exacerbations for IDs
:return: dataframe showing the number of days to the first exacerbation
event for each ID since the index date
"""
presc_year = data[data['PRESC_DATE'] < year_censor]
year_exac_count = presc_year.groupby('SafeHavenID').agg(exac_count_year_post_index=('PRESC_DATE', 'nunique'))
all_exac_data = pd.merge(year_exac_count, first_exac_df, on="SafeHavenID", how="outer")
all_exac_data['exac_count_year_post_index'] = all_exac_data['exac_count_year_post_index'].fillna(0)
return all_exac_data
def main():
# Initialise prescription data
presc = initialize_presc_data(input_file_path + 'Pharmacy_Cohort3R.csv')
# Track rescue med prescriptions
presc = track_medication(presc)
# Filter to only include exacerbation events (rescue med prescriptions) occurring after the index date
presc = filter_data(presc, '01-01-2020')
# Calculate time to first respiratory and COPD admissions
first_exac = calculate_time_to_first_exacerbation(presc, '01-01-2020')
# Calculate number of respiratory and COPD admissions in the follow up year and join this to the time to admission data
first_exac = calculate_exac_count_1_year(presc, '01-01-2021', first_exac)
# Save data
presc.to_csv(output_file_path + 'all_exacerbations_from_index_date.csv')
first_exac.to_pickle(output_file_path + 'community_managed_exacerbations_cohort_summary.pkl')
main()
|