File size: 5,117 Bytes
53a6def
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# Import libraries
import pandas as pd
import numpy as np

# Set file paths
input_file_path = '<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/'
output_file_path = '<YOUR_DATA_PATH>/summary_files/'

steroid_codes = ['0603020T0AAACAC', '0603020T0AABKBK', '0603020T0AAAXAX',
                 '0603020T0AAAGAG', '0603020T0AABHBH', '0603020T0AAACAC',
                 '0603020T0AABKBK', '0603020T0AABNBN', '0603020T0AAAGAG',
                 '0603020T0AABHBH']

antib_codes = ['0501013B0AAAAAA', '0501013B0AAABAB', '0501030I0AAABAB',
               '0501030I0AAAAAA', '0501050B0AAAAAA', '0501050B0AAADAD',
               '0501013K0AAAJAJ']

exac_meds = steroid_codes + antib_codes


def read_data(file, cols, types):
    """
    Read in data source
    --------
    :param file: string filename
    :param cols: string list of column names
    :param types: string list of column types
    :return: dataframe
    """
    schema = dict(zip(cols, types))
    df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema)
    return df


def initialize_presc_data(presc_file):
    """
    Load in prescribing dataset to correct format
    --------
    :param presc_file: prescribing data file name
    :return: prescribing dataframe with correct column names and types
    """
    print('Loading prescribing data')

    # Read in data
    presc_cols = ['SafeHavenID', 'PRESC_DATE', 'PI_Approved_Name',
                  'PI_BNF_Item_Code']
    presc_types = ['int', 'object', 'str', 'str']
    df = read_data(presc_file, presc_cols, presc_types)

    # Drop any nulls or duplicates
    df = df.dropna()
    df = df.drop_duplicates()

    # Convert date
    df['PRESC_DATE'] = pd.to_datetime(df.PRESC_DATE)

    return df


def track_medication(df):
    """
    Track salbutamol and rescue med prescriptions
    --------
    :param df: dataframe
    :return: dataframe with tracked meds
    """
    print('Tracking medication')

    # Extract BNF codes without brand info
    df['code'] = df.PI_BNF_Item_Code.apply(lambda x: x[0:9])

    # Track rescue meds
    df['rescue_meds'] = df.PI_BNF_Item_Code.str.contains(
        '|'.join(exac_meds)).astype(int)

    return df


def filter_data(data, date):
    """
    Filter data to only include rescue med prescritpions occurring
    after the index date
    --------
    :param data: dataframe
    :param date: Index date in 'DD-MM-YYYY' format
    :return: filtered dataframe
    """
    data['PRESC_DATE'] = pd.to_datetime(data['PRESC_DATE'])
    data = data[data['PRESC_DATE'] >= date]
    data = data[data['rescue_meds'] == 1]
    return data


def calculate_time_to_first_exacerbation(data, date):
    """
    Calculate days to first exacerbation
    --------
    :param data: dataframe
    :param date: Index date in 'DD-MM-YYYY' format
    :return: dataframe showing the number of days to the first exacerbation
    event for each ID since the index date
    """
    first_exac = data.groupby('SafeHavenID').agg(first_exac=('PRESC_DATE', np.min))
    first_exac['index_date'] = date
    first_exac['index_date'] = pd.to_datetime(first_exac['index_date'])
    first_exac['days_to_first_exac'] = (first_exac['first_exac'] - first_exac['index_date']).dt.days
    return first_exac


def calculate_exac_count_1_year(data, year_censor, first_exac_df):
    """
    Calculate the number of exacerbations in the year following the index date
    and join this data to the time to first exacerbation data for each ID
    --------
    :param data: dataframe containing exacerbation dates (based on rescue meds)
    :param year_censor: date  1 year following Index date 'DD-MM-YYYY' format
    :param first_exac_df: dataframe showing days to first exacerbations for IDs
    :return: dataframe showing the number of days to the first exacerbation
    event for each ID since the index date
    """
    presc_year = data[data['PRESC_DATE'] < year_censor]
    year_exac_count = presc_year.groupby('SafeHavenID').agg(exac_count_year_post_index=('PRESC_DATE', 'nunique'))
    all_exac_data = pd.merge(year_exac_count, first_exac_df, on="SafeHavenID", how="outer")
    all_exac_data['exac_count_year_post_index'] = all_exac_data['exac_count_year_post_index'].fillna(0)
    return all_exac_data


def main():

    # Initialise prescription data
    presc = initialize_presc_data(input_file_path + 'Pharmacy_Cohort3R.csv')

    # Track rescue med prescriptions
    presc = track_medication(presc)

    # Filter to only include exacerbation events (rescue med prescriptions) occurring after the index date
    presc = filter_data(presc, '01-01-2020')

    # Calculate time to first respiratory and COPD admissions
    first_exac = calculate_time_to_first_exacerbation(presc, '01-01-2020')

    # Calculate number of respiratory and COPD admissions in the follow up year and join this to the time to admission data
    first_exac = calculate_exac_count_1_year(presc, '01-01-2021', first_exac)

    # Save data
    presc.to_csv(output_file_path + 'all_exacerbations_from_index_date.csv')
    first_exac.to_pickle(output_file_path + 'community_managed_exacerbations_cohort_summary.pkl')


main()