| | |
| | import pandas as pd |
| | import numpy as np |
| |
|
| | |
| | input_file_path = '<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/' |
| | output_file_path = '<YOUR_DATA_PATH>/summary_files/' |
| |
|
| |
|
| | copd = 'CHRONIC OBSTRUCTIVE PULMONARY DISEASE' |
| |
|
| |
|
| | resp = ['PNEUMONITIS DUE TO FOOD AND VOMIT', |
| | 'RESPIRATORY FAILURE, UNSPECIFIED; TYPE UNSPECIFIED', |
| | 'CHRONIC RESPIRATORY FAILURE; TYPE II [HYPERCAPNIC]', |
| | 'BRONCHOPNEUMONIA, UNSPECIFIED', 'DYSPNOEA', |
| | 'PLEURAL EFFUSION IN CONDITIONS CLASSIFIED ELSEWHERE', |
| | 'RESPIRATORY FAILURE, UNSPECIFIED; TYPE [HYPERCAPNIC]', |
| | 'PLEURAL EFFUSION, NOT ELSEWHERE CLASSIFIED', |
| | 'CHRONIC RESPIRATORY FAILURE', 'OTHER BACTERIAL PNEUMONIA', |
| | 'ABN MICROBIOLOGICAL FINDINGS IN SPECS FROM RESPIRATORY ORGANS AND THORAX', |
| | 'RESPIRATORY FAILURE, UNSPECIFIED', 'PNEUMONIA, UNSPECIFIED', |
| | 'LOBAR PNEUMONIA, UNSPECIFIED', 'COUGH', |
| | 'PLEURAL PLAQUE WITH PRESENCE OF ASBESTOS', |
| | 'PLEURAL PLAQUE WITHOUT ASBESTOS', 'OTHER DISORDERS OF LUNG', |
| | 'OTHER SPECIFIED PLEURAL CONDITIONS', 'PULMONARY COLLAPSE', |
| | 'ACQUIRED ABSENCE OF LUNG [PART OF]', 'ASPHYXIATION', |
| | 'RESPIRATORY FAILURE, UNSPECIFIED; TYPE [HYPOXIC]', |
| | 'TRACHEOSTOMY STATUS', 'ACUTE RESPIRATORY FAILURE', |
| | 'UNSPECIFIED ACUTE LOWER RESPIRATORY INFECTION', |
| | 'OTHER SPECIFIED SYMPTOMS AND SIGNS INVOLVING THE CIRC AND RESP SYSTEMS', |
| | 'BACTERIAL PNEUMONIA, UNSPECIFIED', 'PYOTHORAX WITHOUT FISTULA', |
| | 'DISEASES OF BRONCHUS, NOT ELSEWHERE CLASSIFIED', |
| | 'PNEUMONIA DUE TO HAEMOPHILUS INFLUENZAE', 'ABNORMAL SPUTUM', |
| | 'OTHER POSTPROCEDURAL RESPIRATORY DISORDERS', |
| | 'OTHER AND UNSPECIFIED ABNORMALITIES OF BREATHING', |
| | 'INFLUENZA WITH OTHER RESP MANIFESTATIONS, SEASONAL INFLUENZA VIRUS IDENTIF', |
| | 'PERSONAL HISTORY OF DISEASES OF THE RESPIRATORY SYSTEM', |
| | 'PNEUMONIA DUE TO STREPTOCOCCUS PNEUMONIAE', |
| | 'WHEEZING', 'CHEST PAIN ON BREATHING', 'HAEMOPTYSIS', |
| | 'INFLUENZA WITH OTHER MANIFESTATIONS, VIRUS NOT IDENTIFIED', |
| | 'OTHER SPECIFIED RESPIRATORY DISORDERS', |
| | 'ACUTE UPPER RESPIRATORY INFECTION, UNSPECIFIED', |
| | 'T.B. OF LUNG, W/O MENTION OF BACTERIOLOGICAL OR HISTOLOGICAL CONFIRMATION', |
| | 'DEPENDENCE ON RESPIRATOR', 'PLEURISY', |
| | 'BRONCHITIS, NOT SPECIFIED AS ACUTE OR CHRONIC'] |
| |
|
| |
|
| | def read_data(file, cols, types): |
| | """ |
| | Read in data source |
| | -------- |
| | :param file: string filename |
| | :param cols: string list of column names |
| | :param types: string list of column types |
| | :return: dataframe |
| | """ |
| | schema = dict(zip(cols, types)) |
| | df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema) |
| | return df |
| |
|
| |
|
| | def update_null_stay(df): |
| | """ |
| | Calculate the values for any null 'STAY' values using the admission and |
| | discharge dates. |
| | -------- |
| | df : pandas dataframe to be updated |
| | """ |
| | is_null = df['STAY'].isnull() |
| | if sum(is_null) > 0: |
| | null_stay = np.where(is_null) |
| | for i in null_stay: |
| | stay = df.loc[i, 'DISDATE'].item() - df.loc[i, 'ADMDATE'].item() |
| | df.loc[i, 'STAY'] = float(stay.days) |
| |
|
| | return df |
| |
|
| |
|
| | def calculate_total_stay(df): |
| | """ |
| | Model A: |
| | Calculate the cumulative (total) length of stay, given data already |
| | grouped by patient ID and sorted by admission date then discharge date. It |
| | sums all stays for which the admission date matches the previous discharge |
| | date, sets the admission date to the first admission and drops all rows |
| | except the final (or only if the patient was not transferred) record |
| | for any given stay. Works for any number of transfers. Also adds a |
| | 'transfer' column to the existing data (True/False) |
| | |
| | df : pandas dataframe |
| | dataframe to be updated |
| | """ |
| | df.reset_index(inplace=True, drop=True) |
| | rows_to_drop = [] |
| | df['transfer'] = df.ADMDATE.eq(df.DISDATE.shift()) |
| | for index, row in df.iloc[1:].iterrows(): |
| | if row.transfer is True: |
| | df.loc[index, 'ADMDATE'] = df.iloc[index - 1].ADMDATE |
| | df.loc[index, 'STAY'] = row.STAY + df.iloc[index - 1].STAY |
| | rows_to_drop.append(index - 1) |
| | df.drop(rows_to_drop, inplace=True) |
| | df.drop('transfer', axis=1, inplace=True) |
| |
|
| | return df |
| |
|
| |
|
| | def track_copd_resp(df, track_type='both'): |
| | """ |
| | Search for COPD and/or respiratory admissions |
| | -------- |
| | df : pandas dataframe |
| | dataframe to be updated |
| | track_type : str |
| | 'copd', 'resp' or 'both' |
| | """ |
| | diag_columns = ['DIAG1Desc', 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', |
| | 'DIAG5Desc', 'DIAG6Desc'] |
| | df_diag = df[diag_columns] |
| |
|
| | if track_type in ['copd', 'both']: |
| | copd_event = df_diag.apply(lambda x: track_feature(x, copd, True)) |
| | copd_event = copd_event.any(axis=1).astype(int) |
| | df['copd_event'] = copd_event |
| |
|
| | if track_type in ['resp', 'both']: |
| | resp_event = df_diag.apply(lambda x: track_feature(x, resp, False)) |
| | resp_event = resp_event.any(axis=1).astype(int) |
| | df['resp_event'] = resp_event |
| |
|
| | return df |
| |
|
| |
|
| | def track_feature(x, desc, single): |
| | """ |
| | Fill nulls and search to see if x matches a description |
| | ------- |
| | x : str list |
| | feature to track |
| | desc : str list |
| | string list to compare |
| | single : boolean |
| | if checking against single description e.g. "COPD" True otherwise False |
| | """ |
| | x = x.fillna('') |
| | if single: |
| | result = [desc in s for s in x] |
| | else: |
| | result = [s in desc for s in x] |
| |
|
| | return result |
| |
|
| |
|
| | def filter_data(data, date): |
| | """ |
| | Filter data to only include copd or resp admission events occurring after |
| | the index date |
| | -------- |
| | :param data: dataframe |
| | :param date: index date |
| | :return: filtered dataframe |
| | """ |
| | data['ADMDATE'] = pd.to_datetime(data['ADMDATE']) |
| | data = data[data['ADMDATE'] >= date] |
| | data = data[(data['copd_event'] == 1) | (data['resp_event'] == 1)] |
| | return data |
| |
|
| |
|
| | def calculate_time_to_first_copd_admission(data, date): |
| | """ |
| | Calculate days to first COPD admission |
| | -------- |
| | :param data: dataframe |
| | :param date: Index date in 'DD-MM-YYYY' format |
| | :return: dataframe showing the number of days to the first COPD admission |
| | event for each ID since the index date |
| | """ |
| | copd_data = data[data['copd_event'] == 1] |
| | first_copd_admission = copd_data.groupby('SafeHavenID').agg(first_copd_admission=('ADMDATE', np.min)) |
| | first_copd_admission['index_date'] = date |
| | first_copd_admission['index_date'] = pd.to_datetime(first_copd_admission['index_date']) |
| | first_copd_admission['days_to_first_copd_admission'] = (first_copd_admission['first_copd_admission'] - first_copd_admission['index_date']).dt.days |
| | return first_copd_admission |
| |
|
| |
|
| | def calculate_time_to_first_resp_admission(data, date): |
| | """ |
| | Calculate days to first resp admission |
| | -------- |
| | :param data: dataframe |
| | :param date: Index date in 'DD-MM-YYYY' format |
| | :return: dataframe showing the number of days to the first resp admission event for each ID since |
| | the index date |
| | """ |
| | resp_data = data[data['resp_event'] == 1] |
| | first_resp_admission = resp_data.groupby('SafeHavenID').agg(first_resp_admission=('ADMDATE', np.min)) |
| | first_resp_admission['index_date'] = date |
| | first_resp_admission['index_date'] = pd.to_datetime(first_resp_admission['index_date']) |
| | first_resp_admission['days_to_first_resp_admission'] = (first_resp_admission['first_resp_admission'] - first_resp_admission['index_date']).dt.days |
| | return first_resp_admission |
| |
|
| |
|
| | def calculate_time_to_first_copd_or_resp_admission(data, date): |
| | """ |
| | Calculate days to first copd or resp admission |
| | -------- |
| | :param data: dataframe |
| | :param date: Index date in 'DD-MM-YYYY' format |
| | :return: dataframe showing the number of days to the first COPD or resp admission |
| | event for each ID since the index date |
| | """ |
| | data['copd_or_resp_event'] = (data['resp_event'] | data['copd_event']) |
| | resp_copd_data = data[(data['copd_or_resp_event'] == 1)] |
| | first_resp_or_copd_admission = resp_copd_data.groupby('SafeHavenID').agg(first_copd_or_resp_admission=('ADMDATE', np.min)) |
| | first_resp_or_copd_admission['index_date'] = date |
| | first_resp_or_copd_admission['index_date'] = pd.to_datetime(first_resp_or_copd_admission['index_date']) |
| | first_resp_or_copd_admission['first_copd_or_resp_admission'] = pd.to_datetime(first_resp_or_copd_admission['first_copd_or_resp_admission']) |
| | first_resp_or_copd_admission['days_to_first_copd_or_resp_admission'] = (first_resp_or_copd_admission['first_copd_or_resp_admission'] - first_resp_or_copd_admission['index_date']).dt.days |
| | return first_resp_or_copd_admission |
| |
|
| |
|
| | def calculate_ad_count_1_year(data, year_censor, first_admission_df, adm_col): |
| | """ |
| | Calculate the number of COPD or respiratory admissions in the year |
| | following the index date and join this data to the time to first |
| | admissions data for each ID |
| | -------- |
| | :param data: dataframe containing admissions dates |
| | :param year_censor: date 1 year following Index date 'DD-MM-YYYY' format |
| | :param first_admission_df: dataframe showing days to first admission |
| | :param adm_col: binary column showing if an admission was copd or |
| | respiratory related or not |
| | :return: dataframe showing the number of days to the first COPD or resp |
| | admission event for each ID since the index date |
| | """ |
| | admission_year = data[data['ADMDATE'] < year_censor] |
| | year_admission_count = admission_year.groupby('SafeHavenID').agg(admission_count_year_post_index=(adm_col, 'sum')) |
| | all_admissions_data = pd.merge(year_admission_count, first_admission_df, on="SafeHavenID", how="outer") |
| | all_admissions_data['admission_count_year_post_index'] = all_admissions_data['admission_count_year_post_index'].fillna(0) |
| | return all_admissions_data |
| |
|
| |
|
| | def main(): |
| |
|
| | adm_file = input_file_path + "SMR01_Cohort3R.csv" |
| | adm_cols = ['SafeHavenID', 'ETHGRP', 'ADMDATE', 'DISDATE', 'DIAG1Desc', |
| | 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', 'DIAG5Desc', |
| | 'DIAG6Desc', 'STAY'] |
| | adm_types = ['int', 'object', 'object', 'object', 'str', 'str', 'str', |
| | 'str', 'str', 'str', 'int'] |
| | adm = read_data(adm_file, adm_cols, adm_types) |
| |
|
| | |
| | adm = adm.drop_duplicates() |
| |
|
| | |
| | adm['ADMDATE'] = pd.to_datetime(adm['ADMDATE']) |
| | adm['DISDATE'] = pd.to_datetime(adm['DISDATE']) |
| |
|
| | |
| | adm = update_null_stay(adm) |
| |
|
| | |
| | adm = adm.sort_values(['SafeHavenID', 'ADMDATE', 'DISDATE']) |
| | adm = adm.groupby('SafeHavenID').apply(calculate_total_stay) |
| | adm = adm.reset_index(drop=True) |
| |
|
| | |
| | adm = adm.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) |
| |
|
| | |
| | adm = track_copd_resp(adm) |
| |
|
| | |
| | adm = filter_data(adm, '01-01-2020') |
| |
|
| | |
| | first_copd_admission = calculate_time_to_first_copd_admission(adm, '01-01-2020') |
| | first_resp_admission = calculate_time_to_first_resp_admission(adm, '01-01-2020') |
| | first_resp_or_copd_admission = calculate_time_to_first_copd_or_resp_admission(adm, '01-01-2020') |
| |
|
| | |
| | first_copd_admission = calculate_ad_count_1_year(adm, '01-01-2021', first_copd_admission, 'copd_event') |
| | first_resp_admission = calculate_ad_count_1_year(adm, '01-01-2021', first_resp_admission, 'resp_event') |
| | first_resp_or_copd_admission = calculate_ad_count_1_year(adm, '01-01-2021', first_resp_or_copd_admission, 'copd_or_resp_event') |
| |
|
| | |
| | adm.to_pickle(output_file_path + 'all_COPD_and_resp_admissions_from_index_date.pkl') |
| | first_copd_admission.to_pickle(output_file_path + 'copd_admissions_cohort_summary.pkl') |
| | first_resp_admission.to_pickle(output_file_path + 'resp_admissions_cohort_summary.pkl') |
| | first_resp_or_copd_admission.to_pickle(output_file_path + 'copd_or_resp_admissions_cohort_summary.pkl') |
| |
|
| |
|
| | main() |
| |
|