| | """ |
| | Process SMR01 comorbidities data |
| | -------- |
| | Clean and process comorbidities, tracking specific comorbidities and returning |
| | the total number of comorbidities per patient per year |
| | """ |
| | import json |
| | import pandas as pd |
| | from datetime import date |
| | from dateutil.relativedelta import relativedelta |
| | from utils.common import track_event |
| | from utils.adm_common import initialize_adm_data, correct_stays |
| | from utils.comorb_processing import diagnosis_mapping_lists |
| |
|
| |
|
| | def track_comorbidity(df, excel_file, sheet_name, diag_names): |
| | """ |
| | Map from admission descriptions to comorbidities using provided sheet. |
| | Add new column for each comorbidity. |
| | -------- |
| | :param df: pandas dataframe |
| | :param excel_file: str filename for diagnosis mapping |
| | :param sheet_name: str sheet name for diagnosis mapping |
| | :param diag_names: list of diagnoses |
| | :return: dataframe update with diagnosis mapping |
| | """ |
| | print('Tracking comorbidities') |
| |
|
| | |
| | mapping = diagnosis_mapping_lists(excel_file, sheet_name, diag_names) |
| | |
| | |
| | diag_columns = ['DIAG1Desc', 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', |
| | 'DIAG5Desc', 'DIAG6Desc'] |
| | df_diag = df[diag_columns] |
| |
|
| | |
| | for key in mapping: |
| | com = mapping[key] |
| | com_bool = df_diag.apply(lambda x: track_event(x, com, False)) |
| | com_int = com_bool.any(axis=1).astype(int) |
| | df[key] = com_int |
| |
|
| | return df |
| |
|
| |
|
| | def fill_comorbidities(df, diag_names): |
| | """ |
| | Fill comorbidites |
| | -------- |
| | :param df: dataframe of groupby values |
| | :param diag_names: list of diagnoses |
| | :return: updated dataframe |
| | """ |
| |
|
| | df[diag_names] = df[diag_names].replace(to_replace=0, method='ffill') |
| |
|
| | return df |
| |
|
| |
|
| | def add_eoy_column(df, dt_col, eoy_date): |
| | """ |
| | Add EOY relative to user-specified end date |
| | -------- |
| | :param df: dataframe |
| | :param dt_col: date column in dataframe |
| | :param eoy_date: EOY date from config |
| | :return: updated df with EOY column added |
| | """ |
| | |
| | df = df.reset_index(drop=True) |
| |
|
| | |
| | end_date = pd.to_datetime(eoy_date) |
| | end_month = end_date.month |
| | end_day = end_date.day |
| |
|
| | |
| | df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year] |
| |
|
| | |
| | eoy_index = df.columns[df.columns == 'eoy'] |
| | adm_vs_eoy = df[dt_col] > df.eoy |
| | row_index = df.index[adm_vs_eoy] |
| | df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1) |
| | df['eoy'] = pd.to_datetime(df.eoy) |
| |
|
| | return df |
| |
|
| |
|
| | def add_yearly_stats(df): |
| | """ |
| | Sum comorbidities per patient per year |
| | -------- |
| | :param df: dataframe to update |
| | :return: sum of comorbidities per patient per year |
| | """ |
| | print('Adding comorbidity count per year') |
| |
|
| | |
| | cols_2_drop = ['ADMDATE', 'DISDATE', 'STAY', 'ETHGRP', 'DIAG1Desc', |
| | 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', 'DIAG5Desc', |
| | 'DIAG6Desc', 'DISDATE', 'STAY'] |
| | df = df.drop(cols_2_drop, axis=1) |
| |
|
| | |
| | df = df.groupby(['SafeHavenID', 'eoy']).last().sum(axis=1) |
| | df = df.to_frame().rename(columns={0: 'comorb_per_year'}) |
| |
|
| | return df |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | with open('../../../config.json') as json_config_file: |
| | config = json.load(json_config_file) |
| |
|
| | |
| | adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv' |
| | adm = initialize_adm_data(adm_file) |
| |
|
| | |
| | adm = correct_stays(adm) |
| |
|
| | |
| | adm = adm.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) |
| |
|
| | |
| | excel_file = "mappings/Comorbidity feature review for models & clin " \ |
| | "summary update v2 May 2021.xlsx" |
| | sheet_name = 'Diagnosis category mapping3' |
| | diag_names = ['Ischaemic_hd', 'Atrial_fib', 'pacemake', 'periph_vasc', |
| | 'cog_imp', 'HF1', 'LV_sys', 'valv_hd', 'HF_pres_ejec', |
| | 'hypertension', 'Cerebrovascula_dis', 'Diabetes_mel', |
| | 'Osteoporosis', 'frailty', 'liver_dis', 'metastat_canc', |
| | 'headneck_canc', 'breast_canc', 'gi_canc', 'other_canc', |
| | 'kidney_dis', 'Asthma_ov', 'Pulmonary_fib', |
| | 'Obstructive_apnoea', 'Pulmonary_hyp', 'Previous_pneum', |
| | 'DVT_PTE', 'Lung_cancer', 'Bronchiectasis', 'Resp_fail'] |
| | adm_comorb = track_comorbidity(adm, excel_file, sheet_name, diag_names) |
| |
|
| | |
| | adm_comorb = adm_comorb.sort_values('ADMDATE').reset_index(drop=True) |
| |
|
| | |
| | print('Filling comorbidities') |
| | adm_filled = adm_comorb.groupby('SafeHavenID').apply( |
| | fill_comorbidities, diag_names) |
| |
|
| | |
| | adm_filled = add_eoy_column(adm_filled, 'ADMDATE', config['date']) |
| |
|
| | |
| | adm_yearly = add_yearly_stats(adm_filled) |
| |
|
| | |
| | adm_yearly.to_pickle(config['model_data_path'] + 'comorb_proc.pkl') |
| |
|
| |
|
| | main() |
| |
|