""" Process SMR01 comorbidities data -------- Clean and process comorbidities, tracking specific comorbidities and returning the total number of comorbidities per patient per year """ import json import pandas as pd from datetime import date from dateutil.relativedelta import relativedelta from utils.common import track_event from utils.adm_common import initialize_adm_data, correct_stays from utils.comorb_processing import diagnosis_mapping_lists def track_comorbidity(df, excel_file, sheet_name, diag_names): """ Map from admission descriptions to comorbidities using provided sheet. Add new column for each comorbidity. -------- :param df: pandas dataframe :param excel_file: str filename for diagnosis mapping :param sheet_name: str sheet name for diagnosis mapping :param diag_names: list of diagnoses :return: dataframe update with diagnosis mapping """ print('Tracking comorbidities') # Load in mappings mapping = diagnosis_mapping_lists(excel_file, sheet_name, diag_names) # Select relevant columns diag_columns = ['DIAG1Desc', 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', 'DIAG5Desc', 'DIAG6Desc'] df_diag = df[diag_columns] # Create column for each comorbidity for key in mapping: com = mapping[key] com_bool = df_diag.apply(lambda x: track_event(x, com, False)) com_int = com_bool.any(axis=1).astype(int) df[key] = com_int return df def fill_comorbidities(df, diag_names): """ Fill comorbidites -------- :param df: dataframe of groupby values :param diag_names: list of diagnoses :return: updated dataframe """ df[diag_names] = df[diag_names].replace(to_replace=0, method='ffill') return df def add_eoy_column(df, dt_col, eoy_date): """ Add EOY relative to user-specified end date -------- :param df: dataframe :param dt_col: date column in dataframe :param eoy_date: EOY date from config :return: updated df with EOY column added """ # Needed to stop error with creating a new column df = df.reset_index(drop=True) # Add column with user-specified end of year date end_date = pd.to_datetime(eoy_date) end_month = end_date.month end_day = end_date.day # Add for every year df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year] # Check that EOY date is after dt_col for each entry eoy_index = df.columns[df.columns == 'eoy'] adm_vs_eoy = df[dt_col] > df.eoy row_index = df.index[adm_vs_eoy] df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1) df['eoy'] = pd.to_datetime(df.eoy) return df def add_yearly_stats(df): """ Sum comorbidities per patient per year -------- :param df: dataframe to update :return: sum of comorbidities per patient per year """ print('Adding comorbidity count per year') # Drop cols not required anymore cols_2_drop = ['ADMDATE', 'DISDATE', 'STAY', 'ETHGRP', 'DIAG1Desc', 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', 'DIAG5Desc', 'DIAG6Desc', 'DISDATE', 'STAY'] df = df.drop(cols_2_drop, axis=1) # Sum comorbidities df = df.groupby(['SafeHavenID', 'eoy']).last().sum(axis=1) df = df.to_frame().rename(columns={0: 'comorb_per_year'}) return df def main(): # Load in config items with open('../../../config.json') as json_config_file: config = json.load(json_config_file) # Load in data adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv' adm = initialize_adm_data(adm_file) # Fill null STAY data and combine transfer admissions adm = correct_stays(adm) # Prepare text data - strip string columns adm = adm.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) # Track comorbidities excel_file = "mappings/Comorbidity feature review for models & clin " \ "summary update v2 May 2021.xlsx" sheet_name = 'Diagnosis category mapping3' diag_names = ['Ischaemic_hd', 'Atrial_fib', 'pacemake', 'periph_vasc', 'cog_imp', 'HF1', 'LV_sys', 'valv_hd', 'HF_pres_ejec', 'hypertension', 'Cerebrovascula_dis', 'Diabetes_mel', 'Osteoporosis', 'frailty', 'liver_dis', 'metastat_canc', 'headneck_canc', 'breast_canc', 'gi_canc', 'other_canc', 'kidney_dis', 'Asthma_ov', 'Pulmonary_fib', 'Obstructive_apnoea', 'Pulmonary_hyp', 'Previous_pneum', 'DVT_PTE', 'Lung_cancer', 'Bronchiectasis', 'Resp_fail'] adm_comorb = track_comorbidity(adm, excel_file, sheet_name, diag_names) # Drop date column adm_comorb = adm_comorb.sort_values('ADMDATE').reset_index(drop=True) # Drop fill comorb cols print('Filling comorbidities') adm_filled = adm_comorb.groupby('SafeHavenID').apply( fill_comorbidities, diag_names) # Add column relative to user-specified date adm_filled = add_eoy_column(adm_filled, 'ADMDATE', config['date']) # Add yearly stats adm_yearly = add_yearly_stats(adm_filled) # Save data adm_yearly.to_pickle(config['model_data_path'] + 'comorb_proc.pkl') main()