File size: 5,274 Bytes

53a6def

"""
Process SMR01 comorbidities data
--------
Clean and process comorbidities, tracking specific comorbidities and returning
the total number of comorbidities per patient per year
"""
import json
import pandas as pd
from datetime import date
from dateutil.relativedelta import relativedelta
from utils.common import track_event
from utils.adm_common import initialize_adm_data, correct_stays
from utils.comorb_processing import diagnosis_mapping_lists


def track_comorbidity(df, excel_file, sheet_name, diag_names):
    """
    Map from admission descriptions to comorbidities using provided sheet.
    Add new column for each comorbidity.
    --------
    :param df: pandas dataframe
    :param excel_file: str filename for diagnosis mapping
    :param sheet_name: str sheet name for diagnosis mapping
    :param diag_names: list of diagnoses
    :return: dataframe update with diagnosis mapping
    """
    print('Tracking comorbidities')

    # Load in mappings
    mapping = diagnosis_mapping_lists(excel_file, sheet_name, diag_names)
    
    # Select relevant columns
    diag_columns = ['DIAG1Desc', 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc',
                    'DIAG5Desc', 'DIAG6Desc']
    df_diag = df[diag_columns]

    # Create column for each comorbidity
    for key in mapping:
        com = mapping[key]
        com_bool = df_diag.apply(lambda x: track_event(x, com, False))
        com_int = com_bool.any(axis=1).astype(int)
        df[key] = com_int

    return df


def fill_comorbidities(df, diag_names):
    """
    Fill comorbidites
    --------
    :param df: dataframe of groupby values
    :param diag_names: list of diagnoses
    :return: updated dataframe
    """

    df[diag_names] = df[diag_names].replace(to_replace=0, method='ffill')

    return df


def add_eoy_column(df, dt_col, eoy_date):
    """
    Add EOY relative to user-specified end date
    --------
    :param df: dataframe
    :param dt_col: date column in dataframe
    :param eoy_date: EOY date from config
    :return: updated df with EOY column added
    """
    # Needed to stop error with creating a new column
    df = df.reset_index(drop=True)

    # Add column with user-specified end of year date
    end_date = pd.to_datetime(eoy_date)
    end_month = end_date.month
    end_day = end_date.day

    # Add for every year
    df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year]

    # Check that EOY date is after dt_col for each entry
    eoy_index = df.columns[df.columns == 'eoy']
    adm_vs_eoy = df[dt_col] > df.eoy
    row_index = df.index[adm_vs_eoy]
    df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1)
    df['eoy'] = pd.to_datetime(df.eoy)

    return df


def add_yearly_stats(df):
    """
    Sum comorbidities per patient per year
    --------
    :param df: dataframe to update
    :return: sum of comorbidities per patient per year
    """
    print('Adding comorbidity count per year')

    # Drop cols not required anymore
    cols_2_drop = ['ADMDATE', 'DISDATE', 'STAY', 'ETHGRP', 'DIAG1Desc',
                   'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', 'DIAG5Desc',
                   'DIAG6Desc', 'DISDATE', 'STAY']
    df = df.drop(cols_2_drop, axis=1)

    # Sum comorbidities
    df = df.groupby(['SafeHavenID', 'eoy']).last().sum(axis=1)
    df = df.to_frame().rename(columns={0: 'comorb_per_year'})

    return df


def main():

    # Load in config items
    with open('../../../config.json') as json_config_file:
        config = json.load(json_config_file)

    # Load in data
    adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv'
    adm = initialize_adm_data(adm_file)

    # Fill null STAY data and combine transfer admissions
    adm = correct_stays(adm)

    # Prepare text data - strip string columns
    adm = adm.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

    # Track comorbidities
    excel_file = "mappings/Comorbidity feature review for models & clin " \
                 "summary update v2 May 2021.xlsx"
    sheet_name = 'Diagnosis category mapping3'
    diag_names = ['Ischaemic_hd', 'Atrial_fib', 'pacemake', 'periph_vasc',
                  'cog_imp', 'HF1', 'LV_sys', 'valv_hd', 'HF_pres_ejec',
                  'hypertension', 'Cerebrovascula_dis', 'Diabetes_mel',
                  'Osteoporosis', 'frailty', 'liver_dis', 'metastat_canc',
                  'headneck_canc', 'breast_canc', 'gi_canc', 'other_canc',
                  'kidney_dis', 'Asthma_ov', 'Pulmonary_fib',
                  'Obstructive_apnoea', 'Pulmonary_hyp', 'Previous_pneum',
                  'DVT_PTE', 'Lung_cancer', 'Bronchiectasis', 'Resp_fail']
    adm_comorb = track_comorbidity(adm, excel_file, sheet_name, diag_names)

    # Drop date column
    adm_comorb = adm_comorb.sort_values('ADMDATE').reset_index(drop=True)

    # Drop fill comorb cols
    print('Filling comorbidities')
    adm_filled = adm_comorb.groupby('SafeHavenID').apply(
        fill_comorbidities, diag_names)

    # Add column relative to user-specified date
    adm_filled = add_eoy_column(adm_filled, 'ADMDATE', config['date'])

    # Add yearly stats
    adm_yearly = add_yearly_stats(adm_filled)

    # Save data
    adm_yearly.to_pickle(config['model_data_path'] + 'comorb_proc.pkl')


main()