copd-model-e / training /src /processing /process_comorbidities.py
IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
"""
Process SMR01 comorbidities data
--------
Clean and process comorbidities, tracking specific comorbidities and returning
the total number of comorbidities per patient per year
"""
import json
import pandas as pd
from datetime import date
from dateutil.relativedelta import relativedelta
from utils.common import track_event
from utils.adm_common import initialize_adm_data, correct_stays
from utils.comorb_processing import diagnosis_mapping_lists
def track_comorbidity(df, excel_file, sheet_name, diag_names):
"""
Map from admission descriptions to comorbidities using provided sheet.
Add new column for each comorbidity.
--------
:param df: pandas dataframe
:param excel_file: str filename for diagnosis mapping
:param sheet_name: str sheet name for diagnosis mapping
:param diag_names: list of diagnoses
:return: dataframe update with diagnosis mapping
"""
print('Tracking comorbidities')
# Load in mappings
mapping = diagnosis_mapping_lists(excel_file, sheet_name, diag_names)
# Select relevant columns
diag_columns = ['DIAG1Desc', 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc',
'DIAG5Desc', 'DIAG6Desc']
df_diag = df[diag_columns]
# Create column for each comorbidity
for key in mapping:
com = mapping[key]
com_bool = df_diag.apply(lambda x: track_event(x, com, False))
com_int = com_bool.any(axis=1).astype(int)
df[key] = com_int
return df
def fill_comorbidities(df, diag_names):
"""
Fill comorbidites
--------
:param df: dataframe of groupby values
:param diag_names: list of diagnoses
:return: updated dataframe
"""
df[diag_names] = df[diag_names].replace(to_replace=0, method='ffill')
return df
def add_eoy_column(df, dt_col, eoy_date):
"""
Add EOY relative to user-specified end date
--------
:param df: dataframe
:param dt_col: date column in dataframe
:param eoy_date: EOY date from config
:return: updated df with EOY column added
"""
# Needed to stop error with creating a new column
df = df.reset_index(drop=True)
# Add column with user-specified end of year date
end_date = pd.to_datetime(eoy_date)
end_month = end_date.month
end_day = end_date.day
# Add for every year
df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year]
# Check that EOY date is after dt_col for each entry
eoy_index = df.columns[df.columns == 'eoy']
adm_vs_eoy = df[dt_col] > df.eoy
row_index = df.index[adm_vs_eoy]
df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1)
df['eoy'] = pd.to_datetime(df.eoy)
return df
def add_yearly_stats(df):
"""
Sum comorbidities per patient per year
--------
:param df: dataframe to update
:return: sum of comorbidities per patient per year
"""
print('Adding comorbidity count per year')
# Drop cols not required anymore
cols_2_drop = ['ADMDATE', 'DISDATE', 'STAY', 'ETHGRP', 'DIAG1Desc',
'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', 'DIAG5Desc',
'DIAG6Desc', 'DISDATE', 'STAY']
df = df.drop(cols_2_drop, axis=1)
# Sum comorbidities
df = df.groupby(['SafeHavenID', 'eoy']).last().sum(axis=1)
df = df.to_frame().rename(columns={0: 'comorb_per_year'})
return df
def main():
# Load in config items
with open('../../../config.json') as json_config_file:
config = json.load(json_config_file)
# Load in data
adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv'
adm = initialize_adm_data(adm_file)
# Fill null STAY data and combine transfer admissions
adm = correct_stays(adm)
# Prepare text data - strip string columns
adm = adm.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
# Track comorbidities
excel_file = "mappings/Comorbidity feature review for models & clin " \
"summary update v2 May 2021.xlsx"
sheet_name = 'Diagnosis category mapping3'
diag_names = ['Ischaemic_hd', 'Atrial_fib', 'pacemake', 'periph_vasc',
'cog_imp', 'HF1', 'LV_sys', 'valv_hd', 'HF_pres_ejec',
'hypertension', 'Cerebrovascula_dis', 'Diabetes_mel',
'Osteoporosis', 'frailty', 'liver_dis', 'metastat_canc',
'headneck_canc', 'breast_canc', 'gi_canc', 'other_canc',
'kidney_dis', 'Asthma_ov', 'Pulmonary_fib',
'Obstructive_apnoea', 'Pulmonary_hyp', 'Previous_pneum',
'DVT_PTE', 'Lung_cancer', 'Bronchiectasis', 'Resp_fail']
adm_comorb = track_comorbidity(adm, excel_file, sheet_name, diag_names)
# Drop date column
adm_comorb = adm_comorb.sort_values('ADMDATE').reset_index(drop=True)
# Drop fill comorb cols
print('Filling comorbidities')
adm_filled = adm_comorb.groupby('SafeHavenID').apply(
fill_comorbidities, diag_names)
# Add column relative to user-specified date
adm_filled = add_eoy_column(adm_filled, 'ADMDATE', config['date'])
# Add yearly stats
adm_yearly = add_yearly_stats(adm_filled)
# Save data
adm_yearly.to_pickle(config['model_data_path'] + 'comorb_proc.pkl')
main()