copd-model-e / training /src /processing /process_comorbidities.py

Model E: Unsupervised PCA + clustering risk stratification

53a6def 3 days ago

5.27 kB

	"""
	Process SMR01 comorbidities data
	--------
	Clean and process comorbidities, tracking specific comorbidities and returning
	the total number of comorbidities per patient per year
	"""
	import json
	import pandas as pd
	from datetime import date
	from dateutil.relativedelta import relativedelta
	from utils.common import track_event
	from utils.adm_common import initialize_adm_data, correct_stays
	from utils.comorb_processing import diagnosis_mapping_lists


	def track_comorbidity(df, excel_file, sheet_name, diag_names):
	"""
	Map from admission descriptions to comorbidities using provided sheet.
	Add new column for each comorbidity.
	--------
	:param df: pandas dataframe
	:param excel_file: str filename for diagnosis mapping
	:param sheet_name: str sheet name for diagnosis mapping
	:param diag_names: list of diagnoses
	:return: dataframe update with diagnosis mapping
	"""
	print('Tracking comorbidities')

	# Load in mappings
	mapping = diagnosis_mapping_lists(excel_file, sheet_name, diag_names)

	# Select relevant columns
	diag_columns = ['DIAG1Desc', 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc',
	'DIAG5Desc', 'DIAG6Desc']
	df_diag = df[diag_columns]

	# Create column for each comorbidity
	for key in mapping:
	com = mapping[key]
	com_bool = df_diag.apply(lambda x: track_event(x, com, False))
	com_int = com_bool.any(axis=1).astype(int)
	df[key] = com_int

	return df


	def fill_comorbidities(df, diag_names):
	"""
	Fill comorbidites
	--------
	:param df: dataframe of groupby values
	:param diag_names: list of diagnoses
	:return: updated dataframe
	"""

	df[diag_names] = df[diag_names].replace(to_replace=0, method='ffill')

	return df


	def add_eoy_column(df, dt_col, eoy_date):
	"""
	Add EOY relative to user-specified end date
	--------
	:param df: dataframe
	:param dt_col: date column in dataframe
	:param eoy_date: EOY date from config
	:return: updated df with EOY column added
	"""
	# Needed to stop error with creating a new column
	df = df.reset_index(drop=True)

	# Add column with user-specified end of year date
	end_date = pd.to_datetime(eoy_date)
	end_month = end_date.month
	end_day = end_date.day

	# Add for every year
	df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year]

	# Check that EOY date is after dt_col for each entry
	eoy_index = df.columns[df.columns == 'eoy']
	adm_vs_eoy = df[dt_col] > df.eoy
	row_index = df.index[adm_vs_eoy]
	df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1)
	df['eoy'] = pd.to_datetime(df.eoy)

	return df


	def add_yearly_stats(df):
	"""
	Sum comorbidities per patient per year
	--------
	:param df: dataframe to update
	:return: sum of comorbidities per patient per year
	"""
	print('Adding comorbidity count per year')

	# Drop cols not required anymore
	cols_2_drop = ['ADMDATE', 'DISDATE', 'STAY', 'ETHGRP', 'DIAG1Desc',
	'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', 'DIAG5Desc',
	'DIAG6Desc', 'DISDATE', 'STAY']
	df = df.drop(cols_2_drop, axis=1)

	# Sum comorbidities
	df = df.groupby(['SafeHavenID', 'eoy']).last().sum(axis=1)
	df = df.to_frame().rename(columns={0: 'comorb_per_year'})

	return df


	def main():

	# Load in config items
	with open('../../../config.json') as json_config_file:
	config = json.load(json_config_file)

	# Load in data
	adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv'
	adm = initialize_adm_data(adm_file)

	# Fill null STAY data and combine transfer admissions
	adm = correct_stays(adm)

	# Prepare text data - strip string columns
	adm = adm.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

	# Track comorbidities
	excel_file = "mappings/Comorbidity feature review for models & clin " \
	"summary update v2 May 2021.xlsx"
	sheet_name = 'Diagnosis category mapping3'
	diag_names = ['Ischaemic_hd', 'Atrial_fib', 'pacemake', 'periph_vasc',
	'cog_imp', 'HF1', 'LV_sys', 'valv_hd', 'HF_pres_ejec',
	'hypertension', 'Cerebrovascula_dis', 'Diabetes_mel',
	'Osteoporosis', 'frailty', 'liver_dis', 'metastat_canc',
	'headneck_canc', 'breast_canc', 'gi_canc', 'other_canc',
	'kidney_dis', 'Asthma_ov', 'Pulmonary_fib',
	'Obstructive_apnoea', 'Pulmonary_hyp', 'Previous_pneum',
	'DVT_PTE', 'Lung_cancer', 'Bronchiectasis', 'Resp_fail']
	adm_comorb = track_comorbidity(adm, excel_file, sheet_name, diag_names)

	# Drop date column
	adm_comorb = adm_comorb.sort_values('ADMDATE').reset_index(drop=True)

	# Drop fill comorb cols
	print('Filling comorbidities')
	adm_filled = adm_comorb.groupby('SafeHavenID').apply(
	fill_comorbidities, diag_names)

	# Add column relative to user-specified date
	adm_filled = add_eoy_column(adm_filled, 'ADMDATE', config['date'])

	# Add yearly stats
	adm_yearly = add_yearly_stats(adm_filled)

	# Save data
	adm_yearly.to_pickle(config['model_data_path'] + 'comorb_proc.pkl')


	main()