Model E: Unsupervised PCA + clustering risk stratification

53a6def 4 days ago

4.25 kB

	"""
	Admission processing utilities
	"""
	import json
	import numpy as np
	from utils.common import track_event


	def update_null_stay(df):
	"""
	Calculate length of stay based on ADM/DISDATE for null STAY values
	--------
	:param df: pandas dataframe to be updated
	:return: updated dataframe
	"""
	# Check for nulls
	is_null = df.STAY.isnull()

	# If null calculate total length of stay
	if sum(is_null) > 0:
	null_stay = np.where(is_null)
	for i in null_stay:
	stay = df.loc[i, 'DISDATE'].item() - df.loc[i, 'ADMDATE'].item()
	df.loc[i, 'STAY'] = float(stay.days)

	return df


	def calculate_total_stay(df):
	"""
	Convert admissions with same ADMDATE as previous DISDATE to single
	admission where patient has been transferred between departments
	--------
	:param df: pandas dataframe to be updated
	:return: updated dataframe
	"""
	df.reset_index(inplace=True, drop=True)
	rows_to_drop = []

	# If ADMDATE matches previous DISDATE, mark as transfer and combine
	df['transfer'] = df.ADMDATE.eq(df.DISDATE.shift())
	for index, row in df.iloc[1:].iterrows():
	if row.transfer is True:
	df.loc[index, 'ADMDATE'] = df.iloc[index - 1].ADMDATE
	df.loc[index, 'STAY'] = row.STAY + df.iloc[index - 1].STAY
	rows_to_drop.append(index - 1)

	# Drop original individual rows in transfer
	df.drop(rows_to_drop, inplace=True)

	# Drop tracking column
	df.drop('transfer', axis=1, inplace=True)

	return df


	def convert_ethgrp_desc(eth):
	"""
	Find ethnic group based on given ETHGRP string
	--------
	:param eth: str ethnic group description in the style of SMR01 data
	:return: string ethnicity
	"""
	if ("White" in eth) \| ("Irish" in eth) \| ("Welsh" in eth) \| ("English" in eth):
	return "White"

	elif eth.startswith("British"):
	return "White"

	elif "mixed" in eth:
	return "Mixed"

	elif ("Asian" in eth) \| ("Pakistani" in eth) \| ("Indian" in eth) \| ("Bangladeshi" in eth) \| ("Chinese" in eth):
	return "Asian"

	elif ("Black" in eth) \| ("Caribbean" in eth) \| ("African" in eth):
	return "Black"

	elif ("Arab" in eth) \| ("other ethnic" in eth):
	return "Other"

	elif "Refused" in eth:
	return "Refused"

	else:
	return "Unknown"


	def mode_ethnicity(v, eth_col):
	"""
	Select the most commonly occuring ethnicity for each patient in groupby
	--------
	:param v: pandas patient dataframe to be updated
	:param eth_col: str ethnicity column
	:return: updated subset of data with common ethnicity per ID
	"""
	eth = v[eth_col]
	n = eth.nunique()
	has_unk = eth.str.contains('Unknown')
	any_unk = any(has_unk)
	wout_unk = has_unk.apply(lambda x: x is False)
	has_ref = eth.str.contains('Refused')
	any_ref = any(has_ref)
	wout_ref = has_ref.apply(lambda x: x is False)

	# Select ethnicities excluding 'Unknown' or 'Refused' where possible
	if any_unk & any_ref & (n > 2):
	eth = eth[wout_unk & wout_ref]
	elif any_unk & (n > 1):
	eth = eth[wout_unk]
	elif any_ref & (n > 1):
	eth = eth[wout_ref]

	# Select the most commonly appearing ethnicity
	main_eth = eth.mode().values[0]
	v[eth_col] = main_eth

	return v


	def search_diag(df, typ):
	"""
	Search diagnosis columns for descriptions indicative of copd or resp events
	--------
	:param df: dataframe to search
	:param typ: 'copd', 'resp' or 'anxiety_depression'
	:return: dataframe with column added tracking specific type of admission
	"""
	# Columns to search
	diag_cols = ['DIAG1Desc', 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc',
	'DIAG5Desc', 'DIAG6Desc']

	# Load mappings
	copd_resp_desc = json.load(open('mappings/diag_copd_resp_desc.json'))

	# Select mappings relevant to desired type of admission
	desc = copd_resp_desc[typ]

	# copd descriptions will only require searching a single specific phrase
	single = typ == 'copd'

	# Search columns and track
	df[typ + '_event'] = df[diag_cols].apply(
	lambda x: track_event(x, desc, single)).any(axis=1).astype(int)

	return df