Model E: Unsupervised PCA + clustering risk stratification

53a6def 4 days ago

3.96 kB

	"""
	Utilities required across all processing scripts
	"""
	import pandas as pd
	import numpy as np


	def read_data(file, cols, types):
	"""
	Read in data source
	--------
	:param file: string filename
	:param cols: string list of column names
	:param types: string list of column types
	:return: dataframe
	"""
	schema = dict(zip(cols, types))
	df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema)

	return df


	def first_patient_appearance(df, dt_col, typ, data_path):
	"""
	Save first appearance of patient in dataset
	--------
	:param df: dataframe to check
	:param dt_col: date column to sort by
	:param typ: type of dataset being used
	:param data_path: path to data extracts
	:return: None, dataframe with first dates saved
	"""
	df = df.sort_values(dt_col)
	df_first = df.groupby('SafeHavenID')[dt_col].first()
	df_first = df_first.to_frame().reset_index()
	df_first.columns = ['SafeHavenID', 'first_adm']
	df_first.to_pickle(data_path + typ + '_first_dates.pkl')


	def add_days_since_event(df, typ, dt_col):
	"""
	Historical features: add days since features e.g. copd/resp/rescue
	--------
	:param df: dataframe to be updated
	:param typ: 'rescue', 'copd' or 'resp' feature to be created
	:param dt_col: str date column name
	:return: updated dataframe with historical column added
	"""
	if typ == 'rescue':
	event_col = 'rescue_meds'
	elif typ == 'adm':
	event_col = 'adm'
	else:
	event_col = typ + '_event'
	date_col = typ + '_date'
	days_col = 'days_since_' + typ
	df[date_col] = df.apply(
	lambda x: x[dt_col] if x[event_col] else np.nan, axis=1).ffill()
	if df[date_col].isna().all():
	df[days_col] = np.nan
	else:
	df[days_col] = (df.eoy - df[date_col]).dt.days

	return df


	def track_event(x, desc, single):
	"""
	Fill nulls and search to see if x matches a description
	--------
	:param x: str list of features to track
	:param desc: str list to compare
	:param single: boolean for checking against single description e.g.
	"COPD" True otherwise False
	:return: tracked feature list
	"""
	x = x.fillna('')

	# COPD only has single description to search
	if single:
	result = [desc in s for s in x]

	# Respiratory has a list of descriptions to search
	else:
	result = [s in desc for s in x]

	return result


	def add_hist_adm_presc(df, typ, dt_col):
	"""
	Historical features: add days since and to-date features
	--------
	:param df: dataframe to be updated
	:param typ: type of data - 'adm' or 'presc'
	:param dt_col: string name of date column
	:return: updated dataframe with historical columns added
	"""
	if typ == 'presc':
	df = df.sort_values(dt_col).reset_index(drop=True)
	df = add_days_since_event(df, 'rescue', dt_col)
	df['rescue_to_date'] = df.rescue_meds.cumsum()
	df['anxiety_depression_presc_to_date'] = df.anxiety_depression_presc.cumsum()
	else:
	for col in ['adm', 'copd', 'resp']:
	df = add_days_since_event(df, col, dt_col)
	for col in ['copd', 'resp', 'anxiety_depression']:
	df[col + '_to_date'] = df[col + '_event'].cumsum()

	# Add counter for events to date
	df[typ + '_to_date'] = df[typ].cumsum()

	return df


	def correct_column_names(cols, typ):
	"""
	Convert column names to lower case and fill any spaces with underscores
	--------
	:param cols: string list of column names
	:param typ: type of dataset being updated
	:return: cleaned column names
	"""
	print('Correcting column headers')

	if typ == 'presc':
	lower_cols = cols.str.replace('[+-]', ' ').str.lower()
	new_cols = ["_".join(col.split()) for col in lower_cols]
	else:
	new_cols = cols.str.lower().str.replace(' ', '_').tolist()

	return new_cols