IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
"""
Utility functions common across admission processing
(admissions/comorbidities/gples)
"""
import pandas as pd
from utils.common import read_data
from utils.adm_processing import (update_null_stay, calculate_total_stay,
search_diag)
def initialize_adm_data(adm_file):
"""
Load in and convert admission dataset to correct format
--------
:param adm_file: admission data file name
:return: admission dataframe with correct column names and types
"""
print('Loading admission data')
# Read in data
adm_cols = ['SafeHavenID', 'ETHGRP', 'ADMDATE', 'DISDATE', 'STAY',
'DIAG1Desc', 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc',
'DIAG5Desc', 'DIAG6Desc']
adm_types = ['int', 'object', 'object', 'object', 'int',
'str', 'str', 'str', 'str', 'str', 'str']
df = read_data(adm_file, adm_cols, adm_types)
# Drop duplicates - nulls needed in DIAGDesc columns
df = df.drop_duplicates()
# Convert date columns to correct type
df['ADMDATE'] = pd.to_datetime(df['ADMDATE'])
df['DISDATE'] = pd.to_datetime(df['DISDATE'])
return df
def correct_stays(df):
"""
Fill any null STAY data and consolidate any transfer admissions into single
admission occurrences
--------
:param df: admission dataframe to be corrected
:return: admission dataframe with null stays filled and transfers combined
"""
print('Correcting stays')
# Update any null STAY data using ADM and DIS dates
df = update_null_stay(df)
# Correct stays for patients passed across departments
df = df.sort_values(['SafeHavenID', 'ADMDATE', 'DISDATE'])
df = df.groupby('SafeHavenID').apply(calculate_total_stay)
df = df.reset_index(drop=True)
return df
def track_copd_resp(df):
"""
Search for COPD and/or respiratory admissions
--------
:param df: admission dataframe to be updated
:return: updated dataframe with events tracked
"""
print('Tracking events')
# Strip DIAGDesc columns
df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
# Track COPD admissions
df = search_diag(df, 'copd')
# Track respiratory admissions
df = search_diag(df, 'resp')
return df