copd-model-e / training /src /processing /process_admissions.py
IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
"""
Process SMR01 admission data
--------
Clean and process admission data while adding tracking for COPD and respiratory
admissions per year for each SafeHavenID
"""
import json
import pandas as pd
from datetime import date
from dateutil.relativedelta import relativedelta
from utils.common import add_hist_adm_presc, first_patient_appearance
from utils.adm_common import (initialize_adm_data, correct_stays,
track_copd_resp)
from utils.adm_processing import (convert_ethgrp_desc, mode_ethnicity,
search_diag)
from utils.adm_reduction import fill_missing_years, calc_adm_per_year
def process_ethnicity(df):
"""
Find relevant ethnic group for each patient, accounting for null data
--------
:param df: admission dataframe to be updated
:return: admission dataframe with ethnicity cleaned and updated
"""
print('Processing ethnicity')
# Fill in missing ethnicities
df = df.rename(columns={'ETHGRP': 'eth_grp'})
df['eth_grp'] = df.eth_grp.str.strip()
df['eth_grp'] = df.groupby('SafeHavenID')['eth_grp'].apply(
lambda x: x.ffill().bfill().fillna('Unknown'))
# Convert to 1 of 7 ethnic groups
df['eth_grp'] = [convert_ethgrp_desc(eth) for eth in df.eth_grp]
# Find most commonly occurring ethnicity per SafeHavenID
df = df.groupby('SafeHavenID').apply(mode_ethnicity, 'eth_grp')
return df
def add_eoy_column(df, dt_col, eoy_date):
"""
Add EOY relative to user-specified end date
--------
:param df: dataframe
:param dt_col: date column in dataframe
:param eoy_date: EOY date from config
:return: updated df with EOY column added
"""
# Needed to stop error with creating a new column
df = df.reset_index(drop=True)
# Add column with user-specified end of year date
end_date = pd.to_datetime(eoy_date)
end_month = end_date.month
end_day = end_date.day
# Add for every year
df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year]
# Check that EOY date is after dt_col for each entry
eoy_index = df.columns[df.columns == 'eoy']
adm_vs_eoy = df[dt_col] > df.eoy
row_index = df.index[adm_vs_eoy]
df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1)
df['eoy'] = pd.to_datetime(df.eoy)
return df
def extract_yearly_data(df):
"""
Extract features on a yearly basis for each SafeHavenID
--------
:param adm: admission dataframe to be updated
:return: dataframe with feature values per year
"""
print('Reducing to 1 row SafeHavenID per year')
# Track rows which are admissions
df['adm'] = 1
# Add rows from years where patient did not have admissions
df = df.groupby('SafeHavenID').apply(fill_missing_years)
df = df.reset_index(drop=True)
# Add any historical count columns
df = df.groupby('SafeHavenID').apply(add_hist_adm_presc, 'adm', 'ADMDATE')
df = df.reset_index(drop=True)
# Reduce data to 1 row per year
df = calc_adm_per_year(df)
# Select columns in final order
final_cols = ['eth_grp', 'adm_per_year', 'total_hosp_days',
'mean_los', 'copd_per_year', 'resp_per_year',
'anxiety_depression_per_year', 'days_since_copd',
'days_since_resp', 'days_since_adm', 'adm_to_date',
'copd_to_date', 'resp_to_date', 'anxiety_depression_to_date',
'copd_date', 'resp_date', 'adm_date']
df = df[final_cols]
return df
def main():
# Load in config items
with open('../../../config.json') as json_config_file:
config = json.load(json_config_file)
# Load in data
adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv'
adm = initialize_adm_data(adm_file)
# Fill null STAY data and combine transfer admissions
adm = correct_stays(adm)
# Save first date in dataset
data_path = config['model_data_path']
first_patient_appearance(adm, 'ADMDATE', 'adm', data_path)
# Process ethnicity data
adm = process_ethnicity(adm)
# Track COPD and respiratory events
adm = track_copd_resp(adm)
# Track anxiety event
adm = search_diag(adm, 'anxiety_depression')
# Select relevant columns
reduced_cols = ['SafeHavenID', 'eth_grp', 'ADMDATE', 'STAY', 'copd_event',
'resp_event', 'anxiety_depression_event']
adm_reduced = adm[reduced_cols]
# Save per event dataset
adm_reduced.to_pickle(data_path + 'validation_adm_proc.pkl')
# Add column relative to user-specified date
adm_reduced = add_eoy_column(adm_reduced, 'ADMDATE', config['date'])
# Extract yearly data
adm_yearly = extract_yearly_data(adm_reduced)
# Save data
adm_yearly.to_pickle(data_path + 'adm_proc.pkl')
main()