""" Process SMR01 admission data -------- Clean and process admission data while adding tracking for COPD and respiratory admissions per year for each SafeHavenID """ import json import pandas as pd from datetime import date from dateutil.relativedelta import relativedelta from utils.common import add_hist_adm_presc, first_patient_appearance from utils.adm_common import (initialize_adm_data, correct_stays, track_copd_resp) from utils.adm_processing import (convert_ethgrp_desc, mode_ethnicity, search_diag) from utils.adm_reduction import fill_missing_years, calc_adm_per_year def process_ethnicity(df): """ Find relevant ethnic group for each patient, accounting for null data -------- :param df: admission dataframe to be updated :return: admission dataframe with ethnicity cleaned and updated """ print('Processing ethnicity') # Fill in missing ethnicities df = df.rename(columns={'ETHGRP': 'eth_grp'}) df['eth_grp'] = df.eth_grp.str.strip() df['eth_grp'] = df.groupby('SafeHavenID')['eth_grp'].apply( lambda x: x.ffill().bfill().fillna('Unknown')) # Convert to 1 of 7 ethnic groups df['eth_grp'] = [convert_ethgrp_desc(eth) for eth in df.eth_grp] # Find most commonly occurring ethnicity per SafeHavenID df = df.groupby('SafeHavenID').apply(mode_ethnicity, 'eth_grp') return df def add_eoy_column(df, dt_col, eoy_date): """ Add EOY relative to user-specified end date -------- :param df: dataframe :param dt_col: date column in dataframe :param eoy_date: EOY date from config :return: updated df with EOY column added """ # Needed to stop error with creating a new column df = df.reset_index(drop=True) # Add column with user-specified end of year date end_date = pd.to_datetime(eoy_date) end_month = end_date.month end_day = end_date.day # Add for every year df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year] # Check that EOY date is after dt_col for each entry eoy_index = df.columns[df.columns == 'eoy'] adm_vs_eoy = df[dt_col] > df.eoy row_index = df.index[adm_vs_eoy] df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1) df['eoy'] = pd.to_datetime(df.eoy) return df def extract_yearly_data(df): """ Extract features on a yearly basis for each SafeHavenID -------- :param adm: admission dataframe to be updated :return: dataframe with feature values per year """ print('Reducing to 1 row SafeHavenID per year') # Track rows which are admissions df['adm'] = 1 # Add rows from years where patient did not have admissions df = df.groupby('SafeHavenID').apply(fill_missing_years) df = df.reset_index(drop=True) # Add any historical count columns df = df.groupby('SafeHavenID').apply(add_hist_adm_presc, 'adm', 'ADMDATE') df = df.reset_index(drop=True) # Reduce data to 1 row per year df = calc_adm_per_year(df) # Select columns in final order final_cols = ['eth_grp', 'adm_per_year', 'total_hosp_days', 'mean_los', 'copd_per_year', 'resp_per_year', 'anxiety_depression_per_year', 'days_since_copd', 'days_since_resp', 'days_since_adm', 'adm_to_date', 'copd_to_date', 'resp_to_date', 'anxiety_depression_to_date', 'copd_date', 'resp_date', 'adm_date'] df = df[final_cols] return df def main(): # Load in config items with open('../../../config.json') as json_config_file: config = json.load(json_config_file) # Load in data adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv' adm = initialize_adm_data(adm_file) # Fill null STAY data and combine transfer admissions adm = correct_stays(adm) # Save first date in dataset data_path = config['model_data_path'] first_patient_appearance(adm, 'ADMDATE', 'adm', data_path) # Process ethnicity data adm = process_ethnicity(adm) # Track COPD and respiratory events adm = track_copd_resp(adm) # Track anxiety event adm = search_diag(adm, 'anxiety_depression') # Select relevant columns reduced_cols = ['SafeHavenID', 'eth_grp', 'ADMDATE', 'STAY', 'copd_event', 'resp_event', 'anxiety_depression_event'] adm_reduced = adm[reduced_cols] # Save per event dataset adm_reduced.to_pickle(data_path + 'validation_adm_proc.pkl') # Add column relative to user-specified date adm_reduced = add_eoy_column(adm_reduced, 'ADMDATE', config['date']) # Extract yearly data adm_yearly = extract_yearly_data(adm_reduced) # Save data adm_yearly.to_pickle(data_path + 'adm_proc.pkl') main()