| | """ |
| | Process SMR01 admission data |
| | -------- |
| | Clean and process admission data while adding tracking for COPD and respiratory |
| | admissions per year for each SafeHavenID |
| | """ |
| | import json |
| | import pandas as pd |
| | from datetime import date |
| | from dateutil.relativedelta import relativedelta |
| | from utils.common import add_hist_adm_presc, first_patient_appearance |
| | from utils.adm_common import (initialize_adm_data, correct_stays, |
| | track_copd_resp) |
| | from utils.adm_processing import (convert_ethgrp_desc, mode_ethnicity, |
| | search_diag) |
| | from utils.adm_reduction import fill_missing_years, calc_adm_per_year |
| |
|
| |
|
| | def process_ethnicity(df): |
| | """ |
| | Find relevant ethnic group for each patient, accounting for null data |
| | -------- |
| | :param df: admission dataframe to be updated |
| | :return: admission dataframe with ethnicity cleaned and updated |
| | """ |
| | print('Processing ethnicity') |
| |
|
| | |
| | df = df.rename(columns={'ETHGRP': 'eth_grp'}) |
| | df['eth_grp'] = df.eth_grp.str.strip() |
| | df['eth_grp'] = df.groupby('SafeHavenID')['eth_grp'].apply( |
| | lambda x: x.ffill().bfill().fillna('Unknown')) |
| |
|
| | |
| | df['eth_grp'] = [convert_ethgrp_desc(eth) for eth in df.eth_grp] |
| |
|
| | |
| | df = df.groupby('SafeHavenID').apply(mode_ethnicity, 'eth_grp') |
| |
|
| | return df |
| |
|
| |
|
| | def add_eoy_column(df, dt_col, eoy_date): |
| | """ |
| | Add EOY relative to user-specified end date |
| | -------- |
| | :param df: dataframe |
| | :param dt_col: date column in dataframe |
| | :param eoy_date: EOY date from config |
| | :return: updated df with EOY column added |
| | """ |
| | |
| | df = df.reset_index(drop=True) |
| |
|
| | |
| | end_date = pd.to_datetime(eoy_date) |
| | end_month = end_date.month |
| | end_day = end_date.day |
| |
|
| | |
| | df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year] |
| |
|
| | |
| | eoy_index = df.columns[df.columns == 'eoy'] |
| | adm_vs_eoy = df[dt_col] > df.eoy |
| | row_index = df.index[adm_vs_eoy] |
| | df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1) |
| | df['eoy'] = pd.to_datetime(df.eoy) |
| |
|
| | return df |
| |
|
| |
|
| | def extract_yearly_data(df): |
| | """ |
| | Extract features on a yearly basis for each SafeHavenID |
| | -------- |
| | :param adm: admission dataframe to be updated |
| | :return: dataframe with feature values per year |
| | """ |
| | print('Reducing to 1 row SafeHavenID per year') |
| |
|
| | |
| | df['adm'] = 1 |
| |
|
| | |
| | df = df.groupby('SafeHavenID').apply(fill_missing_years) |
| | df = df.reset_index(drop=True) |
| |
|
| | |
| | df = df.groupby('SafeHavenID').apply(add_hist_adm_presc, 'adm', 'ADMDATE') |
| | df = df.reset_index(drop=True) |
| |
|
| | |
| | df = calc_adm_per_year(df) |
| |
|
| | |
| | final_cols = ['eth_grp', 'adm_per_year', 'total_hosp_days', |
| | 'mean_los', 'copd_per_year', 'resp_per_year', |
| | 'anxiety_depression_per_year', 'days_since_copd', |
| | 'days_since_resp', 'days_since_adm', 'adm_to_date', |
| | 'copd_to_date', 'resp_to_date', 'anxiety_depression_to_date', |
| | 'copd_date', 'resp_date', 'adm_date'] |
| |
|
| | df = df[final_cols] |
| |
|
| | return df |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | with open('../../../config.json') as json_config_file: |
| | config = json.load(json_config_file) |
| |
|
| | |
| | adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv' |
| | adm = initialize_adm_data(adm_file) |
| |
|
| | |
| | adm = correct_stays(adm) |
| |
|
| | |
| | data_path = config['model_data_path'] |
| | first_patient_appearance(adm, 'ADMDATE', 'adm', data_path) |
| |
|
| | |
| | adm = process_ethnicity(adm) |
| |
|
| | |
| | adm = track_copd_resp(adm) |
| |
|
| | |
| | adm = search_diag(adm, 'anxiety_depression') |
| |
|
| | |
| | reduced_cols = ['SafeHavenID', 'eth_grp', 'ADMDATE', 'STAY', 'copd_event', |
| | 'resp_event', 'anxiety_depression_event'] |
| | adm_reduced = adm[reduced_cols] |
| |
|
| | |
| | adm_reduced.to_pickle(data_path + 'validation_adm_proc.pkl') |
| |
|
| | |
| | adm_reduced = add_eoy_column(adm_reduced, 'ADMDATE', config['date']) |
| |
|
| | |
| | adm_yearly = extract_yearly_data(adm_reduced) |
| |
|
| | |
| | adm_yearly.to_pickle(data_path + 'adm_proc.pkl') |
| |
|
| |
|
| | main() |
| |
|