""" To Do: - Refactor script to be more readable/smaller main function """ import json import pandas as pd import numpy as np from datetime import timedelta def read_pkl_data(dataset, data_path, path_type): """ Read in pickled dataset -------- :param dataset: type of dataset to read in :param data_path: path to generated data :param path_type: type of path to read from :return: dataframe """ print('Reading in ' + dataset) file_path = data_path + dataset if path_type == 'data': file_path += '_proc.pkl' else: file_path += '_first_dates.pkl' return pd.read_pickle(file_path) def fill_eth_grp_data(df): """ Fill nulls in eth_grp column introduced in joining :param df: dataframe to update :return: Filled dataframe """ df['eth_grp'] = df.groupby('SafeHavenID').eth_grp.apply( lambda x: x.ffill().bfill()) df['eth_grp'] = df['eth_grp'].fillna('Unknown') return df def fill_to_date_columns(df): """ Fill nulls in to_date columns introduced in joining :param df: dataframe to update :return: Filled dataframe """ to_date_cols = ['adm_to_date', 'copd_to_date', 'resp_to_date', 'presc_to_date', 'rescue_to_date', 'labs_to_date', 'anxiety_depression_to_date', 'anxiety_depression_presc_to_date'] df[to_date_cols] = df.groupby('SafeHavenID')[to_date_cols].apply( lambda x: x.ffill().fillna(0)) return df def fill_yearly_columns(df): """ Fill nulls in yearly columns introduced in joining :param df: dataframe to update :return: Filled dataframe """ zero_cols = ['adm_per_year', 'total_hosp_days', 'mean_los', 'copd_per_year', 'resp_per_year', 'comorb_per_year', 'salbutamol_per_year', 'saba_inhaler_per_year', 'laba_inhaler_per_year', 'lama_inhaler_per_year', 'sama_inhaler_per_year', 'ics_inhaler_per_year', 'laba_ics_inhaler_per_year', 'lama_laba_ics_inhaler_per_year', 'saba_sama_inhaler_per_year', 'mcs_inhaler_per_year', 'rescue_meds_per_year', 'presc_per_year', 'labs_per_year', 'anxiety_depression_per_year', 'anxiety_depression_presc_per_year'] df[zero_cols] = df[zero_cols].fillna(0) return df def fill_days_since(df, typ): """ Fill days_since_copd/resp/rescue :param df: dataframe to update :param typ: type of feature to fill ('copd', 'resp', 'rescue') :return: Filled dataframe """ df['days_since_' + typ] = df.eoy - df[typ + '_date'].ffill() return df def process_first_dates(df): """ Process dataframe containing patient's first date in the health board region -------- :param df: dataframe to process :return: processed dataframe """ df = df.set_index('SafeHavenID') entry_dataset = df.idxmin(axis=1).apply(lambda x: x.split('_')[1]) first_entry = df.min(axis=1) df['entry_dataset'] = entry_dataset df['first_entry'] = first_entry df_reduced = df[['entry_dataset', 'first_entry']].reset_index() return df_reduced def find_closest_simd(v): """ Find closest SIMD vigintile for each row 'v' -------- :param v: row of data from apply statement :param typ: type of simd column to add :return: simd value """ simd_years = [2009, 2012, 2016] bools = [v.eoy.year >= year for year in simd_years] if any(bools): simd_year = str(simd_years[np.where(bools)[0][-1]]) v['simd_quintile'] = v['simd_' + simd_year + '_quintile'] v['simd_decile'] = v['simd_' + simd_year + '_decile'] v['simd_vigintile'] = v['simd_' + simd_year + '_vigintile'] else: v['simd_quintile'] = np.nan v['simd_decile'] = np.nan v['simd_vigintile'] = np.nan return v def main(): # Load in config items with open('../../../config.json') as json_config_file: config = json.load(json_config_file) data_path = config['model_data_path'] # Read in data adm = read_pkl_data('adm', data_path, 'data') comorb = read_pkl_data('comorb', data_path, 'data') presc = read_pkl_data('presc', data_path, 'data') labs = read_pkl_data('labs', data_path, 'data') demo = read_pkl_data('demo', data_path, 'data') # Join datasets df = adm.join( comorb, how='left').join( presc, how='outer').join( labs, how='outer') df = df.reset_index() # Fill nulls introduced in joining print('Filling data') df = fill_eth_grp_data(df) df = fill_to_date_columns(df) df = fill_yearly_columns(df) # Fill days_since columns for typ in ['copd', 'resp', 'rescue', 'adm']: df = df.groupby('SafeHavenID').apply(fill_days_since, typ) # Reduce to single column ds_cols = ['days_since_copd', 'days_since_resp'] df['days_since_copd_resp'] = df[ds_cols].min(axis=1) # Read in first date data print('Adding first dates') adm_dates = read_pkl_data('adm', data_path, 'date') presc_dates = read_pkl_data('presc', data_path, 'date') labs_dates = read_pkl_data('labs', data_path, 'date') # Merge first date data first_dates = pd.merge( pd.merge(adm_dates, presc_dates, how="outer", on='SafeHavenID'), labs_dates, how="outer", on='SafeHavenID') # Save first dates if needed first_dates.to_pickle(data_path + 'overall_first_dates.pkl') # Process first_years date_data = process_first_dates(first_dates) # Merge first dates data with dataframe print('Merging data') df_merged = pd.merge(df, date_data, on='SafeHavenID', how='inner') # Add years in health board region ggc_years = (df_merged.eoy - df_merged.first_entry) / np.timedelta64(1, 'Y') df_merged['ggc_years'] = round(ggc_years) # Merge demographics df_merged = pd.merge(df_merged, demo, on='SafeHavenID') # Calculate age relative to end of year dt_diff = df_merged.eoy - pd.to_datetime(df_merged.obf_dob) df_merged['age'] = dt_diff // timedelta(days=365.2425) # Find closest SIMD df_merged = df_merged.apply(find_closest_simd, axis=1) # Drop additional columns cols2drop = ['copd_date', 'resp_date', 'adm_date', 'rescue_date', 'simd_2009_quintile', 'simd_2009_decile', 'simd_2009_vigintile', 'simd_2012_quintile', 'simd_2012_decile', 'simd_2012_vigintile', 'simd_2016_quintile', 'simd_2016_decile', 'simd_2016_vigintile', 'days_since_copd', 'days_since_resp'] df_merged = df_merged.drop(cols2drop, axis=1) # Save dataset df_merged.to_pickle(data_path + 'merged_full.pkl') main()