""" Script to remove all receiver IDs from relevant data sources. """ import json import pandas as pd from sklearn.model_selection import train_test_split def get_ids(path): """ Read in IDs -------- :return: list of SafeHavenIDs """ print('Loading IDs from ' + path) df = pd.read_csv(path, encoding="cp1252") ids = df['SafeHavenID'].tolist() return ids def save_rec_sup(df, data_path, rec_ids, sup_ids): """ Remove receiver IDs from dataframe and pickle the dataset -------- :param df: pandas dataframe to remove ids from :param data_path: path to generated data :param rec_ids: list of SafeHavenIDs in receiver cohort to remove :param sup_ids: list of SafeHavenIDs in scale-up cohort to remove :return: None """ print('Saving REC and SUP data') # Remove receiver IDs df_rec = df[df['SafeHavenID'].isin(rec_ids)] df_sup = df[df['SafeHavenID'].isin(sup_ids)] df = df[~df['SafeHavenID'].isin(rec_ids + sup_ids)] # Save data df_rec.to_pickle(data_path + 'merged_rec.pkl') df_sup.to_pickle(data_path + 'merged_sup.pkl') return df def save_df_ids(df, data_path, ids, typ): """ Save train, test or validation ids and corresponding data -------- :param df: dataframe :param data_path: path to generated data :param ids: list of SafeHavenIDs :param typ: type of dataset to create, 'train', 'test', 'val' """ print('Saving ' + typ + ' data') df_ids = pd.DataFrame(ids, columns=['SafeHavenID']) df_ids.to_pickle(data_path + typ + '_ids.pkl') df_ids_data = df[df['SafeHavenID'].isin(ids)] df_ids_data.to_pickle(data_path + 'merged_' + typ + '.pkl') def df_tts(df, data_path): """ Split data into training and testing sets and save dataframes -------- :param df: pandas dataframe to split :param data_path: path to generated data :return: None """ # Split IDs into training, testing and validation sets ids = df['SafeHavenID'].tolist() train_ids, test_ids = train_test_split( ids, test_size=0.2, random_state=42) train_ids, val_ids = train_test_split( train_ids, test_size=0.25, random_state=42) # Save IDs and datasets save_df_ids(df, data_path, train_ids, 'train') save_df_ids(df, data_path, test_ids, 'test') save_df_ids(df, data_path, val_ids, 'val') def main(): # Load in config items with open('../../../config.json') as json_config_file: config = json.load(json_config_file) # Set paths data_path = config['model_data_path'] rec_path = config['rec_data_path'] + 'Cohort3Rand.csv' sup_path = config['sup_data_path'] + 'Scale_Up_lookup.csv' # Get IDs to exclude rec_ids = get_ids(rec_path) sup_ids = get_ids(sup_path) # Remove IDs from datasets df = pd.read_pickle(data_path + 'merged.pkl') df = save_rec_sup(df, data_path, rec_ids, sup_ids) # Split and save the data df_tts(df, data_path) main()