| | """ |
| | Script to remove all receiver IDs from relevant data sources. |
| | """ |
| | import json |
| | import pandas as pd |
| | from sklearn.model_selection import train_test_split |
| |
|
| |
|
| | def get_ids(path): |
| | """ |
| | Read in IDs |
| | -------- |
| | :return: list of SafeHavenIDs |
| | """ |
| | print('Loading IDs from ' + path) |
| |
|
| | df = pd.read_csv(path, encoding="cp1252") |
| | ids = df['SafeHavenID'].tolist() |
| |
|
| | return ids |
| |
|
| |
|
| | def save_rec_sup(df, data_path, rec_ids, sup_ids): |
| | """ |
| | Remove receiver IDs from dataframe and pickle the dataset |
| | -------- |
| | :param df: pandas dataframe to remove ids from |
| | :param data_path: path to generated data |
| | :param rec_ids: list of SafeHavenIDs in receiver cohort to remove |
| | :param sup_ids: list of SafeHavenIDs in scale-up cohort to remove |
| | :return: None |
| | """ |
| | print('Saving REC and SUP data') |
| |
|
| | |
| | df_rec = df[df['SafeHavenID'].isin(rec_ids)] |
| | df_sup = df[df['SafeHavenID'].isin(sup_ids)] |
| | df = df[~df['SafeHavenID'].isin(rec_ids + sup_ids)] |
| |
|
| | |
| | df_rec.to_pickle(data_path + 'merged_rec.pkl') |
| | df_sup.to_pickle(data_path + 'merged_sup.pkl') |
| |
|
| | return df |
| |
|
| |
|
| | def save_df_ids(df, data_path, ids, typ): |
| | """ |
| | Save train, test or validation ids and corresponding data |
| | -------- |
| | :param df: dataframe |
| | :param data_path: path to generated data |
| | :param ids: list of SafeHavenIDs |
| | :param typ: type of dataset to create, 'train', 'test', 'val' |
| | """ |
| | print('Saving ' + typ + ' data') |
| |
|
| | df_ids = pd.DataFrame(ids, columns=['SafeHavenID']) |
| | df_ids.to_pickle(data_path + typ + '_ids.pkl') |
| | df_ids_data = df[df['SafeHavenID'].isin(ids)] |
| | df_ids_data.to_pickle(data_path + 'merged_' + typ + '.pkl') |
| |
|
| |
|
| | def df_tts(df, data_path): |
| | """ |
| | Split data into training and testing sets and save dataframes |
| | -------- |
| | :param df: pandas dataframe to split |
| | :param data_path: path to generated data |
| | :return: None |
| | """ |
| | |
| | ids = df['SafeHavenID'].tolist() |
| | train_ids, test_ids = train_test_split( |
| | ids, test_size=0.2, random_state=42) |
| | train_ids, val_ids = train_test_split( |
| | train_ids, test_size=0.25, random_state=42) |
| |
|
| | |
| | save_df_ids(df, data_path, train_ids, 'train') |
| | save_df_ids(df, data_path, test_ids, 'test') |
| | save_df_ids(df, data_path, val_ids, 'val') |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | with open('../../../config.json') as json_config_file: |
| | config = json.load(json_config_file) |
| |
|
| | |
| | data_path = config['model_data_path'] |
| | rec_path = config['rec_data_path'] + 'Cohort3Rand.csv' |
| | sup_path = config['sup_data_path'] + 'Scale_Up_lookup.csv' |
| |
|
| | |
| | rec_ids = get_ids(rec_path) |
| | sup_ids = get_ids(sup_path) |
| | |
| | |
| | df = pd.read_pickle(data_path + 'merged.pkl') |
| | df = save_rec_sup(df, data_path, rec_ids, sup_ids) |
| |
|
| | |
| | df_tts(df, data_path) |
| |
|
| |
|
| | main() |
| |
|