import sys import json import pandas as pd import numpy as np import pickle def extract_year(df, eoy_date): """ Extract 1 year of data -------- :param df: dataframe to extract from :param eoy_date: user-specified end of year date :return: data from chosen year """ return df[df.eoy == eoy_date] def read_yearly_data(data_path, data_type, eoy_date): """ Read in data for year required -------- :param data_path: path to generated data :param data_type: type of data to read in :param eoy_date: user-specified end of year date :return: data from chosen year and ids """ df = pd.read_pickle(data_path + 'min_max_' + data_type + '.pkl') df_year = extract_year(df, eoy_date) ids = df_year.pop('SafeHavenID').to_list() df_year = df_year.drop('eoy', axis=1) return df_year, ids def main(): # Load in config items with open('../../../config.json') as json_config_file: config = json.load(json_config_file) # Set model parameters eoy_date = config['date'] data_path = config['model_data_path'] # Get datatype from cmd line data_type = sys.argv[1] run_name = sys.argv[2] # Read data print('Loading data') columns = np.load(data_path + run_name + '_cols.npy', allow_pickle=True) df_scaled, ids = read_yearly_data(data_path, data_type, eoy_date) df_scaled_reduced = df_scaled[columns] df_unscaled_full = pd.read_pickle(data_path + 'filled_' + data_type + '.pkl') df_unscaled = extract_year(df_unscaled_full, eoy_date) # Load model print('Loading model') clf_model_file = data_path + run_name + '_dtc_model.pkl' clf = pickle.load(open(clf_model_file, 'rb')) # Predict on new data print('Predicting clusters') labels = clf.predict(df_scaled_reduced.to_numpy()) df_unscaled['cluster'] = labels df_unscaled.to_pickle(data_path + '_'.join((run_name, data_type, 'clusters.pkl'))) main()