File size: 1,976 Bytes
53a6def
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import sys
import json
import pandas as pd
import numpy as np
import pickle


def extract_year(df, eoy_date):
    """
    Extract 1 year of data
    --------
    :param df: dataframe to extract from
    :param eoy_date: user-specified end of year date
    :return: data from chosen year
    """
    return df[df.eoy == eoy_date]


def read_yearly_data(data_path, data_type, eoy_date):
    """
    Read in data for year required
    --------
    :param data_path: path to generated data
    :param data_type: type of data to read in
    :param eoy_date: user-specified end of year date
    :return: data from chosen year and ids
    """
    df = pd.read_pickle(data_path + 'min_max_' + data_type + '.pkl')
    df_year = extract_year(df, eoy_date)
    ids = df_year.pop('SafeHavenID').to_list()
    df_year = df_year.drop('eoy', axis=1)

    return df_year, ids


def main():

    # Load in config items
    with open('../../../config.json') as json_config_file:
        config = json.load(json_config_file)

    # Set model parameters
    eoy_date = config['date']
    data_path = config['model_data_path']

    # Get datatype from cmd line
    data_type = sys.argv[1]
    run_name = sys.argv[2]

    # Read data
    print('Loading data')
    columns = np.load(data_path + run_name + '_cols.npy', allow_pickle=True)
    df_scaled, ids = read_yearly_data(data_path, data_type, eoy_date)
    df_scaled_reduced = df_scaled[columns]
    df_unscaled_full = pd.read_pickle(data_path + 'filled_' + data_type + '.pkl')
    df_unscaled = extract_year(df_unscaled_full, eoy_date)
    
    # Load model
    print('Loading model')
    clf_model_file = data_path + run_name + '_dtc_model.pkl'
    clf = pickle.load(open(clf_model_file, 'rb'))

    # Predict on new data
    print('Predicting clusters')
    labels = clf.predict(df_scaled_reduced.to_numpy())
    df_unscaled['cluster'] = labels
    df_unscaled.to_pickle(data_path + '_'.join((run_name, data_type, 'clusters.pkl')))


main()