copd-model-e / training /src /modelling /predict_clusters.py
IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
import sys
import json
import pandas as pd
import numpy as np
import pickle
def extract_year(df, eoy_date):
"""
Extract 1 year of data
--------
:param df: dataframe to extract from
:param eoy_date: user-specified end of year date
:return: data from chosen year
"""
return df[df.eoy == eoy_date]
def read_yearly_data(data_path, data_type, eoy_date):
"""
Read in data for year required
--------
:param data_path: path to generated data
:param data_type: type of data to read in
:param eoy_date: user-specified end of year date
:return: data from chosen year and ids
"""
df = pd.read_pickle(data_path + 'min_max_' + data_type + '.pkl')
df_year = extract_year(df, eoy_date)
ids = df_year.pop('SafeHavenID').to_list()
df_year = df_year.drop('eoy', axis=1)
return df_year, ids
def main():
# Load in config items
with open('../../../config.json') as json_config_file:
config = json.load(json_config_file)
# Set model parameters
eoy_date = config['date']
data_path = config['model_data_path']
# Get datatype from cmd line
data_type = sys.argv[1]
run_name = sys.argv[2]
# Read data
print('Loading data')
columns = np.load(data_path + run_name + '_cols.npy', allow_pickle=True)
df_scaled, ids = read_yearly_data(data_path, data_type, eoy_date)
df_scaled_reduced = df_scaled[columns]
df_unscaled_full = pd.read_pickle(data_path + 'filled_' + data_type + '.pkl')
df_unscaled = extract_year(df_unscaled_full, eoy_date)
# Load model
print('Loading model')
clf_model_file = data_path + run_name + '_dtc_model.pkl'
clf = pickle.load(open(clf_model_file, 'rb'))
# Predict on new data
print('Predicting clusters')
labels = clf.predict(df_scaled_reduced.to_numpy())
df_unscaled['cluster'] = labels
df_unscaled.to_pickle(data_path + '_'.join((run_name, data_type, 'clusters.pkl')))
main()