| | """ |
| | Modelling process |
| | """ |
| | import json |
| | import pandas as pd |
| | import numpy as np |
| | import pickle |
| | import matplotlib.pyplot as plt |
| | import mlflow |
| | from matplotlib import rcParams |
| | from sklearn.cluster import AgglomerativeClustering, KMeans |
| | from sklearn.tree import DecisionTreeClassifier as DTC |
| | from sklearn.decomposition import PCA |
| | from sklearn.metrics import (davies_bouldin_score, silhouette_score, |
| | accuracy_score, confusion_matrix, |
| | ConfusionMatrixDisplay) |
| |
|
| |
|
| | |
| | rcParams['figure.figsize'] = 20, 5 |
| | rcParams['axes.spines.top'] = False |
| | rcParams['axes.spines.right'] = False |
| |
|
| |
|
| | def extract_year(df, eoy_date): |
| | """ |
| | Extract 1 year of data |
| | -------- |
| | :param df: dataframe to extract from |
| | :param eoy_date: user-specified EOY date for training |
| | :return: data from chosen year |
| | """ |
| | return df[df.eoy == eoy_date] |
| |
|
| |
|
| | def read_yearly_data(data_path, typ, eoy_date): |
| | """ |
| | Read in data for year required |
| | -------- |
| | :param data_path: path to generated data |
| | :param typ: type of data to read in |
| | :param eoy_date: end of year date to select data from |
| | :return: data from chosen year and ids |
| | """ |
| | df = pd.read_pickle(data_path + 'min_max_' + typ + '.pkl') |
| | df_year = extract_year(df, eoy_date) |
| | ids = df_year.pop('SafeHavenID').to_list() |
| | df_year = df_year.drop('eoy', axis=1) |
| |
|
| | return df_year, ids |
| |
|
| |
|
| | def plot_variance(df, typ): |
| | """ |
| | Plot PCA variance |
| | --------- |
| | :param df: dataframe to process with PCA |
| | :param typ: type of plot - for 'full' data or 'reduced' |
| | :return: pca object |
| | """ |
| | pca = PCA().fit(df) |
| | n = list(range(1, len(df.columns) + 1)) |
| | evr = pca.explained_variance_ratio_.cumsum() |
| | fig, ax = plt.subplots() |
| | ax.plot(n, evr) |
| | title = 'PCA Variance - ' + typ |
| | ax.set_title(title, size=20) |
| | ax.set_xlabel('Number of principal components') |
| | ax.set_ylabel('Cumulative explained variance') |
| | ax.grid() |
| | plt.tight_layout() |
| | mlflow.log_figure(fig, 'fig/' + title + '.png') |
| |
|
| | return pca |
| |
|
| |
|
| | def extract_pca_loadings(df, pca_object): |
| | """ |
| | Extract PCA loadings |
| | -------- |
| | :param df: dataframe to reduce with pca |
| | :param pca_object: pca object with feature loadings |
| | :return: loadings table |
| | """ |
| | cols = df.columns |
| | loadings = pd.DataFrame( |
| | data=pca_object.components_.T * np.sqrt(pca_object.explained_variance_), |
| | columns=[f'PC{i}' for i in range(1, len(cols) + 1)], |
| | index=cols) |
| |
|
| | return loadings |
| |
|
| |
|
| | def plot_loadings(loadings): |
| | """ |
| | Plot loadings for PC1 returned from PCA |
| | -------- |
| | :param loadings: table of feature correlations to PC1 |
| | :return: updated loadings table |
| | """ |
| | loadings_abs = loadings.abs().sort_values(by='PC1', ascending=False) |
| | pc1_abs = loadings_abs[['PC1']].reset_index() |
| | col_map = {'index': 'Attribute', 'PC1': 'AbsCorrWithPC1'} |
| | pc1_abs = pc1_abs.rename(col_map, axis=1) |
| | fig, ax = plt.subplots() |
| | pc1_abs.plot(ax=ax, kind='bar') |
| | title = 'PCA loading scores (PC1)' |
| | ax.set_title(title, size=20) |
| | ax.set_xticks(ticks=pc1_abs.index, labels=pc1_abs.Attribute, rotation='vertical') |
| | ax.set_xlabel('Attribute') |
| | ax.set_ylabel('AbsCorrWithPC1') |
| | plt.tight_layout() |
| | mlflow.log_figure(fig, 'fig/' + title + '.png') |
| |
|
| | return pc1_abs |
| |
|
| |
|
| | def extract_array(df, data_path, run_name, pca_object, typ): |
| | """ |
| | Extract data to pass to clustering algos |
| | -------- |
| | :param df: dataframe to convert |
| | :param data_path: path to generated data |
| | :param run_name: name of run in ML Flow |
| | :param pca_object: initialised PCA object |
| | :param typ: type of return needed, either 'train' or 'test' |
| | :return: converted array (and PCA object if training) |
| | """ |
| | if typ == 'train': |
| | pca_func = pca_object.fit_transform |
| | else: |
| | pca_func = pca_object.transform |
| |
|
| | pca_data = pd.DataFrame(pca_func(df)).to_numpy() |
| |
|
| | if typ == 'train': |
| | pca_file = data_path + run_name + '_pca.pkl' |
| | pickle.dump(pca_object, open(pca_file, 'wb')) |
| | |
| | return pca_data |
| |
|
| |
|
| | def get_kmeans_score(data, k): |
| | ''' |
| | Calculate K-Means Davies Bouldin and Silhouette scores |
| | -------- |
| | :param data: dataset to fit K-Means to |
| | :param k: number of centers/clusters |
| | :return: Scores |
| | ''' |
| | kmeans = KMeans(n_clusters=k) |
| | model = kmeans.fit_predict(data) |
| | db_score = davies_bouldin_score(data, model) |
| | sil_score = silhouette_score(data, model) |
| | |
| | return db_score, sil_score |
| |
|
| |
|
| | def plot_DB(df): |
| | """ |
| | Extract David Bouldin score and plot for a range of cluster numbers, |
| | applied using K-Means clustering. |
| | |
| | "Davies Bouldin index represents the average 'similarity' of clusters, |
| | where similarity is a measure that relates cluster distance to cluster |
| | size" - the lowest score indicates best cluster set. |
| | -------- |
| | :param df: dataframe to plot from |
| | """ |
| | db_scores = [] |
| | sil_scores = [] |
| | centers = list(range(2, 10)) |
| | for center in centers: |
| | db_score, sil_score = get_kmeans_score(df, center) |
| | db_scores.append(db_score) |
| | sil_scores.append(sil_score) |
| |
|
| | |
| | fig, ax = plt.subplots() |
| | ax.plot(centers, db_scores, linestyle='--', marker='o', color='b') |
| | ax.set_xlabel('K') |
| | ax.set_ylabel('Davies Bouldin score') |
| | title = 'Davies Bouldin score vs. K' |
| | ax.set_title(title, size=20) |
| | plt.tight_layout() |
| | mlflow.log_figure(fig, 'fig/' + title + '.png') |
| |
|
| | |
| | fig, ax = plt.subplots() |
| | ax.plot(centers, sil_scores, linestyle='--', marker='o', color='b') |
| | ax.set_xlabel('K') |
| | ax.set_ylabel('Silhouette score') |
| | title = 'Silhouette score vs. K' |
| | ax.set_title(title, size=20) |
| | plt.tight_layout() |
| | mlflow.log_figure(fig, 'fig/' + title + '.png') |
| |
|
| |
|
| | def plot_clust(df, labels): |
| | """ |
| | Plot clusters |
| | -------- |
| | :param df: dataframe to plot clusters from |
| | :param labels: cluster labels |
| | """ |
| | fig = plt.figure(figsize=(10, 10)) |
| | ax = fig.add_subplot(111, projection='3d') |
| | sc = ax.scatter(df[:, 0], df[:, 1], df[:, 2], c=labels) |
| | ax.set_xlabel('Principal Component 1') |
| | ax.set_ylabel('Principal Component 2') |
| | ax.set_zlabel('Principal Component 3') |
| | ax.legend(*sc.legend_elements(), title='clusters') |
| | title = 'Clusters' |
| | ax.set_title(title, size=20) |
| | plt.tight_layout() |
| | mlflow.log_figure(fig, 'fig/' + title + '.png') |
| |
|
| |
|
| | def save_clusters(data_path, run_name, eoy_date, typ, labels): |
| | """ |
| | Save results from clustering |
| | -------- |
| | :param typ: type of datasets - train, val |
| | :param labels: labels from clustering to add to df |
| | :param cols: columns to use for training |
| | :return: reduced dataframe in numpy format |
| | """ |
| | df_full = pd.read_pickle(data_path + 'filled_' + typ + '.pkl') |
| | df = df_full[df_full.eoy == eoy_date] |
| | df['cluster'] = labels |
| | df.to_pickle(data_path + '_'.join((run_name, typ, 'clusters.pkl'))) |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | with open('../../../config.json') as json_config_file: |
| | config = json.load(json_config_file) |
| |
|
| | |
| | eoy_date = config['date'] |
| | data_path = config['model_data_path'] |
| | model_type = config['model_type'] |
| |
|
| | |
| | with open(model_type + '_params.json') as json_params_file: |
| | model_params = json.load(json_params_file) |
| |
|
| | |
| | stamp = str(pd.Timestamp.now(tz='GMT+0'))[:16].replace(':', '').replace(' ', '_') |
| | experiment_name = 'Model E - Date Specific: ' + model_type |
| | run_name = "_".join((str(eoy_date), model_type, stamp)) |
| | description = "Clustering model for COPD data in the year prior to " + str(eoy_date) |
| |
|
| | |
| | print('Setting up ML Flow run') |
| | mlflow.set_tracking_uri('http://127.0.0.1:5000/') |
| | mlflow.set_experiment(experiment_name) |
| | mlflow.start_run(run_name=run_name, description=description) |
| | mlflow.set_tag("model.name", model_type) |
| | mlflow.set_tag("model.training_data", config['extract_data_path']) |
| | mlflow.set_tag("model.training_date", eoy_date) |
| | mlflow.log_param("k", model_params['n_clusters']) |
| |
|
| | |
| | df_train, train_ids = read_yearly_data(data_path, 'train', eoy_date) |
| | df_val, val_ids = read_yearly_data(data_path, 'val', eoy_date) |
| | mlflow.log_param("n_cols", len(df_train.columns)) |
| |
|
| | |
| | df_train, train_ids = read_yearly_data(data_path, 'train', eoy_date) |
| | df_val, val_ids = read_yearly_data(data_path, 'val', eoy_date) |
| | mlflow.log_param("n_cols", len(df_train.columns)) |
| |
|
| | |
| | print('Feature reduction stage 1') |
| | pca = plot_variance(df_train, 'full') |
| | loadings = extract_pca_loadings(df_train, pca) |
| | pc1_abs_loadings = plot_loadings(loadings) |
| | variance_full = pca.explained_variance_ratio_.cumsum() |
| | n_cols = np.argmax(variance_full >= 0.9) + 1 |
| | mlflow.log_param("pca_stage_1", n_cols) |
| | columns = pc1_abs_loadings.Attribute[:n_cols].values |
| | np.save(data_path + run_name + '_cols.npy', columns) |
| |
|
| | |
| | df_train_reduced = df_train[columns] |
| | df_val_reduced = df_val[columns] |
| |
|
| | |
| | print('Feature reduction stage 2') |
| | pca_n_cols = plot_variance(df_train_reduced, 'reduced') |
| | variance_reduced = pca_n_cols.explained_variance_ratio_.cumsum() |
| | n_components = np.argmax(variance_reduced >= 0.8) + 1 |
| | mlflow.log_param("pca_stage_2", n_components) |
| | pca_reduced = PCA(n_components=n_components) |
| | data_train = extract_array( |
| | df_train_reduced, data_path, run_name, pca_reduced, 'train') |
| | data_val = extract_array( |
| | df_val_reduced, data_path, run_name, pca_reduced, 'test') |
| |
|
| | |
| | print('Detecting best cluster number') |
| | plot_DB(data_train) |
| |
|
| | |
| | print('Cluster model training') |
| | data = np.concatenate((data_train, data_val)) |
| | if model_type == 'hierarchical': |
| | cluster_model = AgglomerativeClustering(**model_params) |
| | else: |
| | cluster_model = KMeans(**model_params) |
| | cluster_model.fit(data) |
| | cluster_model_file = data_path + "_".join((run_name, model_type, 'cluster_model.pkl')) |
| | pickle.dump(cluster_model, open(cluster_model_file, 'wb')) |
| |
|
| | |
| | labels = cluster_model.labels_ |
| | train_labels = labels[:len(train_ids)] |
| | val_labels = labels[len(train_ids):] |
| | save_clusters(data_path, run_name, eoy_date, 'train', train_labels) |
| | save_clusters(data_path, run_name, eoy_date, 'val', val_labels) |
| |
|
| | |
| | plot_clust(data, labels) |
| |
|
| | |
| | with open('dtc_params.json') as dtc_params_file: |
| | dtc_params = json.load(dtc_params_file) |
| |
|
| | |
| | print('Decision tree classifier training') |
| | clf = DTC(**dtc_params).fit(df_train_reduced.to_numpy(), train_labels) |
| | clf_model_file = data_path + run_name + '_dtc_model.pkl' |
| | pickle.dump(clf, open(clf_model_file, 'wb')) |
| |
|
| | |
| | val_pred = clf.predict(df_val_reduced.to_numpy()) |
| |
|
| | accuracy = accuracy_score(val_labels, val_pred) |
| | mlflow.log_metric('dtc accuracy', accuracy) |
| |
|
| | |
| | cm = confusion_matrix(val_labels, val_pred, labels=clf.classes_) |
| | disp = ConfusionMatrixDisplay( |
| | confusion_matrix=cm, display_labels=clf.classes_) |
| | disp.plot() |
| | plt.tight_layout() |
| | mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png') |
| |
|
| | |
| | mlflow.end_run() |
| |
|
| |
|
| | main() |
| |
|