| | """ |
| | Modelling process |
| | """ |
| | import pandas as pd |
| | import numpy as np |
| | import pickle |
| | import matplotlib.pyplot as plt |
| | import mlflow |
| | from matplotlib import rcParams |
| | from sklearn.cluster import AgglomerativeClustering, KMeans |
| | from sklearn.decomposition import PCA |
| | from sklearn.metrics import (davies_bouldin_score, silhouette_score, |
| | accuracy_score, confusion_matrix, |
| | ConfusionMatrixDisplay) |
| | from sklearn.multiclass import OneVsRestClassifier |
| | from sklearn.tree import DecisionTreeClassifier |
| | import os |
| |
|
| |
|
| | |
| | rcParams['figure.figsize'] = 20, 5 |
| | rcParams['axes.spines.top'] = False |
| | rcParams['axes.spines.right'] = False |
| |
|
| | |
| | year = 2019 |
| | model_type = 'hierarchical' |
| | data_type = 'train' |
| | k = 3 |
| | stamp = str(pd.Timestamp.now(tz='GMT+0'))[:16].replace(':', '').replace(' ', '_') |
| | data_path = '<YOUR_DATA_PATH>/Model_E_Extracts/' |
| |
|
| | |
| | mlflow.set_tracking_uri("file:/.") |
| | tracking_uri = mlflow.get_tracking_uri() |
| | experiment_name = 'Model E: one vs rest adaption DTC ' + model_type |
| | run_name = "_".join((str(year), model_type, stamp)) |
| | description = "Clustering model with one vs rest adaption (DTC) for COPD data in " + str(year) |
| |
|
| |
|
| | def extract_year(df, year): |
| | """ |
| | Extract 1 year of data |
| | -------- |
| | :param df: dataframe to extract from |
| | :param year: year to select data from |
| | :return: data from chosen year |
| | """ |
| | return df[df.year == year] |
| |
|
| |
|
| | def read_yearly_data(typ, year): |
| | """ |
| | Read in data for year required |
| | -------- |
| | :param typ: type of data to read in |
| | :param year: year to select data from |
| | :return: data from chosen year and ids |
| | """ |
| | df = pd.read_pickle(data_path + 'min_max_' + typ + '.pkl') |
| | df_year = extract_year(df, year) |
| | ids = df_year.pop('SafeHavenID').to_list() |
| | df_year = df_year.drop('year', axis=1) |
| |
|
| | return df_year, ids |
| |
|
| |
|
| | def plot_variance(df, typ): |
| | """ |
| | Plot PCA variance |
| | --------- |
| | :param df: dataframe to process with PCA |
| | :param typ: type of plot - for 'full' data or 'reduced' |
| | :return: pca object |
| | """ |
| | pca = PCA().fit(df) |
| | n = list(range(1, len(df.columns) + 1)) |
| | evr = pca.explained_variance_ratio_.cumsum() |
| | fig, ax = plt.subplots() |
| | ax.plot(n, evr) |
| | title = 'PCA Variance - ' + typ |
| | ax.set_title(title, size=20) |
| | ax.set_xlabel('Number of principal components') |
| | ax.set_ylabel('Cumulative explained variance') |
| | ax.grid() |
| | plt.tight_layout() |
| | mlflow.log_figure(fig, 'fig/' + title + '.png') |
| |
|
| | return pca |
| |
|
| |
|
| | def extract_pca_loadings(df, pca_object): |
| | """ |
| | Extract PCA loadings |
| | -------- |
| | :param df: dataframe to reduce with pca |
| | :param pca_object: pca object with feature loadings |
| | :return: loadings table |
| | """ |
| | cols = df.columns |
| | loadings = pd.DataFrame( |
| | data=pca_object.components_.T * np.sqrt(pca_object.explained_variance_), |
| | columns=[f'PC{i}' for i in range(1, len(cols) + 1)], |
| | index=cols) |
| |
|
| | return loadings |
| |
|
| |
|
| | def plot_loadings(loadings): |
| | """ |
| | Plot loadings for PC1 returned from PCA |
| | -------- |
| | :param loadings: table of feature correlations to PC1 |
| | :return: updated loadings table |
| | """ |
| | loadings_abs = loadings.abs().sort_values(by='PC1', ascending=False) |
| | pc1_abs = loadings_abs[['PC1']].reset_index() |
| | col_map = {'index': 'Attribute', 'PC1': 'AbsCorrWithPC1'} |
| | pc1_abs = pc1_abs.rename(col_map, axis=1) |
| | fig, ax = plt.subplots() |
| | pc1_abs.plot(ax=ax, kind='bar') |
| | title = 'PCA loading scores (PC1)' |
| | ax.set_title(title, size=20) |
| | ax.set_xticks(ticks=pc1_abs.index, labels=pc1_abs.Attribute, rotation='vertical') |
| | ax.set_xlabel('Attribute') |
| | ax.set_ylabel('AbsCorrWithPC1') |
| | plt.tight_layout() |
| | mlflow.log_figure(fig, 'fig/' + title + '.png') |
| |
|
| | return pc1_abs |
| |
|
| |
|
| | def extract_array(df, pca_object, typ): |
| | """ |
| | Extract data to pass to clustering algos |
| | -------- |
| | :param df: dataframe to convert |
| | :param pca_object: initialised PCA object |
| | :param typ: type of return needed, either 'train' or 'test' |
| | :return: converted array (and PCA object if training) |
| | """ |
| | if typ == 'train': |
| | pca_func = pca_object.fit_transform |
| | else: |
| | pca_func = pca_object.transform |
| |
|
| | pca_data = pd.DataFrame(pca_func(df)).to_numpy() |
| |
|
| | if typ == 'train': |
| | pca_file = data_path + run_name + '_pca.pkl' |
| | pickle.dump(pca_object, open(pca_file, 'wb')) |
| | |
| | return pca_data |
| |
|
| |
|
| | def get_kmeans_score(data, k): |
| | ''' |
| | Calculate K-Means Davies Bouldin and Silhouette scores |
| | -------- |
| | :param data: dataset to fit K-Means to |
| | :param k: number of centers/clusters |
| | :return: Scores |
| | ''' |
| | kmeans = KMeans(n_clusters=k) |
| | model = kmeans.fit_predict(data) |
| | db_score = davies_bouldin_score(data, model) |
| | sil_score = silhouette_score(data, model) |
| | |
| | return db_score, sil_score |
| |
|
| |
|
| | def plot_DB(df): |
| | """ |
| | Extract David Bouldin score and plot for a range of cluster numbers, |
| | applied using K-Means clustering. |
| | |
| | "Davies Bouldin index represents the average 'similarity' of clusters, |
| | where similarity is a measure that relates cluster distance to cluster |
| | size" - the lowest score indicates best cluster set. |
| | -------- |
| | :param df: dataframe to plot from |
| | """ |
| | db_scores = [] |
| | sil_scores = [] |
| | centers = list(range(2, 10)) |
| | for center in centers: |
| | db_score, sil_score = get_kmeans_score(df, center) |
| | db_scores.append(db_score) |
| | sil_scores.append(sil_score) |
| |
|
| | |
| | fig, ax = plt.subplots() |
| | ax.plot(centers, db_scores, linestyle='--', marker='o', color='b') |
| | ax.set_xlabel('K') |
| | ax.set_ylabel('Davies Bouldin score') |
| | title = 'Davies Bouldin score vs. K' |
| | ax.set_title(title, size=20) |
| | plt.tight_layout() |
| | mlflow.log_figure(fig, 'fig/' + title + '.png') |
| |
|
| | |
| | fig, ax = plt.subplots() |
| | ax.plot(centers, sil_scores, linestyle='--', marker='o', color='b') |
| | ax.set_xlabel('K') |
| | ax.set_ylabel('Silhouette score') |
| | title = 'Silhouette score vs. K' |
| | ax.set_title(title, size=20) |
| | plt.tight_layout() |
| | mlflow.log_figure(fig, 'fig/' + title + '.png') |
| |
|
| |
|
| | def plot_clust(df, labels): |
| | """ |
| | Plot clusters |
| | -------- |
| | :param df: dataframe to plot clusters from |
| | :param labels: cluster labels |
| | """ |
| | fig = plt.figure(figsize=(10, 10)) |
| | ax = fig.add_subplot(111, projection='3d') |
| | sc = ax.scatter(df[:, 0], df[:, 1], df[:, 2], c=labels) |
| | ax.set_xlabel('Principal Component 1') |
| | ax.set_ylabel('Principal Component 2') |
| | ax.set_zlabel('Principal Component 3') |
| | ax.legend(*sc.legend_elements(), title='clusters') |
| | title = 'Clusters' |
| | ax.set_title(title, size=20) |
| | plt.tight_layout() |
| | mlflow.log_figure(fig, 'fig/' + title + '.png') |
| |
|
| |
|
| | def save_clusters(typ, labels): |
| | """ |
| | Save results from clustering |
| | -------- |
| | :param typ: type of datasets - train, val |
| | :param labels: labels from clustering to add to df |
| | :param cols: columns to use for training |
| | :return: reduced dataframe in numpy format |
| | """ |
| | df_full = pd.read_pickle(data_path + 'filled_' + typ + '.pkl') |
| | df = df_full[df_full.year == year] |
| | df['cluster'] = labels |
| | df.to_pickle(data_path + '_'.join((run_name, typ, 'clusters.pkl'))) |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | df_train, train_ids = read_yearly_data('train', year) |
| | df_val, val_ids = read_yearly_data('val', year) |
| |
|
| | |
| | print('Setting up ML Flow run') |
| | mlflow.set_tracking_uri('http://127.0.0.1:5000/') |
| | mlflow.set_experiment(experiment_name) |
| | mlflow.start_run(run_name=run_name, description=description) |
| | mlflow.set_tag("model.name", model_type) |
| | mlflow.set_tag("model.training_data", "EXAMPLE_STUDY_DATA") |
| | mlflow.set_tag("model.training_year", year) |
| | mlflow.log_param("n_cols", len(df_train.columns) - 1) |
| | mlflow.log_param("k", k) |
| |
|
| | |
| | print('Feature reduction stage 1') |
| | pca = plot_variance(df_train, 'full') |
| | loadings = extract_pca_loadings(df_train, pca) |
| | pc1_abs_loadings = plot_loadings(loadings) |
| | variance_full = pca.explained_variance_ratio_.cumsum() |
| |
|
| | n_cols = np.argmax(variance_full >= 0.9) + 1 |
| |
|
| | mlflow.log_param("pca_stage_1", n_cols) |
| | columns = pc1_abs_loadings.Attribute[:n_cols].values |
| | np.save(data_path + run_name + '_cols.npy', columns) |
| |
|
| | |
| | df_train_reduced = df_train[columns] |
| | df_val_reduced = df_val[columns] |
| |
|
| | |
| | print('Feature reduction stage 2') |
| | pca_n_cols = plot_variance(df_train_reduced, 'reduced') |
| | variance_reduced = pca_n_cols.explained_variance_ratio_.cumsum() |
| |
|
| | n_components = np.argmax(variance_reduced >= 0.8) + 1 |
| | mlflow.log_param("pca_stage_2", n_components) |
| | pca_reduced = PCA(n_components=n_components) |
| | data_train = extract_array(df_train_reduced, pca_reduced, 'train') |
| | data_val = extract_array(df_val_reduced, pca_reduced, 'test') |
| |
|
| | |
| | print('Detecting best cluster number') |
| | plot_DB(data_train) |
| |
|
| | |
| | print('Cluster model training') |
| | data = np.concatenate((data_train, data_val)) |
| | cluster_model = AgglomerativeClustering(n_clusters=k, linkage="ward") |
| | |
| | cluster_model.fit(data) |
| | cluster_model_file = data_path + "_".join((run_name, model_type, 'cluster_model.pkl')) |
| | pickle.dump(cluster_model, open(cluster_model_file, 'wb')) |
| |
|
| | |
| | labels = cluster_model.labels_ |
| | train_labels = labels[:len(train_ids)] |
| | val_labels = labels[len(train_ids):] |
| | save_clusters('train', train_labels) |
| | save_clusters('val', val_labels) |
| |
|
| | |
| | plot_clust(data, labels) |
| |
|
| | |
| | print('BLR classifier training') |
| |
|
| | |
| | clf_pre = DecisionTreeClassifier(random_state=42) |
| | clf = OneVsRestClassifier(clf_pre) |
| | clf.fit(df_train_reduced.to_numpy(), train_labels) |
| | clf_model_file = data_path + run_name + '_dtc_model.pkl' |
| | pickle.dump(clf, open(clf_model_file, 'wb')) |
| |
|
| | |
| | n_classes = len(set(train_labels)) |
| | n_features = df_train_reduced.shape[1] |
| |
|
| | fig, axs = plt.subplots(n_classes, 1, figsize=(10, 5 * n_classes)) |
| |
|
| | |
| | fig.subplots_adjust(hspace=0.99) |
| |
|
| | |
| | for i in range(n_classes): |
| | |
| | importance = clf.estimators_[i].feature_importances_ |
| | |
| | |
| | indices = np.argsort(importance)[::-1] |
| | |
| | |
| | axs[i].bar(range(n_features), importance[indices]) |
| | axs[i].set_xticks(range(n_features)) |
| | axs[i].set_xticklabels(np.array(df_train_reduced.columns)[indices], rotation=90, fontsize=9) |
| | axs[i].set_xlabel('Features') |
| | axs[i].set_ylabel('Importance') |
| | axs[i].set_title('Class {} Feature Importance'.format(i)) |
| |
|
| | |
| | plt.subplots_adjust(hspace=0.5) |
| |
|
| | |
| | tmpfile = "plot.png" |
| | fig.savefig(tmpfile) |
| |
|
| | |
| | with open(tmpfile, "rb") as fig: |
| | mlflow.log_artifact(tmpfile, "feature_importance.png") |
| |
|
| | |
| | os.remove(tmpfile) |
| |
|
| | |
| | val_pred = clf.predict(df_val_reduced.to_numpy()) |
| | accuracy = accuracy_score(val_labels, val_pred) |
| | mlflow.log_metric('dtc accuracy', accuracy) |
| |
|
| | cm = confusion_matrix(val_labels, val_pred, labels=clf.classes_) |
| | disp = ConfusionMatrixDisplay( |
| | confusion_matrix=cm, display_labels=clf.classes_) |
| | disp.plot() |
| | plt.tight_layout() |
| | mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png') |
| | |
| | |
| | mlflow.end_run() |
| |
|
| |
|
| | main() |
| |
|