Spaces:
Build error
Build error
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| import numpy as np | |
| import pandas as pd | |
| import time | |
| import os.path | |
| # from sklearnex import patch_sklearn, unpatch_sklearn | |
| # patch_sklearn() | |
| from sklearn.cluster import KMeans | |
| from sklearn.cluster import MeanShift | |
| # values: a two-dimensional array, m number of n-dimensional vectors to be clustered; | |
| def modified_kmeans_cluster(values_to_cluster, threshold, k_start, n_clusters=None): | |
| if n_clusters is not None: | |
| kmeans = KMeans(n_clusters=n_clusters, n_init="auto", random_state=0).fit(values_to_cluster) | |
| return kmeans.labels_ | |
| else: | |
| n_clusters = k_start | |
| n_values = len(values_to_cluster) | |
| assert n_values > 0 | |
| kmeans = KMeans(n_clusters=n_clusters, n_init="auto", random_state=0).fit(values_to_cluster) | |
| inertias = [kmeans.inertia_] | |
| while n_values > n_clusters: | |
| n_clusters_new = n_clusters + 1 | |
| kmeans_new = KMeans(n_clusters=n_clusters_new, n_init="auto", random_state=0).fit(values_to_cluster) | |
| inertias.append(kmeans_new.inertia_) | |
| if terminate_clustering(inertias, threshold): | |
| break | |
| kmeans = kmeans_new | |
| n_clusters += 1 | |
| return kmeans.labels_ | |
| def terminate_clustering(inertias, threshold): | |
| # method: compute relative improvement toward previous step | |
| assert len(inertias) > 1 | |
| improvement = 1 - (inertias[-1] / inertias[-2]) | |
| return improvement < threshold | |
| def cluster_existed_features(network_folder_path, classes, layers_indexes, taus): | |
| appendixes = ["_correctly_classified_features.csv", "_incorrectly_classified_features.csv"] | |
| product = ((i, y, appendix) for i in layers_indexes for y in classes for appendix in appendixes) | |
| for i, y, appendix in product: | |
| start_time = time.time() | |
| # load data for class y at layer minus i | |
| features_file_path = network_folder_path +"Layer_minus_" + str(i) + "/class_" + str(y) + appendix | |
| df = pd.read_csv(features_file_path) | |
| index_values = df["index"].to_numpy() | |
| values_to_cluster = df[df.columns[3:]].to_numpy() | |
| if len(values_to_cluster): | |
| # specify path and then load existing clustering results | |
| k_and_taus = dict() | |
| taus_existed = [] | |
| clustering_results = pd.DataFrame(df, columns=["index", "true_label", "pred_label"]) | |
| clustering_results_path = network_folder_path + "Layer_minus_" + str(i) + "/clustering_results_class_" + str(y) + appendix | |
| if os.path.exists(clustering_results_path): | |
| clustering_results = pd.read_csv(clustering_results_path) | |
| for col in clustering_results.columns[3:]: | |
| k_and_taus[col] = clustering_results[col].max() + 1 | |
| # update the existing values of tau | |
| taus_existed = [float(key) for key in k_and_taus.keys()] | |
| # remove existing tau from list existed_taus | |
| taus_new = [tau for tau in taus if tau not in taus_existed] | |
| # iterate every tau to cluster the given data | |
| for tau in taus_new: | |
| # fix starting searching point | |
| k_start = 1 | |
| bigger_taus = [x for x in taus_existed if x > tau] | |
| if len(bigger_taus): | |
| tau_closest = min(bigger_taus) | |
| k_start = k_and_taus[str(tau_closest)] | |
| # start to cluster | |
| cluster_labels = modified_kmeans_cluster(values_to_cluster, tau, k_start) | |
| clustering_results[str(tau)] = cluster_labels | |
| taus_existed.append(tau) | |
| k_and_taus[str(tau)] = max(cluster_labels) + 1 | |
| clustering_results.to_csv(clustering_results_path, index = False) | |
| elapsed_time = time.time() - start_time | |
| print("file:" + "Layer_minus_" + str(i) + "_class_" + str(y) + appendix + ",", "lasting time:", elapsed_time, "seconds") | |
| def features_clustering(features, taus, nb_clusters): | |
| start_time = time.time() | |
| values_to_cluster = features | |
| if len(values_to_cluster): | |
| # specify path and then load existing clustering results | |
| k_and_taus = dict() | |
| taus_existed = [] | |
| # if os.path.exists(clustering_results_path): | |
| # clustering_results = pd.read_csv(clustering_results_path) | |
| # for col in clustering_results.columns[3:]: | |
| # k_and_taus[col] = clustering_results[col].max() + 1 | |
| # else: | |
| # clustering_results = pd.DataFrame() | |
| # update the existing values of tau | |
| taus_existed = [float(key) for key in k_and_taus.keys()] | |
| # remove existing tau from list existed_taus | |
| taus_new = [tau for tau in taus if tau not in taus_existed] | |
| clustering_results = dict() | |
| # iterate every tau to cluster the given data | |
| for tau in taus_new: | |
| # fix starting searching point | |
| k_start = 1 | |
| bigger_taus = [x for x in taus_existed if x > tau] | |
| if len(bigger_taus): | |
| tau_closest = min(bigger_taus) | |
| k_start = k_and_taus[str(tau_closest)] | |
| # start to cluster | |
| cluster_labels = modified_kmeans_cluster(values_to_cluster, tau, k_start, nb_clusters) | |
| clustering_results[str(tau)] = cluster_labels | |
| taus_existed.append(tau) | |
| k_and_taus[str(tau)] = max(cluster_labels) + 1 | |
| # clustering_results.to_csv(clustering_results_path, index = False) | |
| elapsed_time = time.time() - start_time | |
| # print("clustering time:", elapsed_time, "seconds") | |
| return clustering_results | |