Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| from time import time | |
| from sklearn import metrics | |
| from sklearn.pipeline import make_pipeline | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.cluster import KMeans | |
| from sklearn.decomposition import PCA | |
| from huggingface_hub import login | |
| from datasets import load_dataset | |
| import matplotlib.pyplot as plt | |
| # https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py | |
| def display_plot(data, n_digits): | |
| reduced_data = PCA(n_components=2).fit_transform(data) | |
| kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4) | |
| kmeans.fit(reduced_data) | |
| # Step size of the mesh. Decrease to increase the quality of the VQ. | |
| h = 0.02 # point in the mesh [x_min, x_max]x[y_min, y_max]. | |
| # Plot the decision boundary. For that, we will assign a color to each | |
| x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 | |
| y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 | |
| xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) | |
| # Obtain labels for each point in mesh. Use last trained model. | |
| Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) | |
| # Put the result into a color plot | |
| Z = Z.reshape(xx.shape) | |
| fig = plt.figure() | |
| plt.clf() | |
| plt.imshow( | |
| Z, | |
| interpolation="nearest", | |
| extent=(xx.min(), xx.max(), yy.min(), yy.max()), | |
| cmap=plt.cm.Paired, | |
| aspect="auto", | |
| origin="lower", | |
| ) | |
| plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2) | |
| # Plot the centroids as a white X | |
| centroids = kmeans.cluster_centers_ | |
| plt.scatter( | |
| centroids[:, 0], | |
| centroids[:, 1], | |
| marker="x", | |
| s=169, | |
| linewidths=3, | |
| color="w", | |
| zorder=10, | |
| ) | |
| plt.title( | |
| "K-means clustering on the digits dataset (PCA-reduced data)\n" | |
| "Centroids are marked with white cross" | |
| ) | |
| plt.xlim(x_min, x_max) | |
| plt.ylim(y_min, y_max) | |
| plt.xticks(()) | |
| plt.yticks(()) | |
| return fig | |
| def bench_k_means(kmeans, name, data, labels): | |
| """Benchmark to evaluate the KMeans initialization methods. | |
| Parameters | |
| ---------- | |
| kmeans : KMeans instance | |
| A :class:`~sklearn.cluster.KMeans` instance with the initialization | |
| already set. | |
| name : str | |
| Name given to the strategy. It will be used to show the results in a | |
| table. | |
| data : ndarray of shape (n_samples, n_features) | |
| The data to cluster. | |
| labels : ndarray of shape (n_samples,) | |
| The labels used to compute the clustering metrics which requires some | |
| supervision. | |
| """ | |
| t0 = time() | |
| estimator = make_pipeline(StandardScaler(), kmeans).fit(data) | |
| fit_time = time() - t0 | |
| results = [name, fit_time, estimator[-1].inertia_] | |
| # Define the metrics which require only the true labels and estimator | |
| # labels | |
| clustering_metrics = [ | |
| metrics.homogeneity_score, | |
| metrics.completeness_score, | |
| metrics.v_measure_score, | |
| metrics.adjusted_rand_score, | |
| metrics.adjusted_mutual_info_score, | |
| ] | |
| results += [m(labels, estimator[-1].labels_) for m in clustering_metrics] | |
| # The silhouette score requires the full dataset | |
| results += [ | |
| metrics.silhouette_score( | |
| data, | |
| estimator[-1].labels_, | |
| metric="euclidean", | |
| sample_size=300, | |
| ) | |
| ] | |
| return results | |
| title = "A demo of K-Means clustering on the handwritten digits data" | |
| def do_submit(kmeans_n_digit,random_n_digit, pca_n_digit): | |
| # Load the dataset | |
| dataset = load_dataset("sklearn-docs/digits", header=None) | |
| # convert dataset to pandas | |
| df = dataset['train'].to_pandas() | |
| data = df.iloc[:, :64] | |
| labels = df.iloc[:, 64] | |
| kmeans = KMeans(init="k-means++", n_clusters=int(kmeans_n_digit), n_init=4, random_state=0) | |
| results = bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels) | |
| df = pd.DataFrame(results).T | |
| numeric_cols = ['time','inertia','homo','compl','v-meas','ARI','AMI','silhouette'] | |
| df.columns = ['init'] + numeric_cols | |
| kmeans = KMeans(init="random", n_clusters=int(random_n_digit), n_init=4, random_state=0) | |
| results = bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels) | |
| df.loc[len(df.index)] = results | |
| pca = PCA(n_components=int(pca_n_digit)).fit(data) | |
| kmeans = KMeans(init=pca.components_, n_clusters=int(pca_n_digit), n_init=1) | |
| results = bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels) | |
| df.loc[len(df.index)] = results | |
| df[df.columns[1:]] = df.iloc[:,1:].astype(float).round(3) | |
| df = df.T #Transpose for display | |
| df.columns = df.iloc[0,:].tolist() | |
| df = df.iloc[1:,:].reset_index() | |
| df.columns = ['metrics', 'k-means++', 'random', 'PCA-based'] | |
| return display_plot(data, kmeans_n_digit), df | |
| #Theme from - https://huggingface.co/spaces/trl-lib/stack-llama/blob/main/app.py | |
| theme = gr.themes.Monochrome( | |
| primary_hue="indigo", | |
| secondary_hue="blue", | |
| neutral_hue="slate", | |
| radius_size=gr.themes.sizes.radius_sm, | |
| font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"], | |
| ) | |
| with gr.Blocks(title=title, theme=theme) as demo: | |
| gr.Markdown(f"## {title}") | |
| gr.Markdown("This demo is based on this [scikit-learn example](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py)") | |
| gr.Markdown("In this example we compare the various initialization strategies for K-means in terms of runtime and quality of the results.") | |
| gr.Markdown("As the ground truth is known here, we also apply different cluster quality metrics to judge the goodness of fit of the cluster labels to the ground truth.") | |
| gr.Markdown("Cluster quality metrics evaluated (see [Clustering performance evaluation](https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation) \ | |
| for definitions and discussions of the metrics):") | |
| gr.Markdown("---") | |
| gr.Markdown(" We will be utilizing [digits](https://huggingface.co/datasets/sklearn-docs/digits) dataset. This dataset contains handwritten digits from 0 to 9. \ | |
| In the context of clustering, one would like to group images such that the handwritten digits on the image are the same.") | |
| with gr.Row(): | |
| with gr.Column(scale=0.5): | |
| kmeans_n_digit = gr.Slider(minimum=2, maximum=10, label="KMeans n_digits", info="n_digits is number of handwritten digits" , step=1, value=10) | |
| random_n_digit = gr.Slider(minimum=2, maximum=10, label="Random n_digits", step=1, value=10) | |
| pca_n_digit = gr.Slider(minimum=2, maximum=10, label="PCA n_digits",step=1, value=10) | |
| plt_out = gr.Plot() | |
| with gr.Column(scale=0.5): | |
| sample_df = pd.DataFrame(np.zeros((9,4)),columns=['metrics', 'k-means++', 'random', 'PCA-based']) | |
| output = gr.Dataframe(sample_df, label="Clustering Metrics") | |
| with gr.Row(): | |
| sub_btn = gr.Button("Submit") | |
| sub_btn.click(fn=do_submit, inputs=[kmeans_n_digit,random_n_digit, pca_n_digit], outputs=[plt_out, output]) | |
| demo.launch() |