Spaces:
Sleeping
Sleeping
| """This dashboard is a live demonstration of the sklearn document at | |
| https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py | |
| """ | |
| import numpy as np | |
| import typing as tp | |
| import gradio as gr | |
| from sklearn.datasets import make_blobs | |
| from sklearn.cluster import KMeans | |
| import matplotlib.pyplot as plt | |
| title = "Demonstration of k-means assumptions" | |
| random_state = 170 | |
| transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]] | |
| # Defines 4 Apps for each demo senario | |
| class App: | |
| name: tp.ClassVar[str] | |
| description: tp.ClassVar[str] | |
| def make_data(self, n_samples: int) -> tp.Tuple[np.ndarray, np.ndarray]: | |
| raise NotImplementedError() | |
| def kmeans_predict(self, n_cluster: int, X: np.ndarray) -> np.ndarray: | |
| raise NotImplementedError() | |
| class MixGaussianBlobs(App): | |
| name = "Mixture of Gaussian Blobs" | |
| description = ( | |
| "In a real setting there is no uniquely defined true number of clusters. " | |
| "An appropriate number of clusters has to be decided from data-based criteria" | |
| " and knowledge of the intended goal." | |
| ) | |
| def make_data(self, n_samples): | |
| return make_blobs(n_samples=n_samples, random_state=random_state) | |
| def kmeans_predict(self, n_clusters, X): | |
| return KMeans( | |
| n_clusters=n_clusters, n_init="auto", random_state=random_state | |
| ).fit_predict(X) | |
| class AnisoDistBlobs(MixGaussianBlobs): | |
| name = "Anisotropically Distributed Blobs" | |
| description = ( | |
| "k-means consists of minimizing sample’s euclidean distances to the centroid of the" | |
| " cluster they are assigned to. As a consequence, k-means is more appropriate for " | |
| "clusters that are isotropic and normally distributed (i.e. spherical gaussians)" | |
| ) | |
| def make_data(self, n_samples): | |
| X, y = super().make_data(n_samples=n_samples) | |
| X = np.dot(X, transformation) | |
| return X, y | |
| class UnequalVariance(MixGaussianBlobs): | |
| name = "Unequal Variance" | |
| description = ( | |
| "k-means is equivalent to taking the maximum likelihood estimator for a 'mixture' " | |
| "of k gaussian distributions with the same variances but with possibly different " | |
| " means." | |
| ) | |
| def make_data(self, n_samples): | |
| return make_blobs( | |
| n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state | |
| ) | |
| class UnevenlySizedBlobs(MixGaussianBlobs): | |
| name = "Unevenly Sized Blobs" | |
| description = ( | |
| "There is no theoretical result about k-means that states that it requires similar" | |
| " cluster sizes to perform well, yet minimizing euclidean distances does mean that" | |
| " the more sparse and high-dimensional the problem is, the higher is the need to run " | |
| "the algorithm with different centroid seeds to ensure a global minimal inertia." | |
| ) | |
| def make_data(self, n_samples): | |
| X, y = super().make_data(n_samples=n_samples) | |
| X_filter = np.vstack( | |
| ( | |
| X[y == 0][:500], | |
| X[y == 1][:100], | |
| X[y == 2][:10], | |
| ) | |
| ) | |
| # print(len(X_filter[:, 0])) | |
| # print(len(X_filter[:, 1])) | |
| y_filter = [0] * 500 + [1] * 100 + [2] * 10 | |
| return X_filter, y_filter | |
| # Define instances of the apps | |
| _apps = [ | |
| MixGaussianBlobs(), | |
| AnisoDistBlobs(), | |
| UnequalVariance(), | |
| UnevenlySizedBlobs(), | |
| ] | |
| apps = {k.name: k for k in _apps} | |
| data_choices = [k.name for k in _apps] | |
| # Define the callback to the triggered when a button or a slider used by the user. | |
| def fn(data_choice, n_samples, n_clusters): | |
| # Find the app and create sample data based on the user choice. | |
| app = apps[data_choice] | |
| X, y = app.make_data(n_samples) | |
| fig_sample, ax_sample = plt.subplots() | |
| ax_sample.set_title(app.name) | |
| # Execute the KMeans clustering. | |
| y_pred = app.kmeans_predict(n_clusters, X) | |
| ax_sample.scatter(X[:, 0], X[:, 1], c=y) | |
| fig_pred, ax_pred = plt.subplots() | |
| ax_pred.scatter(X[:, 0], X[:, 1], c=y_pred) | |
| ax_pred.set_title(f"Unexpected KMeans Clusters (n_cluster={n_clusters})") | |
| return f"## {app.description}", fig_sample, fig_pred | |
| # Define the dashboard layout and buttons | |
| with gr.Blocks(title=title) as demo: | |
| gr.Markdown(f"# {title}") | |
| gr.Markdown( | |
| "This demo is based on " | |
| "[sklearn document](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py)." | |
| "It is meant to illustrate how K-Means can produce unexpected clusters in 4 different data sets" | |
| ) | |
| with gr.Row(): | |
| data_choice = gr.Radio( | |
| choices=data_choices, | |
| value=data_choices[0], | |
| ) | |
| with gr.Row(): | |
| n_samples = gr.Slider( | |
| minimum=1500, maximum=3000, step=50, label="Number of Samples" | |
| ) | |
| n_clusters = gr.Slider(minimum=2, maximum=8, step=1, label="Number of Clusters") | |
| with gr.Accordion("Description"): | |
| description = gr.Markdown(label="Description") | |
| with gr.Row(): | |
| plot_sample = gr.Plot(label="Ground Truth Cluster") | |
| plot_kmeans = gr.Plot(label="Unexpected KMeans Cluster") | |
| data_choice.change( | |
| fn=fn, | |
| inputs=[data_choice, n_samples, n_clusters], | |
| outputs=[description, plot_sample, plot_kmeans], | |
| ) | |
| n_samples.change( | |
| fn=fn, | |
| inputs=[data_choice, n_samples, n_clusters], | |
| outputs=[description, plot_sample, plot_kmeans], | |
| ) | |
| n_clusters.change( | |
| fn=fn, | |
| inputs=[data_choice, n_samples, n_clusters], | |
| outputs=[description, plot_sample, plot_kmeans], | |
| ) | |
| demo.launch() | |