Spaces:

sklearn-docs
/

kmeans-assumptions

Sleeping

App Files Files Community

kmeans-assumptions / app.py

ehengao

add initial version for the kmeans assumption dashboard

2fd8e3d over 2 years ago

raw

history blame contribute delete

5.77 kB

	"""This dashboard is a live demonstration of the sklearn document at
	https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py
	"""
	import numpy as np
	import typing as tp
	import gradio as gr
	from sklearn.datasets import make_blobs
	from sklearn.cluster import KMeans
	import matplotlib.pyplot as plt

	title = "Demonstration of k-means assumptions"
	random_state = 170
	transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]

	# Defines 4 Apps for each demo senario
	class App:
	name: tp.ClassVar[str]
	description: tp.ClassVar[str]

	def make_data(self, n_samples: int) -> tp.Tuple[np.ndarray, np.ndarray]:
	raise NotImplementedError()

	def kmeans_predict(self, n_cluster: int, X: np.ndarray) -> np.ndarray:
	raise NotImplementedError()

	class MixGaussianBlobs(App):
	name = "Mixture of Gaussian Blobs"
	description = (
	"In a real setting there is no uniquely defined true number of clusters. "
	"An appropriate number of clusters has to be decided from data-based criteria"
	" and knowledge of the intended goal."
	)

	def make_data(self, n_samples):
	return make_blobs(n_samples=n_samples, random_state=random_state)

	def kmeans_predict(self, n_clusters, X):
	return KMeans(
	n_clusters=n_clusters, n_init="auto", random_state=random_state
	).fit_predict(X)


	class AnisoDistBlobs(MixGaussianBlobs):
	name = "Anisotropically Distributed Blobs"
	description = (
	"k-means consists of minimizing sample’s euclidean distances to the centroid of the"
	" cluster they are assigned to. As a consequence, k-means is more appropriate for "
	"clusters that are isotropic and normally distributed (i.e. spherical gaussians)"
	)

	def make_data(self, n_samples):
	X, y = super().make_data(n_samples=n_samples)
	X = np.dot(X, transformation)
	return X, y


	class UnequalVariance(MixGaussianBlobs):
	name = "Unequal Variance"
	description = (
	"k-means is equivalent to taking the maximum likelihood estimator for a 'mixture' "
	"of k gaussian distributions with the same variances but with possibly different "
	" means."
	)

	def make_data(self, n_samples):
	return make_blobs(
	n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
	)


	class UnevenlySizedBlobs(MixGaussianBlobs):
	name = "Unevenly Sized Blobs"
	description = (
	"There is no theoretical result about k-means that states that it requires similar"
	" cluster sizes to perform well, yet minimizing euclidean distances does mean that"
	" the more sparse and high-dimensional the problem is, the higher is the need to run "
	"the algorithm with different centroid seeds to ensure a global minimal inertia."
	)

	def make_data(self, n_samples):
	X, y = super().make_data(n_samples=n_samples)
	X_filter = np.vstack(
	(
	X[y == 0][:500],
	X[y == 1][:100],
	X[y == 2][:10],
	)
	)
	# print(len(X_filter[:, 0]))
	# print(len(X_filter[:, 1]))
	y_filter = [0] * 500 + [1] * 100 + [2] * 10
	return X_filter, y_filter


	# Define instances of the apps
	_apps = [
	MixGaussianBlobs(),
	AnisoDistBlobs(),
	UnequalVariance(),
	UnevenlySizedBlobs(),
	]
	apps = {k.name: k for k in _apps}
	data_choices = [k.name for k in _apps]


	# Define the callback to the triggered when a button or a slider used by the user.
	def fn(data_choice, n_samples, n_clusters):
	# Find the app and create sample data based on the user choice.
	app = apps[data_choice]
	X, y = app.make_data(n_samples)
	fig_sample, ax_sample = plt.subplots()
	ax_sample.set_title(app.name)

	# Execute the KMeans clustering.
	y_pred = app.kmeans_predict(n_clusters, X)
	ax_sample.scatter(X[:, 0], X[:, 1], c=y)
	fig_pred, ax_pred = plt.subplots()
	ax_pred.scatter(X[:, 0], X[:, 1], c=y_pred)
	ax_pred.set_title(f"Unexpected KMeans Clusters (n_cluster={n_clusters})")

	return f"## {app.description}", fig_sample, fig_pred


	# Define the dashboard layout and buttons
	with gr.Blocks(title=title) as demo:
	gr.Markdown(f"# {title}")
	gr.Markdown(
	"This demo is based on "
	"[sklearn document](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py)."
	"It is meant to illustrate how K-Means can produce unexpected clusters in 4 different data sets"
	)
	with gr.Row():
	data_choice = gr.Radio(
	choices=data_choices,
	value=data_choices[0],
	)
	with gr.Row():
	n_samples = gr.Slider(
	minimum=1500, maximum=3000, step=50, label="Number of Samples"
	)
	n_clusters = gr.Slider(minimum=2, maximum=8, step=1, label="Number of Clusters")
	with gr.Accordion("Description"):
	description = gr.Markdown(label="Description")
	with gr.Row():
	plot_sample = gr.Plot(label="Ground Truth Cluster")
	plot_kmeans = gr.Plot(label="Unexpected KMeans Cluster")

	data_choice.change(
	fn=fn,
	inputs=[data_choice, n_samples, n_clusters],
	outputs=[description, plot_sample, plot_kmeans],
	)
	n_samples.change(
	fn=fn,
	inputs=[data_choice, n_samples, n_clusters],
	outputs=[description, plot_sample, plot_kmeans],
	)
	n_clusters.change(
	fn=fn,
	inputs=[data_choice, n_samples, n_clusters],
	outputs=[description, plot_sample, plot_kmeans],
	)


	demo.launch()