Spaces:

sklearn-docs
/

plot-k-means-digits

Sleeping

App Files Files Community

plot-k-means-digits / app.py

merve HF Staff

Update app.py

135ab42 over 2 years ago

raw

history blame contribute delete

7.42 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	from time import time
	from sklearn import metrics
	from sklearn.pipeline import make_pipeline
	from sklearn.preprocessing import StandardScaler
	from sklearn.cluster import KMeans
	from sklearn.decomposition import PCA
	from huggingface_hub import login
	from datasets import load_dataset
	import matplotlib.pyplot as plt


	# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py

	def display_plot(data, n_digits):
	reduced_data = PCA(n_components=2).fit_transform(data)
	kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4)
	kmeans.fit(reduced_data)

	# Step size of the mesh. Decrease to increase the quality of the VQ.
	h = 0.02 # point in the mesh [x_min, x_max]x[y_min, y_max].

	# Plot the decision boundary. For that, we will assign a color to each
	x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
	y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
	xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

	# Obtain labels for each point in mesh. Use last trained model.
	Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

	# Put the result into a color plot
	Z = Z.reshape(xx.shape)

	fig = plt.figure()

	plt.clf()
	plt.imshow(
	Z,
	interpolation="nearest",
	extent=(xx.min(), xx.max(), yy.min(), yy.max()),
	cmap=plt.cm.Paired,
	aspect="auto",
	origin="lower",
	)

	plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
	# Plot the centroids as a white X
	centroids = kmeans.cluster_centers_
	plt.scatter(
	centroids[:, 0],
	centroids[:, 1],
	marker="x",
	s=169,
	linewidths=3,
	color="w",
	zorder=10,
	)
	plt.title(
	"K-means clustering on the digits dataset (PCA-reduced data)\n"
	"Centroids are marked with white cross"
	)
	plt.xlim(x_min, x_max)
	plt.ylim(y_min, y_max)
	plt.xticks(())
	plt.yticks(())
	return fig

	def bench_k_means(kmeans, name, data, labels):
	"""Benchmark to evaluate the KMeans initialization methods.

	Parameters
	----------
	kmeans : KMeans instance
	A :class:`~sklearn.cluster.KMeans` instance with the initialization
	already set.
	name : str
	Name given to the strategy. It will be used to show the results in a
	table.
	data : ndarray of shape (n_samples, n_features)
	The data to cluster.
	labels : ndarray of shape (n_samples,)
	The labels used to compute the clustering metrics which requires some
	supervision.
	"""
	t0 = time()
	estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
	fit_time = time() - t0
	results = [name, fit_time, estimator[-1].inertia_]

	# Define the metrics which require only the true labels and estimator
	# labels
	clustering_metrics = [
	metrics.homogeneity_score,
	metrics.completeness_score,
	metrics.v_measure_score,
	metrics.adjusted_rand_score,
	metrics.adjusted_mutual_info_score,
	]
	results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]

	# The silhouette score requires the full dataset
	results += [
	metrics.silhouette_score(
	data,
	estimator[-1].labels_,
	metric="euclidean",
	sample_size=300,
	)
	]

	return results

	title = "A demo of K-Means clustering on the handwritten digits data"
	def do_submit(kmeans_n_digit,random_n_digit, pca_n_digit):
	# Load the dataset
	dataset = load_dataset("sklearn-docs/digits", header=None)
	# convert dataset to pandas
	df = dataset['train'].to_pandas()
	data = df.iloc[:, :64]
	labels = df.iloc[:, 64]

	kmeans = KMeans(init="k-means++", n_clusters=int(kmeans_n_digit), n_init=4, random_state=0)
	results = bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels)

	df = pd.DataFrame(results).T
	numeric_cols = ['time','inertia','homo','compl','v-meas','ARI','AMI','silhouette']
	df.columns = ['init'] + numeric_cols

	kmeans = KMeans(init="random", n_clusters=int(random_n_digit), n_init=4, random_state=0)
	results = bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels)
	df.loc[len(df.index)] = results

	pca = PCA(n_components=int(pca_n_digit)).fit(data)
	kmeans = KMeans(init=pca.components_, n_clusters=int(pca_n_digit), n_init=1)
	results = bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels)
	df.loc[len(df.index)] = results
	df[df.columns[1:]] = df.iloc[:,1:].astype(float).round(3)

	df = df.T #Transpose for display
	df.columns = df.iloc[0,:].tolist()
	df = df.iloc[1:,:].reset_index()
	df.columns = ['metrics', 'k-means++', 'random', 'PCA-based']
	return display_plot(data, kmeans_n_digit), df

	#Theme from - https://huggingface.co/spaces/trl-lib/stack-llama/blob/main/app.py
	theme = gr.themes.Monochrome(
	primary_hue="indigo",
	secondary_hue="blue",
	neutral_hue="slate",
	radius_size=gr.themes.sizes.radius_sm,
	font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
	)

	with gr.Blocks(title=title, theme=theme) as demo:
	gr.Markdown(f"## {title}")
	gr.Markdown("This demo is based on this [scikit-learn example](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py)")
	gr.Markdown("In this example we compare the various initialization strategies for K-means in terms of runtime and quality of the results.")
	gr.Markdown("As the ground truth is known here, we also apply different cluster quality metrics to judge the goodness of fit of the cluster labels to the ground truth.")
	gr.Markdown("Cluster quality metrics evaluated (see [Clustering performance evaluation](https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation) \
	for definitions and discussions of the metrics):")
	gr.Markdown("---")
	gr.Markdown(" We will be utilizing [digits](https://huggingface.co/datasets/sklearn-docs/digits) dataset. This dataset contains handwritten digits from 0 to 9. \
	In the context of clustering, one would like to group images such that the handwritten digits on the image are the same.")


	with gr.Row():
	with gr.Column(scale=0.5):
	kmeans_n_digit = gr.Slider(minimum=2, maximum=10, label="KMeans n_digits", info="n_digits is number of handwritten digits" , step=1, value=10)
	random_n_digit = gr.Slider(minimum=2, maximum=10, label="Random n_digits", step=1, value=10)
	pca_n_digit = gr.Slider(minimum=2, maximum=10, label="PCA n_digits",step=1, value=10)

	plt_out = gr.Plot()

	with gr.Column(scale=0.5):
	sample_df = pd.DataFrame(np.zeros((9,4)),columns=['metrics', 'k-means++', 'random', 'PCA-based'])

	output = gr.Dataframe(sample_df, label="Clustering Metrics")

	with gr.Row():
	sub_btn = gr.Button("Submit")
	sub_btn.click(fn=do_submit, inputs=[kmeans_n_digit,random_n_digit, pca_n_digit], outputs=[plt_out, output])

	demo.launch()