Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- app.py +142 -0
- requirements.txt +4 -0
app.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import numpy as np
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
from sklearn import datasets
|
| 6 |
+
from sklearn.preprocessing import StandardScaler
|
| 7 |
+
from sklearn.decomposition import PCA
|
| 8 |
+
from sklearn.cluster import MiniBatchKMeans
|
| 9 |
+
from sklearn.metrics import silhouette_score
|
| 10 |
+
|
| 11 |
+
INTRO_MD = r"""
|
| 12 |
+
### Wat gebeurt hier?
|
| 13 |
+
We laten **unsupervised learning** zien: het algoritme zoekt **vanzelf groepjes** in de data — zónder dat we van tevoren labels geven.
|
| 14 |
+
We gebruiken een bekende dataset (sklearn *diabetes*) met meerdere metingen per persoon (features).
|
| 15 |
+
|
| 16 |
+
- We **schalen** de data (zodat alle metingen vergelijkbaar meewegen).
|
| 17 |
+
- We projecteren alles naar **2D met PCA** om het zichtbaar te maken.
|
| 18 |
+
- We voeren **k-means clustering** uit en **updaten** de centers stap voor stap (mini-batches).
|
| 19 |
+
- Je ziet live:
|
| 20 |
+
- de **punten** (elk een persoon) ingekleurd per **cluster**,
|
| 21 |
+
- de **clustercentra** (kruisjes) die **opschuiven**,
|
| 22 |
+
- en de **inertia-curve** die meestal **daalt** (lager = strakkere clusters).
|
| 23 |
+
|
| 24 |
+
> Educatief voorbeeld. Dit is géén medisch advies en geen diagnose.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def load_diabetes_features():
|
| 28 |
+
d = datasets.load_diabetes()
|
| 29 |
+
X = d.data # 10 features
|
| 30 |
+
return X
|
| 31 |
+
|
| 32 |
+
def kmeans_live_generator(k, iters, batch_size, seed):
|
| 33 |
+
# Data voorbereiden
|
| 34 |
+
X = load_diabetes_features()
|
| 35 |
+
scaler = StandardScaler(with_mean=True, with_std=True)
|
| 36 |
+
Xs = scaler.fit_transform(X)
|
| 37 |
+
|
| 38 |
+
pca = PCA(n_components=2, random_state=int(seed))
|
| 39 |
+
Z = pca.fit_transform(Xs) # 2D projectie voor visualisatie
|
| 40 |
+
|
| 41 |
+
# MiniBatchKMeans voor stapsgewijze updates
|
| 42 |
+
kmeans = MiniBatchKMeans(
|
| 43 |
+
n_clusters=int(k),
|
| 44 |
+
random_state=int(seed),
|
| 45 |
+
n_init=1,
|
| 46 |
+
init="k-means++",
|
| 47 |
+
batch_size=int(batch_size),
|
| 48 |
+
reassignment_ratio=0.01,
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Maak batches
|
| 52 |
+
n = Xs.shape[0]
|
| 53 |
+
rng = np.random.RandomState(int(seed))
|
| 54 |
+
idx = np.arange(n)
|
| 55 |
+
rng.shuffle(idx)
|
| 56 |
+
|
| 57 |
+
inertias = []
|
| 58 |
+
for t in range(1, int(iters) + 1):
|
| 59 |
+
# Pak een batch (roterend door de data)
|
| 60 |
+
start = ((t - 1) * batch_size) % n
|
| 61 |
+
end = min(start + int(batch_size), n)
|
| 62 |
+
batch_idx = idx[start:end]
|
| 63 |
+
Xb = Xs[batch_idx]
|
| 64 |
+
|
| 65 |
+
# Eén update-stap
|
| 66 |
+
kmeans.partial_fit(Xb)
|
| 67 |
+
|
| 68 |
+
# Labels en inertia op volledige set
|
| 69 |
+
labels = kmeans.predict(Xs)
|
| 70 |
+
inertia = float(kmeans.inertia_)
|
| 71 |
+
inertias.append(inertia)
|
| 72 |
+
|
| 73 |
+
# Projecteer centers naar 2D
|
| 74 |
+
centers_2d = pca.transform(kmeans.cluster_centers_)
|
| 75 |
+
|
| 76 |
+
# Plot 1: 2D scatter met clusters + centers
|
| 77 |
+
fig_main = plt.figure(figsize=(7, 4))
|
| 78 |
+
ax1 = fig_main.add_subplot(111)
|
| 79 |
+
ax1.scatter(Z[:, 0], Z[:, 1], c=labels, s=22, alpha=0.85)
|
| 80 |
+
ax1.scatter(centers_2d[:, 0], centers_2d[:, 1], marker="x", s=120, linewidths=2)
|
| 81 |
+
ax1.set_title(f"K-means live — iteratie {t}/{iters} (k={k})")
|
| 82 |
+
ax1.set_xlabel("PCA component 1")
|
| 83 |
+
ax1.set_ylabel("PCA component 2")
|
| 84 |
+
ax1.grid(True, linestyle=":", linewidth=0.6)
|
| 85 |
+
plt.tight_layout()
|
| 86 |
+
|
| 87 |
+
# Plot 2: inertia-curve
|
| 88 |
+
fig_inertia = plt.figure(figsize=(7, 3.2))
|
| 89 |
+
ax2 = fig_inertia.add_subplot(111)
|
| 90 |
+
ax2.plot(range(1, len(inertias)+1), inertias, marker="o")
|
| 91 |
+
ax2.set_title("Inertia (doelfunctie) per iteratie — lager is beter")
|
| 92 |
+
ax2.set_xlabel("Iteratie")
|
| 93 |
+
ax2.set_ylabel("Inertia")
|
| 94 |
+
ax2.grid(True, linestyle=":", linewidth=0.6)
|
| 95 |
+
plt.tight_layout()
|
| 96 |
+
|
| 97 |
+
# Metrics: op laatste stap ook silhouette en cluster-groottes
|
| 98 |
+
metrics_lines = [f"**Iteratie:** {t}/{iters} — **Inertia:** {inertia:.2f}"]
|
| 99 |
+
if t == int(iters):
|
| 100 |
+
try:
|
| 101 |
+
sil = float(silhouette_score(Xs, labels))
|
| 102 |
+
metrics_lines.append(f"**Silhouette score:** {sil:.3f}")
|
| 103 |
+
except Exception:
|
| 104 |
+
metrics_lines.append("**Silhouette score:** (n.v.t.)")
|
| 105 |
+
# cluster groottes
|
| 106 |
+
sizes = np.bincount(labels, minlength=int(k))
|
| 107 |
+
size_str = ", ".join([f"cluster {i}: {sizes[i]}" for i in range(int(k))])
|
| 108 |
+
metrics_lines.append(f"**Cluster-groottes:** {size_str}")
|
| 109 |
+
metrics_lines.append("> Tip: probeer een andere *k* en vergelijk de inertia/silhouette.")
|
| 110 |
+
|
| 111 |
+
yield fig_main, fig_inertia, "\n".join(metrics_lines)
|
| 112 |
+
|
| 113 |
+
with gr.Blocks(title="Unsupervised Learning — Live Clustering (K-means + PCA)") as demo:
|
| 114 |
+
gr.Markdown("# Unsupervised Learning — Live Clustering (K-means + PCA)")
|
| 115 |
+
gr.Markdown(INTRO_MD)
|
| 116 |
+
|
| 117 |
+
with gr.Row():
|
| 118 |
+
with gr.Column(scale=1):
|
| 119 |
+
k = gr.Slider(2, 10, value=3, step=1, label="Aantal clusters (k)")
|
| 120 |
+
iters = gr.Slider(5, 200, value=40, step=1, label="Iteraties")
|
| 121 |
+
batch_size = gr.Slider(16, 256, value=128, step=1, label="Batchgrootte")
|
| 122 |
+
seed = gr.Slider(0, 9999, value=42, step=1, label="Random seed")
|
| 123 |
+
run_btn = gr.Button("Cluster live")
|
| 124 |
+
with gr.Column(scale=2):
|
| 125 |
+
plot_main = gr.Plot(label="2D-projectie (PCA) met clusters en centers (live)")
|
| 126 |
+
plot_inertia = gr.Plot(label="Inertia per iteratie")
|
| 127 |
+
metrics = gr.Markdown()
|
| 128 |
+
|
| 129 |
+
run_btn.click(
|
| 130 |
+
fn=kmeans_live_generator,
|
| 131 |
+
inputs=[k, iters, batch_size, seed],
|
| 132 |
+
outputs=[plot_main, plot_inertia, metrics]
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
demo.load(
|
| 136 |
+
fn=kmeans_live_generator,
|
| 137 |
+
inputs=[k, iters, batch_size, seed],
|
| 138 |
+
outputs=[plot_main, plot_inertia, metrics]
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
if __name__ == "__main__":
|
| 142 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.36.0
|
| 2 |
+
matplotlib>=3.7.0
|
| 3 |
+
numpy>=1.23.0
|
| 4 |
+
scikit-learn>=1.2.0
|