Marcel0123 commited on
Commit
8406363
·
verified ·
1 Parent(s): f9494a1

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +142 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ from sklearn import datasets
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.decomposition import PCA
8
+ from sklearn.cluster import MiniBatchKMeans
9
+ from sklearn.metrics import silhouette_score
10
+
11
+ INTRO_MD = r"""
12
+ ### Wat gebeurt hier?
13
+ We laten **unsupervised learning** zien: het algoritme zoekt **vanzelf groepjes** in de data — zónder dat we van tevoren labels geven.
14
+ We gebruiken een bekende dataset (sklearn *diabetes*) met meerdere metingen per persoon (features).
15
+
16
+ - We **schalen** de data (zodat alle metingen vergelijkbaar meewegen).
17
+ - We projecteren alles naar **2D met PCA** om het zichtbaar te maken.
18
+ - We voeren **k-means clustering** uit en **updaten** de centers stap voor stap (mini-batches).
19
+ - Je ziet live:
20
+ - de **punten** (elk een persoon) ingekleurd per **cluster**,
21
+ - de **clustercentra** (kruisjes) die **opschuiven**,
22
+ - en de **inertia-curve** die meestal **daalt** (lager = strakkere clusters).
23
+
24
+ > Educatief voorbeeld. Dit is géén medisch advies en geen diagnose.
25
+ """
26
+
27
+ def load_diabetes_features():
28
+ d = datasets.load_diabetes()
29
+ X = d.data # 10 features
30
+ return X
31
+
32
+ def kmeans_live_generator(k, iters, batch_size, seed):
33
+ # Data voorbereiden
34
+ X = load_diabetes_features()
35
+ scaler = StandardScaler(with_mean=True, with_std=True)
36
+ Xs = scaler.fit_transform(X)
37
+
38
+ pca = PCA(n_components=2, random_state=int(seed))
39
+ Z = pca.fit_transform(Xs) # 2D projectie voor visualisatie
40
+
41
+ # MiniBatchKMeans voor stapsgewijze updates
42
+ kmeans = MiniBatchKMeans(
43
+ n_clusters=int(k),
44
+ random_state=int(seed),
45
+ n_init=1,
46
+ init="k-means++",
47
+ batch_size=int(batch_size),
48
+ reassignment_ratio=0.01,
49
+ )
50
+
51
+ # Maak batches
52
+ n = Xs.shape[0]
53
+ rng = np.random.RandomState(int(seed))
54
+ idx = np.arange(n)
55
+ rng.shuffle(idx)
56
+
57
+ inertias = []
58
+ for t in range(1, int(iters) + 1):
59
+ # Pak een batch (roterend door de data)
60
+ start = ((t - 1) * batch_size) % n
61
+ end = min(start + int(batch_size), n)
62
+ batch_idx = idx[start:end]
63
+ Xb = Xs[batch_idx]
64
+
65
+ # Eén update-stap
66
+ kmeans.partial_fit(Xb)
67
+
68
+ # Labels en inertia op volledige set
69
+ labels = kmeans.predict(Xs)
70
+ inertia = float(kmeans.inertia_)
71
+ inertias.append(inertia)
72
+
73
+ # Projecteer centers naar 2D
74
+ centers_2d = pca.transform(kmeans.cluster_centers_)
75
+
76
+ # Plot 1: 2D scatter met clusters + centers
77
+ fig_main = plt.figure(figsize=(7, 4))
78
+ ax1 = fig_main.add_subplot(111)
79
+ ax1.scatter(Z[:, 0], Z[:, 1], c=labels, s=22, alpha=0.85)
80
+ ax1.scatter(centers_2d[:, 0], centers_2d[:, 1], marker="x", s=120, linewidths=2)
81
+ ax1.set_title(f"K-means live — iteratie {t}/{iters} (k={k})")
82
+ ax1.set_xlabel("PCA component 1")
83
+ ax1.set_ylabel("PCA component 2")
84
+ ax1.grid(True, linestyle=":", linewidth=0.6)
85
+ plt.tight_layout()
86
+
87
+ # Plot 2: inertia-curve
88
+ fig_inertia = plt.figure(figsize=(7, 3.2))
89
+ ax2 = fig_inertia.add_subplot(111)
90
+ ax2.plot(range(1, len(inertias)+1), inertias, marker="o")
91
+ ax2.set_title("Inertia (doelfunctie) per iteratie — lager is beter")
92
+ ax2.set_xlabel("Iteratie")
93
+ ax2.set_ylabel("Inertia")
94
+ ax2.grid(True, linestyle=":", linewidth=0.6)
95
+ plt.tight_layout()
96
+
97
+ # Metrics: op laatste stap ook silhouette en cluster-groottes
98
+ metrics_lines = [f"**Iteratie:** {t}/{iters} — **Inertia:** {inertia:.2f}"]
99
+ if t == int(iters):
100
+ try:
101
+ sil = float(silhouette_score(Xs, labels))
102
+ metrics_lines.append(f"**Silhouette score:** {sil:.3f}")
103
+ except Exception:
104
+ metrics_lines.append("**Silhouette score:** (n.v.t.)")
105
+ # cluster groottes
106
+ sizes = np.bincount(labels, minlength=int(k))
107
+ size_str = ", ".join([f"cluster {i}: {sizes[i]}" for i in range(int(k))])
108
+ metrics_lines.append(f"**Cluster-groottes:** {size_str}")
109
+ metrics_lines.append("> Tip: probeer een andere *k* en vergelijk de inertia/silhouette.")
110
+
111
+ yield fig_main, fig_inertia, "\n".join(metrics_lines)
112
+
113
+ with gr.Blocks(title="Unsupervised Learning — Live Clustering (K-means + PCA)") as demo:
114
+ gr.Markdown("# Unsupervised Learning — Live Clustering (K-means + PCA)")
115
+ gr.Markdown(INTRO_MD)
116
+
117
+ with gr.Row():
118
+ with gr.Column(scale=1):
119
+ k = gr.Slider(2, 10, value=3, step=1, label="Aantal clusters (k)")
120
+ iters = gr.Slider(5, 200, value=40, step=1, label="Iteraties")
121
+ batch_size = gr.Slider(16, 256, value=128, step=1, label="Batchgrootte")
122
+ seed = gr.Slider(0, 9999, value=42, step=1, label="Random seed")
123
+ run_btn = gr.Button("Cluster live")
124
+ with gr.Column(scale=2):
125
+ plot_main = gr.Plot(label="2D-projectie (PCA) met clusters en centers (live)")
126
+ plot_inertia = gr.Plot(label="Inertia per iteratie")
127
+ metrics = gr.Markdown()
128
+
129
+ run_btn.click(
130
+ fn=kmeans_live_generator,
131
+ inputs=[k, iters, batch_size, seed],
132
+ outputs=[plot_main, plot_inertia, metrics]
133
+ )
134
+
135
+ demo.load(
136
+ fn=kmeans_live_generator,
137
+ inputs=[k, iters, batch_size, seed],
138
+ outputs=[plot_main, plot_inertia, metrics]
139
+ )
140
+
141
+ if __name__ == "__main__":
142
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=4.36.0
2
+ matplotlib>=3.7.0
3
+ numpy>=1.23.0
4
+ scikit-learn>=1.2.0