Marcel0123 commited on
Commit
337b68c
Β·
verified Β·
1 Parent(s): b4fd976

Upload 2 files

Browse files
Files changed (1) hide show
  1. app.py +117 -108
app.py CHANGED
@@ -4,54 +4,57 @@ import matplotlib.pyplot as plt
4
  from sklearn import datasets
5
  from sklearn.utils import shuffle
6
 
7
- # ------------------------------
8
- # Data helpers
9
- # ------------------------------
10
- def load_dataset(name: str, n_samples: int = 200, noise: float = 10.0):
11
- """Return (x, y, label) with x,y as 1D numpy arrays for easy plotting."""
12
- if name == "Synthetisch":
13
- rng = np.random.RandomState(42)
14
- X = np.linspace(-3, 3, n_samples)
15
- true_w, true_b = 4.0, -2.0
16
- y = true_w * X + true_b + rng.normal(0, noise, size=n_samples)
17
- return X, y, "Synthetische data (y = 4x - 2 + noise)"
18
- elif name == "Diabetes (BMI vs target)":
19
- d = datasets.load_diabetes()
20
- X = d.data[:, 2]
21
- y = d.target
22
- return X, y, "Diabetes: BMI vs. disease progression"
23
- elif name == "California Housing (MedInc vs value)":
24
- try:
25
- ch = datasets.fetch_california_housing()
26
- X = ch.data[:, 0]
27
- y = ch.target
28
- return X, y, "California Housing: MedInc vs. house value"
29
- except Exception:
30
- X, y, _ = load_dataset("Synthetisch", n_samples=n_samples, noise=noise)
31
- return X, y, "(Fallback) Synthetische data"
32
- else:
33
- raise ValueError("Onbekende dataset")
34
-
35
-
36
- # ------------------------------
37
- # Training (SGD) met live plots
38
- # ------------------------------
39
- def sgd_train_generator(dataset_name, lr, epochs, batch_size, n_samples, noise, seed):
40
- rng = np.random.RandomState(int(seed))
41
- x, y, label = load_dataset(dataset_name, n_samples=n_samples, noise=noise)
42
- n = x.shape[0]
43
- x = x.astype(np.float64)
44
- y = y.astype(np.float64)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  w, b = 0.0, 0.0
 
47
  x_min, x_max = float(np.min(x)), float(np.max(x))
48
- loss_history = []
 
 
 
49
 
50
  for epoch in range(1, int(epochs) + 1):
51
- x, y = shuffle(x, y, random_state=rng)
 
 
 
52
  for start in range(0, n, int(batch_size)):
53
  end = min(start + int(batch_size), n)
54
- xb, yb = x[start:end], y[start:end]
55
  yhat = w * xb + b
56
  err = yb - yhat
57
  dw = -(2.0 / xb.size) * np.sum(xb * err)
@@ -59,82 +62,88 @@ def sgd_train_generator(dataset_name, lr, epochs, batch_size, n_samples, noise,
59
  w -= lr * dw
60
  b -= lr * db
61
 
62
- y_pred = w * x + b
63
- mse = float(np.mean((y - y_pred) ** 2))
64
- loss_history.append(mse)
65
-
66
- # Plot scatter + regressielijn
67
- fig1 = plt.figure(figsize=(6, 4))
68
- ax1 = fig1.add_subplot(111)
69
- ax1.scatter(x, y, alpha=0.6, s=18)
 
 
 
 
 
 
 
 
 
 
 
70
  xs = np.linspace(x_min, x_max, 200)
71
- ax1.plot(xs, w * xs + b, linewidth=2)
72
- ax1.set_title(f"{label}\nEpoch {epoch}/{epochs} β€” MSE: {mse:.4f}")
73
- ax1.set_xlabel("x")
74
- ax1.set_ylabel("y")
 
75
  ax1.grid(True, linestyle=":", linewidth=0.6)
76
  plt.tight_layout()
77
 
78
- # Plot loss curve
79
- fig2 = plt.figure(figsize=(6, 4))
80
- ax2 = fig2.add_subplot(111)
81
- ax2.plot(range(1, epoch + 1), loss_history, marker="o")
82
- ax2.set_title("Loss curve (MSE per epoch)")
 
83
  ax2.set_xlabel("Epoch")
84
  ax2.set_ylabel("MSE")
 
85
  ax2.grid(True, linestyle=":", linewidth=0.6)
86
  plt.tight_layout()
87
 
88
- yield fig1, fig2, f"w = {w:.4f}, b = {b:.4f}, MSE = {mse:.4f}"
89
-
90
-
91
- # ------------------------------
92
- # Uitlegtekst
93
- # ------------------------------
94
- THEORY_MD = r"""
95
- ### Wat is supervised learning?
96
- Bij **supervised learning** leer je een model aan de hand van voorbeeldparen *(input β†’ gewenste output)*. Het doel is een functie te vinden die de relatie tussen input en output goed benadert.
97
-
98
- ### Lineaire regressie in 1D
99
- We passen een lijn \( y = w x + b \) aan op data. We minimaliseren de **Mean Squared Error (MSE)**:
100
- \[ \operatorname{MSE} = \frac{1}{N} \sum_{i=1}^N (y_i - (w x_i + b))^2 \]
101
- We gebruiken **stochastic gradient descent (SGD)** om \(w\) en \(b\) stapje voor stapje te verbeteren.
102
- """
103
-
104
-
105
- # ------------------------------
106
- # Gradio UI
107
- # ------------------------------
108
- with gr.Blocks(title="Live Supervised Learning: Linear Regression") as demo:
109
- gr.Markdown("# Live Supervised Learning β€” Lineaire Regressie")
110
- with gr.Tabs():
111
- with gr.TabItem("Uitleg"):
112
- gr.Markdown(THEORY_MD)
113
- with gr.TabItem("Playground"):
114
- with gr.Row():
115
- with gr.Column(scale=1):
116
- dataset = gr.Dropdown(
117
- ["Synthetisch", "Diabetes (BMI vs target)", "California Housing (MedInc vs value)"],
118
- value="Synthetisch",
119
- label="Dataset"
120
- )
121
- lr = gr.Slider(1e-4, 1e-0, value=1e-2, step=1e-4, label="Learning Rate")
122
- epochs = gr.Slider(1, 200, value=50, step=1, label="Epochs")
123
- batch = gr.Slider(1, 512, value=64, step=1, label="Batchgrootte")
124
- n_samples = gr.Slider(50, 2000, value=300, step=10, label="Aantal samples (synthetisch)")
125
- noise = gr.Slider(0.0, 30.0, value=10.0, step=0.5, label="Noise (synthetisch)")
126
- seed = gr.Slider(0, 9999, value=42, step=1, label="Random seed")
127
- train_btn = gr.Button("Train live")
128
- with gr.Column(scale=2):
129
- plot_data = gr.Plot(label="Data & regressielijn (live)")
130
- plot_loss = gr.Plot(label="Loss curve (MSE)")
131
- metrics = gr.Markdown()
132
-
133
- train_btn.click(
134
- fn=sgd_train_generator,
135
- inputs=[dataset, lr, epochs, batch, n_samples, noise, seed],
136
- outputs=[plot_data, plot_loss, metrics]
137
- )
138
 
139
  if __name__ == "__main__":
140
  demo.launch()
 
4
  from sklearn import datasets
5
  from sklearn.utils import shuffle
6
 
7
+ EXPLAIN_MD = """
8
+ ### Wat testen we hier?
9
+ We bekijken of er een **lineair verband** is tussen **BMI** en de **diabetes-progressiescore** in een bekende (openbare) dataset.
10
+ Dat doen we met *supervised learning*: het model ziet voorbeelden `(BMI β†’ score)` en leert een lijn \(y = w x + b\) die dit verband benadert.
11
+
12
+ **Hoe meten we of dat gelukt is?**
13
+ - We splitsen de data in **train (80%)** en **test (20%)**.
14
+ - We **trainen** het model alleen op de **trainset**.
15
+ - We **toetsen** het resultaat op de **testset** die het model niet gezien heeft.
16
+ - We rapporteren **MSE** (gemiddelde kwadratische fout) en **RΒ²** (uitlegvariantie) op de testset.
17
+
18
+ > Let op: in deze sklearn-dataset is BMI **genormaliseerd** (geschaald). De helling `w` geeft wel de **richting en sterkte** aan (positief = hogere BMI hangt samen met hogere score).
19
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ def load_bmi_diabetes():
22
+ d = datasets.load_diabetes()
23
+ X = d.data[:, 2] # BMI feature (genormaliseerd)
24
+ y = d.target # Progressiescore
25
+ return X.astype(np.float64), y.astype(np.float64), "Diabetes: BMI vs. score"
26
+
27
+ def train_test_split_1d(x, y, test_size=0.2, seed=42):
28
+ rng = np.random.RandomState(seed)
29
+ idx = np.arange(x.shape[0])
30
+ rng.shuffle(idx)
31
+ n_test = int(len(idx) * test_size)
32
+ test_idx = idx[:n_test]
33
+ train_idx = idx[n_test:]
34
+ return x[train_idx], y[train_idx], x[test_idx], y[test_idx]
35
+
36
+ def sgd_train_generator(lr, epochs, batch_size, seed, split_seed):
37
+ # Data & split
38
+ x, y, label = load_bmi_diabetes()
39
+ x_tr, y_tr, x_te, y_te = train_test_split_1d(x, y, test_size=0.2, seed=int(split_seed))
40
+
41
+ n = x_tr.shape[0]
42
  w, b = 0.0, 0.0
43
+
44
  x_min, x_max = float(np.min(x)), float(np.max(x))
45
+ train_losses = []
46
+ test_losses = []
47
+
48
+ rng = np.random.RandomState(int(seed))
49
 
50
  for epoch in range(1, int(epochs) + 1):
51
+ # shuffle train set
52
+ x_tr, y_tr = shuffle(x_tr, y_tr, random_state=rng)
53
+
54
+ # SGD over mini-batches
55
  for start in range(0, n, int(batch_size)):
56
  end = min(start + int(batch_size), n)
57
+ xb, yb = x_tr[start:end], y_tr[start:end]
58
  yhat = w * xb + b
59
  err = yb - yhat
60
  dw = -(2.0 / xb.size) * np.sum(xb * err)
 
62
  w -= lr * dw
63
  b -= lr * db
64
 
65
+ # Metrics on train and test
66
+ y_tr_pred = w * x_tr + b
67
+ y_te_pred = w * x_te + b
68
+ mse_tr = float(np.mean((y_tr - y_tr_pred)**2))
69
+ mse_te = float(np.mean((y_te - y_te_pred)**2))
70
+
71
+ # R^2 on test
72
+ ss_res = float(np.sum((y_te - y_te_pred)**2))
73
+ ss_tot = float(np.sum((y_te - np.mean(y_te))**2))
74
+ r2_te = 1.0 - ss_res / ss_tot if ss_tot > 0 else float("nan")
75
+
76
+ train_losses.append(mse_tr)
77
+ test_losses.append(mse_te)
78
+
79
+ # Plot 1: data (train vs test) + regressielijn
80
+ fig_main = plt.figure(figsize=(7, 4))
81
+ ax1 = fig_main.add_subplot(111)
82
+ ax1.scatter(x_tr, y_tr, alpha=0.6, s=18, label="train")
83
+ ax1.scatter(x_te, y_te, alpha=0.8, s=22, marker="x", label="test")
84
  xs = np.linspace(x_min, x_max, 200)
85
+ ax1.plot(xs, w * xs + b, linewidth=2, label="model")
86
+ ax1.set_title(f"{label} β€” Epoch {epoch}/{epochs}")
87
+ ax1.set_xlabel("BMI (genormaliseerd)")
88
+ ax1.set_ylabel("Progressiescore")
89
+ ax1.legend()
90
  ax1.grid(True, linestyle=":", linewidth=0.6)
91
  plt.tight_layout()
92
 
93
+ # Plot 2: loss-curve (train & test)
94
+ fig_loss = plt.figure(figsize=(7, 3.5))
95
+ ax2 = fig_loss.add_subplot(111)
96
+ ax2.plot(range(1, len(train_losses)+1), train_losses, marker="o", label="Train MSE")
97
+ ax2.plot(range(1, len(test_losses)+1), test_losses, marker="o", linestyle="--", label="Test MSE")
98
+ ax2.set_title("Loss-curve (MSE per epoch) β€” lager is beter")
99
  ax2.set_xlabel("Epoch")
100
  ax2.set_ylabel("MSE")
101
+ ax2.legend()
102
  ax2.grid(True, linestyle=":", linewidth=0.6)
103
  plt.tight_layout()
104
 
105
+ # Plain-language results
106
+ verdict = "positief" if w >= 0 else "negatief"
107
+ summary = (
108
+ f"**Wat levert dit op?**\n"
109
+ f"- Huidige regressielijn: `y = {w:.4f} * x + {b:.4f}`\n"
110
+ f"- Train MSE: `{mse_tr:.2f}` β€” Test MSE: `{mse_te:.2f}` β€” Test RΒ²: `{r2_te:.3f}`\n"
111
+ f"- Interpretatie: het verband tussen BMI en progressiescore is **{verdict}** in deze dataset "
112
+ f"(hogere BMI hangt samen met hogere score als `w > 0`)."
113
+ )
114
+
115
+ yield fig_main, fig_loss, summary
116
+
117
+ with gr.Blocks(title="Diabetes: BMI β†’ Progressiescore (Live Regressie)") as demo:
118
+ gr.Markdown("# Diabetes: BMI β†’ Progressiescore (Live Lineaire Regressie)")
119
+ gr.Markdown(EXPLAIN_MD)
120
+
121
+ with gr.Row():
122
+ with gr.Column(scale=1):
123
+ lr = gr.Slider(1e-4, 1e-0, value=5e-3, step=1e-4, label="Learning rate")
124
+ epochs = gr.Slider(5, 200, value=60, step=1, label="Epochs")
125
+ batch = gr.Slider(8, 256, value=64, step=1, label="Batchgrootte")
126
+ seed = gr.Slider(0, 9999, value=42, step=1, label="Training seed")
127
+ split_seed = gr.Slider(0, 9999, value=7, step=1, label="Train/test split seed")
128
+ train_btn = gr.Button("Train live")
129
+ with gr.Column(scale=2):
130
+ plot_main = gr.Plot(label="Data (train/test) & regressielijn (live)")
131
+ plot_loss = gr.Plot(label="Loss-curve (MSE per epoch) β€” train vs test")
132
+ results = gr.Markdown()
133
+
134
+ # Button-triggered training
135
+ train_btn.click(
136
+ fn=sgd_train_generator,
137
+ inputs=[lr, epochs, batch, seed, split_seed],
138
+ outputs=[plot_main, plot_loss, results]
139
+ )
140
+
141
+ # Auto-train on load with defaults
142
+ demo.load(
143
+ fn=sgd_train_generator,
144
+ inputs=[lr, epochs, batch, seed, split_seed],
145
+ outputs=[plot_main, plot_loss, results]
146
+ )
 
 
 
 
 
 
 
 
147
 
148
  if __name__ == "__main__":
149
  demo.launch()