Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files
app.py
CHANGED
|
@@ -4,54 +4,57 @@ import matplotlib.pyplot as plt
|
|
| 4 |
from sklearn import datasets
|
| 5 |
from sklearn.utils import shuffle
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
X = d.data[:, 2]
|
| 21 |
-
y = d.target
|
| 22 |
-
return X, y, "Diabetes: BMI vs. disease progression"
|
| 23 |
-
elif name == "California Housing (MedInc vs value)":
|
| 24 |
-
try:
|
| 25 |
-
ch = datasets.fetch_california_housing()
|
| 26 |
-
X = ch.data[:, 0]
|
| 27 |
-
y = ch.target
|
| 28 |
-
return X, y, "California Housing: MedInc vs. house value"
|
| 29 |
-
except Exception:
|
| 30 |
-
X, y, _ = load_dataset("Synthetisch", n_samples=n_samples, noise=noise)
|
| 31 |
-
return X, y, "(Fallback) Synthetische data"
|
| 32 |
-
else:
|
| 33 |
-
raise ValueError("Onbekende dataset")
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
# ------------------------------
|
| 37 |
-
# Training (SGD) met live plots
|
| 38 |
-
# ------------------------------
|
| 39 |
-
def sgd_train_generator(dataset_name, lr, epochs, batch_size, n_samples, noise, seed):
|
| 40 |
-
rng = np.random.RandomState(int(seed))
|
| 41 |
-
x, y, label = load_dataset(dataset_name, n_samples=n_samples, noise=noise)
|
| 42 |
-
n = x.shape[0]
|
| 43 |
-
x = x.astype(np.float64)
|
| 44 |
-
y = y.astype(np.float64)
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
w, b = 0.0, 0.0
|
|
|
|
| 47 |
x_min, x_max = float(np.min(x)), float(np.max(x))
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
for epoch in range(1, int(epochs) + 1):
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
| 52 |
for start in range(0, n, int(batch_size)):
|
| 53 |
end = min(start + int(batch_size), n)
|
| 54 |
-
xb, yb =
|
| 55 |
yhat = w * xb + b
|
| 56 |
err = yb - yhat
|
| 57 |
dw = -(2.0 / xb.size) * np.sum(xb * err)
|
|
@@ -59,82 +62,88 @@ def sgd_train_generator(dataset_name, lr, epochs, batch_size, n_samples, noise,
|
|
| 59 |
w -= lr * dw
|
| 60 |
b -= lr * db
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
xs = np.linspace(x_min, x_max, 200)
|
| 71 |
-
ax1.plot(xs, w * xs + b, linewidth=2)
|
| 72 |
-
ax1.set_title(f"{label}
|
| 73 |
-
ax1.set_xlabel("
|
| 74 |
-
ax1.set_ylabel("
|
|
|
|
| 75 |
ax1.grid(True, linestyle=":", linewidth=0.6)
|
| 76 |
plt.tight_layout()
|
| 77 |
|
| 78 |
-
# Plot loss
|
| 79 |
-
|
| 80 |
-
ax2 =
|
| 81 |
-
ax2.plot(range(1,
|
| 82 |
-
ax2.
|
|
|
|
| 83 |
ax2.set_xlabel("Epoch")
|
| 84 |
ax2.set_ylabel("MSE")
|
|
|
|
| 85 |
ax2.grid(True, linestyle=":", linewidth=0.6)
|
| 86 |
plt.tight_layout()
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
plot_loss = gr.Plot(label="Loss curve (MSE)")
|
| 131 |
-
metrics = gr.Markdown()
|
| 132 |
-
|
| 133 |
-
train_btn.click(
|
| 134 |
-
fn=sgd_train_generator,
|
| 135 |
-
inputs=[dataset, lr, epochs, batch, n_samples, noise, seed],
|
| 136 |
-
outputs=[plot_data, plot_loss, metrics]
|
| 137 |
-
)
|
| 138 |
|
| 139 |
if __name__ == "__main__":
|
| 140 |
demo.launch()
|
|
|
|
| 4 |
from sklearn import datasets
|
| 5 |
from sklearn.utils import shuffle
|
| 6 |
|
| 7 |
+
EXPLAIN_MD = """
|
| 8 |
+
### Wat testen we hier?
|
| 9 |
+
We bekijken of er een **lineair verband** is tussen **BMI** en de **diabetes-progressiescore** in een bekende (openbare) dataset.
|
| 10 |
+
Dat doen we met *supervised learning*: het model ziet voorbeelden `(BMI β score)` en leert een lijn \(y = w x + b\) die dit verband benadert.
|
| 11 |
+
|
| 12 |
+
**Hoe meten we of dat gelukt is?**
|
| 13 |
+
- We splitsen de data in **train (80%)** en **test (20%)**.
|
| 14 |
+
- We **trainen** het model alleen op de **trainset**.
|
| 15 |
+
- We **toetsen** het resultaat op de **testset** die het model niet gezien heeft.
|
| 16 |
+
- We rapporteren **MSE** (gemiddelde kwadratische fout) en **RΒ²** (uitlegvariantie) op de testset.
|
| 17 |
+
|
| 18 |
+
> Let op: in deze sklearn-dataset is BMI **genormaliseerd** (geschaald). De helling `w` geeft wel de **richting en sterkte** aan (positief = hogere BMI hangt samen met hogere score).
|
| 19 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
def load_bmi_diabetes():
|
| 22 |
+
d = datasets.load_diabetes()
|
| 23 |
+
X = d.data[:, 2] # BMI feature (genormaliseerd)
|
| 24 |
+
y = d.target # Progressiescore
|
| 25 |
+
return X.astype(np.float64), y.astype(np.float64), "Diabetes: BMI vs. score"
|
| 26 |
+
|
| 27 |
+
def train_test_split_1d(x, y, test_size=0.2, seed=42):
|
| 28 |
+
rng = np.random.RandomState(seed)
|
| 29 |
+
idx = np.arange(x.shape[0])
|
| 30 |
+
rng.shuffle(idx)
|
| 31 |
+
n_test = int(len(idx) * test_size)
|
| 32 |
+
test_idx = idx[:n_test]
|
| 33 |
+
train_idx = idx[n_test:]
|
| 34 |
+
return x[train_idx], y[train_idx], x[test_idx], y[test_idx]
|
| 35 |
+
|
| 36 |
+
def sgd_train_generator(lr, epochs, batch_size, seed, split_seed):
|
| 37 |
+
# Data & split
|
| 38 |
+
x, y, label = load_bmi_diabetes()
|
| 39 |
+
x_tr, y_tr, x_te, y_te = train_test_split_1d(x, y, test_size=0.2, seed=int(split_seed))
|
| 40 |
+
|
| 41 |
+
n = x_tr.shape[0]
|
| 42 |
w, b = 0.0, 0.0
|
| 43 |
+
|
| 44 |
x_min, x_max = float(np.min(x)), float(np.max(x))
|
| 45 |
+
train_losses = []
|
| 46 |
+
test_losses = []
|
| 47 |
+
|
| 48 |
+
rng = np.random.RandomState(int(seed))
|
| 49 |
|
| 50 |
for epoch in range(1, int(epochs) + 1):
|
| 51 |
+
# shuffle train set
|
| 52 |
+
x_tr, y_tr = shuffle(x_tr, y_tr, random_state=rng)
|
| 53 |
+
|
| 54 |
+
# SGD over mini-batches
|
| 55 |
for start in range(0, n, int(batch_size)):
|
| 56 |
end = min(start + int(batch_size), n)
|
| 57 |
+
xb, yb = x_tr[start:end], y_tr[start:end]
|
| 58 |
yhat = w * xb + b
|
| 59 |
err = yb - yhat
|
| 60 |
dw = -(2.0 / xb.size) * np.sum(xb * err)
|
|
|
|
| 62 |
w -= lr * dw
|
| 63 |
b -= lr * db
|
| 64 |
|
| 65 |
+
# Metrics on train and test
|
| 66 |
+
y_tr_pred = w * x_tr + b
|
| 67 |
+
y_te_pred = w * x_te + b
|
| 68 |
+
mse_tr = float(np.mean((y_tr - y_tr_pred)**2))
|
| 69 |
+
mse_te = float(np.mean((y_te - y_te_pred)**2))
|
| 70 |
+
|
| 71 |
+
# R^2 on test
|
| 72 |
+
ss_res = float(np.sum((y_te - y_te_pred)**2))
|
| 73 |
+
ss_tot = float(np.sum((y_te - np.mean(y_te))**2))
|
| 74 |
+
r2_te = 1.0 - ss_res / ss_tot if ss_tot > 0 else float("nan")
|
| 75 |
+
|
| 76 |
+
train_losses.append(mse_tr)
|
| 77 |
+
test_losses.append(mse_te)
|
| 78 |
+
|
| 79 |
+
# Plot 1: data (train vs test) + regressielijn
|
| 80 |
+
fig_main = plt.figure(figsize=(7, 4))
|
| 81 |
+
ax1 = fig_main.add_subplot(111)
|
| 82 |
+
ax1.scatter(x_tr, y_tr, alpha=0.6, s=18, label="train")
|
| 83 |
+
ax1.scatter(x_te, y_te, alpha=0.8, s=22, marker="x", label="test")
|
| 84 |
xs = np.linspace(x_min, x_max, 200)
|
| 85 |
+
ax1.plot(xs, w * xs + b, linewidth=2, label="model")
|
| 86 |
+
ax1.set_title(f"{label} β Epoch {epoch}/{epochs}")
|
| 87 |
+
ax1.set_xlabel("BMI (genormaliseerd)")
|
| 88 |
+
ax1.set_ylabel("Progressiescore")
|
| 89 |
+
ax1.legend()
|
| 90 |
ax1.grid(True, linestyle=":", linewidth=0.6)
|
| 91 |
plt.tight_layout()
|
| 92 |
|
| 93 |
+
# Plot 2: loss-curve (train & test)
|
| 94 |
+
fig_loss = plt.figure(figsize=(7, 3.5))
|
| 95 |
+
ax2 = fig_loss.add_subplot(111)
|
| 96 |
+
ax2.plot(range(1, len(train_losses)+1), train_losses, marker="o", label="Train MSE")
|
| 97 |
+
ax2.plot(range(1, len(test_losses)+1), test_losses, marker="o", linestyle="--", label="Test MSE")
|
| 98 |
+
ax2.set_title("Loss-curve (MSE per epoch) β lager is beter")
|
| 99 |
ax2.set_xlabel("Epoch")
|
| 100 |
ax2.set_ylabel("MSE")
|
| 101 |
+
ax2.legend()
|
| 102 |
ax2.grid(True, linestyle=":", linewidth=0.6)
|
| 103 |
plt.tight_layout()
|
| 104 |
|
| 105 |
+
# Plain-language results
|
| 106 |
+
verdict = "positief" if w >= 0 else "negatief"
|
| 107 |
+
summary = (
|
| 108 |
+
f"**Wat levert dit op?**\n"
|
| 109 |
+
f"- Huidige regressielijn: `y = {w:.4f} * x + {b:.4f}`\n"
|
| 110 |
+
f"- Train MSE: `{mse_tr:.2f}` β Test MSE: `{mse_te:.2f}` β Test RΒ²: `{r2_te:.3f}`\n"
|
| 111 |
+
f"- Interpretatie: het verband tussen BMI en progressiescore is **{verdict}** in deze dataset "
|
| 112 |
+
f"(hogere BMI hangt samen met hogere score als `w > 0`)."
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
yield fig_main, fig_loss, summary
|
| 116 |
+
|
| 117 |
+
with gr.Blocks(title="Diabetes: BMI β Progressiescore (Live Regressie)") as demo:
|
| 118 |
+
gr.Markdown("# Diabetes: BMI β Progressiescore (Live Lineaire Regressie)")
|
| 119 |
+
gr.Markdown(EXPLAIN_MD)
|
| 120 |
+
|
| 121 |
+
with gr.Row():
|
| 122 |
+
with gr.Column(scale=1):
|
| 123 |
+
lr = gr.Slider(1e-4, 1e-0, value=5e-3, step=1e-4, label="Learning rate")
|
| 124 |
+
epochs = gr.Slider(5, 200, value=60, step=1, label="Epochs")
|
| 125 |
+
batch = gr.Slider(8, 256, value=64, step=1, label="Batchgrootte")
|
| 126 |
+
seed = gr.Slider(0, 9999, value=42, step=1, label="Training seed")
|
| 127 |
+
split_seed = gr.Slider(0, 9999, value=7, step=1, label="Train/test split seed")
|
| 128 |
+
train_btn = gr.Button("Train live")
|
| 129 |
+
with gr.Column(scale=2):
|
| 130 |
+
plot_main = gr.Plot(label="Data (train/test) & regressielijn (live)")
|
| 131 |
+
plot_loss = gr.Plot(label="Loss-curve (MSE per epoch) β train vs test")
|
| 132 |
+
results = gr.Markdown()
|
| 133 |
+
|
| 134 |
+
# Button-triggered training
|
| 135 |
+
train_btn.click(
|
| 136 |
+
fn=sgd_train_generator,
|
| 137 |
+
inputs=[lr, epochs, batch, seed, split_seed],
|
| 138 |
+
outputs=[plot_main, plot_loss, results]
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
# Auto-train on load with defaults
|
| 142 |
+
demo.load(
|
| 143 |
+
fn=sgd_train_generator,
|
| 144 |
+
inputs=[lr, epochs, batch, seed, split_seed],
|
| 145 |
+
outputs=[plot_main, plot_loss, results]
|
| 146 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
if __name__ == "__main__":
|
| 149 |
demo.launch()
|