Spaces:

Marcel0123
/

supervised-learning

Sleeping

App Files Files Community

Marcel0123 commited on Aug 27, 2025

Commit

337b68c

verified ·

1 Parent(s): b4fd976

Upload 2 files

Browse files

Files changed (1) hide show

app.py +117 -108

app.py CHANGED Viewed

@@ -4,54 +4,57 @@ import matplotlib.pyplot as plt
 from sklearn import datasets
 from sklearn.utils import shuffle
-# ------------------------------
-# Data helpers
-# ------------------------------
-def load_dataset(name: str, n_samples: int = 200, noise: float = 10.0):
-    """Return (x, y, label) with x,y as 1D numpy arrays for easy plotting."""
-    if name == "Synthetisch":
-        rng = np.random.RandomState(42)
-        X = np.linspace(-3, 3, n_samples)
-        true_w, true_b = 4.0, -2.0
-        y = true_w * X + true_b + rng.normal(0, noise, size=n_samples)
-        return X, y, "Synthetische data (y = 4x - 2 + noise)"
-    elif name == "Diabetes (BMI vs target)":
-        d = datasets.load_diabetes()
-        X = d.data[:, 2]
-        y = d.target
-        return X, y, "Diabetes: BMI vs. disease progression"
-    elif name == "California Housing (MedInc vs value)":
-        try:
-            ch = datasets.fetch_california_housing()
-            X = ch.data[:, 0]
-            y = ch.target
-            return X, y, "California Housing: MedInc vs. house value"
-        except Exception:
-            X, y, _ = load_dataset("Synthetisch", n_samples=n_samples, noise=noise)
-            return X, y, "(Fallback) Synthetische data"
-    else:
-        raise ValueError("Onbekende dataset")
-# ------------------------------
-# Training (SGD) met live plots
-# ------------------------------
-def sgd_train_generator(dataset_name, lr, epochs, batch_size, n_samples, noise, seed):
-    rng = np.random.RandomState(int(seed))
-    x, y, label = load_dataset(dataset_name, n_samples=n_samples, noise=noise)
-    n = x.shape[0]
-    x = x.astype(np.float64)
-    y = y.astype(np.float64)
     w, b = 0.0, 0.0
     x_min, x_max = float(np.min(x)), float(np.max(x))
-    loss_history = []
     for epoch in range(1, int(epochs) + 1):
-        x, y = shuffle(x, y, random_state=rng)
         for start in range(0, n, int(batch_size)):
             end = min(start + int(batch_size), n)
-            xb, yb = x[start:end], y[start:end]
             yhat = w * xb + b
             err = yb - yhat
             dw = -(2.0 / xb.size) * np.sum(xb * err)
@@ -59,82 +62,88 @@ def sgd_train_generator(dataset_name, lr, epochs, batch_size, n_samples, noise,
             w -= lr * dw
             b -= lr * db
-        y_pred = w * x + b
-        mse = float(np.mean((y - y_pred) ** 2))
-        loss_history.append(mse)
-        # Plot scatter + regressielijn
-        fig1 = plt.figure(figsize=(6, 4))
-        ax1 = fig1.add_subplot(111)
-        ax1.scatter(x, y, alpha=0.6, s=18)
         xs = np.linspace(x_min, x_max, 200)
-        ax1.plot(xs, w * xs + b, linewidth=2)
-        ax1.set_title(f"{label}\nEpoch {epoch}/{epochs} — MSE: {mse:.4f}")
-        ax1.set_xlabel("x")
-        ax1.set_ylabel("y")
         ax1.grid(True, linestyle=":", linewidth=0.6)
         plt.tight_layout()
-        # Plot loss curve
-        fig2 = plt.figure(figsize=(6, 4))
-        ax2 = fig2.add_subplot(111)
-        ax2.plot(range(1, epoch + 1), loss_history, marker="o")
-        ax2.set_title("Loss curve (MSE per epoch)")
         ax2.set_xlabel("Epoch")
         ax2.set_ylabel("MSE")
         ax2.grid(True, linestyle=":", linewidth=0.6)
         plt.tight_layout()
-        yield fig1, fig2, f"w = {w:.4f}, b = {b:.4f}, MSE = {mse:.4f}"
-# ------------------------------
-# Uitlegtekst
-# ------------------------------
-THEORY_MD = r"""
-### Wat is supervised learning?
-Bij **supervised learning** leer je een model aan de hand van voorbeeldparen *(input → gewenste output)*. Het doel is een functie te vinden die de relatie tussen input en output goed benadert.
-### Lineaire regressie in 1D
-We passen een lijn \( y = w x + b \) aan op data. We minimaliseren de **Mean Squared Error (MSE)**:
-\[ \operatorname{MSE} = \frac{1}{N} \sum_{i=1}^N (y_i - (w x_i + b))^2 \]
-We gebruiken **stochastic gradient descent (SGD)** om \(w\) en \(b\) stapje voor stapje te verbeteren.
-"""
-# ------------------------------
-# Gradio UI
-# ------------------------------
-with gr.Blocks(title="Live Supervised Learning: Linear Regression") as demo:
-    gr.Markdown("# Live Supervised Learning — Lineaire Regressie")
-    with gr.Tabs():
-        with gr.TabItem("Uitleg"):
-            gr.Markdown(THEORY_MD)
-        with gr.TabItem("Playground"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    dataset = gr.Dropdown(
-                        ["Synthetisch", "Diabetes (BMI vs target)", "California Housing (MedInc vs value)"],
-                        value="Synthetisch",
-                        label="Dataset"
-                    )
-                    lr = gr.Slider(1e-4, 1e-0, value=1e-2, step=1e-4, label="Learning Rate")
-                    epochs = gr.Slider(1, 200, value=50, step=1, label="Epochs")
-                    batch = gr.Slider(1, 512, value=64, step=1, label="Batchgrootte")
-                    n_samples = gr.Slider(50, 2000, value=300, step=10, label="Aantal samples (synthetisch)")
-                    noise = gr.Slider(0.0, 30.0, value=10.0, step=0.5, label="Noise (synthetisch)")
-                    seed = gr.Slider(0, 9999, value=42, step=1, label="Random seed")
-                    train_btn = gr.Button("Train live")
-                with gr.Column(scale=2):
-                    plot_data = gr.Plot(label="Data & regressielijn (live)")
-                    plot_loss = gr.Plot(label="Loss curve (MSE)")
-                    metrics = gr.Markdown()
-            train_btn.click(
-                fn=sgd_train_generator,
-                inputs=[dataset, lr, epochs, batch, n_samples, noise, seed],
-                outputs=[plot_data, plot_loss, metrics]
-            )
 if __name__ == "__main__":
     demo.launch()

 from sklearn import datasets
 from sklearn.utils import shuffle
+EXPLAIN_MD = """
+### Wat testen we hier?
+We bekijken of er een **lineair verband** is tussen **BMI** en de **diabetes-progressiescore** in een bekende (openbare) dataset.
+Dat doen we met *supervised learning*: het model ziet voorbeelden `(BMI → score)` en leert een lijn \(y = w x + b\) die dit verband benadert.
+**Hoe meten we of dat gelukt is?**
+- We splitsen de data in **train (80%)** en **test (20%)**.
+- We **trainen** het model alleen op de **trainset**.
+- We **toetsen** het resultaat op de **testset** die het model niet gezien heeft.
+- We rapporteren **MSE** (gemiddelde kwadratische fout) en **R²** (uitlegvariantie) op de testset.
+> Let op: in deze sklearn-dataset is BMI **genormaliseerd** (geschaald). De helling `w` geeft wel de **richting en sterkte** aan (positief = hogere BMI hangt samen met hogere score).
+"""
+def load_bmi_diabetes():
+    d = datasets.load_diabetes()
+    X = d.data[:, 2]  # BMI feature (genormaliseerd)
+    y = d.target      # Progressiescore
+    return X.astype(np.float64), y.astype(np.float64), "Diabetes: BMI vs. score"
+def train_test_split_1d(x, y, test_size=0.2, seed=42):
+    rng = np.random.RandomState(seed)
+    idx = np.arange(x.shape[0])
+    rng.shuffle(idx)
+    n_test = int(len(idx) * test_size)
+    test_idx = idx[:n_test]
+    train_idx = idx[n_test:]
+    return x[train_idx], y[train_idx], x[test_idx], y[test_idx]
+def sgd_train_generator(lr, epochs, batch_size, seed, split_seed):
+    # Data & split
+    x, y, label = load_bmi_diabetes()
+    x_tr, y_tr, x_te, y_te = train_test_split_1d(x, y, test_size=0.2, seed=int(split_seed))
+    n = x_tr.shape[0]
     w, b = 0.0, 0.0
     x_min, x_max = float(np.min(x)), float(np.max(x))
+    train_losses = []
+    test_losses = []
+    rng = np.random.RandomState(int(seed))
     for epoch in range(1, int(epochs) + 1):
+        # shuffle train set
+        x_tr, y_tr = shuffle(x_tr, y_tr, random_state=rng)
+        # SGD over mini-batches
         for start in range(0, n, int(batch_size)):
             end = min(start + int(batch_size), n)
+            xb, yb = x_tr[start:end], y_tr[start:end]
             yhat = w * xb + b
             err = yb - yhat
             dw = -(2.0 / xb.size) * np.sum(xb * err)
             w -= lr * dw
             b -= lr * db
+        # Metrics on train and test
+        y_tr_pred = w * x_tr + b
+        y_te_pred = w * x_te + b
+        mse_tr = float(np.mean((y_tr - y_tr_pred)**2))
+        mse_te = float(np.mean((y_te - y_te_pred)**2))
+        # R^2 on test
+        ss_res = float(np.sum((y_te - y_te_pred)**2))
+        ss_tot = float(np.sum((y_te - np.mean(y_te))**2))
+        r2_te = 1.0 - ss_res / ss_tot if ss_tot > 0 else float("nan")
+        train_losses.append(mse_tr)
+        test_losses.append(mse_te)
+        # Plot 1: data (train vs test) + regressielijn
+        fig_main = plt.figure(figsize=(7, 4))
+        ax1 = fig_main.add_subplot(111)
+        ax1.scatter(x_tr, y_tr, alpha=0.6, s=18, label="train")
+        ax1.scatter(x_te, y_te, alpha=0.8, s=22, marker="x", label="test")
         xs = np.linspace(x_min, x_max, 200)
+        ax1.plot(xs, w * xs + b, linewidth=2, label="model")
+        ax1.set_title(f"{label} — Epoch {epoch}/{epochs}")
+        ax1.set_xlabel("BMI (genormaliseerd)")
+        ax1.set_ylabel("Progressiescore")
+        ax1.legend()
         ax1.grid(True, linestyle=":", linewidth=0.6)
         plt.tight_layout()
+        # Plot 2: loss-curve (train & test)
+        fig_loss = plt.figure(figsize=(7, 3.5))
+        ax2 = fig_loss.add_subplot(111)
+        ax2.plot(range(1, len(train_losses)+1), train_losses, marker="o", label="Train MSE")
+        ax2.plot(range(1, len(test_losses)+1), test_losses, marker="o", linestyle="--", label="Test MSE")
+        ax2.set_title("Loss-curve (MSE per epoch) — lager is beter")
         ax2.set_xlabel("Epoch")
         ax2.set_ylabel("MSE")
+        ax2.legend()
         ax2.grid(True, linestyle=":", linewidth=0.6)
         plt.tight_layout()
+        # Plain-language results
+        verdict = "positief" if w >= 0 else "negatief"
+        summary = (
+            f"**Wat levert dit op?**\n"
+            f"- Huidige regressielijn: `y = {w:.4f} * x + {b:.4f}`\n"
+            f"- Train MSE: `{mse_tr:.2f}` — Test MSE: `{mse_te:.2f}` — Test R²: `{r2_te:.3f}`\n"
+            f"- Interpretatie: het verband tussen BMI en progressiescore is **{verdict}** in deze dataset "
+            f"(hogere BMI hangt samen met hogere score als `w > 0`)."
+        )
+        yield fig_main, fig_loss, summary
+with gr.Blocks(title="Diabetes: BMI → Progressiescore (Live Regressie)") as demo:
+    gr.Markdown("# Diabetes: BMI → Progressiescore (Live Lineaire Regressie)")
+    gr.Markdown(EXPLAIN_MD)
+    with gr.Row():
+        with gr.Column(scale=1):
+            lr = gr.Slider(1e-4, 1e-0, value=5e-3, step=1e-4, label="Learning rate")
+            epochs = gr.Slider(5, 200, value=60, step=1, label="Epochs")
+            batch = gr.Slider(8, 256, value=64, step=1, label="Batchgrootte")
+            seed = gr.Slider(0, 9999, value=42, step=1, label="Training seed")
+            split_seed = gr.Slider(0, 9999, value=7, step=1, label="Train/test split seed")
+            train_btn = gr.Button("Train live")
+        with gr.Column(scale=2):
+            plot_main = gr.Plot(label="Data (train/test) & regressielijn (live)")
+            plot_loss = gr.Plot(label="Loss-curve (MSE per epoch) — train vs test")
+            results = gr.Markdown()
+    # Button-triggered training
+    train_btn.click(
+        fn=sgd_train_generator,
+        inputs=[lr, epochs, batch, seed, split_seed],
+        outputs=[plot_main, plot_loss, results]
+    )
+    # Auto-train on load with defaults
+    demo.load(
+        fn=sgd_train_generator,
+        inputs=[lr, epochs, batch, seed, split_seed],
+        outputs=[plot_main, plot_loss, results]
+    )
 if __name__ == "__main__":
     demo.launch()