import time import json import numpy as np import pandas as pd import plotly.graph_objects as go import gradio as gr from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, f1_score, roc_auc_score # ---------- Ingebouwde dataset ---------- def load_builtin_dataset(n=1000, seed=42): rng = np.random.default_rng(seed) age = rng.integers(18, 75, size=n) gender = rng.choice([0, 1], size=n) # dummy feature sleep_quality = np.clip(rng.normal(6.5, 1.5, size=n), 1, 10) energy = np.clip(rng.normal(6.0, 1.7, size=n), 1, 10) anhedonia = np.clip(rng.normal(3.5, 1.8, size=n), 1, 10) stress = np.clip(rng.normal(4.5, 2.0, size=n), 1, 10) social_support = np.clip(rng.normal(6.0, 1.8, size=n), 1, 10) activity = np.clip(rng.normal(3.0 + 0.4*energy - 0.2*stress, 1.5, size=n), 0, 10) phq9 = np.clip( 0.8*anhedonia + 0.7*stress - 0.5*sleep_quality - 0.4*energy + rng.normal(0, 1.2, size=n) + 5, 0, 27 ) logit = ( + 0.65*anhedonia + 0.55*stress - 0.45*sleep_quality - 0.40*energy - 0.30*social_support - 0.20*activity + 0.01*(age - 40) + 0.05*gender + rng.normal(0, 0.6, size=n) ) logit -= np.median(logit) prob = 1 / (1 + np.exp(-logit)) depressed = (prob > 0.5).astype(int) df = pd.DataFrame({ "age": age, "gender": gender, "sleep_quality": sleep_quality, "energy": energy, "anhedonia": anhedonia, "stress": stress, "social_support": social_support, "activity": activity, "phq9": phq9, "depressed": depressed }) return df, "depressed" # ---------- Helpers ---------- def ensure_min_classes(y): if len(np.unique(y)) < 2: raise gr.Error("Label heeft minder dan 2 unieke klassen.") def make_base_fig(coords, y, title): # Helder palet + wit canvas palette = ["#2563eb", "#ef4444", "#10b981", "#f59e0b", "#a855f7", "#06b6d4", "#f97316", "#22c55e"] fig = go.Figure() fig.update_layout( title=title, xaxis_title="PC1", yaxis_title="PC2", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), margin=dict(l=10, r=10, t=60, b=10), template=None, plot_bgcolor="#ffffff", paper_bgcolor="#ffffff", height=520 ) labels = pd.Series(y).astype(str).values uniq = list(np.unique(labels)) for i, lbl in enumerate(uniq): mask = labels == lbl color = palette[i % len(palette)] fig.add_trace(go.Scatter( x=coords[mask, 0], y=coords[mask, 1], mode="markers", name=f"Klasse {lbl}", marker=dict(size=10, opacity=0.95, color=color, line=dict(width=1, color="#111")), hovertemplate="PC1: %{x:.2f}
PC2: %{y:.2f}" + f"Klasse {lbl}" )) return fig def draw_decision_boundary(fig, clf2d, scaler2d, pca2d, X_scaled): coords = pca2d.transform(X_scaled) x_min, x_max = coords[:, 0].min() - 0.5, coords[:, 0].max() + 0.5 y_min, y_max = coords[:, 1].min() - 0.5, coords[:, 1].max() + 0.5 xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200)) grid_2d = np.c_[xx.ravel(), yy.ravel()] coords_grid_s = scaler2d.transform(grid_2d) if hasattr(clf2d, "predict_proba"): Z = clf2d.predict_proba(coords_grid_s)[:, -1] else: dec = clf2d.decision_function(coords_grid_s) Z = (dec - np.nanmin(dec)) / (np.nanmax(dec) - np.nanmin(dec) + 1e-9) Z = np.nan_to_num(Z, nan=0.5, posinf=1.0, neginf=0.0).reshape(xx.shape) fig.add_trace(go.Contour( x=np.linspace(x_min, x_max, 200), y=np.linspace(y_min, y_max, 200), z=Z, showscale=False, contours=dict(coloring="lines", showlines=True), line=dict(width=1), opacity=0.8, name="Beslissingslijnen" )) return fig def get_model(model_name, params): if model_name == "SGDClassifier (realtime)": return SGDClassifier( loss=params.get("sgd_loss", "log_loss"), alpha=params.get("sgd_alpha", 1e-4), learning_rate=params.get("sgd_lr", "optimal"), max_iter=1, random_state=42 ) elif model_name == "Logistic Regression": return LogisticRegression(max_iter=300) elif model_name == "Random Forest": return RandomForestClassifier( n_estimators=int(params.get("rf_n", 250)), max_depth=int(params.get("rf_depth", 8)) if params.get("rf_depth", None) else None, random_state=42 ) elif model_name == "SVM (RBF)": return SVC(probability=True, gamma="scale", C=params.get("svm_c", 1.0), random_state=42) return LogisticRegression(max_iter=300) # ---------- Train & Stream ---------- def train_and_stream(test_size, model_name, params, epochs, pause_s): df, ycol = load_builtin_dataset() X = df.drop(columns=[ycol]).values y = df[ycol].values ensure_min_classes(y) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=42, stratify=y ) scaler = StandardScaler().fit(X_train) X_train_s = scaler.transform(X_train) X_test_s = scaler.transform(X_test) pca = PCA(n_components=2, random_state=42).fit(X_train_s) coords_train = pca.transform(X_train_s) coords_test = pca.transform(X_test_s) clf = get_model(model_name, params) if model_name == "SGDClassifier (realtime)": classes = np.unique(y_train) for e in range(1, int(epochs) + 1): clf.partial_fit(X_train_s, y_train, classes=classes) y_pred = clf.predict(X_test_s) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred, average="weighted") try: y_proba = clf.predict_proba(X_test_s)[:, -1] auc = roc_auc_score(y_test, y_proba) except Exception: auc = np.nan scaler2d = StandardScaler().fit(coords_train) coords_train_s = scaler2d.transform(coords_train) clf2d = LogisticRegression(max_iter=200).fit(coords_train_s, y_train) title = f"Epoch {e}/{epochs} • Acc {acc:.2f} • F1 {f1:.2f}" fig_epoch = make_base_fig(coords_train, y_train, title=title) fig_epoch = draw_decision_boundary(fig_epoch, clf2d, scaler2d, pca, X_train_s) fig_epoch.add_trace(go.Scatter( x=coords_test[:, 0], y=coords_test[:, 1], mode="markers", name="Test set", marker=dict(size=10, symbol="circle-open", line=dict(width=2, color="#111")), hovertemplate="PC1: %{x:.2f}
PC2: %{y:.2f}Test set" )) metrics_md = ( f"### Metrieken (testset)\n" f"**Accuracy:** {acc:.3f} \n" f"**F1 (gewogen):** {f1:.3f} \n" f"**ROC AUC:** {auc:.3f}\n" ) # Belangrijk: retourneer een échte Plotly Figure yield fig_epoch, metrics_md if pause_s and float(pause_s) > 0: time.sleep(float(pause_s)) return else: clf.fit(X_train_s, y_train) y_pred = clf.predict(X_test_s) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred, average="weighted") try: y_proba = clf.predict_proba(X_test_s)[:, -1] auc = roc_auc_score(y_test, y_proba) except Exception: auc = np.nan fig = make_base_fig(coords_train, y_train, title=f"Model: {model_name}") scaler2d = StandardScaler().fit(coords_train) coords_train_s = scaler2d.transform(coords_train) clf2d = LogisticRegression(max_iter=200).fit(coords_train_s, y_train) fig = draw_decision_boundary(fig, clf2d, scaler2d, pca, X_train_s) fig.add_trace(go.Scatter( x=coords_test[:, 0], y=coords_test[:, 1], mode="markers", name="Test set", marker=dict(size=10, symbol="circle-open", line=dict(width=2, color="#111")), )) metrics_md = ( f"### Metrieken (testset)\n" f"**Accuracy:** {acc:.3f} \n" f"**F1 (gewogen):** {f1:.3f} \n" f"**ROC AUC:** {auc:.3f}\n" ) return fig, metrics_md # ---------- UI ---------- DESCRIPTION = """ # 🧠 Supervised Leren – Depressie (synthetisch, ingebouwd) - **Realtime** training (SGD) met **PCA-scatter** (elk bolletje = patiënt) en **beslissingslijnen**. - Eén pagina, helder wit canvas. Geen uploads nodig. """ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", neutral_hue="slate")) as demo: gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(scale=1): ds_preview = gr.Dataframe(label="Voorbeeld van de data (eerste 10 rijen)") btn_preview = gr.Button("📄 Dataset preview vernieuwen", variant="secondary") with gr.Column(scale=1): model_choice = gr.Radio( label="Model", choices=["SGDClassifier (realtime)", "Logistic Regression", "Random Forest", "SVM (RBF)"], value="SGDClassifier (realtime)" ) with gr.Accordion("Hyperparameters", open=False): sgd_loss = gr.Dropdown(["log_loss", "hinge", "modified_huber"], value="log_loss", label="SGD loss") sgd_alpha = gr.Slider(1e-6, 1e-2, value=1e-4, step=1e-6, label="SGD alpha (L2)") sgd_lr = gr.Dropdown(["optimal", "invscaling", "constant", "adaptive"], value="optimal", label="SGD learning rate") rf_n = gr.Slider(50, 500, value=250, step=10, label="RandomForest n_estimators") rf_depth = gr.Slider(0, 20, value=8, step=1, label="RandomForest max_depth (0 = None)") svm_c = gr.Slider(0.1, 5.0, value=1.0, step=0.1, label="SVM C") test_size = gr.Slider(0.1, 0.5, value=0.25, step=0.05, label="Testset proportie") with gr.Row(): epochs = gr.Slider(1, 30, value=12, step=1, label="Epochs (alleen realtime SGD)") pause_s = gr.Slider(0.0, 1.0, value=0.15, step=0.05, label="Pauze per epoch (s)") btn_train = gr.Button("🚀 Train & Visualiseer", variant="primary") with gr.Row(): fig_out = gr.Plot(label="Visualisatie (PCA 2D) met beslissingslijnen)") metrics_out = gr.Markdown(label="Metrieken") with gr.Row(): with gr.Column(): row_index = gr.Slider(0, 999, value=0, step=1, label="Kies een patiënt (rij-index) voor voorspelling") btn_predict = gr.Button("🔮 Voorspel voor gekozen patiënt", variant="secondary") pred_md = gr.Markdown(label="Voorspelling") # Preload: preview, dan direct trainen demo.load(lambda: load_builtin_dataset()[0].head(10), inputs=None, outputs=[ds_preview]) def _proxy_train(test_size_v, model_name_v, sgd_loss_v, sgd_alpha_v, sgd_lr_v, rf_n_v, rf_depth_v, svm_c_v, epochs_v, pause_v): params = dict( sgd_loss=sgd_loss_v, sgd_alpha=float(sgd_alpha_v), sgd_lr=sgd_lr_v, rf_n=int(rf_n_v), rf_depth=None if int(rf_depth_v) == 0 else int(rf_depth_v), svm_c=float(svm_c_v), ) yield from train_and_stream(test_size_v, model_name_v, params, epochs_v, pause_v) demo.load( _proxy_train, inputs=[test_size, model_choice, sgd_loss, sgd_alpha, sgd_lr, rf_n, rf_depth, svm_c, epochs, pause_s], outputs=[fig_out, metrics_out] ) btn_preview.click(lambda: load_builtin_dataset()[0].head(10), inputs=None, outputs=[ds_preview]) btn_train.click( _proxy_train, inputs=[test_size, model_choice, sgd_loss, sgd_alpha, sgd_lr, rf_n, rf_depth, svm_c, epochs, pause_s], outputs=[fig_out, metrics_out] ) btn_predict.click( lambda model_name_v, sgd_loss_v, sgd_alpha_v, sgd_lr_v, rf_n_v, rf_depth_v, svm_c_v, row_idx: (lambda df, ycol: ( (lambda scaler, Xs, y, idx: (lambda clf: (lambda x_row, pred, proba, pretty: f"### Gekozen patiënt (rij {idx})\n```json\n{pretty}\n```\n**Voorspelling:** {pred} \n" + (f"**Zekerheid (max. klasse-prob):** {proba:.3f}" if proba is not None else "") )( Xs[idx].reshape(1, -1), clf.predict(Xs[idx].reshape(1, -1))[0], (clf.predict_proba(Xs[idx].reshape(1, -1))[0].max() if hasattr(clf, 'predict_proba') else None), json.dumps(df.iloc[[idx]].to_dict(orient='records')[0], ensure_ascii=False, indent=2) ) )( (lambda base_clf: LogisticRegression(max_iter=300) if isinstance(base_clf, SGDClassifier) else base_clf )(get_model(model_name_v, dict( sgd_loss=sgd_loss_v, sgd_alpha=float(sgd_alpha_v), sgd_lr=sgd_lr_v, rf_n=int(rf_n_v), rf_depth=None if int(rf_depth_v)==0 else int(rf_depth_v), svm_c=float(svm_c_v) ))).fit(Xs, y.values) ) )(StandardScaler().fit(df.drop(columns=[ycol]).values), StandardScaler().fit(df.drop(columns=[ycol]).values).transform(df.drop(columns=[ycol]).values), df[ycol], int(row_idx)) ))(*load_builtin_dataset()), inputs=[model_choice, sgd_loss, sgd_alpha, sgd_lr, rf_n, rf_depth, svm_c, row_index], outputs=[pred_md] ) if __name__ == "__main__": demo.launch()