Spaces:

Marcel0123
/

supervised-training-Machine-learning-GGZ-Depressie

Sleeping

App Files Files Community

Marcel0123 commited on Sep 22, 2025

Commit

2766592

verified ·

1 Parent(s): fa7883a

Upload 3 files

Browse files

Files changed (2) hide show

app.py +172 -226
ggz_depressie_synth_1000_modeling.csv +0 -0

app.py CHANGED Viewed

@@ -16,80 +16,67 @@ from sklearn.model_selection import train_test_split
 DESCRIPTION = """
 # Interactieve Scatter (2D/3D) — Gradio + Plotly + Live Train
-- **Upload** een CSV/TSV/Parquet (of gebruik de demo data)
 - **Kies** x, y (en z) kolommen voor de scatter
 - **Kleur** op cluster/categorie of continue variabele
 - **Hover** toont gekozen kenmerken
-- **Train (live)**: KMeans clustering of Logistic Regression classificatie
-- **Auto-train bij start**: model traint automatisch wanneer de Space start
 """
 MODEL_PATH = Path("model.joblib")
 # -----------------------------
-# Demo dataset
 # -----------------------------
-def make_demo_df(n=400, seed=7):
-    rng = np.random.default_rng(seed)
-    centers = np.array([
-        [0, 0, 0],
-        [5, 5, 2],
-        [-4, 3, -3],
-    ])
-    labels = rng.integers(0, len(centers), size=n)
-    points = centers[labels] + rng.normal(0, 1.1, size=(n, 3))
-    df = pd.DataFrame(points, columns=["x", "y", "z"])
     df["cluster"] = pd.Categorical(["A" if l == 0 else ("B" if l == 1 else "C") for l in labels])
-    df["age"] = rng.integers(20, 90, size=n)
-    df["sex"] = pd.Categorical(rng.choice(["F", "M"], size=n))
-    df["diagnosis"] = pd.Categorical(rng.choice(["Type I", "Type II", "Control"], size=n, p=[0.35, 0.35, 0.30]))
-    df["patient_id"] = [f"P{1000+i}" for i in range(n)]
     return df
-DEMO_DF = make_demo_df()
 # -----------------------------
-# Helpers
 # -----------------------------
-def parse_file(file_obj):
-    if file_obj is None:
-        return DEMO_DF.copy(), "Demo dataset geladen (geen upload)."
-    name = getattr(file_obj, "name", str(file_obj))
-    path = name
-    if name.lower().endswith(".csv"):
-        df = pd.read_csv(path)
-    elif name.lower().endswith(".tsv"):
-        df = pd.read_csv(path, sep="\t")
-    elif name.lower().endswith(".parquet"):
-        df = pd.read_parquet(path)
-    else:
-        df = pd.read_csv(path)
-    return df, f"Bestand geladen: {Path(name).name} — {df.shape[0]} rijen, {df.shape[1]} kolommen."
-def detect_columns(df):
-    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    all_cols = df.columns.tolist()
-    x_default = next((c for c in ["x", "X", "dim1", "pc1", "tsne1", "umap1"] if c in df.columns),
-                     (numeric_cols[0] if numeric_cols else None))
-    y_default = next((c for c in ["y", "Y", "dim2", "pc2", "tsne2", "umap2"] if c in df.columns and c != x_default),
-                     (numeric_cols[1] if len(numeric_cols) > 1 else None))
-    z_default = next((c for c in ["z", "Z", "dim3", "pc3", "tsne3", "umap3"] if c in df.columns and c not in {x_default, y_default}),
-                     (numeric_cols[2] if len(numeric_cols) > 2 else None))
-    cat_candidates = [c for c in df.columns if (df[c].dtype == 'object' or str(df[c].dtype).startswith('category'))
-                      and c not in {x_default, y_default, z_default}]
-    color_default = next((c for c in ["cluster", "label", "group", "diagnosis", "category"] if c in df.columns),
-                         (cat_candidates[0] if cat_candidates else (numeric_cols[0] if numeric_cols else None)))
-    return numeric_cols, all_cols, x_default, y_default, z_default, color_default
 def build_hovertemplate(hover_cols):
     if not hover_cols:
         return "%{x}, %{y}<extra></extra>"
     lines = []
     for i, col in enumerate(hover_cols):
         lines.append(f"<b>{col}</b>: %{{customdata[{i}]}}")
@@ -108,30 +95,22 @@ def make_figure(df, x_col, y_col, z_col, color_col, hover_cols, mode_3d, point_s
         fig = go.Figure()
         if color_series is not None and (color_series.dtype == 'object' or str(color_series.dtype).startswith('category')):
             for cat_val, dsub in df.groupby(color_col):
-                fig.add_trace(
-                    go.Scattergl(
-                        x=dsub[x_col], y=dsub[y_col], mode='markers', name=str(cat_val),
-                        marker=dict(size=point_size), opacity=opacity,
-                        customdata=(dsub[hover_cols].to_numpy() if hover_cols else None),
-                        hovertemplate=hovertemplate,
-                    )
-                )
         else:
-            fig.add_trace(
-                go.Scattergl(
-                    x=df[x_col], y=df[y_col], mode='markers', name=color_col if color_col else "data",
-                    marker=dict(size=point_size, color=(color_series if color_series is not None else None), coloraxis='coloraxis'),
-                    opacity=opacity, customdata=customdata, hovertemplate=hovertemplate,
-                )
-            )
             fig.update_layout(coloraxis=dict(colorbar=dict(title=color_col)))
-        fig.update_layout(
-            template="plotly_white",
-            margin=dict(l=10, r=10, t=30, b=10),
-            legend=dict(itemsizing='trace', title=color_col if color_col else None),
-            xaxis_title=x_col, yaxis_title=y_col,
-            dragmode='pan',
-        )
         return fig
     # 3D
@@ -140,66 +119,63 @@ def make_figure(df, x_col, y_col, z_col, color_col, hover_cols, mode_3d, point_s
     if color_series is not None and (color_series.dtype == 'object' or str(color_series.dtype).startswith('category')):
         fig = go.Figure()
         for cat_val, dsub in df.groupby(color_col):
-            fig.add_trace(
-                go.Scatter3d(
-                    x=dsub[x_col], y=dsub[y_col], z=dsub[z_col], mode='markers', name=str(cat_val),
-                    marker=dict(size=point_size), opacity=opacity,
-                    customdata=(dsub[hover_cols].to_numpy() if hover_cols else None),
-                    hovertemplate=hovertemplate,
-                )
-            )
     else:
-        fig = go.Figure(
-            data=[
-                go.Scatter3d(
-                    x=df[x_col], y=df[y_col], z=df[z_col], mode='markers', name=color_col if color_col else "data",
-                    marker=dict(size=point_size, color=(color_series if color_series is not None else None), coloraxis='coloraxis'),
-                    opacity=opacity, customdata=customdata, hovertemplate=hovertemplate,
-                )
-            ]
-        )
         fig.update_layout(coloraxis=dict(colorbar=dict(title=color_col)))
-    fig.update_layout(
-        template="plotly_white",
-        margin=dict(l=10, r=10, t=30, b=10),
-        legend=dict(itemsizing='trace', title=color_col if color_col else None),
-        scene=dict(xaxis_title=x_col, yaxis_title=y_col, zaxis_title=z_col),
-    )
     return fig
 def _plot_confusion_matrix(y_true, y_pred, title="Confusion matrix"):
     labels = sorted(pd.Series(y_true).unique())
     cm = confusion_matrix(y_true, y_pred, labels=labels)
-    fig = go.Figure(
-        data=go.Heatmap(
-            z=cm, x=labels, y=labels, text=cm, texttemplate="%{text}",
-            hovertemplate="Pred=%{x}<br>True=%{y}<br>Count=%{z}<extra></extra>"
-        )
-    )
-    fig.update_layout(title=title, xaxis_title="Predicted", yaxis_title="True",
-                      template="plotly_white", margin=dict(l=10,r=10,t=40,b=10))
     return fig
 def _fmt_metrics(metrics: dict) -> str:
     lines = []
     for k,v in metrics.items():
-        if isinstance(v, float):
-            lines.append(f"{k}: {v:.4f}")
-        else:
-            lines.append(f"{k}: {v}")
     return "\n".join(lines)
-# -----------------------------
-# Training
-# -----------------------------
 def train_live(df, task, feature_cols, label_col, k_clusters, seed):
-    if df is None or len(df) == 0:
         raise gr.Error("Geen data om te trainen.")
     if not feature_cols:
         raise gr.Error("Selecteer minimaal één featurekolom.")
     X = df[feature_cols].select_dtypes(include=[np.number])
-    if X.shape[1] == 0:
         raise gr.Error("De gekozen features bevatten geen numerieke kolommen.")
     log_lines = []
@@ -207,68 +183,41 @@ def train_live(df, task, feature_cols, label_col, k_clusters, seed):
     eval_fig = None
     if task == "Clustering (KMeans)":
-        pipe = Pipeline([
-            ("scaler", StandardScaler()),
-            ("kmeans", KMeans(n_clusters=k_clusters, random_state=seed, n_init="auto")),
-        ])
         pipe.fit(X)
         labels = pipe["kmeans"].labels_
-        alpha = [chr(ord('A') + (i % 26)) for i in labels]
-        df["cluster_model"] = pd.Categorical(alpha)
         color_col_suggestion = "cluster_model"
         dump(pipe, MODEL_PATH)
-        log_lines.append(f"✅ KMeans getraind met k={k_clusters} op {X.shape[0]} rijen en {X.shape[1]} features.")
-        log_lines.append(f"Model opgeslagen: {MODEL_PATH.resolve()}")
     elif task == "Classificatie (Logistic Regression)":
         if not label_col:
             raise gr.Error("Kies een labelkolom voor classificatie.")
         y = df[label_col].astype(str)
-        # Stratified split
-        Xtr, Xva, ytr, yva = train_test_split(
-            X, y, test_size=0.2, random_state=seed, stratify=y if y.nunique()>1 else None
-        )
-        pipe = Pipeline([
-            ("scaler", StandardScaler()),
-            ("logreg", LogisticRegression(max_iter=1000, random_state=seed, class_weight="balanced"))
-        ])
         pipe.fit(Xtr, ytr)
-        # Validatie
         yhat = pipe.predict(Xva)
-        metrics = {
-            "accuracy": accuracy_score(yva, yhat),
-            "f1_weighted": f1_score(yva, yhat, average="weighted"),
-        }
-        if y.nunique() == 2:
             try:
-                proba = pipe.predict_proba(Xva)[:, 1]
                 uniq = list(pd.Series(y).unique())
                 mapping = {uniq[0]:0, uniq[1]:1}
                 metrics["roc_auc"] = roc_auc_score(yva.map(mapping), proba)
             except Exception:
                 pass
         eval_fig = _plot_confusion_matrix(yva, yhat, title="Confusion matrix (validatie)")
-        # Train op alle data en voorspel voor visualisatie
         pipe.fit(X, y)
         preds = pipe.predict(X)
         df["pred_model"] = pd.Categorical(preds)
-        if y.nunique() == 2:
-            try:
-                df["pred_proba"] = pipe.predict_proba(X)[:, 1]
-            except Exception:
-                pass
         color_col_suggestion = "pred_model"
         dump(pipe, MODEL_PATH)
-        log_lines.append(f"✅ LogisticRegression getraind (split 80/20).")
-        log_lines.append(f"Model opgeslagen: {MODEL_PATH.resolve()}")
-        log_lines.append("Metrics (validatie):\n" + _fmt_metrics(metrics))
     else:
         raise gr.Error("Onbekende taak.")
@@ -286,29 +235,49 @@ def try_load_model():
 # -----------------------------
 # Gradio callbacks
 # -----------------------------
 def init_from_file(file_obj):
-    df, status = parse_file(file_obj)
     numeric_cols, all_cols, x_d, y_d, z_d, color_d = detect_columns(df)
-    hover_default = [c for c in ["patient_id", "age", "sex", "diagnosis", "cluster"] if c in df.columns]
-    feat_default = [c for c in numeric_cols]
     return (
         gr.update(choices=all_cols, value=x_d),
         gr.update(choices=all_cols, value=y_d),
         gr.update(choices=all_cols, value=z_d),
-        gr.update(choices=all_cols, value=color_d),
         gr.update(choices=all_cols, value=hover_default),
         status,
         df,
-        gr.update(choices=numeric_cols, value=feat_default),
-        gr.update(choices=all_cols, value=None),
     )
 def update_plot(df, x_col, y_col, z_col, color_col, hover_cols, mode_dim, size, opacity):
-    if df is None or (isinstance(df, (list, tuple)) and len(df) == 0):
-        df = DEMO_DF.copy()
     mode_3d = (mode_dim == "3D")
-    fig = make_figure(df, x_col, y_col, z_col, color_col, hover_cols, mode_3d, size, opacity)
-    return fig
 def on_train_click(df, task, feature_cols, label_col, k_clusters, seed, x_col, y_col, z_col, color_col, hover_cols, mode_dim, size, opacity):
     df2, log_text, color_suggestion, eval_fig = train_live(df.copy(), task, feature_cols, label_col, k_clusters, seed)
@@ -317,8 +286,11 @@ def on_train_click(df, task, feature_cols, label_col, k_clusters, seed, x_col, y
     return df2, log_text, gr.update(value=new_color, choices=df2.columns.tolist()), fig, eval_fig
 def startup_auto_train(df, task_default, feature_cols, label_col, k_clusters, seed, x_col, y_col, z_col, color_col, hover_cols, mode_dim, size, opacity):
     try:
-        df2, log_text, color_suggestion, eval_fig = train_live(df.copy(), task_default, feature_cols, label_col, k_clusters, seed)
         new_color = color_suggestion if color_suggestion else color_col
         fig = update_plot(df2, x_col, y_col, z_col, new_color, hover_cols, mode_dim, size, opacity)
         return df2, log_text, gr.update(value=new_color, choices=df2.columns.tolist()), fig, eval_fig
@@ -334,87 +306,61 @@ with gr.Blocks(css=".gradio-container {max-width: 1200px !important}") as demo:
     with gr.Row():
         with gr.Column(scale=1):
             data_file = gr.File(label="Upload CSV/TSV/Parquet", file_count="single", type="filepath")
-            status_box = gr.Markdown("Gebruik de demo data of upload je eigen bestand.")
             with gr.Accordion("Assen & kleur", open=True):
-                x_dd = gr.Dropdown(choices=DEMO_DF.columns.tolist(), value="x", label="X kolom")
-                y_dd = gr.Dropdown(choices=DEMO_DF.columns.tolist(), value="y", label="Y kolom")
-                z_dd = gr.Dropdown(choices=DEMO_DF.columns.tolist(), value="z", label="Z kolom (voor 3D)")
-                color_dd = gr.Dropdown(choices=DEMO_DF.columns.tolist(), value="cluster", label="Kleur op kolom")
-                hover_ms = gr.Dropdown(
-                    choices=DEMO_DF.columns.tolist(),
-                    value=["patient_id", "age", "sex", "diagnosis", "cluster"],
-                    multiselect=True,
-                    label="Hover info kolommen"
-                )
             with gr.Accordion("Weergave", open=True):
-                mode_dim = gr.Radio(["2D", "3D"], value="2D", label="Dimensie")
                 size_slider = gr.Slider(3, 18, value=8, step=1, label="Puntgrootte")
                 opacity_slider = gr.Slider(0.1, 1.0, value=0.8, step=0.05, label="Transparantie (opacity)")
             with gr.Accordion("Training (live)", open=True):
-                task_radio = gr.Radio(
-                    ["Clustering (KMeans)", "Classificatie (Logistic Regression)"],
-                    value="Clustering (KMeans)",
-                    label="Taak"
-                )
-                feat_ms = gr.Dropdown(choices=DEMO_DF.select_dtypes(include=[np.number]).columns.tolist(),
-                                      value=["x", "y", "z", "age"],
-                                      multiselect=True,
-                                      label="Feature kolommen (numeriek)")
-                label_dd = gr.Dropdown(choices=DEMO_DF.columns.tolist(), value=None, label="Label kolom (alleen voor classificatie)")
                 k_slider = gr.Slider(2, 12, value=3, step=1, label="K (clusters) — KMeans")
                 seed_slider = gr.Slider(0, 10_000, value=7, step=1, label="Random seed")
                 train_btn = gr.Button("🚀 Train (live)")
                 train_log = gr.Textbox(label="Train log", lines=6, interactive=False)
-            hidden_df = gr.State(DEMO_DF.copy())
         with gr.Column(scale=2):
-            # Zelfde visualisatie (bolletjes) en layout behouden
             plot = gr.Plot(label="Scatterplot")
             with gr.Accordion("Evaluatie (validatie)", open=False):
                 cm_plot = gr.Plot(label="Confusion Matrix (validatie)")
     # ===== Events =====
-    data_file.change(
-        fn=init_from_file,
-        inputs=[data_file],
-        outputs=[x_dd, y_dd, z_dd, color_dd, hover_ms, status_box, hidden_df, feat_ms, label_dd],
-        show_progress=False,
-    )
     for comp in [x_dd, y_dd, z_dd, color_dd, hover_ms, mode_dim, size_slider, opacity_slider]:
-        comp.change(
-            fn=update_plot,
-            inputs=[hidden_df, x_dd, y_dd, z_dd, color_dd, hover_ms, mode_dim, size_slider, opacity_slider],
-            outputs=plot,
-            show_progress=False,
-        )
-    train_btn.click(
-        fn=on_train_click,
-        inputs=[hidden_df, task_radio, feat_ms, label_dd, k_slider, seed_slider,
-                x_dd, y_dd, z_dd, color_dd, hover_ms, mode_dim, size_slider, opacity_slider],
-        outputs=[hidden_df, train_log, color_dd, plot, cm_plot],
-        show_progress=True,
-    )
-    demo.load(
-        fn=update_plot,
-        inputs=[hidden_df, x_dd, y_dd, z_dd, color_dd, hover_ms, mode_dim, size_slider, opacity_slider],
-        outputs=plot,
-        show_progress=False,
-    )
-    demo.load(
-        fn=startup_auto_train,
-        inputs=[hidden_df, task_radio, feat_ms, label_dd, k_slider, seed_slider,
-                x_dd, y_dd, z_dd, color_dd, hover_ms, mode_dim, size_slider, opacity_slider],
-        outputs=[hidden_df, train_log, color_dd, plot, cm_plot],
-        show_progress=True,
-    )
 if __name__ == "__main__":
     demo.launch()

 DESCRIPTION = """
 # Interactieve Scatter (2D/3D) — Gradio + Plotly + Live Train
+- **Bundled dataset**: ggz_depressie_synth_1000_modeling.csv (laadt automatisch)
+- **Auto-train (supervised)** bij start met label: `target_respons50`
+- **Upload** een eigen CSV/TSV/Parquet indien gewenst
 - **Kies** x, y (en z) kolommen voor de scatter
 - **Kleur** op cluster/categorie of continue variabele
 - **Hover** toont gekozen kenmerken
+- **Train (live)**: KMeans of Logistic Regression
 """
 MODEL_PATH = Path("model.joblib")
+DATA_PATH = Path("data/ggz_depressie_synth_1000_modeling.csv")
 # -----------------------------
+# Data loading
 # -----------------------------
+def load_default_df():
+    if DATA_PATH.exists():
+        try:
+            return pd.read_csv(DATA_PATH)
+        except Exception as e:
+            print("Kon bundled dataset niet laden:", e)
+    # Fallback demo
+    rng = np.random.default_rng(7)
+    centers = np.array([[0,0,0],[5,5,2],[-4,3,-3]])
+    labels = rng.integers(0, len(centers), size=400)
+    points = centers[labels] + rng.normal(0, 1.1, size=(400,3))
+    df = pd.DataFrame(points, columns=["x","y","z"])
     df["cluster"] = pd.Categorical(["A" if l == 0 else ("B" if l == 1 else "C") for l in labels])
+    df["age"] = rng.integers(20, 90, size=400)
+    df["sex"] = pd.Categorical(rng.choice(["F","M"], size=400))
+    df["diagnosis"] = pd.Categorical(rng.choice(["Type I","Type II","Control"], size=400, p=[0.35,0.35,0.30]))
+    df["patient_id"] = [f"P{1000+i}" for i in range(400)]
     return df
+BASE_DF = load_default_df()
+# Heuristics for label/features
+def pick_default_label(df):
+    # voorkeur: target_respons50 -> anders bekende varianten -> anders eerste target_
+    for name in ["target_respons50", "target_remissie", "target_uitval", "target_opname6mnd", "target_rtw3mnd"]:
+        if name in df.columns:
+            return name
+    for c in df.columns:
+        if str(c).lower() in ["label","target","y","class","diagnosis","outcome"]:
+            return c
+    for c in df.columns:
+        if str(c).startswith("target_"):
+            return c
+    return None
+def default_features(df):
+    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    exclude = set(["patient_id"] + [c for c in df.columns if str(c).startswith("target_")])
+    return [c for c in num_cols if c not in exclude]
 # -----------------------------
+# Plot utils (identiek aan eerdere bolletjesvis)
 # -----------------------------
 def build_hovertemplate(hover_cols):
     if not hover_cols:
         return "%{x}, %{y}<extra></extra>"
     lines = []
     for i, col in enumerate(hover_cols):
         lines.append(f"<b>{col}</b>: %{{customdata[{i}]}}")
         fig = go.Figure()
         if color_series is not None and (color_series.dtype == 'object' or str(color_series.dtype).startswith('category')):
             for cat_val, dsub in df.groupby(color_col):
+                fig.add_trace(go.Scattergl(
+                    x=dsub[x_col], y=dsub[y_col], mode='markers', name=str(cat_val),
+                    marker=dict(size=point_size), opacity=opacity,
+                    customdata=(dsub[hover_cols].to_numpy() if hover_cols else None),
+                    hovertemplate=hovertemplate,
+                ))
         else:
+            fig.add_trace(go.Scattergl(
+                x=df[x_col], y=df[y_col], mode='markers', name=color_col if color_col else "data",
+                marker=dict(size=point_size, color=(color_series if color_series is not None else None), coloraxis='coloraxis'),
+                opacity=opacity, customdata=customdata, hovertemplate=hovertemplate,
+            ))
             fig.update_layout(coloraxis=dict(colorbar=dict(title=color_col)))
+        fig.update_layout(template="plotly_white", margin=dict(l=10,r=10,t=30,b=10),
+                          legend=dict(itemsizing='trace', title=color_col if color_col else None),
+                          xaxis_title=x_col, yaxis_title=y_col, dragmode='pan')
         return fig
     # 3D
     if color_series is not None and (color_series.dtype == 'object' or str(color_series.dtype).startswith('category')):
         fig = go.Figure()
         for cat_val, dsub in df.groupby(color_col):
+            fig.add_trace(go.Scatter3d(
+                x=dsub[x_col], y=dsub[y_col], z=dsub[z_col], mode='markers', name=str(cat_val),
+                marker=dict(size=point_size), opacity=opacity,
+                customdata=(dsub[hover_cols].to_numpy() if hover_cols else None),
+                hovertemplate=hovertemplate,
+            ))
     else:
+        fig = go.Figure(data=[go.Scatter3d(
+            x=df[x_col], y=df[y_col], z=df[z_col], mode='markers', name=color_col if color_col else "data",
+            marker=dict(size=point_size, color=(color_series if color_series is not None else None), coloraxis='coloraxis'),
+            opacity=opacity, customdata=customdata, hovertemplate=hovertemplate,
+        )])
         fig.update_layout(coloraxis=dict(colorbar=dict(title=color_col)))
+    fig.update_layout(template="plotly_white", margin=dict(l=10,r=10,t=30,b=10),
+                      legend=dict(itemsizing='trace', title=color_col if color_col else None),
+                      scene=dict(xaxis_title=x_col, yaxis_title=y_col, zaxis_title=z_col))
     return fig
+# -----------------------------
+# App state & defaults
+# -----------------------------
+def detect_columns(df):
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    all_cols = df.columns.tolist()
+    # try common embedding columns
+    x_default = next((c for c in ["x","X","dim1","pc1","tsne1","umap1"] if c in df.columns), (numeric_cols[0] if numeric_cols else None))
+    y_default = next((c for c in ["y","Y","dim2","pc2","tsne2","umap2"] if c in df.columns and c != x_default), (numeric_cols[1] if len(numeric_cols)>1 else None))
+    z_default = next((c for c in ["z","Z","dim3","pc3","tsne3","umap3"] if c in df.columns and c not in {x_default,y_default}), (numeric_cols[2] if len(numeric_cols)>2 else None))
+    # color default
+    cat_candidates = [c for c in df.columns if (df[c].dtype == 'object' or str(df[c].dtype).startswith('category')) and c not in {x_default, y_default, z_default}]
+    color_default = next((c for c in ["cluster","label","group","diagnosis","category","pred_model"] if c in df.columns), (cat_candidates[0] if cat_candidates else (numeric_cols[0] if numeric_cols else None)))
+    return numeric_cols, all_cols, x_default, y_default, z_default, color_default
+# -----------------------------
+# Training
+# -----------------------------
 def _plot_confusion_matrix(y_true, y_pred, title="Confusion matrix"):
     labels = sorted(pd.Series(y_true).unique())
     cm = confusion_matrix(y_true, y_pred, labels=labels)
+    fig = go.Figure(data=go.Heatmap(z=cm, x=labels, y=labels, text=cm, texttemplate="%{text}",
+                                    hovertemplate="Pred=%{x}<br>True=%{y}<br>Count=%{z}<extra></extra>"))
+    fig.update_layout(title=title, xaxis_title="Predicted", yaxis_title="True", template="plotly_white", margin=dict(l=10,r=10,t=40,b=10))
     return fig
 def _fmt_metrics(metrics: dict) -> str:
     lines = []
     for k,v in metrics.items():
+        lines.append(f"{k}: {v:.4f}" if isinstance(v,float) else f"{k}: {v}")
     return "\n".join(lines)
 def train_live(df, task, feature_cols, label_col, k_clusters, seed):
+    if df is None or len(df)==0:
         raise gr.Error("Geen data om te trainen.")
     if not feature_cols:
         raise gr.Error("Selecteer minimaal één featurekolom.")
     X = df[feature_cols].select_dtypes(include=[np.number])
+    if X.shape[1]==0:
         raise gr.Error("De gekozen features bevatten geen numerieke kolommen.")
     log_lines = []
     eval_fig = None
     if task == "Clustering (KMeans)":
+        pipe = Pipeline([("scaler", StandardScaler()), ("kmeans", KMeans(n_clusters=k_clusters, random_state=seed, n_init="auto"))])
         pipe.fit(X)
         labels = pipe["kmeans"].labels_
+        df["cluster_model"] = pd.Categorical([chr(ord('A') + (i % 26)) for i in labels])
         color_col_suggestion = "cluster_model"
         dump(pipe, MODEL_PATH)
+        log_lines += [f"✅ KMeans getraind met k={k_clusters} op {X.shape[0]} rijen.", f"Model opgeslagen: {MODEL_PATH.resolve()}"]
     elif task == "Classificatie (Logistic Regression)":
         if not label_col:
             raise gr.Error("Kies een labelkolom voor classificatie.")
         y = df[label_col].astype(str)
+        Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y if y.nunique()>1 else None)
+        pipe = Pipeline([("scaler", StandardScaler()), ("logreg", LogisticRegression(max_iter=1000, random_state=seed, class_weight="balanced"))])
         pipe.fit(Xtr, ytr)
         yhat = pipe.predict(Xva)
+        metrics = {"accuracy": accuracy_score(yva, yhat), "f1_weighted": f1_score(yva, yhat, average="weighted")}
+        if y.nunique()==2:
             try:
+                proba = pipe.predict_proba(Xva)[:,1]
                 uniq = list(pd.Series(y).unique())
                 mapping = {uniq[0]:0, uniq[1]:1}
                 metrics["roc_auc"] = roc_auc_score(yva.map(mapping), proba)
             except Exception:
                 pass
         eval_fig = _plot_confusion_matrix(yva, yhat, title="Confusion matrix (validatie)")
         pipe.fit(X, y)
         preds = pipe.predict(X)
         df["pred_model"] = pd.Categorical(preds)
+        if y.nunique()==2:
+            try: df["pred_proba"] = pipe.predict_proba(X)[:,1]
+            except Exception: pass
         color_col_suggestion = "pred_model"
         dump(pipe, MODEL_PATH)
+        log_lines += ["✅ LogisticRegression getraind (split 80/20).", f"Model opgeslagen: {MODEL_PATH.resolve()}", "Metrics (validatie):\n"+_fmt_metrics(metrics)]
     else:
         raise gr.Error("Onbekende taak.")
 # -----------------------------
 # Gradio callbacks
 # -----------------------------
+def parse_file(file_obj):
+    if file_obj is None:
+        return BASE_DF.copy(), ("Bundled dataset geladen." if DATA_PATH.exists() else "Demo dataset geladen.")
+    name = getattr(file_obj, "name", str(file_obj))
+    path = name
+    if name.lower().endswith(".csv"):
+        df = pd.read_csv(path)
+    elif name.lower().endswith(".tsv"):
+        df = pd.read_csv(path, sep="\t")
+    elif name.lower().endswith(".parquet"):
+        df = pd.read_parquet(path)
+    else:
+        df = pd.read_csv(path)
+    return df, f"Bestand geladen: {Path(name).name} — {df.shape[0]} rijen, {df.shape[1]} kolommen."
 def init_from_file(file_obj):
+    if file_obj is None:
+        df = BASE_DF.copy()
+        status = "Bundled dataset geladen." if DATA_PATH.exists() else "Demo dataset geladen."
+    else:
+        df, status = parse_file(file_obj)
     numeric_cols, all_cols, x_d, y_d, z_d, color_d = detect_columns(df)
+    hover_default = [c for c in ["patient_id","age","sex","diagnosis","cluster"] if c in df.columns]
+    feat_default = default_features(df)
+    label_default = pick_default_label(df)
     return (
         gr.update(choices=all_cols, value=x_d),
         gr.update(choices=all_cols, value=y_d),
         gr.update(choices=all_cols, value=z_d),
+        gr.update(choices=all_cols, value="pred_model" if "pred_model" in df.columns else color_d),
         gr.update(choices=all_cols, value=hover_default),
         status,
         df,
+        gr.update(choices=df.select_dtypes(include=[np.number]).columns.tolist(), value=feat_default),
+        gr.update(choices=all_cols, value=label_default),
     )
 def update_plot(df, x_col, y_col, z_col, color_col, hover_cols, mode_dim, size, opacity):
+    if df is None or (isinstance(df,(list,tuple)) and len(df)==0):
+        df = BASE_DF.copy()
     mode_3d = (mode_dim == "3D")
+    return make_figure(df, x_col, y_col, z_col, color_col, hover_cols, mode_3d, size, opacity)
 def on_train_click(df, task, feature_cols, label_col, k_clusters, seed, x_col, y_col, z_col, color_col, hover_cols, mode_dim, size, opacity):
     df2, log_text, color_suggestion, eval_fig = train_live(df.copy(), task, feature_cols, label_col, k_clusters, seed)
     return df2, log_text, gr.update(value=new_color, choices=df2.columns.tolist()), fig, eval_fig
 def startup_auto_train(df, task_default, feature_cols, label_col, k_clusters, seed, x_col, y_col, z_col, color_col, hover_cols, mode_dim, size, opacity):
+    # Forceer supervised classificatie bij start
     try:
+        chosen_label = label_col or pick_default_label(df)
+        chosen_feats = feature_cols or default_features(df)
+        df2, log_text, color_suggestion, eval_fig = train_live(df.copy(), "Classificatie (Logistic Regression)", chosen_feats, chosen_label, k_clusters, seed)
         new_color = color_suggestion if color_suggestion else color_col
         fig = update_plot(df2, x_col, y_col, z_col, new_color, hover_cols, mode_dim, size, opacity)
         return df2, log_text, gr.update(value=new_color, choices=df2.columns.tolist()), fig, eval_fig
     with gr.Row():
         with gr.Column(scale=1):
             data_file = gr.File(label="Upload CSV/TSV/Parquet", file_count="single", type="filepath")
+            status_box = gr.Markdown("Bundled dataset wordt standaard geladen en getraind bij start.")
             with gr.Accordion("Assen & kleur", open=True):
+                base_numeric = BASE_DF.select_dtypes(include=[np.number]).columns.tolist()
+                x_default = base_numeric[0] if len(base_numeric)>0 else None
+                y_default = base_numeric[1] if len(base_numeric)>1 else None
+                z_default = base_numeric[2] if len(base_numeric)>2 else None
+                x_dd = gr.Dropdown(choices=BASE_DF.columns.tolist(), value=x_default, label="X kolom")
+                y_dd = gr.Dropdown(choices=BASE_DF.columns.tolist(), value=y_default, label="Y kolom")
+                z_dd = gr.Dropdown(choices=BASE_DF.columns.tolist(), value=z_default, label="Z kolom (voor 3D)")
+                color_dd = gr.Dropdown(choices=BASE_DF.columns.tolist(), value="pred_model" if "pred_model" in BASE_DF.columns else None, label="Kleur op kolom")
+                hover_ms = gr.Dropdown(choices=BASE_DF.columns.tolist(), value=[c for c in ["patient_id","age","sex","diagnosis","cluster"] if c in BASE_DF.columns], multiselect=True, label="Hover info kolommen")
             with gr.Accordion("Weergave", open=True):
+                mode_dim = gr.Radio(["2D","3D"], value="2D", label="Dimensie")
                 size_slider = gr.Slider(3, 18, value=8, step=1, label="Puntgrootte")
                 opacity_slider = gr.Slider(0.1, 1.0, value=0.8, step=0.05, label="Transparantie (opacity)")
             with gr.Accordion("Training (live)", open=True):
+                task_radio = gr.Radio(["Clustering (KMeans)","Classificatie (Logistic Regression)"], value="Classificatie (Logistic Regression)", label="Taak")
+                feat_ms = gr.Dropdown(choices=BASE_DF.select_dtypes(include=[np.number]).columns.tolist(), value=default_features(BASE_DF), multiselect=True, label="Feature kolommen (numeriek)")
+                label_dd = gr.Dropdown(choices=BASE_DF.columns.tolist(), value=pick_default_label(BASE_DF), label="Label kolom (alleen voor classificatie)")
                 k_slider = gr.Slider(2, 12, value=3, step=1, label="K (clusters) — KMeans")
                 seed_slider = gr.Slider(0, 10_000, value=7, step=1, label="Random seed")
                 train_btn = gr.Button("🚀 Train (live)")
                 train_log = gr.Textbox(label="Train log", lines=6, interactive=False)
+            hidden_df = gr.State(BASE_DF.copy())
         with gr.Column(scale=2):
             plot = gr.Plot(label="Scatterplot")
             with gr.Accordion("Evaluatie (validatie)", open=False):
                 cm_plot = gr.Plot(label="Confusion Matrix (validatie)")
     # ===== Events =====
+    data_file.change(fn=init_from_file, inputs=[data_file],
+                     outputs=[x_dd, y_dd, z_dd, color_dd, hover_ms, status_box, hidden_df, feat_ms, label_dd],
+                     show_progress=False)
     for comp in [x_dd, y_dd, z_dd, color_dd, hover_ms, mode_dim, size_slider, opacity_slider]:
+        comp.change(fn=update_plot,
+                    inputs=[hidden_df, x_dd, y_dd, z_dd, color_dd, hover_ms, mode_dim, size_slider, opacity_slider],
+                    outputs=plot, show_progress=False)
+    train_btn.click(fn=on_train_click,
+        inputs=[hidden_df, task_radio, feat_ms, label_dd, k_slider, seed_slider, x_dd, y_dd, z_dd, color_dd, hover_ms, mode_dim, size_slider, opacity_slider],
+        outputs=[hidden_df, train_log, color_dd, plot, cm_plot], show_progress=True)
+    # Initial plot
+    demo.load(fn=update_plot, inputs=[hidden_df, x_dd, y_dd, z_dd, color_dd, hover_ms, mode_dim, size_slider, opacity_slider], outputs=plot, show_progress=False)
+    # Auto-train bij start (forceer supervised classificatie met default label)
+    demo.load(fn=startup_auto_train,
+        inputs=[hidden_df, task_radio, feat_ms, label_dd, k_slider, seed_slider, x_dd, y_dd, z_dd, color_dd, hover_ms, mode_dim, size_slider, opacity_slider],
+        outputs=[hidden_df, train_log, color_dd, plot, cm_plot], show_progress=True)
 if __name__ == "__main__":
     demo.launch()

ggz_depressie_synth_1000_modeling.csv ADDED Viewed

The diff for this file is too large to render. See raw diff