Spaces:

Marcel0123
/

AGRESSIE-prediction

Sleeping

App Files Files Community

Marcel0123 commited on Oct 23, 2025

Commit

6a40b52

verified ·

1 Parent(s): 39e478b

Upload 3 files

Browse files

Files changed (3) hide show

app.py +225 -0
requirements.txt +6 -0
synthetische_ggz_agressie_dataset_1000.csv +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.decomposition import TruncatedSVD
+from sklearn.manifold import TSNE
+import plotly.express as px
+DEFAULT_CSV = "synthetische_ggz_agressie_dataset_1000.csv"
+DESCRIPTION = \"\"\"
+# GGZ Agressie (synthetisch) — Auto-train + 2D visualisatie
+Deze Space **traint automatisch** bij het opstarten op een **synthetische Nederlandstalige GGZ-dataset** en toont
+een **2D "bolletjes" plot** (interactief) waarin iedere punt een patiëntvoorval representeert.
+- **Kleur**: op basis van het *werkelijke label* (0/1) of de *voorspelde kans*.
+- **Hover**: toont een korte snippet uit de *rapportage* en relevante features.
+- **Model**: TF‑IDF ➜ Logistic Regression (probabilistisch), standaard train/test split (stratified).
+> ⚠️ **Belangrijk**: dit is **synthetische data** en uitsluitend voor **educatieve doeleinden**.
+> Niet gebruiken voor klinische beslissingen.
+\"\"\"
+FOOTER = \"\"\"
+**Tips**
+- Upload een eigen CSV met minimaal kolommen `rapportage` en `agressie_volgende30d` om opnieuw te trainen.
+- Pas de *threshold* aan om de confusion matrix en metrics live te zien.
+- De 2D-plot gebruikt **TruncatedSVD (50D)** gevolgd door **t-SNE (2D)** op TF‑IDF features (sneller & expressief).
+\"\"\"
+def load_dataset(file_obj=None):
+    if file_obj is None:
+        df = pd.read_csv(DEFAULT_CSV)
+    else:
+        df = pd.read_csv(file_obj.name if hasattr(file_obj, "name") else file_obj)
+    # Basiseisen
+    req = {"rapportage", "agressie_volgende30d"}
+    missing = req - set(df.columns)
+    if missing:
+        raise ValueError(f"CSV mist kolommen: {missing}")
+    df = df.dropna(subset=["rapportage", "agressie_volgende30d"]).copy()
+    df["agressie_volgende30d"] = (df["agressie_volgende30d"].astype(int) > 0).astype(int)
+    return df
+def build_and_train(df, test_size=0.2, random_state=42, max_features=4000, ngram_max=2):
+    X = df["rapportage"].astype(str).values
+    y = df["agressie_volgende30d"].values
+    X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
+        X, y, np.arange(len(X)), test_size=test_size, random_state=random_state, stratify=y
+    )
+    vect = TfidfVectorizer(max_features=max_features, ngram_range=(1, ngram_max))
+    clf = LogisticRegression(max_iter=3000)
+    pipe = Pipeline([("tfidf", vect), ("clf", clf)])
+    pipe.fit(X_train, y_train)
+    # Probabilities
+    y_score = pipe.predict_proba(X_test)[:, 1]
+    auroc = float(roc_auc_score(y_test, y_score))
+    auprc = float(average_precision_score(y_test, y_score))
+    # for visualization: compute 2D embedding on ALL data to show full cloud
+    tfidf_all = pipe.named_steps["tfidf"].fit_transform(X)  # fit on all text for viz only
+    svd = TruncatedSVD(n_components=50, random_state=random_state)
+    X50 = svd.fit_transform(tfidf_all)
+    tsne = TSNE(n_components=2, random_state=random_state, perplexity=30, learning_rate="auto", init="pca")
+    X2 = tsne.fit_transform(X50)
+    # scale to 0-1 for nicer plotting sizes
+    x = (X2[:,0] - X2[:,0].min()) / (X2[:,0].ptp() + 1e-9)
+    y2 = (X2[:,1] - X2[:,1].min()) / (X2[:,1].ptp() + 1e-9)
+    # Pred proba on all
+    proba_all = pipe.predict_proba(X)[:, 1]
+    # Build DataFrame for plotting
+    plot_df = pd.DataFrame({
+        "x": x, "y": y2,
+        "label": df["agressie_volgende30d"].values,
+        "kans": proba_all,
+        "rapportage": df["rapportage"].str.slice(0, 180) + "..."
+    })
+    # annotate some optional features if present
+    for col in ["PHQ9_baseline","GAD7_baseline","stress_niveau_1_5","slaap_uren","sociale_steun_0_10","zorgsetting"]:
+        if col in df.columns:
+            plot_df[col] = df[col]
+    # Test set indices mask for highlighting (optional)
+    test_mask = np.zeros(len(plot_df), dtype=bool)
+    test_mask[idx_test] = True
+    plot_df["split"] = np.where(test_mask, "test", "train")
+    return pipe, (X_test, y_test, y_score), plot_df, auroc, auprc
+def make_scatter(plot_df, color_mode="label"):
+    if color_mode == "label":
+        color = plot_df["label"].map({0:"geen agressie", 1:"agressie"})
+        fig = px.scatter(
+            plot_df, x="x", y="y", color=color, hover_data=["rapportage","kans","split"],
+            title="2D projectie van teksten (t‑SNE) — kleur = werkelijk label",
+            opacity=0.8
+        )
+    else:
+        fig = px.scatter(
+            plot_df, x="x", y="y", color="kans", hover_data=["rapportage","kans","split"],
+            color_continuous_scale="Turbo",
+            title="2D projectie van teksten (t‑SNE) — kleur = voorspelde kans",
+            opacity=0.85
+        )
+    fig.update_traces(marker=dict(size=8, line=dict(width=0)))
+    fig.update_layout(margin=dict(l=10,r=10,t=40,b=10), template="simple_white")
+    return fig
+def metrics_table(y_true, y_score, thr):
+    y_pred = (y_score >= thr).astype(int)
+    rep = classification_report(y_true, y_pred, output_dict=True)
+    rep_df = pd.DataFrame(rep).T.round(3)
+    cm = confusion_matrix(y_true, y_pred)
+    cm_df = pd.DataFrame(cm, index=["True 0","True 1"], columns=["Pred 0","Pred 1"])
+    return rep_df, cm_df
+# Global state for auto-training on load
+GLOBAL = {"pipe": None, "plot_df": None, "eval": None, "auroc": None, "auprc": None}
+def do_train(file_obj=None, test_size=0.2, seed=42, max_features=4000, ngram_max=2):
+    df = load_dataset(file_obj)
+    pipe, eval_pack, plot_df, auroc, auprc = build_and_train(df, test_size, seed, max_features, ngram_max)
+    GLOBAL["pipe"] = pipe
+    GLOBAL["plot_df"] = plot_df
+    GLOBAL["eval"] = eval_pack
+    GLOBAL["auroc"] = auroc
+    GLOBAL["auprc"] = auprc
+    fig_label = make_scatter(plot_df, color_mode="label")
+    fig_prob = make_scatter(plot_df, color_mode="prob")
+    rep_df, cm_df = metrics_table(eval_pack[1], eval_pack[2], thr=0.5)
+    return (
+        float(auroc), float(auprc),
+        fig_label, fig_prob,
+        rep_df, cm_df
+    )
+def predict_one(text):
+    if GLOBAL["pipe"] is None:
+        return "Nog geen model getraind.", None
+    if not text or text.strip() == "":
+        return "Voer een rapportage in.", None
+    proba = float(GLOBAL["pipe"].predict_proba([text])[:,1][0])
+    label = int(proba >= 0.5)
+    md = f"**Kans op agressie (30d)**: **{proba:.3f}** — voorspelde klasse: **{label}** (drempel 0.50)"
+    return md, proba
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="red", neutral_hue="slate")) as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column(scale=2):
+            auroc_box = gr.Number(label="AUROC", precision=3)
+        with gr.Column(scale=2):
+            auprc_box = gr.Number(label="AUPRC", precision=3)
+    with gr.Tabs():
+        with gr.Tab("Visualisatie"):
+            color_mode = gr.Radio(choices=["label","prob"], value="label", label="Kleurmodus (label of kans)")
+            fig_out = gr.Plot(label="2D bolletjes-plot")
+            def _switch_color(mode):
+                if GLOBAL["plot_df"] is None:
+                    return None
+                return make_scatter(GLOBAL["plot_df"], color_mode=mode)
+            color_mode.change(_switch_color, inputs=color_mode, outputs=fig_out)
+            # Also show both plots on load
+            fig_label_out = gr.Plot(visible=False)
+            fig_prob_out = gr.Plot(visible=False)
+        with gr.Tab("Evaluatie"):
+            thr = gr.Slider(0.05, 0.95, value=0.5, step=0.05, label="Drempel voor classificatie")
+            rep_df = gr.Dataframe(label="Classification report")
+            cm_df = gr.Dataframe(label="Confusion matrix")
+            def _update_eval(t):
+                if GLOBAL["eval"] is None:
+                    return None, None
+                y_true, y_score = GLOBAL["eval"][1], GLOBAL["eval"][2]
+                rep, cm = metrics_table(y_true, y_score, t)
+                return rep, cm
+            thr.release(_update_eval, inputs=thr, outputs=[rep_df, cm_df])
+        with gr.Tab("Predict (vrije tekst)"):
+            txt = gr.Textbox(lines=6, label="Rapportage (NL)")
+            btn = gr.Button("Voorspel")
+            md_out = gr.Markdown()
+            proba_out = gr.Number(label="Kans", precision=3)
+            btn.click(predict_one, inputs=txt, outputs=[md_out, proba_out])
+        with gr.Tab("(Optioneel) Hertrain"):
+            csv_in = gr.File(label="Upload eigen CSV")
+            test_size = gr.Slider(0.1, 0.4, value=0.2, step=0.05, label="Test set grootte")
+            seed = gr.Slider(1, 999, value=42, step=1, label="Random seed")
+            max_features = gr.Slider(1000, 12000, value=4000, step=1000, label="TF‑IDF max_features")
+            ngram_max = gr.Radio(choices=[1,2], value=2, label="n‑gram max")
+            train_btn = gr.Button("Train opnieuw")
+            def _train(csv_in, test_size, seed, max_features, ngram_max):
+                return do_train(csv_in, test_size, int(seed), int(max_features), int(ngram_max))
+            train_btn.click(_train, inputs=[csv_in, test_size, seed, max_features, ngram_max],
+                            outputs=[auroc_box, auprc_box, fig_label_out, fig_prob_out, rep_df, cm_df]).then(
+                                lambda: _switch_color(color_mode.value), None, fig_out
+                            )
+    # Auto-train on load using default CSV
+    def _auto_train():
+        return do_train(None, 0.2, 42, 4000, 2)
+    demo.load(_auto_train, inputs=None, outputs=[auroc_box, auprc_box, fig_label_out, fig_prob_out, rep_df, cm_df]).then(
+        lambda: _switch_color("label"), None, fig_out
+    )
+    gr.Markdown(FOOTER)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio>=4.16.0
+pandas>=2.0.0
+numpy>=1.24.0
+scikit-learn>=1.3.0
+plotly>=5.20.0

synthetische_ggz_agressie_dataset_1000.csv ADDED Viewed

The diff for this file is too large to render. See raw diff