Spaces:

Marcel0123
/

supervised-training-depressie

Configuration error

App Files Files Community

Marcel0123 commited on Sep 21, 2025

Commit

9eb3654

verified ·

1 Parent(s): 81acd21

Upload 3 files

Browse files

Files changed (3) hide show

README.md +0 -16
app.py +1 -391
requirements.txt +1 -8

README.md CHANGED Viewed

@@ -1,17 +1 @@
----
-title: "Synthetische depressiedata – Supervised ML demo"
-emoji: "🧠"
-colorFrom: "blue"
-colorTo: "purple"
-sdk: gradio
-sdk_version: "4.0.0"
-app_file: app.py
-pinned: false
----
-# Supervised ML demo – synthetische depressiedata
-Volledig synthetische data. Niet voor klinisch gebruik.
-## Gebruik
-Upload `app.py`, `requirements.txt` (en desgewenst `runtime.txt`) naar een nieuwe **Gradio** Space.












1

app.py CHANGED Viewed

@@ -1,391 +1 @@
-import gradio as gr
-import pandas as pd
-import numpy as np
-from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
-from sklearn.preprocessing import OneHotEncoder, StandardScaler
-from sklearn.compose import ColumnTransformer
-from sklearn.pipeline import Pipeline
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, RocCurveDisplay, precision_recall_curve, average_precision_score
-from sklearn.inspection import permutation_importance
-from sklearn.calibration import CalibratedClassifierCV
-import matplotlib.pyplot as plt
-import io
-import joblib
-# Optionele afhankelijkheid
-try:
-    import shap
-    SHAP_AVAILABLE = True
-except Exception:
-    SHAP_AVAILABLE = False
-# -----------------------------
-# 1) Synthetische datageneratie
-# -----------------------------
-def generate_synthetic_dataset(n_samples=1000, seed=42):
-    rng = np.random.default_rng(seed)
-    age = rng.integers(18, 81, size=n_samples)
-    sex = rng.choice(["man", "vrouw"], size=n_samples, p=[0.48, 0.52])
-    bmi = np.clip(rng.normal(26, 5, size=n_samples), 16, 45)
-    sleep_hours = np.clip(rng.normal(7, 1.5, size=n_samples), 3, 12)
-    activity_min = np.clip(rng.normal(30, 25, size=n_samples), 0, 180)
-    phq9 = np.clip(np.round(rng.normal(9, 6, size=n_samples)), 0, 27)
-    gad7 = np.clip(np.round(rng.normal(7, 5, size=n_samples)), 0, 21)
-    prior_depr = rng.integers(0, 2, size=n_samples)
-    family_hist = rng.integers(0, 2, size=n_samples)
-    chronic_ill = rng.integers(0, 2, size=n_samples)
-    substance_use = rng.integers(0, 2, size=n_samples)
-    stressful_events = np.clip(rng.poisson(1.2, size=n_samples), 0, 6)
-    social_support = rng.integers(1, 6, size=n_samples)
-    employment = rng.choice(["werkend", "student", "werkloos", "ziekverlof"], size=n_samples, p=[0.56, 0.16, 0.18, 0.10])
-    z = (
-        0.35 * (phq9 / 27) +
-        0.12 * (gad7 / 21) +
-        0.18 * (1 - (sleep_hours - 3) / 9) +
-        0.10 * (1 - np.sqrt(np.maximum(activity_min,1e-6) / 180)) +
-        0.10 * (stressful_events / 6) +
-        0.08 * (1 - (social_support - 1) / 4) +
-        0.10 * prior_depr +
-        0.05 * family_hist +
-        0.03 * chronic_ill +
-        0.02 * (bmi - 25) / 20 +
-        0.03 * substance_use
-    )
-    z = z + rng.normal(0, 0.05, size=n_samples)
-    p = 1 / (1 + np.exp(-(z * 4 - 2)))
-    label = (rng.random(n_samples) < p).astype(int)
-    df = pd.DataFrame({
-        "age": age,
-        "sex": sex,
-        "bmi": np.round(bmi, 1),
-        "sleep_hours": np.round(sleep_hours, 1),
-        "activity_minutes": np.round(activity_min, 0).astype(int),
-        "phq9": phq9.astype(int),
-        "gad7": gad7.astype(int),
-        "prior_depression": prior_depr,
-        "family_history": family_hist,
-        "chronic_illness": chronic_ill,
-        "substance_use": substance_use,
-        "stressful_events": stressful_events,
-        "social_support": social_support,
-        "employment_status": employment,
-        "current_depression": label
-    })
-    return df
-# -----------------------------
-# 2) Pipeline helpers
-# -----------------------------
-def make_preprocessor():
-    numeric_cols = [
-        "age","bmi","sleep_hours","activity_minutes","phq9","gad7",
-        "prior_depression","family_history","chronic_illness","substance_use",
-        "stressful_events","social_support"
-    ]
-    cat_cols = ["sex", "employment_status"]
-    pre = ColumnTransformer([
-        ("num", StandardScaler(), numeric_cols),
-        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
-    ])
-    return pre, numeric_cols, cat_cols
-def build_pipeline(model_type="Logistic Regression", seed=42, calibration=None):
-    pre, *_ = make_preprocessor()
-    if model_type == "Random Forest":
-        base_model = RandomForestClassifier(n_estimators=300, random_state=seed)
-    else:
-        base_model = LogisticRegression(max_iter=300)
-    if calibration in ("Platt (sigmoid)", "Isotonic"):
-        method = "sigmoid" if calibration.startswith("Platt") else "isotonic"
-        model = CalibratedClassifierCV(base_model, cv=3, method=method)
-    else:
-        model = base_model
-    return Pipeline([("prep", pre), ("clf", model)])
-def train_model(df, model_type="Logistic Regression", test_size=0.2, seed=42, threshold=0.5, calibration=None):
-    y = df["current_depression"]
-    X = df.drop(columns=["current_depression"])
-    pipe = build_pipeline(model_type, seed, calibration=calibration)
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=test_size, random_state=seed, stratify=y
-    )
-    pipe.fit(X_train, y_train)
-    y_proba = pipe.predict_proba(X_test)[:, 1]
-    y_pred = (y_proba >= threshold).astype(int)
-    acc = float(accuracy_score(y_test, y_pred))
-    auc = float(roc_auc_score(y_test, y_proba))
-    ap = float(average_precision_score(y_test, y_proba))
-    cm = confusion_matrix(y_test, y_pred)
-    # ROC
-    fig, ax = plt.subplots()
-    RocCurveDisplay.from_predictions(y_test, y_proba, ax=ax)
-    ax.set_title("ROC-curve")
-    buf = io.BytesIO(); fig.savefig(buf, format="png", bbox_inches="tight"); plt.close(fig)
-    roc_png = buf.getvalue()
-    # PR
-    precision, recall, _ = precision_recall_curve(y_test, y_proba)
-    fig3, ax3 = plt.subplots()
-    ax3.plot(recall, precision)
-    ax3.set_xlabel("Recall"); ax3.set_ylabel("Precision"); ax3.set_title("Precision–Recall curve")
-    buf3 = io.BytesIO(); fig3.savefig(buf3, format="png", bbox_inches="tight"); plt.close(fig3)
-    pr_png = buf3.getvalue()
-    # Confusion matrix
-    fig2, ax2 = plt.subplots()
-    _ = ax2.imshow(cm, interpolation="nearest")
-    ax2.set_title(f"Confusion matrix (thr={threshold:.2f})")
-    ax2.set_xlabel("Voorspeld"); ax2.set_ylabel("Werkelijk")
-    for (i, j), v in np.ndenumerate(cm):
-        ax2.text(j, i, str(v), ha="center", va="center")
-    buf2 = io.BytesIO(); fig2.savefig(buf2, format="png", bbox_inches="tight"); plt.close(fig2)
-    cm_png = buf2.getvalue()
-    # Permutation importance
-    try:
-        r = permutation_importance(pipe, X_test, y_test, n_repeats=10, random_state=seed)
-        importances = r.importances_mean
-        feat_names = pipe.named_steps["prep"].get_feature_names_out()
-        imp_df = pd.DataFrame({"feature": feat_names, "importance": importances}).sort_values("importance", ascending=False).head(20)
-        figi, axi = plt.subplots(figsize=(6,4))
-        axi.barh(imp_df["feature"][::-1], imp_df["importance"][::-1])
-        axi.set_title("Permutation importance (top 20)")
-        figbuf = io.BytesIO(); figi.savefig(figbuf, format="png", bbox_inches="tight"); plt.close(figi)
-        imp_png = figbuf.getvalue()
-    except Exception:
-        imp_png = None
-    shap_png = None
-    if SHAP_AVAILABLE:
-        try:
-            sample_idx = np.random.choice(len(X_test), size=min(200, len(X_test)), replace=False)
-            X_sample = X_test.iloc[sample_idx]
-            f = lambda data: pipe.predict_proba(pd.DataFrame(data, columns=X_test.columns))[:,1]
-            explainer = shap.KernelExplainer(f, shap.sample(X_train, 50, random_state=seed))
-            shap_values = explainer.shap_values(X_sample, nsamples=100)
-            figshap = plt.figure()
-            shap.summary_plot(shap_values, X_sample, show=False)
-            bufshap = io.BytesIO(); figshap.savefig(bufshap, format="png", bbox_inches="tight"); plt.close(figshap)
-            shap_png = bufshap.getvalue()
-        except Exception:
-            shap_png = None
-    metrics = {"accuracy": round(acc,3), "roc_auc": round(auc,3), "avg_precision": round(ap,3)}
-    return pipe, metrics, cm, roc_png, cm_png, pr_png, imp_png, shap_png
-def cross_validate(df, model_type="Logistic Regression", seed=42, k=5, calibration=None):
-    y = df["current_depression"]
-    X = df.drop(columns=["current_depression"])
-    pipe = build_pipeline(model_type, seed, calibration=calibration)
-    cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
-    aucs = cross_val_score(pipe, X, y, scoring="roc_auc", cv=cv)
-    accs = cross_val_score(pipe, X, y, scoring="accuracy", cv=cv)
-    return {"cv_auc_mean": float(np.mean(aucs)), "cv_auc_std": float(np.std(aucs)),
-            "cv_acc_mean": float(np.mean(accs)), "cv_acc_std": float(np.std(accs))}
-# -----------------------------
-# 3) Gradio UI
-# -----------------------------
-def build_app():
-    with gr.Blocks(title="Synthetische depressiedata – Supervised ML demo") as demo:
-        gr.Markdown(
-            "# Supervised ML demo (synthetische depressiedata)\n"
-            "**Let op:** Deze app gebruikt *volledig synthetische* data en is alleen voor onderwijs/demonstratie. "
-            "Niet gebruiken voor klinische beslissingen."
-        )
-        state_df = gr.State()
-        model_state = gr.State()
-        # Tab 0: Exploratie
-        with gr.Tab("0) Data Exploratie"):
-            gr.Markdown("Genereer eerst een dataset of laad de standaard. Bekijk distributies en correlaties.")
-            init_btn = gr.Button("(Re)genereer standaarddataset")
-            stats_json = gr.JSON(label="Samenvatting")
-            dist_img = gr.Image(label="Histogrammen (kernvariabelen)")
-            corr_img = gr.Image(label="Correlatie heatmap (numeriek)")
-            def init_and_explore():
-                df = generate_synthetic_dataset(1000, 42)
-                desc = df.describe().to_dict()
-                fig, ax = plt.subplots(figsize=(8,6))
-                cols = ["phq9","gad7","sleep_hours","activity_minutes","stressful_events","social_support"]
-                for c in cols:
-                    df[c].plot(kind="hist", alpha=0.5)
-                ax.set_title("Distributies kernvariabelen")
-                buf = io.BytesIO(); fig.savefig(buf, format="png", bbox_inches="tight"); plt.close(fig)
-                hist_png = buf.getvalue()
-                num = df.select_dtypes(include=[np.number]).corr()
-                fig2, ax2 = plt.subplots(figsize=(6,5))
-                _ = ax2.imshow(num, aspect='auto')
-                ax2.set_title("Correlatie (Pearson)")
-                ax2.set_xticks(range(len(num.columns))); ax2.set_xticklabels(num.columns, rotation=90)
-                ax2.set_yticks(range(len(num.index))); ax2.set_yticklabels(num.index)
-                buf2 = io.BytesIO(); fig2.savefig(buf2, format="png", bbox_inches="tight"); plt.close(fig2)
-                corr_png = buf2.getvalue()
-                return df, desc, hist_png, corr_png
-            init_btn.click(init_and_explore, inputs=None, outputs=[state_df, stats_json, dist_img, corr_img])
-        # Tab 1: Data
-        with gr.Tab("1) Data"):
-            n = gr.Slider(200, 5000, value=1000, step=50, label="Aantal voorbeelden")
-            seed = gr.Slider(0, 9999, value=42, step=1, label="Random seed")
-            gen_btn = gr.Button("Genereer dataset")
-            df_out = gr.Dataframe(interactive=False, wrap=True, height=300)
-            csv = gr.File(label="Download CSV", interactive=False)
-            def on_generate(n, seed):
-                df = generate_synthetic_dataset(int(n), int(seed))
-                path = "synthetic_depression.csv"; df.to_csv(path, index=False)
-                return df, df.head(50), path
-            gen_btn.click(on_generate, [n, seed], [state_df, df_out, csv])
-        # Tab 2: Train & Evaluate
-        with gr.Tab("2) Train & Evaluate"):
-            model_type = gr.Radio(["Logistic Regression", "Random Forest"], value="Logistic Regression", label="Model")
-            calibration = gr.Radio(["Geen", "Platt (sigmoid)", "Isotonic"], value="Geen", label="Calibratie")
-            test_size = gr.Slider(0.1, 0.5, value=0.2, step=0.05, label="Test set fractie")
-            threshold = gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="Beslisdrempel (positief bij p ≥ drempel)")
-            seed2 = gr.Slider(0, 9999, value=42, step=1, label="Random seed")
-            train_btn = gr.Button("Train model")
-            metrics = gr.JSON(label="Metrics (accuracy, ROC AUC, AP)")
-            roc_img = gr.Image(label="ROC-curve")
-            pr_img = gr.Image(label="PR-curve")
-            cm_img = gr.Image(label="Confusion matrix")
-            imp_img = gr.Image(label="Permutation importance (top 20)")
-            shap_img = gr.Image(label="SHAP summary (optioneel)")
-            def on_train(model_type, test_size, seed, threshold, calibration, df):
-                if df is None or len(df)==0:
-                    df = generate_synthetic_dataset(1000, 42)
-                model, metrics_out, cm, roc_png, cm_png, pr_png, imp_png, shap_png = train_model(
-                    df, model_type=model_type, test_size=float(test_size), seed=int(seed), threshold=float(threshold),
-                    calibration=None if calibration=="Geen" else calibration
-                )
-                return model, metrics_out, roc_png, pr_png, cm_png, imp_png, shap_png
-            train_btn.click(on_train, [model_type, test_size, seed2, threshold, calibration, state_df],
-                            [model_state, metrics, roc_img, pr_img, cm_img, imp_img, shap_img])
-            cv_btn = gr.Button("Cross-validation (k=5) – ROC AUC & Accuracy")
-            cv_json = gr.JSON(label="CV-resultaten")
-            def on_cv(model_type, calibration, df):
-                if df is None or len(df)==0:
-                    df = generate_synthetic_dataset(1000, 42)
-                return cross_validate(df, model_type=model_type, calibration=None if calibration=="Geen" else calibration)
-            cv_btn.click(on_cv, [model_type, calibration, state_df], [cv_json])
-            with gr.Row():
-                save_btn = gr.Button("Sla model op (.joblib)")
-                model_file = gr.File(label="Gedownloade model-file", interactive=False)
-                load_file = gr.File(label="Laad model (.joblib)")
-                load_btn = gr.Button("Laad model in app")
-            def on_save(model):
-                if model is None:
-                    return None
-                path = "trained_pipeline.joblib"
-                joblib.dump(model, path)
-                return path
-            save_btn.click(on_save, [model_state], [model_file])
-            def on_load(file_obj):
-                if file_obj is None:
-                    return None
-                model = joblib.load(file_obj.name)
-                return model
-            load_btn.click(on_load, [load_file], [model_state])
-        # Tab 3: Voorspellen
-        with gr.Tab("3) Voorspellen (speels)"):
-            gr.Markdown("Kies kenmerken om een kans op *actuele depressie* te laten berekenen (didactisch, niet klinisch).")
-            with gr.Row():
-                age = gr.Slider(18, 80, value=35, step=1, label="Leeftijd")
-                sex = gr.Radio(["man", "vrouw"], value="vrouw", label="Geslacht")
-                bmi = gr.Slider(16.0, 45.0, value=25.0, step=0.1, label="BMI")
-            with gr.Row():
-                sleep_hours = gr.Slider(3.0, 12.0, value=7.0, step=0.1, label="Slaap (uren/dag)")
-                activity_minutes = gr.Slider(0, 180, value=30, step=5, label="Lichaamsbeweging (min/dag)")
-                employment = gr.Radio(["werkend", "student", "werkloos", "ziekverlof"], value="werkend", label="Werkstatus")
-            with gr.Row():
-                phq9 = gr.Slider(0, 27, value=10, step=1, label="PHQ-9")
-                gad7 = gr.Slider(0, 21, value=7, step=1, label="GAD-7")
-                social_support = gr.Slider(1, 5, value=3, step=1, label="Sociale steun (1-5)")
-            with gr.Row():
-                prior_depr = gr.Checkbox(False, label="Eerder depressieve episode")
-                family_history = gr.Checkbox(False, label="Familiaire voorgeschiedenis")
-                chronic_ill = gr.Checkbox(False, label="Chronische somatische aandoening")
-                substance_use = gr.Checkbox(False, label="Middelengebruik (actueel)")
-                stressful_events = gr.Slider(0, 6, value=1, step=1, label="Belastende levensgebeurtenissen (0-6)")
-            pred_btn = gr.Button("Bereken kans")
-            pred_json = gr.JSON(label="Voorspelling")
-            def predict_fn(age, sex, bmi, sleep_hours, activity_minutes, employment, phq9, gad7, social_support, prior_depr, family_history, chronic_ill, substance_use, stressful_events, model):
-                if model is None:
-                    df = generate_synthetic_dataset(1000, 42)
-                    model, *_ = train_model(df)
-                input_df = pd.DataFrame([{
-                    "age": age,
-                    "sex": sex,
-                    "bmi": bmi,
-                    "sleep_hours": sleep_hours,
-                    "activity_minutes": activity_minutes,
-                    "phq9": phq9,
-                    "gad7": gad7,
-                    "prior_depression": int(prior_depr),
-                    "family_history": int(family_history),
-                    "chronic_illness": int(chronic_ill),
-                    "substance_use": int(substance_use),
-                    "stressful_events": stressful_events,
-                    "social_support": social_support,
-                    "employment_status": employment
-                }])
-                try:
-                    prob = float(model.predict_proba(input_df)[0,1])
-                except Exception:
-                    prob = float(model.predict(input_df)[0])
-                return {"probability_current_depression": round(prob, 3)}
-            pred_inputs = [age, sex, bmi, sleep_hours, activity_minutes, employment, phq9, gad7, social_support,
-                           prior_depr, family_history, chronic_ill, substance_use, stressful_events, model_state]
-            pred_btn.click(predict_fn, pred_inputs, [pred_json])
-        gr.Markdown(
-            "---\n"
-            "### Ethische noot\n"
-            "- Data zijn **geheel synthetisch** en bevatten geen persoonsgegevens.\n"
-            "- Model is **niet** gevalideerd voor klinisch gebruik.\n"
-            "- Gebruik dit uitsluitend voor onderwijs/demonstratie."
-        )
-    return demo
-# Heel belangrijk voor Hugging Face Spaces: maak een **globale** `demo` variabele.
-demo = build_app()
-if __name__ == "__main__":
-    demo.launch()


1	+ # Hugging Face Space — Live Supervised Training Visualizer (Student WOW Edition)

requirements.txt CHANGED Viewed

@@ -1,8 +1 @@
-gradio>=4.0.0
-pandas
-numpy
-scikit-learn
-matplotlib
-shap
-scipy
-joblib


1	+