Spaces:

Marcel0123
/

Explainable-Speech-Analytics

Sleeping

App Files Files Community

Marcel0123 commited on Jan 27

Commit

7416f7f

verified ·

1 Parent(s): 5c4e27d

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -445

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-```python
 import os
 import math
 import numpy as np
@@ -13,38 +12,35 @@ from functools import lru_cache
 import torch
 from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
-# -----------------------------
 # Configuration
-# -----------------------------
 TARGET_SR = 16000
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_ID = os.getenv("W2V_MODEL_ID", "facebook/wav2vec2-base-960h")
-# -----------------------------
-# Lightweight explainability helpers
-# -----------------------------
-def _human_seconds(sec: float) -> str:
     if not math.isfinite(sec):
         return "—"
     if sec < 60:
         return f"{sec:.1f}s"
     m = int(sec // 60)
-    s = sec - 60 * m
-    return f"{m}m {s:.1f}s"
-def _cosine(a: np.ndarray, b: np.ndarray) -> float:
-    a = np.asarray(a, dtype=np.float32)
-    b = np.asarray(b, dtype=np.float32)
-    denom = (np.linalg.norm(a) * np.linalg.norm(b)) + 1e-12
     return float(np.dot(a, b) / denom)
-# -----------------------------
-# Model (audio embedding)
-# -----------------------------
 @lru_cache(maxsize=1)
-def load_w2v():
     extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
     model = Wav2Vec2Model.from_pretrained(MODEL_ID).to(DEVICE)
     model.eval()
@@ -52,69 +48,55 @@ def load_w2v():
 def embed_audio(y: np.ndarray, sr: int) -> np.ndarray:
-    extractor, model = load_w2v()
     if sr != TARGET_SR:
-        y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
-        sr = TARGET_SR
     if y.size == 0:
-        return np.zeros((768,), dtype=np.float32)
     y = y.astype(np.float32)
-    mx = float(np.max(np.abs(y))) + 1e-9
-    y = y / mx
-    inputs = extractor(y, sampling_rate=sr, return_tensors="pt")
     with torch.no_grad():
-        input_values = inputs["input_values"].to(DEVICE)
-        out = model(input_values)
-        emb = out.last_hidden_state.mean(dim=1).squeeze(0).detach().cpu().numpy()
     return emb.astype(np.float32)
-# -----------------------------
 # Feature extraction
-# -----------------------------
 @dataclass
 class Features:
     duration_s: float
     rms_mean: float
     rms_std: float
-    zcr_mean: float
-    pitch_median_hz: float
-    pitch_iqr_hz: float
-    voiced_ratio: float
     n_pauses: int
     pause_total_s: float
     active_ratio: float
 def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
-    """Return features + artifacts for plots/inspection."""
-    if y is None or len(y) == 0:
-        f = Features(np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 0, 0.0, np.nan)
-        return f, {"y": np.array([]), "sr": sr, "times": np.array([]), "pitch": np.array([])}
     if sr != TARGET_SR:
-        y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
         sr = TARGET_SR
-    y = y.astype(np.float32)
-    duration = float(len(y) / sr)
-    hop = 160   # 10 ms at 16k
-    frame = 400 # 25 ms at 16k
     rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0]
-    zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame, hop_length=hop)[0]
-    rms_mean = float(np.mean(rms)) if rms.size else np.nan
-    rms_std = float(np.std(rms)) if rms.size else np.nan
-    zcr_mean = float(np.mean(zcr)) if zcr.size else np.nan
-    # Pitch using probabilistic YIN (pyin)
     try:
-        f0, voiced_flag, voiced_probs = librosa.pyin(
             y,
             fmin=librosa.note_to_hz("C2"),
             fmax=librosa.note_to_hz("C7"),
@@ -125,440 +107,127 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
     except Exception:
         f0 = None
-    if f0 is None:
         pitch_median = np.nan
         pitch_iqr = np.nan
-        voiced_ratio = np.nan
-        pitch = np.array([])
-        times = np.array([])
-    else:
-        pitch = np.asarray(f0, dtype=np.float32)
-        times = librosa.frames_to_time(np.arange(len(pitch)), sr=sr, hop_length=hop)
-        voiced = np.isfinite(pitch)
-        voiced_ratio = float(np.mean(voiced)) if voiced.size else np.nan
-        if np.any(voiced):
-            pv = pitch[voiced]
-            pitch_median = float(np.median(pv))
-            q75, q25 = np.percentile(pv, [75, 25])
-            pitch_iqr = float(q75 - q25)
-        else:
-            pitch_median = np.nan
-            pitch_iqr = np.nan
-    # Pause detection using RMS threshold (relative)
-    if rms.size:
-        thr = float(np.percentile(rms, 20)) * 0.8
-        silent = rms < thr
-        min_pause_frames = int(0.2 / (hop / sr))  # pauses >= 0.2s
-        pauses = []
-        start = None
-        for i, s in enumerate(silent):
-            if s and start is None:
-                start = i
-            if (not s) and start is not None:
-                end = i
-                if (end - start) >= min_pause_frames:
-                    pauses.append((start, end))
-                start = None
-        if start is not None:
-            end = len(silent)
-            if (end - start) >= min_pause_frames:
-                pauses.append((start, end))
-        n_pauses = int(len(pauses))
-        pause_total_s = float(sum((e - s) * (hop / sr) for s, e in pauses))
-        active_ratio = float(1.0 - (np.mean(silent) if silent.size else 0.0))
-    else:
-        pauses = []
-        n_pauses = 0
-        pause_total_s = 0.0
-        active_ratio = np.nan
-        thr = None
     feats = Features(
         duration_s=duration,
         rms_mean=rms_mean,
         rms_std=rms_std,
-        zcr_mean=zcr_mean,
-        pitch_median_hz=pitch_median,
-        pitch_iqr_hz=pitch_iqr,
-        voiced_ratio=voiced_ratio,
-        n_pauses=n_pauses,
-        pause_total_s=pause_total_s,
         active_ratio=active_ratio,
     )
     artifacts = {
         "y": y,
         "sr": sr,
-        "hop": hop,
-        "frame": frame,
         "rms": rms,
-        "zcr": zcr,
-        "pitch": pitch,
-        "times": times,
         "pauses": pauses,
-        "rms_thr": thr,
     }
     return feats, artifacts
-# -----------------------------
 # Plotting
-# -----------------------------
-def plot_waveform_with_pauses(artifacts: Dict[str, Any]) -> plt.Figure:
     y = artifacts["y"]
     sr = artifacts["sr"]
-    pauses = artifacts.get("pauses", [])
-    hop = artifacts.get("hop", 160)
-    fig = plt.figure(figsize=(10, 3.2))
     ax = fig.add_subplot(111)
-    if y.size:
-        t = np.arange(len(y)) / sr
-        ax.plot(t, y, linewidth=0.8)
-        ax.set_xlim(0, t[-1] if t.size else 1)
-        ax.set_xlabel("Tijd (s)")
-        ax.set_ylabel("Amplitude")
-        ax.set_title("Waveform (met gedetecteerde pauzes)")
-        for (s, e) in pauses:
-            ts = s * (hop / sr)
-            te = e * (hop / sr)
-            ax.axvspan(ts, te, alpha=0.2)
-    else:
-        ax.text(0.5, 0.5, "Geen audio", ha="center", va="center")
-        ax.set_axis_off()
-    fig.tight_layout()
-    return fig
-def plot_pitch(artifacts: Dict[str, Any]) -> plt.Figure:
-    pitch = artifacts.get("pitch", np.array([]))
-    times = artifacts.get("times", np.array([]))
-    fig = plt.figure(figsize=(10, 3.2))
-    ax = fig.add_subplot(111)
-    if pitch.size and times.size:
-        ax.plot(times, pitch, linewidth=1.0)
-        ax.set_xlabel("Tijd (s)")
-        ax.set_ylabel("Pitch (Hz)")
-        ax.set_title("Pitch contour (NaN = onvoiced)")
-    else:
-        ax.text(0.5, 0.5, "Pitch niet beschikbaar (te kort / te veel ruis)", ha="center", va="center")
-        ax.set_axis_off()
     fig.tight_layout()
     return fig
-# -----------------------------
-# UI helpers
-# -----------------------------
-def format_features_table(feats: Features) -> List[List[str]]:
-    def fmt_float(x):
-        if x is None or (isinstance(x, float) and not math.isfinite(x)):
-            return "—"
-        return f"{float(x):.3f}"
-    def fmt_int(x):
-        if x is None:
-            return "—"
-        return str(int(x))
-    return [
-        ["Duur", _human_seconds(feats.duration_s)],
-        ["Volume (RMS) gemiddeld", fmt_float(feats.rms_mean)],
-        ["Volume (RMS) variatie", fmt_float(feats.rms_std)],
-        ["ZCR (ruis/‘scherpte’) gemiddeld", fmt_float(feats.zcr_mean)],
-        ["Pitch mediaan", "—" if not math.isfinite(feats.pitch_median_hz) else f"{feats.pitch_median_hz:.1f} Hz"],
-        ["Pitch spreiding (IQR)", "—" if not math.isfinite(feats.pitch_iqr_hz) else f"{feats.pitch_iqr_hz:.1f} Hz"],
-        ["Voiced ratio", "—" if not math.isfinite(feats.voiced_ratio) else f"{feats.voiced_ratio*100:.1f}%"],
-        ["Aantal pauzes (≥ 0.2s)", fmt_int(feats.n_pauses)],
-        ["Totale pauzeduur", _human_seconds(feats.pause_total_s)],
-        ["Actieve-spraak ratio", "—" if not math.isfinite(feats.active_ratio) else f"{feats.active_ratio*100:.1f}%"],
-    ]
-def explain_panel(feats: Features) -> str:
-    bullets = []
-    if math.isfinite(feats.pause_total_s):
-        bullets.append(f"- **Pauzes**: {feats.n_pauses} pauzes (≥0.2s), samen {_human_seconds(feats.pause_total_s)}.")
-    if math.isfinite(feats.pitch_median_hz):
-        bullets.append(f"- **Pitch**: mediaan ~ {feats.pitch_median_hz:.1f} Hz, spreiding (IQR) {feats.pitch_iqr_hz:.1f} Hz.")
-    if math.isfinite(feats.rms_mean):
-        bullets.append(f"- **Volume**: RMS gemiddeld {feats.rms_mean:.3f} (relatief; alleen vergelijken binnen dezelfde setup).")
-    if math.isfinite(feats.active_ratio):
-        bullets.append(f"- **Actieve spraak**: ~ {feats.active_ratio*100:.1f}% van de tijd boven drempel.")
-    if not bullets:
-        bullets = ["- Geen features beschikbaar (audio te kort of leeg)."]
-    return (
-        "### Wat ‘ziet’ de AI hier?\n"
-        "Dit is een **uitleg-demo**: we tonen *meetbare spraaksignalen* en hoe die veranderen tussen fragmenten.\n\n"
-        + "\n".join(bullets)
-        + "\n\n"
-        "**Belangrijk:** dit systeem geeft **geen diagnose** en is **geen medisch hulpmiddel**. "
-        "Gebruik dit als gespreksstarter of educatieve visualisatie."
-    )
-# -----------------------------
-# Core callbacks
-# -----------------------------
-def analyze_single(audio: Tuple[int, np.ndarray]):
     if audio is None:
-        return gr.Dataframe(value=[["—", "Upload of neem audio op om te starten."]]), None, None, "### Upload of neem audio op"
     sr, y = audio
     feats, art = compute_features(y, sr)
-    table = format_features_table(feats)
-    wf = plot_waveform_with_pauses(art)
-    pc = plot_pitch(art)
-    expl = explain_panel(feats)
-    return gr.Dataframe(value=table, headers=["Kenmerk", "Waarde"]), wf, pc, expl
-def analyze_compare(a1, a2):
-    if a1 is None or a2 is None:
-        return "—", gr.Dataframe(value=[["—", "Selecteer twee fragmenten."]]), None
-    sr1, y1 = a1
-    sr2, y2 = a2
-    f1, art1 = compute_features(y1, sr1)
-    f2, art2 = compute_features(y2, sr2)
-    e1 = embed_audio(art1["y"], art1["sr"])
-    e2 = embed_audio(art2["y"], art2["sr"])
-    sim = _cosine(e1, e2)
-    def delta(a, b):
-        if (a is None) or (b is None):
-            return "—"
-        if (isinstance(a, float) and not math.isfinite(a)) or (isinstance(b, float) and not math.isfinite(b)):
-            return "—"
-        return f"{(b - a):+.3f}"
-    rows = [
-        ["Duur (s)", f1.duration_s, f2.duration_s, delta(f1.duration_s, f2.duration_s)],
-        ["RMS mean", f1.rms_mean, f2.rms_mean, delta(f1.rms_mean, f2.rms_mean)],
-        ["Pitch mediaan (Hz)", f1.pitch_median_hz, f2.pitch_median_hz, delta(f1.pitch_median_hz, f2.pitch_median_hz)],
-        ["Pauzes (#)", float(f1.n_pauses), float(f2.n_pauses), f"{(f2.n_pauses - f1.n_pauses):+d}"],
-        ["Pauzeduur (s)", f1.pause_total_s, f2.pause_total_s, delta(f1.pause_total_s, f2.pause_total_s)],
-        ["Actieve ratio", f1.active_ratio, f2.active_ratio, delta(f1.active_ratio, f2.active_ratio)],
-    ]
-    formatted = []
-    for k, v1, v2, dv in rows:
-        def fmt(v):
-            if isinstance(v, float) and math.isfinite(v):
-                if "ratio" in k.lower():
-                    return f"{v*100:.1f}%"
-                if "pitch" in k.lower():
-                    return f"{v:.1f}"
-                return f"{v:.3f}"
-            return "—"
-        formatted.append([k, fmt(v1), fmt(v2), dv])
-    fig = plt.figure(figsize=(10, 3.2))
-    ax = fig.add_subplot(111)
-    def prep_plot(y, sr):
-        if sr != TARGET_SR:
-            y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
-            sr = TARGET_SR
-        if y.size > sr * 20:
-            y = y[: sr * 20]
-        t = np.arange(len(y)) / sr
-        return t, y
-    t1, yy1 = prep_plot(y1, sr1)
-    t2, yy2 = prep_plot(y2, sr2)
-    if yy1.size:
-        ax.plot(t1, yy1, linewidth=0.8, label="Fragment A")
-    if yy2.size:
-        ax.plot(t2, yy2, linewidth=0.8, label="Fragment B", alpha=0.8)
-    ax.set_title("Waveform overlay (eerste max 20s)")
-    ax.set_xlabel("Tijd (s)")
-    ax.set_ylabel("Amplitude")
-    ax.legend(loc="upper right")
-    fig.tight_layout()
-    return f"{sim*100:.1f}%", gr.Dataframe(value=formatted, headers=["Kenmerk", "A", "B", "Δ (B−A)"]), fig
-# -----------------------------
-# UI
-# -----------------------------
-CSS = """
-:root{
-  --bg: #0b0f19;
-  --panel: rgba(255,255,255,0.06);
-  --text: rgba(255,255,255,0.92);
-  --muted: rgba(255,255,255,0.70);
-  --border: rgba(255,255,255,0.14);
-  --shadow: 0 10px 30px rgba(0,0,0,0.35);
-}
-.gradio-container{
-  background: radial-gradient(1200px 700px at 10% 10%, rgba(124,58,237,0.25), transparent 55%),
-              radial-gradient(900px 600px at 90% 20%, rgba(34,197,94,0.18), transparent 55%),
-              radial-gradient(1100px 800px at 40% 100%, rgba(59,130,246,0.15), transparent 60%),
-              var(--bg) !important;
-  color: var(--text) !important;
-}
-#header-card{
-  background: linear-gradient(135deg, rgba(124,58,237,0.22), rgba(34,197,94,0.14));
-  border: 1px solid var(--border);
-  border-radius: 18px;
-  padding: 18px 18px 14px 18px;
-  box-shadow: var(--shadow);
-}
-#header-title{
-  font-size: 28px;
-  font-weight: 750;
-  letter-spacing: -0.02em;
-  margin: 0;
-}
-#header-sub{
-  margin-top: 6px;
-  color: var(--muted);
-  font-size: 14px;
-  line-height: 1.45;
-}
-.badge{
-  display: inline-flex;
-  align-items: center;
-  gap: 8px;
-  padding: 6px 10px;
-  border-radius: 999px;
-  border: 1px solid var(--border);
-  background: rgba(255,255,255,0.05);
-  color: var(--muted);
-  font-size: 12px;
-  margin-right: 10px;
-}
-.badge b{
-  color: var(--text);
-  font-weight: 700;
-}
-a { color: rgba(255,255,255,0.9) !important; }
-label, .md, .markdown { color: var(--text) !important; }
-"""
-def build_demo():
-    with gr.Blocks(
-        css=CSS,
-        theme=gr.themes.Soft(primary_hue="violet", secondary_hue="emerald"),
-        title="Explainable Speech Analytics (Demo)"
-    ) as demo:
-        gr.HTML(
-            """
-            <div id="header-card">
-              <p id="header-title">Explainable Speech Analytics</p>
-              <div id="header-sub">
-                <span class="badge"><b>Doel</b> inzicht in meetbare spraaksignalen</span>
-                <span class="badge"><b>Geen diagnose</b> geen medisch hulpmiddel</span>
-                <span class="badge"><b>Privacy</b> audio wordt niet opgeslagen door deze demo</span>
-                <p style="margin-top:12px">
-                  Upload of neem korte audiofragmenten op en bekijk <b>wat het systeem meet</b>: pauzes, pitch,
-                  volume-energie en een algemene <b>audio-embedding</b> om fragmenten te vergelijken.
-                  Gebruik dit als <b>educatieve visualisatie</b> of gespreksstarter — niet als klinische beslissing.
-                </p>
-              </div>
-            </div>
-            """
-        )
-        with gr.Tabs():
-            with gr.TabItem("Analyse (1 fragment)"):
-                with gr.Row():
-                    with gr.Column(scale=5):
-                        input_audio = gr.Audio(
-                            label="Audio",
-                            sources=["upload", "microphone"],
-                            type="numpy",
-                        )
-                        run_btn = gr.Button("Analyseer", variant="primary")
-                        with gr.Accordion("Wat gebeurt er technisch?", open=False):
-                            gr.Markdown(
-                                """
-                                - **Akoestiek**: we extraheren frame-based signalen (RMS, ZCR), schatten **pitch** met *pyin*,
-                                  en detecteren **pauzes** met een adaptieve energiedrempel.
-                                - **Embedding**: een vooraf getraind **Wav2Vec2**-model maakt een vaste vector (embedding) van de audio
-                                  waarmee we fragmenten **onderling** kunnen vergelijken (cosine similarity).
-                                - **Explainable by design**: we tonen de signalen en deltas, niet alleen een score.
-                                """
-                            )
-                    with gr.Column(scale=7):
-                        feat_df = gr.Dataframe(
-                            headers=["Kenmerk", "Waarde"],
-                            datatype=["str", "str"],
-                            interactive=False,
-                            wrap=True,
-                            label="Meetbare kenmerken",
-                        )
-                        wf_plot = gr.Plot(label="Waveform + pauzes")
-                        pitch_plot = gr.Plot(label="Pitch")
-                        explanation = gr.Markdown("### Upload of neem audio op", elem_id="explain-card")
-                run_btn.click(analyze_single, inputs=[input_audio], outputs=[feat_df, wf_plot, pitch_plot, explanation])
-            with gr.TabItem("Vergelijk (2 fragmenten)"):
-                with gr.Row():
-                    with gr.Column(scale=5):
-                        a1 = gr.Audio(label="Fragment A", sources=["upload", "microphone"], type="numpy")
-                        a2 = gr.Audio(label="Fragment B", sources=["upload", "microphone"], type="numpy")
-                        compare_btn = gr.Button("Vergelijk", variant="primary")
-                        gr.Markdown(
-                            """
-                            **Interpretatie-tip:** een lagere overeenkomst betekent alleen dat de audio *anders* is
-                            (andere omgeving, microfoon, emotie, vermoeidheid, etc.). Het zegt **niet** *waarom*.
-                            """
-                        )
-                    with gr.Column(scale=7):
-                        sim_out = gr.Textbox(label="Embedding-overeenkomst (cosine similarity)", value="—", interactive=False)
-                        delta_df = gr.Dataframe(
-                            headers=["Kenmerk", "A", "B", "Δ (B−A)"],
-                            datatype=["str", "str", "str", "str"],
-                            interactive=False,
-                            wrap=True,
-                            label="Verschillen (uitlegbaar)",
-                        )
-                        overlay_plot = gr.Plot(label="Waveform overlay")
-                compare_btn.click(analyze_compare, inputs=[a1, a2], outputs=[sim_out, delta_df, overlay_plot])
-        with gr.Accordion("Ethiek & transparantie (anti–black box)", open=False):
-            gr.Markdown(
-                """
-                **Hoe voorkomt deze demo ‘black box’ gedrag?**
-                - We tonen **de signalen** (pauzes, pitch, energie) in grafieken en tabellen.
-                - We tonen **verschillen** tussen fragmenten, i.p.v. één eindlabel.
-                - We geven **geen diagnose** of medische claim; de output is bedoeld als **observatie**.
-                - In een zorgcontext hoort interpretatie altijd samen te gaan met **context + gesprek + klinisch oordeel**.
-                """
-            )
-    return demo
-if __name__ == "__main__":
-    demo = build_demo()
-    demo.queue(max_size=32)
-    demo.launch()
-```

 import os
 import math
 import numpy as np
 import torch
 from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
+# =========================================================
 # Configuration
+# =========================================================
 TARGET_SR = 16000
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_ID = os.getenv("W2V_MODEL_ID", "facebook/wav2vec2-base-960h")
+# =========================================================
+# Utility helpers
+# =========================================================
+def human_seconds(sec: float) -> str:
     if not math.isfinite(sec):
         return "—"
     if sec < 60:
         return f"{sec:.1f}s"
     m = int(sec // 60)
+    return f"{m}m {sec - 60*m:.1f}s"
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    denom = (np.linalg.norm(a) * np.linalg.norm(b)) + 1e-9
     return float(np.dot(a, b) / denom)
+# =========================================================
+# Model loading (cached)
+# =========================================================
 @lru_cache(maxsize=1)
+def load_wav2vec():
     extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
     model = Wav2Vec2Model.from_pretrained(MODEL_ID).to(DEVICE)
     model.eval()
 def embed_audio(y: np.ndarray, sr: int) -> np.ndarray:
     if sr != TARGET_SR:
+        y = librosa.resample(y, sr, TARGET_SR)
     if y.size == 0:
+        return np.zeros(768, dtype=np.float32)
     y = y.astype(np.float32)
+    y /= np.max(np.abs(y)) + 1e-9
+    extractor, model = load_wav2vec()
+    inputs = extractor(y, sampling_rate=TARGET_SR, return_tensors="pt")
     with torch.no_grad():
+        out = model(inputs["input_values"].to(DEVICE))
+        emb = out.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
     return emb.astype(np.float32)
+# =========================================================
 # Feature extraction
+# =========================================================
 @dataclass
 class Features:
     duration_s: float
     rms_mean: float
     rms_std: float
+    pitch_median: float
+    pitch_iqr: float
     n_pauses: int
     pause_total_s: float
     active_ratio: float
 def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
     if sr != TARGET_SR:
+        y = librosa.resample(y, sr, TARGET_SR)
         sr = TARGET_SR
+    duration = len(y) / sr
+    hop = 160
+    frame = 400
     rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0]
+    rms_mean = float(np.mean(rms))
+    rms_std = float(np.std(rms))
     try:
+        f0, _, _ = librosa.pyin(
             y,
             fmin=librosa.note_to_hz("C2"),
             fmax=librosa.note_to_hz("C7"),
     except Exception:
         f0 = None
+    if f0 is not None and np.any(np.isfinite(f0)):
+        voiced = f0[np.isfinite(f0)]
+        pitch_median = float(np.median(voiced))
+        pitch_iqr = float(np.percentile(voiced, 75) - np.percentile(voiced, 25))
+    else:
         pitch_median = np.nan
         pitch_iqr = np.nan
+    silence = rms < np.percentile(rms, 20)
+    min_pause_frames = int(0.2 / (hop / sr))
+    pauses = []
+    start = None
+    for i, s in enumerate(silence):
+        if s and start is None:
+            start = i
+        if not s and start is not None:
+            if i - start >= min_pause_frames:
+                pauses.append((start, i))
+            start = None
+    pause_total = sum((e - s) * hop / sr for s, e in pauses)
+    active_ratio = 1.0 - float(np.mean(silence))
     feats = Features(
         duration_s=duration,
         rms_mean=rms_mean,
         rms_std=rms_std,
+        pitch_median=pitch_median,
+        pitch_iqr=pitch_iqr,
+        n_pauses=len(pauses),
+        pause_total_s=pause_total,
         active_ratio=active_ratio,
     )
     artifacts = {
         "y": y,
         "sr": sr,
         "rms": rms,
+        "pitch": f0,
         "pauses": pauses,
+        "hop": hop,
     }
     return feats, artifacts
+# =========================================================
 # Plotting
+# =========================================================
+def plot_waveform(artifacts: Dict[str, Any]):
     y = artifacts["y"]
     sr = artifacts["sr"]
+    pauses = artifacts["pauses"]
+    hop = artifacts["hop"]
+    fig = plt.figure(figsize=(10, 3))
     ax = fig.add_subplot(111)
+    t = np.arange(len(y)) / sr
+    ax.plot(t, y, lw=0.8)
+    for s, e in pauses:
+        ax.axvspan(s * hop / sr, e * hop / sr, alpha=0.2)
+    ax.set_title("Waveform met pauzes")
+    ax.set_xlabel("Tijd (s)")
+    ax.set_ylabel("Amplitude")
     fig.tight_layout()
     return fig
+# =========================================================
+# UI callbacks
+# =========================================================
+def analyze_single(audio):
     if audio is None:
+        return [], None, "Upload of neem audio op."
     sr, y = audio
     feats, art = compute_features(y, sr)
+    table = [
+        ["Duur", human_seconds(feats.duration_s)],
+        ["Gemiddeld volume (RMS)", f"{feats.rms_mean:.3f}"],
+        ["Volume-variatie", f"{feats.rms_std:.3f}"],
+        ["Pitch mediaan", "—" if not math.isfinite(feats.pitch_median) else f"{feats.pitch_median:.1f} Hz"],
+        ["Pitch spreiding (IQR)", "—" if not math.isfinite(feats.pitch_iqr) else f"{feats.pitch_iqr:.1f} Hz"],
+        ["Aantal pauzes ≥0.2s", str(feats.n_pauses)],
+        ["Totale pauzeduur", human_seconds(feats.pause_total_s)],
+        ["Actieve spraakratio", f"{feats.active_ratio*100:.1f}%"],
+    ]
+    fig = plot_waveform(art)
+    explanation = (
+        "### Wat laat dit zien?\n"
+        "- Dit zijn **meetbare spraaksignalen** (pauzes, pitch, volume).\n"
+        "- Er wordt **geen diagnose** gesteld.\n"
+        "- Interpretatie hoort altijd samen met context en gesprek."
+    )
+    return table, fig, explanation
+# =========================================================
+# UI
+# =========================================================
+with gr.Blocks(title="Explainable Speech Analytics") as demo:
+    gr.Markdown(
+        "## Explainable Speech Analytics\n"
+        "*Educatieve demo – geen medisch hulpmiddel*"
+    )
+    with gr.Row():
+        audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Audiofragment")
+        run = gr.Button("Analyseer", variant="primary")
+    table = gr.Dataframe(headers=["Kenmerk", "Waarde"], interactive=False)
+    plot = gr.Plot()
+    explanation = gr.Markdown()
+    run.click(analyze_single, inputs=audio, outputs=[table, plot, explanation])
+demo.launch()