Spaces:

Marcel0123
/

Explainable-Speech-Analytics

Sleeping

App Files Files Community

Marcel0123 commited on 16 days ago

Commit

5c4e27d

verified ·

1 Parent(s): 482cc2e

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -95

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import math
 import numpy as np
@@ -6,7 +7,8 @@ import librosa
 import matplotlib.pyplot as plt
 from dataclasses import dataclass
-from typing import Dict, Any, Tuple, Optional, List
 import torch
 from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
@@ -21,51 +23,43 @@ MODEL_ID = os.getenv("W2V_MODEL_ID", "facebook/wav2vec2-base-960h")
 # -----------------------------
 # Lightweight explainability helpers
 # -----------------------------
-def _safe_float(x, default=np.nan):
-    try:
-        if x is None:
-            return default
-        x = float(x)
-        if math.isfinite(x):
-            return x
-        return default
-    except Exception:
-        return default
 def _human_seconds(sec: float) -> str:
     if not math.isfinite(sec):
         return "—"
     if sec < 60:
         return f"{sec:.1f}s"
     m = int(sec // 60)
-    s = sec - 60*m
     return f"{m}m {s:.1f}s"
 def _cosine(a: np.ndarray, b: np.ndarray) -> float:
     a = np.asarray(a, dtype=np.float32)
     b = np.asarray(b, dtype=np.float32)
     denom = (np.linalg.norm(a) * np.linalg.norm(b)) + 1e-12
     return float(np.dot(a, b) / denom)
 # -----------------------------
 # Model (audio embedding)
 # -----------------------------
-@gr.cache()
 def load_w2v():
     extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
     model = Wav2Vec2Model.from_pretrained(MODEL_ID).to(DEVICE)
     model.eval()
     return extractor, model
 def embed_audio(y: np.ndarray, sr: int) -> np.ndarray:
     extractor, model = load_w2v()
     if sr != TARGET_SR:
         y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
         sr = TARGET_SR
-    # Normalize to [-1, 1]
     if y.size == 0:
         return np.zeros((768,), dtype=np.float32)
     y = y.astype(np.float32)
     mx = float(np.max(np.abs(y))) + 1e-9
     y = y / mx
@@ -74,10 +68,10 @@ def embed_audio(y: np.ndarray, sr: int) -> np.ndarray:
     with torch.no_grad():
         input_values = inputs["input_values"].to(DEVICE)
         out = model(input_values)
-        # Mean pooling over time
         emb = out.last_hidden_state.mean(dim=1).squeeze(0).detach().cpu().numpy()
     return emb.astype(np.float32)
 # -----------------------------
 # Feature extraction
 # -----------------------------
@@ -94,6 +88,7 @@ class Features:
     pause_total_s: float
     active_ratio: float
 def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
     """Return features + artifacts for plots/inspection."""
     if y is None or len(y) == 0:
@@ -105,22 +100,19 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
         sr = TARGET_SR
     y = y.astype(np.float32)
-    # Trim leading/trailing silence slightly for stability, but keep for pause detection
     duration = float(len(y) / sr)
-    # Frame-level features
-    hop = 160  # 10 ms at 16k
-    frame = 400  # 25 ms at 16k
     rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0]
     zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame, hop_length=hop)[0]
     rms_mean = float(np.mean(rms)) if rms.size else np.nan
-    rms_std  = float(np.std(rms)) if rms.size else np.nan
     zcr_mean = float(np.mean(zcr)) if zcr.size else np.nan
-    # Pitch using probabilistic YIN (pyin). Can be slow, but OK for short clips.
-    # f0 contains NaN for unvoiced frames.
     try:
         f0, voiced_flag, voiced_probs = librosa.pyin(
             y,
@@ -132,7 +124,6 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
         )
     except Exception:
         f0 = None
-        voiced_flag = None
     if f0 is None:
         pitch_median = np.nan
@@ -155,13 +146,11 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
             pitch_iqr = np.nan
     # Pause detection using RMS threshold (relative)
-    # Convert rms frames -> boolean "silent"
     if rms.size:
-        thr = float(np.percentile(rms, 20)) * 0.8  # conservative
         silent = rms < thr
-        # Count pauses longer than 0.2s
-        min_pause_frames = int(0.2 / (hop / sr))
-        # Run-length encoding
         pauses = []
         start = None
         for i, s in enumerate(silent):
@@ -185,6 +174,7 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
         n_pauses = 0
         pause_total_s = 0.0
         active_ratio = np.nan
     feats = Features(
         duration_s=duration,
@@ -209,10 +199,11 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
         "pitch": pitch,
         "times": times,
         "pauses": pauses,
-        "rms_thr": thr if rms.size else None,
     }
     return feats, artifacts
 # -----------------------------
 # Plotting
 # -----------------------------
@@ -224,6 +215,7 @@ def plot_waveform_with_pauses(artifacts: Dict[str, Any]) -> plt.Figure:
     fig = plt.figure(figsize=(10, 3.2))
     ax = fig.add_subplot(111)
     if y.size:
         t = np.arange(len(y)) / sr
         ax.plot(t, y, linewidth=0.8)
@@ -232,7 +224,6 @@ def plot_waveform_with_pauses(artifacts: Dict[str, Any]) -> plt.Figure:
         ax.set_ylabel("Amplitude")
         ax.set_title("Waveform (met gedetecteerde pauzes)")
-        # Overlay pause regions (convert pause frames to time)
         for (s, e) in pauses:
             ts = s * (hop / sr)
             te = e * (hop / sr)
@@ -244,12 +235,14 @@ def plot_waveform_with_pauses(artifacts: Dict[str, Any]) -> plt.Figure:
     fig.tight_layout()
     return fig
 def plot_pitch(artifacts: Dict[str, Any]) -> plt.Figure:
     pitch = artifacts.get("pitch", np.array([]))
     times = artifacts.get("times", np.array([]))
     fig = plt.figure(figsize=(10, 3.2))
     ax = fig.add_subplot(111)
     if pitch.size and times.size:
         ax.plot(times, pitch, linewidth=1.0)
         ax.set_xlabel("Tijd (s)")
@@ -262,43 +255,46 @@ def plot_pitch(artifacts: Dict[str, Any]) -> plt.Figure:
     fig.tight_layout()
     return fig
 # -----------------------------
 # UI helpers
 # -----------------------------
 def format_features_table(feats: Features) -> List[List[str]]:
-    def fmt(x, kind="float"):
-        if x is None or (isinstance(x, float) and (not math.isfinite(x))):
             return "—"
-        if kind == "sec":
-            return _human_seconds(float(x))
-        if kind == "int":
-            return str(int(x))
         return f"{float(x):.3f}"
     return [
-        ["Duur", fmt(feats.duration_s, "sec")],
-        ["Volume (RMS) gemiddeld", fmt(feats.rms_mean)],
-        ["Volume (RMS) variatie", fmt(feats.rms_std)],
-        ["ZCR (ruis/‘scherpte’) gemiddeld", fmt(feats.zcr_mean)],
-        ["Pitch mediaan", ("—" if not math.isfinite(feats.pitch_median_hz) else f"{feats.pitch_median_hz:.1f} Hz")],
-        ["Pitch spreiding (IQR)", ("—" if not math.isfinite(feats.pitch_iqr_hz) else f"{feats.pitch_iqr_hz:.1f} Hz")],
-        ["Voiced ratio", ("—" if not math.isfinite(feats.voiced_ratio) else f"{feats.voiced_ratio*100:.1f}%")],
-        ["Aantal pauzes (≥ 0.2s)", fmt(feats.n_pauses, "int")],
-        ["Totale pauzeduur", fmt(feats.pause_total_s, "sec")],
-        ["Actieve-spraak ratio", ("—" if not math.isfinite(feats.active_ratio) else f"{feats.active_ratio*100:.1f}%")],
     ]
 def explain_panel(feats: Features) -> str:
-    # Human-friendly explanation without medical conclusions.
     bullets = []
     if math.isfinite(feats.pause_total_s):
-        bullets.append(f"- **Pauzes**: {feats.n_pauses} pauzes (≥0.2s), samen { _human_seconds(feats.pause_total_s) }.")
     if math.isfinite(feats.pitch_median_hz):
         bullets.append(f"- **Pitch**: mediaan ~ {feats.pitch_median_hz:.1f} Hz, spreiding (IQR) {feats.pitch_iqr_hz:.1f} Hz.")
     if math.isfinite(feats.rms_mean):
         bullets.append(f"- **Volume**: RMS gemiddeld {feats.rms_mean:.3f} (relatief; alleen vergelijken binnen dezelfde setup).")
     if math.isfinite(feats.active_ratio):
         bullets.append(f"- **Actieve spraak**: ~ {feats.active_ratio*100:.1f}% van de tijd boven drempel.")
     if not bullets:
         bullets = ["- Geen features beschikbaar (audio te kort of leeg)."]
@@ -311,6 +307,7 @@ def explain_panel(feats: Features) -> str:
         "Gebruik dit als gespreksstarter of educatieve visualisatie."
     )
 # -----------------------------
 # Core callbacks
 # -----------------------------
@@ -325,6 +322,7 @@ def analyze_single(audio: Tuple[int, np.ndarray]):
     expl = explain_panel(feats)
     return gr.Dataframe(value=table, headers=["Kenmerk", "Waarde"]), wf, pc, expl
 def analyze_compare(a1, a2):
     if a1 is None or a2 is None:
         return "—", gr.Dataframe(value=[["—", "Selecteer twee fragmenten."]]), None
@@ -339,8 +337,7 @@ def analyze_compare(a1, a2):
     e2 = embed_audio(art2["y"], art2["sr"])
     sim = _cosine(e1, e2)
-    # Delta table
-    def d(a, b):
         if (a is None) or (b is None):
             return "—"
         if (isinstance(a, float) and not math.isfinite(a)) or (isinstance(b, float) and not math.isfinite(b)):
@@ -348,35 +345,29 @@ def analyze_compare(a1, a2):
         return f"{(b - a):+.3f}"
     rows = [
-        ["Duur (s)", f1.duration_s if math.isfinite(f1.duration_s) else np.nan, f2.duration_s if math.isfinite(f2.duration_s) else np.nan, d(f1.duration_s, f2.duration_s)],
-        ["RMS mean", f1.rms_mean, f2.rms_mean, d(f1.rms_mean, f2.rms_mean)],
-        ["Pitch mediaan (Hz)", f1.pitch_median_hz, f2.pitch_median_hz, d(f1.pitch_median_hz, f2.pitch_median_hz)],
         ["Pauzes (#)", float(f1.n_pauses), float(f2.n_pauses), f"{(f2.n_pauses - f1.n_pauses):+d}"],
-        ["Pauzeduur (s)", f1.pause_total_s, f2.pause_total_s, d(f1.pause_total_s, f2.pause_total_s)],
-        ["Actieve ratio", f1.active_ratio, f2.active_ratio, d(f1.active_ratio, f2.active_ratio)],
     ]
-    # Format values nicely
     formatted = []
     for k, v1, v2, dv in rows:
-        def fmtv(v):
             if isinstance(v, float) and math.isfinite(v):
                 if "ratio" in k.lower():
                     return f"{v*100:.1f}%"
                 if "pitch" in k.lower():
                     return f"{v:.1f}"
-                if "duur" in k.lower() or "s)" in k.lower() or "(s)" in k.lower() or "RMS" in k:
-                    return f"{v:.3f}"
                 return f"{v:.3f}"
-            if isinstance(v, (int, np.integer)):
-                return str(int(v))
             return "—"
-        formatted.append([k, fmtv(v1), fmtv(v2), dv])
-    # Compare waveform overlay
     fig = plt.figure(figsize=(10, 3.2))
     ax = fig.add_subplot(111)
-    # downsample for plotting speed
     def prep_plot(y, sr):
         if sr != TARGET_SR:
             y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
@@ -388,18 +379,20 @@ def analyze_compare(a1, a2):
     t1, yy1 = prep_plot(y1, sr1)
     t2, yy2 = prep_plot(y2, sr2)
     if yy1.size:
         ax.plot(t1, yy1, linewidth=0.8, label="Fragment A")
     if yy2.size:
         ax.plot(t2, yy2, linewidth=0.8, label="Fragment B", alpha=0.8)
     ax.set_title("Waveform overlay (eerste max 20s)")
     ax.set_xlabel("Tijd (s)")
     ax.set_ylabel("Amplitude")
     ax.legend(loc="upper right")
     fig.tight_layout()
-    sim_txt = f"{sim*100:.1f}%"
-    return sim_txt, gr.Dataframe(value=formatted, headers=["Kenmerk", "A", "B", "Δ (B−A)"]), fig
 # -----------------------------
 # UI
@@ -408,11 +401,8 @@ CSS = """
 :root{
   --bg: #0b0f19;
   --panel: rgba(255,255,255,0.06);
-  --panel2: rgba(255,255,255,0.09);
   --text: rgba(255,255,255,0.92);
   --muted: rgba(255,255,255,0.70);
-  --accent: #7c3aed;
-  --accent2: #22c55e;
   --border: rgba(255,255,255,0.14);
   --shadow: 0 10px 30px rgba(0,0,0,0.35);
 }
@@ -447,14 +437,6 @@ CSS = """
   line-height: 1.45;
 }
-.card{
-  background: var(--panel);
-  border: 1px solid var(--border);
-  border-radius: 18px;
-  padding: 14px;
-  box-shadow: var(--shadow);
-}
 .badge{
   display: inline-flex;
   align-items: center;
@@ -523,19 +505,16 @@ def build_demo():
                                 """
                             )
                     with gr.Column(scale=7):
-                        with gr.Row():
-                            feat_df = gr.Dataframe(
-                                headers=["Kenmerk", "Waarde"],
-                                datatype=["str", "str"],
-                                interactive=False,
-                                wrap=True,
-                                label="Meetbare kenmerken"
-                            )
-                        with gr.Row():
-                            wf_plot = gr.Plot(label="Waveform + pauzes")
-                        with gr.Row():
-                            pitch_plot = gr.Plot(label="Pitch")
-                        explanation = gr.Markdown("### Upload of neem audio op", elem_classes=["card"])
                 run_btn.click(analyze_single, inputs=[input_audio], outputs=[feat_df, wf_plot, pitch_plot, explanation])
@@ -558,7 +537,7 @@ def build_demo():
                             datatype=["str", "str", "str", "str"],
                             interactive=False,
                             wrap=True,
-                            label="Verschillen (uitlegbaar)"
                         )
                         overlay_plot = gr.Plot(label="Waveform overlay")
@@ -572,15 +551,14 @@ def build_demo():
                 - We tonen **verschillen** tussen fragmenten, i.p.v. één eindlabel.
                 - We geven **geen diagnose** of medische claim; de output is bedoeld als **observatie**.
                 - In een zorgcontext hoort interpretatie altijd samen te gaan met **context + gesprek + klinisch oordeel**.
-                **Let op:** als je dit ooit richting praktijk wilt brengen, heb je o.a. nodig:
-                governance, dataminimalisatie, DPIA/AVG, bias-audit, modelmonitoring, en duidelijke ‘human-in-the-loop’ afspraken.
                 """
             )
     return demo
 if __name__ == "__main__":
     demo = build_demo()
     demo.queue(max_size=32)
     demo.launch()

+```python
 import os
 import math
 import numpy as np
 import matplotlib.pyplot as plt
 from dataclasses import dataclass
+from typing import Dict, Any, Tuple, List
+from functools import lru_cache
 import torch
 from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
 # -----------------------------
 # Lightweight explainability helpers
 # -----------------------------
 def _human_seconds(sec: float) -> str:
     if not math.isfinite(sec):
         return "—"
     if sec < 60:
         return f"{sec:.1f}s"
     m = int(sec // 60)
+    s = sec - 60 * m
     return f"{m}m {s:.1f}s"
 def _cosine(a: np.ndarray, b: np.ndarray) -> float:
     a = np.asarray(a, dtype=np.float32)
     b = np.asarray(b, dtype=np.float32)
     denom = (np.linalg.norm(a) * np.linalg.norm(b)) + 1e-12
     return float(np.dot(a, b) / denom)
 # -----------------------------
 # Model (audio embedding)
 # -----------------------------
+@lru_cache(maxsize=1)
 def load_w2v():
     extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
     model = Wav2Vec2Model.from_pretrained(MODEL_ID).to(DEVICE)
     model.eval()
     return extractor, model
 def embed_audio(y: np.ndarray, sr: int) -> np.ndarray:
     extractor, model = load_w2v()
     if sr != TARGET_SR:
         y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
         sr = TARGET_SR
     if y.size == 0:
         return np.zeros((768,), dtype=np.float32)
     y = y.astype(np.float32)
     mx = float(np.max(np.abs(y))) + 1e-9
     y = y / mx
     with torch.no_grad():
         input_values = inputs["input_values"].to(DEVICE)
         out = model(input_values)
         emb = out.last_hidden_state.mean(dim=1).squeeze(0).detach().cpu().numpy()
     return emb.astype(np.float32)
 # -----------------------------
 # Feature extraction
 # -----------------------------
     pause_total_s: float
     active_ratio: float
 def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
     """Return features + artifacts for plots/inspection."""
     if y is None or len(y) == 0:
         sr = TARGET_SR
     y = y.astype(np.float32)
     duration = float(len(y) / sr)
+    hop = 160   # 10 ms at 16k
+    frame = 400 # 25 ms at 16k
     rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0]
     zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame, hop_length=hop)[0]
     rms_mean = float(np.mean(rms)) if rms.size else np.nan
+    rms_std = float(np.std(rms)) if rms.size else np.nan
     zcr_mean = float(np.mean(zcr)) if zcr.size else np.nan
+    # Pitch using probabilistic YIN (pyin)
     try:
         f0, voiced_flag, voiced_probs = librosa.pyin(
             y,
         )
     except Exception:
         f0 = None
     if f0 is None:
         pitch_median = np.nan
             pitch_iqr = np.nan
     # Pause detection using RMS threshold (relative)
     if rms.size:
+        thr = float(np.percentile(rms, 20)) * 0.8
         silent = rms < thr
+        min_pause_frames = int(0.2 / (hop / sr))  # pauses >= 0.2s
         pauses = []
         start = None
         for i, s in enumerate(silent):
         n_pauses = 0
         pause_total_s = 0.0
         active_ratio = np.nan
+        thr = None
     feats = Features(
         duration_s=duration,
         "pitch": pitch,
         "times": times,
         "pauses": pauses,
+        "rms_thr": thr,
     }
     return feats, artifacts
 # -----------------------------
 # Plotting
 # -----------------------------
     fig = plt.figure(figsize=(10, 3.2))
     ax = fig.add_subplot(111)
     if y.size:
         t = np.arange(len(y)) / sr
         ax.plot(t, y, linewidth=0.8)
         ax.set_ylabel("Amplitude")
         ax.set_title("Waveform (met gedetecteerde pauzes)")
         for (s, e) in pauses:
             ts = s * (hop / sr)
             te = e * (hop / sr)
     fig.tight_layout()
     return fig
 def plot_pitch(artifacts: Dict[str, Any]) -> plt.Figure:
     pitch = artifacts.get("pitch", np.array([]))
     times = artifacts.get("times", np.array([]))
     fig = plt.figure(figsize=(10, 3.2))
     ax = fig.add_subplot(111)
     if pitch.size and times.size:
         ax.plot(times, pitch, linewidth=1.0)
         ax.set_xlabel("Tijd (s)")
     fig.tight_layout()
     return fig
 # -----------------------------
 # UI helpers
 # -----------------------------
 def format_features_table(feats: Features) -> List[List[str]]:
+    def fmt_float(x):
+        if x is None or (isinstance(x, float) and not math.isfinite(x)):
             return "—"
         return f"{float(x):.3f}"
+    def fmt_int(x):
+        if x is None:
+            return "—"
+        return str(int(x))
     return [
+        ["Duur", _human_seconds(feats.duration_s)],
+        ["Volume (RMS) gemiddeld", fmt_float(feats.rms_mean)],
+        ["Volume (RMS) variatie", fmt_float(feats.rms_std)],
+        ["ZCR (ruis/‘scherpte’) gemiddeld", fmt_float(feats.zcr_mean)],
+        ["Pitch mediaan", "—" if not math.isfinite(feats.pitch_median_hz) else f"{feats.pitch_median_hz:.1f} Hz"],
+        ["Pitch spreiding (IQR)", "—" if not math.isfinite(feats.pitch_iqr_hz) else f"{feats.pitch_iqr_hz:.1f} Hz"],
+        ["Voiced ratio", "—" if not math.isfinite(feats.voiced_ratio) else f"{feats.voiced_ratio*100:.1f}%"],
+        ["Aantal pauzes (≥ 0.2s)", fmt_int(feats.n_pauses)],
+        ["Totale pauzeduur", _human_seconds(feats.pause_total_s)],
+        ["Actieve-spraak ratio", "—" if not math.isfinite(feats.active_ratio) else f"{feats.active_ratio*100:.1f}%"],
     ]
 def explain_panel(feats: Features) -> str:
     bullets = []
     if math.isfinite(feats.pause_total_s):
+        bullets.append(f"- **Pauzes**: {feats.n_pauses} pauzes (≥0.2s), samen {_human_seconds(feats.pause_total_s)}.")
     if math.isfinite(feats.pitch_median_hz):
         bullets.append(f"- **Pitch**: mediaan ~ {feats.pitch_median_hz:.1f} Hz, spreiding (IQR) {feats.pitch_iqr_hz:.1f} Hz.")
     if math.isfinite(feats.rms_mean):
         bullets.append(f"- **Volume**: RMS gemiddeld {feats.rms_mean:.3f} (relatief; alleen vergelijken binnen dezelfde setup).")
     if math.isfinite(feats.active_ratio):
         bullets.append(f"- **Actieve spraak**: ~ {feats.active_ratio*100:.1f}% van de tijd boven drempel.")
     if not bullets:
         bullets = ["- Geen features beschikbaar (audio te kort of leeg)."]
         "Gebruik dit als gespreksstarter of educatieve visualisatie."
     )
 # -----------------------------
 # Core callbacks
 # -----------------------------
     expl = explain_panel(feats)
     return gr.Dataframe(value=table, headers=["Kenmerk", "Waarde"]), wf, pc, expl
 def analyze_compare(a1, a2):
     if a1 is None or a2 is None:
         return "—", gr.Dataframe(value=[["—", "Selecteer twee fragmenten."]]), None
     e2 = embed_audio(art2["y"], art2["sr"])
     sim = _cosine(e1, e2)
+    def delta(a, b):
         if (a is None) or (b is None):
             return "—"
         if (isinstance(a, float) and not math.isfinite(a)) or (isinstance(b, float) and not math.isfinite(b)):
         return f"{(b - a):+.3f}"
     rows = [
+        ["Duur (s)", f1.duration_s, f2.duration_s, delta(f1.duration_s, f2.duration_s)],
+        ["RMS mean", f1.rms_mean, f2.rms_mean, delta(f1.rms_mean, f2.rms_mean)],
+        ["Pitch mediaan (Hz)", f1.pitch_median_hz, f2.pitch_median_hz, delta(f1.pitch_median_hz, f2.pitch_median_hz)],
         ["Pauzes (#)", float(f1.n_pauses), float(f2.n_pauses), f"{(f2.n_pauses - f1.n_pauses):+d}"],
+        ["Pauzeduur (s)", f1.pause_total_s, f2.pause_total_s, delta(f1.pause_total_s, f2.pause_total_s)],
+        ["Actieve ratio", f1.active_ratio, f2.active_ratio, delta(f1.active_ratio, f2.active_ratio)],
     ]
     formatted = []
     for k, v1, v2, dv in rows:
+        def fmt(v):
             if isinstance(v, float) and math.isfinite(v):
                 if "ratio" in k.lower():
                     return f"{v*100:.1f}%"
                 if "pitch" in k.lower():
                     return f"{v:.1f}"
                 return f"{v:.3f}"
             return "—"
+        formatted.append([k, fmt(v1), fmt(v2), dv])
     fig = plt.figure(figsize=(10, 3.2))
     ax = fig.add_subplot(111)
     def prep_plot(y, sr):
         if sr != TARGET_SR:
             y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
     t1, yy1 = prep_plot(y1, sr1)
     t2, yy2 = prep_plot(y2, sr2)
     if yy1.size:
         ax.plot(t1, yy1, linewidth=0.8, label="Fragment A")
     if yy2.size:
         ax.plot(t2, yy2, linewidth=0.8, label="Fragment B", alpha=0.8)
     ax.set_title("Waveform overlay (eerste max 20s)")
     ax.set_xlabel("Tijd (s)")
     ax.set_ylabel("Amplitude")
     ax.legend(loc="upper right")
     fig.tight_layout()
+    return f"{sim*100:.1f}%", gr.Dataframe(value=formatted, headers=["Kenmerk", "A", "B", "Δ (B−A)"]), fig
 # -----------------------------
 # UI
 :root{
   --bg: #0b0f19;
   --panel: rgba(255,255,255,0.06);
   --text: rgba(255,255,255,0.92);
   --muted: rgba(255,255,255,0.70);
   --border: rgba(255,255,255,0.14);
   --shadow: 0 10px 30px rgba(0,0,0,0.35);
 }
   line-height: 1.45;
 }
 .badge{
   display: inline-flex;
   align-items: center;
                                 """
                             )
                     with gr.Column(scale=7):
+                        feat_df = gr.Dataframe(
+                            headers=["Kenmerk", "Waarde"],
+                            datatype=["str", "str"],
+                            interactive=False,
+                            wrap=True,
+                            label="Meetbare kenmerken",
+                        )
+                        wf_plot = gr.Plot(label="Waveform + pauzes")
+                        pitch_plot = gr.Plot(label="Pitch")
+                        explanation = gr.Markdown("### Upload of neem audio op", elem_id="explain-card")
                 run_btn.click(analyze_single, inputs=[input_audio], outputs=[feat_df, wf_plot, pitch_plot, explanation])
                             datatype=["str", "str", "str", "str"],
                             interactive=False,
                             wrap=True,
+                            label="Verschillen (uitlegbaar)",
                         )
                         overlay_plot = gr.Plot(label="Waveform overlay")
                 - We tonen **verschillen** tussen fragmenten, i.p.v. één eindlabel.
                 - We geven **geen diagnose** of medische claim; de output is bedoeld als **observatie**.
                 - In een zorgcontext hoort interpretatie altijd samen te gaan met **context + gesprek + klinisch oordeel**.
                 """
             )
     return demo
 if __name__ == "__main__":
     demo = build_demo()
     demo.queue(max_size=32)
     demo.launch()
+```