Spaces:

Marcel0123
/

Explainable-Speech-Analytics

Sleeping

App Files Files Community

Marcel0123 commited on 25 days ago

Commit

68f57e5

verified ·

1 Parent(s): 58cbc82

Upload 2 files

Browse files

Files changed (2) hide show

app .py +586 -0
requirements.txt.txt +8 -0

app .py ADDED Viewed

	@@ -0,0 +1,586 @@

+import os
+import math
+import numpy as np
+import gradio as gr
+import librosa
+import matplotlib.pyplot as plt
+from dataclasses import dataclass
+from typing import Dict, Any, Tuple, Optional, List
+import torch
+from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
+# -----------------------------
+# Configuration
+# -----------------------------
+TARGET_SR = 16000
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_ID = os.getenv("W2V_MODEL_ID", "facebook/wav2vec2-base-960h")
+# -----------------------------
+# Lightweight explainability helpers
+# -----------------------------
+def _safe_float(x, default=np.nan):
+    try:
+        if x is None:
+            return default
+        x = float(x)
+        if math.isfinite(x):
+            return x
+        return default
+    except Exception:
+        return default
+def _human_seconds(sec: float) -> str:
+    if not math.isfinite(sec):
+        return "—"
+    if sec < 60:
+        return f"{sec:.1f}s"
+    m = int(sec // 60)
+    s = sec - 60*m
+    return f"{m}m {s:.1f}s"
+def _cosine(a: np.ndarray, b: np.ndarray) -> float:
+    a = np.asarray(a, dtype=np.float32)
+    b = np.asarray(b, dtype=np.float32)
+    denom = (np.linalg.norm(a) * np.linalg.norm(b)) + 1e-12
+    return float(np.dot(a, b) / denom)
+# -----------------------------
+# Model (audio embedding)
+# -----------------------------
+@gr.cache()
+def load_w2v():
+    extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
+    model = Wav2Vec2Model.from_pretrained(MODEL_ID).to(DEVICE)
+    model.eval()
+    return extractor, model
+def embed_audio(y: np.ndarray, sr: int) -> np.ndarray:
+    extractor, model = load_w2v()
+    if sr != TARGET_SR:
+        y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
+        sr = TARGET_SR
+    # Normalize to [-1, 1]
+    if y.size == 0:
+        return np.zeros((768,), dtype=np.float32)
+    y = y.astype(np.float32)
+    mx = float(np.max(np.abs(y))) + 1e-9
+    y = y / mx
+    inputs = extractor(y, sampling_rate=sr, return_tensors="pt")
+    with torch.no_grad():
+        input_values = inputs["input_values"].to(DEVICE)
+        out = model(input_values)
+        # Mean pooling over time
+        emb = out.last_hidden_state.mean(dim=1).squeeze(0).detach().cpu().numpy()
+    return emb.astype(np.float32)
+# -----------------------------
+# Feature extraction
+# -----------------------------
+@dataclass
+class Features:
+    duration_s: float
+    rms_mean: float
+    rms_std: float
+    zcr_mean: float
+    pitch_median_hz: float
+    pitch_iqr_hz: float
+    voiced_ratio: float
+    n_pauses: int
+    pause_total_s: float
+    active_ratio: float
+def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
+    """Return features + artifacts for plots/inspection."""
+    if y is None or len(y) == 0:
+        f = Features(np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 0, 0.0, np.nan)
+        return f, {"y": np.array([]), "sr": sr, "times": np.array([]), "pitch": np.array([])}
+    if sr != TARGET_SR:
+        y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
+        sr = TARGET_SR
+    y = y.astype(np.float32)
+    # Trim leading/trailing silence slightly for stability, but keep for pause detection
+    duration = float(len(y) / sr)
+    # Frame-level features
+    hop = 160  # 10 ms at 16k
+    frame = 400  # 25 ms at 16k
+    rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0]
+    zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame, hop_length=hop)[0]
+    rms_mean = float(np.mean(rms)) if rms.size else np.nan
+    rms_std  = float(np.std(rms)) if rms.size else np.nan
+    zcr_mean = float(np.mean(zcr)) if zcr.size else np.nan
+    # Pitch using probabilistic YIN (pyin). Can be slow, but OK for short clips.
+    # f0 contains NaN for unvoiced frames.
+    try:
+        f0, voiced_flag, voiced_probs = librosa.pyin(
+            y,
+            fmin=librosa.note_to_hz("C2"),
+            fmax=librosa.note_to_hz("C7"),
+            sr=sr,
+            frame_length=frame,
+            hop_length=hop,
+        )
+    except Exception:
+        f0 = None
+        voiced_flag = None
+    if f0 is None:
+        pitch_median = np.nan
+        pitch_iqr = np.nan
+        voiced_ratio = np.nan
+        pitch = np.array([])
+        times = np.array([])
+    else:
+        pitch = np.asarray(f0, dtype=np.float32)
+        times = librosa.frames_to_time(np.arange(len(pitch)), sr=sr, hop_length=hop)
+        voiced = np.isfinite(pitch)
+        voiced_ratio = float(np.mean(voiced)) if voiced.size else np.nan
+        if np.any(voiced):
+            pv = pitch[voiced]
+            pitch_median = float(np.median(pv))
+            q75, q25 = np.percentile(pv, [75, 25])
+            pitch_iqr = float(q75 - q25)
+        else:
+            pitch_median = np.nan
+            pitch_iqr = np.nan
+    # Pause detection using RMS threshold (relative)
+    # Convert rms frames -> boolean "silent"
+    if rms.size:
+        thr = float(np.percentile(rms, 20)) * 0.8  # conservative
+        silent = rms < thr
+        # Count pauses longer than 0.2s
+        min_pause_frames = int(0.2 / (hop / sr))
+        # Run-length encoding
+        pauses = []
+        start = None
+        for i, s in enumerate(silent):
+            if s and start is None:
+                start = i
+            if (not s) and start is not None:
+                end = i
+                if (end - start) >= min_pause_frames:
+                    pauses.append((start, end))
+                start = None
+        if start is not None:
+            end = len(silent)
+            if (end - start) >= min_pause_frames:
+                pauses.append((start, end))
+        n_pauses = int(len(pauses))
+        pause_total_s = float(sum((e - s) * (hop / sr) for s, e in pauses))
+        active_ratio = float(1.0 - (np.mean(silent) if silent.size else 0.0))
+    else:
+        pauses = []
+        n_pauses = 0
+        pause_total_s = 0.0
+        active_ratio = np.nan
+    feats = Features(
+        duration_s=duration,
+        rms_mean=rms_mean,
+        rms_std=rms_std,
+        zcr_mean=zcr_mean,
+        pitch_median_hz=pitch_median,
+        pitch_iqr_hz=pitch_iqr,
+        voiced_ratio=voiced_ratio,
+        n_pauses=n_pauses,
+        pause_total_s=pause_total_s,
+        active_ratio=active_ratio,
+    )
+    artifacts = {
+        "y": y,
+        "sr": sr,
+        "hop": hop,
+        "frame": frame,
+        "rms": rms,
+        "zcr": zcr,
+        "pitch": pitch,
+        "times": times,
+        "pauses": pauses,
+        "rms_thr": thr if rms.size else None,
+    }
+    return feats, artifacts
+# -----------------------------
+# Plotting
+# -----------------------------
+def plot_waveform_with_pauses(artifacts: Dict[str, Any]) -> plt.Figure:
+    y = artifacts["y"]
+    sr = artifacts["sr"]
+    pauses = artifacts.get("pauses", [])
+    hop = artifacts.get("hop", 160)
+    fig = plt.figure(figsize=(10, 3.2))
+    ax = fig.add_subplot(111)
+    if y.size:
+        t = np.arange(len(y)) / sr
+        ax.plot(t, y, linewidth=0.8)
+        ax.set_xlim(0, t[-1] if t.size else 1)
+        ax.set_xlabel("Tijd (s)")
+        ax.set_ylabel("Amplitude")
+        ax.set_title("Waveform (met gedetecteerde pauzes)")
+        # Overlay pause regions (convert pause frames to time)
+        for (s, e) in pauses:
+            ts = s * (hop / sr)
+            te = e * (hop / sr)
+            ax.axvspan(ts, te, alpha=0.2)
+    else:
+        ax.text(0.5, 0.5, "Geen audio", ha="center", va="center")
+        ax.set_axis_off()
+    fig.tight_layout()
+    return fig
+def plot_pitch(artifacts: Dict[str, Any]) -> plt.Figure:
+    pitch = artifacts.get("pitch", np.array([]))
+    times = artifacts.get("times", np.array([]))
+    fig = plt.figure(figsize=(10, 3.2))
+    ax = fig.add_subplot(111)
+    if pitch.size and times.size:
+        ax.plot(times, pitch, linewidth=1.0)
+        ax.set_xlabel("Tijd (s)")
+        ax.set_ylabel("Pitch (Hz)")
+        ax.set_title("Pitch contour (NaN = onvoiced)")
+    else:
+        ax.text(0.5, 0.5, "Pitch niet beschikbaar (te kort / te veel ruis)", ha="center", va="center")
+        ax.set_axis_off()
+    fig.tight_layout()
+    return fig
+# -----------------------------
+# UI helpers
+# -----------------------------
+def format_features_table(feats: Features) -> List[List[str]]:
+    def fmt(x, kind="float"):
+        if x is None or (isinstance(x, float) and (not math.isfinite(x))):
+            return "—"
+        if kind == "sec":
+            return _human_seconds(float(x))
+        if kind == "int":
+            return str(int(x))
+        return f"{float(x):.3f}"
+    return [
+        ["Duur", fmt(feats.duration_s, "sec")],
+        ["Volume (RMS) gemiddeld", fmt(feats.rms_mean)],
+        ["Volume (RMS) variatie", fmt(feats.rms_std)],
+        ["ZCR (ruis/‘scherpte’) gemiddeld", fmt(feats.zcr_mean)],
+        ["Pitch mediaan", ("—" if not math.isfinite(feats.pitch_median_hz) else f"{feats.pitch_median_hz:.1f} Hz")],
+        ["Pitch spreiding (IQR)", ("—" if not math.isfinite(feats.pitch_iqr_hz) else f"{feats.pitch_iqr_hz:.1f} Hz")],
+        ["Voiced ratio", ("—" if not math.isfinite(feats.voiced_ratio) else f"{feats.voiced_ratio*100:.1f}%")],
+        ["Aantal pauzes (≥ 0.2s)", fmt(feats.n_pauses, "int")],
+        ["Totale pauzeduur", fmt(feats.pause_total_s, "sec")],
+        ["Actieve-spraak ratio", ("—" if not math.isfinite(feats.active_ratio) else f"{feats.active_ratio*100:.1f}%")],
+    ]
+def explain_panel(feats: Features) -> str:
+    # Human-friendly explanation without medical conclusions.
+    bullets = []
+    if math.isfinite(feats.pause_total_s):
+        bullets.append(f"- **Pauzes**: {feats.n_pauses} pauzes (≥0.2s), samen { _human_seconds(feats.pause_total_s) }.")
+    if math.isfinite(feats.pitch_median_hz):
+        bullets.append(f"- **Pitch**: mediaan ~ {feats.pitch_median_hz:.1f} Hz, spreiding (IQR) {feats.pitch_iqr_hz:.1f} Hz.")
+    if math.isfinite(feats.rms_mean):
+        bullets.append(f"- **Volume**: RMS gemiddeld {feats.rms_mean:.3f} (relatief; alleen vergelijken binnen dezelfde setup).")
+    if math.isfinite(feats.active_ratio):
+        bullets.append(f"- **Actieve spraak**: ~ {feats.active_ratio*100:.1f}% van de tijd boven drempel.")
+    if not bullets:
+        bullets = ["- Geen features beschikbaar (audio te kort of leeg)."]
+    return (
+        "### Wat ‘ziet’ de AI hier?\n"
+        "Dit is een **uitleg-demo**: we tonen *meetbare spraaksignalen* en hoe die veranderen tussen fragmenten.\n\n"
+        + "\n".join(bullets)
+        + "\n\n"
+        "**Belangrijk:** dit systeem geeft **geen diagnose** en is **geen medisch hulpmiddel**. "
+        "Gebruik dit als gespreksstarter of educatieve visualisatie."
+    )
+# -----------------------------
+# Core callbacks
+# -----------------------------
+def analyze_single(audio: Tuple[int, np.ndarray]):
+    if audio is None:
+        return gr.Dataframe(value=[["—", "Upload of neem audio op om te starten."]]), None, None, "### Upload of neem audio op"
+    sr, y = audio
+    feats, art = compute_features(y, sr)
+    table = format_features_table(feats)
+    wf = plot_waveform_with_pauses(art)
+    pc = plot_pitch(art)
+    expl = explain_panel(feats)
+    return gr.Dataframe(value=table, headers=["Kenmerk", "Waarde"]), wf, pc, expl
+def analyze_compare(a1, a2):
+    if a1 is None or a2 is None:
+        return "—", gr.Dataframe(value=[["—", "Selecteer twee fragmenten."]]), None
+    sr1, y1 = a1
+    sr2, y2 = a2
+    f1, art1 = compute_features(y1, sr1)
+    f2, art2 = compute_features(y2, sr2)
+    e1 = embed_audio(art1["y"], art1["sr"])
+    e2 = embed_audio(art2["y"], art2["sr"])
+    sim = _cosine(e1, e2)
+    # Delta table
+    def d(a, b):
+        if (a is None) or (b is None):
+            return "—"
+        if (isinstance(a, float) and not math.isfinite(a)) or (isinstance(b, float) and not math.isfinite(b)):
+            return "—"
+        return f"{(b - a):+.3f}"
+    rows = [
+        ["Duur (s)", f1.duration_s if math.isfinite(f1.duration_s) else np.nan, f2.duration_s if math.isfinite(f2.duration_s) else np.nan, d(f1.duration_s, f2.duration_s)],
+        ["RMS mean", f1.rms_mean, f2.rms_mean, d(f1.rms_mean, f2.rms_mean)],
+        ["Pitch mediaan (Hz)", f1.pitch_median_hz, f2.pitch_median_hz, d(f1.pitch_median_hz, f2.pitch_median_hz)],
+        ["Pauzes (#)", float(f1.n_pauses), float(f2.n_pauses), f"{(f2.n_pauses - f1.n_pauses):+d}"],
+        ["Pauzeduur (s)", f1.pause_total_s, f2.pause_total_s, d(f1.pause_total_s, f2.pause_total_s)],
+        ["Actieve ratio", f1.active_ratio, f2.active_ratio, d(f1.active_ratio, f2.active_ratio)],
+    ]
+    # Format values nicely
+    formatted = []
+    for k, v1, v2, dv in rows:
+        def fmtv(v):
+            if isinstance(v, float) and math.isfinite(v):
+                if "ratio" in k.lower():
+                    return f"{v*100:.1f}%"
+                if "pitch" in k.lower():
+                    return f"{v:.1f}"
+                if "duur" in k.lower() or "s)" in k.lower() or "(s)" in k.lower() or "RMS" in k:
+                    return f"{v:.3f}"
+                return f"{v:.3f}"
+            if isinstance(v, (int, np.integer)):
+                return str(int(v))
+            return "—"
+        formatted.append([k, fmtv(v1), fmtv(v2), dv])
+    # Compare waveform overlay
+    fig = plt.figure(figsize=(10, 3.2))
+    ax = fig.add_subplot(111)
+    # downsample for plotting speed
+    def prep_plot(y, sr):
+        if sr != TARGET_SR:
+            y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
+            sr = TARGET_SR
+        if y.size > sr * 20:
+            y = y[: sr * 20]
+        t = np.arange(len(y)) / sr
+        return t, y
+    t1, yy1 = prep_plot(y1, sr1)
+    t2, yy2 = prep_plot(y2, sr2)
+    if yy1.size:
+        ax.plot(t1, yy1, linewidth=0.8, label="Fragment A")
+    if yy2.size:
+        ax.plot(t2, yy2, linewidth=0.8, label="Fragment B", alpha=0.8)
+    ax.set_title("Waveform overlay (eerste max 20s)")
+    ax.set_xlabel("Tijd (s)")
+    ax.set_ylabel("Amplitude")
+    ax.legend(loc="upper right")
+    fig.tight_layout()
+    sim_txt = f"{sim*100:.1f}%"
+    return sim_txt, gr.Dataframe(value=formatted, headers=["Kenmerk", "A", "B", "Δ (B−A)"]), fig
+# -----------------------------
+# UI
+# -----------------------------
+CSS = """
+:root{
+  --bg: #0b0f19;
+  --panel: rgba(255,255,255,0.06);
+  --panel2: rgba(255,255,255,0.09);
+  --text: rgba(255,255,255,0.92);
+  --muted: rgba(255,255,255,0.70);
+  --accent: #7c3aed;
+  --accent2: #22c55e;
+  --border: rgba(255,255,255,0.14);
+  --shadow: 0 10px 30px rgba(0,0,0,0.35);
+}
+.gradio-container{
+  background: radial-gradient(1200px 700px at 10% 10%, rgba(124,58,237,0.25), transparent 55%),
+              radial-gradient(900px 600px at 90% 20%, rgba(34,197,94,0.18), transparent 55%),
+              radial-gradient(1100px 800px at 40% 100%, rgba(59,130,246,0.15), transparent 60%),
+              var(--bg) !important;
+  color: var(--text) !important;
+}
+#header-card{
+  background: linear-gradient(135deg, rgba(124,58,237,0.22), rgba(34,197,94,0.14));
+  border: 1px solid var(--border);
+  border-radius: 18px;
+  padding: 18px 18px 14px 18px;
+  box-shadow: var(--shadow);
+}
+#header-title{
+  font-size: 28px;
+  font-weight: 750;
+  letter-spacing: -0.02em;
+  margin: 0;
+}
+#header-sub{
+  margin-top: 6px;
+  color: var(--muted);
+  font-size: 14px;
+  line-height: 1.45;
+}
+.card{
+  background: var(--panel);
+  border: 1px solid var(--border);
+  border-radius: 18px;
+  padding: 14px;
+  box-shadow: var(--shadow);
+}
+.badge{
+  display: inline-flex;
+  align-items: center;
+  gap: 8px;
+  padding: 6px 10px;
+  border-radius: 999px;
+  border: 1px solid var(--border);
+  background: rgba(255,255,255,0.05);
+  color: var(--muted);
+  font-size: 12px;
+  margin-right: 10px;
+}
+.badge b{
+  color: var(--text);
+  font-weight: 700;
+}
+a { color: rgba(255,255,255,0.9) !important; }
+label, .md, .markdown { color: var(--text) !important; }
+"""
+def build_demo():
+    with gr.Blocks(
+        css=CSS,
+        theme=gr.themes.Soft(primary_hue="violet", secondary_hue="emerald"),
+        title="Explainable Speech Analytics (Demo)"
+    ) as demo:
+        gr.HTML(
+            """
+            <div id="header-card">
+              <p id="header-title">Explainable Speech Analytics</p>
+              <div id="header-sub">
+                <span class="badge"><b>Doel</b> inzicht in meetbare spraaksignalen</span>
+                <span class="badge"><b>Geen diagnose</b> geen medisch hulpmiddel</span>
+                <span class="badge"><b>Privacy</b> audio wordt niet opgeslagen door deze demo</span>
+                <p style="margin-top:12px">
+                  Upload of neem korte audiofragmenten op en bekijk <b>wat het systeem meet</b>: pauzes, pitch,
+                  volume-energie en een algemene <b>audio-embedding</b> om fragmenten te vergelijken.
+                  Gebruik dit als <b>educatieve visualisatie</b> of gespreksstarter — niet als klinische beslissing.
+                </p>
+              </div>
+            </div>
+            """
+        )
+        with gr.Tabs():
+            with gr.TabItem("Analyse (1 fragment)"):
+                with gr.Row():
+                    with gr.Column(scale=5):
+                        input_audio = gr.Audio(
+                            label="Audio",
+                            sources=["upload", "microphone"],
+                            type="numpy",
+                        )
+                        run_btn = gr.Button("Analyseer", variant="primary")
+                        with gr.Accordion("Wat gebeurt er technisch?", open=False):
+                            gr.Markdown(
+                                """
+                                - **Akoestiek**: we extraheren frame-based signalen (RMS, ZCR), schatten **pitch** met *pyin*,
+                                  en detecteren **pauzes** met een adaptieve energiedrempel.
+                                - **Embedding**: een vooraf getraind **Wav2Vec2**-model maakt een vaste vector (embedding) van de audio
+                                  waarmee we fragmenten **onderling** kunnen vergelijken (cosine similarity).
+                                - **Explainable by design**: we tonen de signalen en deltas, niet alleen een score.
+                                """
+                            )
+                    with gr.Column(scale=7):
+                        with gr.Row():
+                            feat_df = gr.Dataframe(
+                                headers=["Kenmerk", "Waarde"],
+                                datatype=["str", "str"],
+                                interactive=False,
+                                wrap=True,
+                                label="Meetbare kenmerken"
+                            )
+                        with gr.Row():
+                            wf_plot = gr.Plot(label="Waveform + pauzes")
+                        with gr.Row():
+                            pitch_plot = gr.Plot(label="Pitch")
+                        explanation = gr.Markdown("### Upload of neem audio op", elem_classes=["card"])
+                run_btn.click(analyze_single, inputs=[input_audio], outputs=[feat_df, wf_plot, pitch_plot, explanation])
+            with gr.TabItem("Vergelijk (2 fragmenten)"):
+                with gr.Row():
+                    with gr.Column(scale=5):
+                        a1 = gr.Audio(label="Fragment A", sources=["upload", "microphone"], type="numpy")
+                        a2 = gr.Audio(label="Fragment B", sources=["upload", "microphone"], type="numpy")
+                        compare_btn = gr.Button("Vergelijk", variant="primary")
+                        gr.Markdown(
+                            """
+                            **Interpretatie-tip:** een lagere overeenkomst betekent alleen dat de audio *anders* is
+                            (andere omgeving, microfoon, emotie, vermoeidheid, etc.). Het zegt **niet** *waarom*.
+                            """
+                        )
+                    with gr.Column(scale=7):
+                        sim_out = gr.Textbox(label="Embedding-overeenkomst (cosine similarity)", value="—", interactive=False)
+                        delta_df = gr.Dataframe(
+                            headers=["Kenmerk", "A", "B", "Δ (B−A)"],
+                            datatype=["str", "str", "str", "str"],
+                            interactive=False,
+                            wrap=True,
+                            label="Verschillen (uitlegbaar)"
+                        )
+                        overlay_plot = gr.Plot(label="Waveform overlay")
+                compare_btn.click(analyze_compare, inputs=[a1, a2], outputs=[sim_out, delta_df, overlay_plot])
+        with gr.Accordion("Ethiek & transparantie (anti–black box)", open=False):
+            gr.Markdown(
+                """
+                **Hoe voorkomt deze demo ‘black box’ gedrag?**
+                - We tonen **de signalen** (pauzes, pitch, energie) in grafieken en tabellen.
+                - We tonen **verschillen** tussen fragmenten, i.p.v. één eindlabel.
+                - We geven **geen diagnose** of medische claim; de output is bedoeld als **observatie**.
+                - In een zorgcontext hoort interpretatie altijd samen te gaan met **context + gesprek + klinisch oordeel**.
+                **Let op:** als je dit ooit richting praktijk wilt brengen, heb je o.a. nodig:
+                governance, dataminimalisatie, DPIA/AVG, bias-audit, modelmonitoring, en duidelijke ‘human-in-the-loop’ afspraken.
+                """
+            )
+    return demo
+if __name__ == "__main__":
+    demo = build_demo()
+    demo.queue(max_size=32)
+    demo.launch()

requirements.txt.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio==4.44.1
+numpy>=1.24
+scipy>=1.10
+librosa>=0.10.2.post1
+soundfile>=0.12.1
+matplotlib>=3.7
+torch>=2.1
+transformers>=4.41