Marcel0123 commited on
Commit
7416f7f
·
verified ·
1 Parent(s): 5c4e27d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -445
app.py CHANGED
@@ -1,4 +1,3 @@
1
- ```python
2
  import os
3
  import math
4
  import numpy as np
@@ -13,38 +12,35 @@ from functools import lru_cache
13
  import torch
14
  from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
15
 
16
- # -----------------------------
17
  # Configuration
18
- # -----------------------------
19
  TARGET_SR = 16000
20
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
  MODEL_ID = os.getenv("W2V_MODEL_ID", "facebook/wav2vec2-base-960h")
22
 
23
- # -----------------------------
24
- # Lightweight explainability helpers
25
- # -----------------------------
26
- def _human_seconds(sec: float) -> str:
27
  if not math.isfinite(sec):
28
  return "—"
29
  if sec < 60:
30
  return f"{sec:.1f}s"
31
  m = int(sec // 60)
32
- s = sec - 60 * m
33
- return f"{m}m {s:.1f}s"
34
 
35
 
36
- def _cosine(a: np.ndarray, b: np.ndarray) -> float:
37
- a = np.asarray(a, dtype=np.float32)
38
- b = np.asarray(b, dtype=np.float32)
39
- denom = (np.linalg.norm(a) * np.linalg.norm(b)) + 1e-12
40
  return float(np.dot(a, b) / denom)
41
 
42
 
43
- # -----------------------------
44
- # Model (audio embedding)
45
- # -----------------------------
46
  @lru_cache(maxsize=1)
47
- def load_w2v():
48
  extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
49
  model = Wav2Vec2Model.from_pretrained(MODEL_ID).to(DEVICE)
50
  model.eval()
@@ -52,69 +48,55 @@ def load_w2v():
52
 
53
 
54
  def embed_audio(y: np.ndarray, sr: int) -> np.ndarray:
55
- extractor, model = load_w2v()
56
  if sr != TARGET_SR:
57
- y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
58
- sr = TARGET_SR
59
 
60
  if y.size == 0:
61
- return np.zeros((768,), dtype=np.float32)
62
 
63
  y = y.astype(np.float32)
64
- mx = float(np.max(np.abs(y))) + 1e-9
65
- y = y / mx
 
 
66
 
67
- inputs = extractor(y, sampling_rate=sr, return_tensors="pt")
68
  with torch.no_grad():
69
- input_values = inputs["input_values"].to(DEVICE)
70
- out = model(input_values)
71
- emb = out.last_hidden_state.mean(dim=1).squeeze(0).detach().cpu().numpy()
72
  return emb.astype(np.float32)
73
 
74
 
75
- # -----------------------------
76
  # Feature extraction
77
- # -----------------------------
78
  @dataclass
79
  class Features:
80
  duration_s: float
81
  rms_mean: float
82
  rms_std: float
83
- zcr_mean: float
84
- pitch_median_hz: float
85
- pitch_iqr_hz: float
86
- voiced_ratio: float
87
  n_pauses: int
88
  pause_total_s: float
89
  active_ratio: float
90
 
91
 
92
  def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
93
- """Return features + artifacts for plots/inspection."""
94
- if y is None or len(y) == 0:
95
- f = Features(np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 0, 0.0, np.nan)
96
- return f, {"y": np.array([]), "sr": sr, "times": np.array([]), "pitch": np.array([])}
97
-
98
  if sr != TARGET_SR:
99
- y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
100
  sr = TARGET_SR
101
 
102
- y = y.astype(np.float32)
103
- duration = float(len(y) / sr)
104
-
105
- hop = 160 # 10 ms at 16k
106
- frame = 400 # 25 ms at 16k
107
 
108
  rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0]
109
- zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame, hop_length=hop)[0]
 
110
 
111
- rms_mean = float(np.mean(rms)) if rms.size else np.nan
112
- rms_std = float(np.std(rms)) if rms.size else np.nan
113
- zcr_mean = float(np.mean(zcr)) if zcr.size else np.nan
114
-
115
- # Pitch using probabilistic YIN (pyin)
116
  try:
117
- f0, voiced_flag, voiced_probs = librosa.pyin(
118
  y,
119
  fmin=librosa.note_to_hz("C2"),
120
  fmax=librosa.note_to_hz("C7"),
@@ -125,440 +107,127 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
125
  except Exception:
126
  f0 = None
127
 
128
- if f0 is None:
 
 
 
 
129
  pitch_median = np.nan
130
  pitch_iqr = np.nan
131
- voiced_ratio = np.nan
132
- pitch = np.array([])
133
- times = np.array([])
134
- else:
135
- pitch = np.asarray(f0, dtype=np.float32)
136
- times = librosa.frames_to_time(np.arange(len(pitch)), sr=sr, hop_length=hop)
137
- voiced = np.isfinite(pitch)
138
- voiced_ratio = float(np.mean(voiced)) if voiced.size else np.nan
139
- if np.any(voiced):
140
- pv = pitch[voiced]
141
- pitch_median = float(np.median(pv))
142
- q75, q25 = np.percentile(pv, [75, 25])
143
- pitch_iqr = float(q75 - q25)
144
- else:
145
- pitch_median = np.nan
146
- pitch_iqr = np.nan
147
-
148
- # Pause detection using RMS threshold (relative)
149
- if rms.size:
150
- thr = float(np.percentile(rms, 20)) * 0.8
151
- silent = rms < thr
152
-
153
- min_pause_frames = int(0.2 / (hop / sr)) # pauses >= 0.2s
154
- pauses = []
155
- start = None
156
- for i, s in enumerate(silent):
157
- if s and start is None:
158
- start = i
159
- if (not s) and start is not None:
160
- end = i
161
- if (end - start) >= min_pause_frames:
162
- pauses.append((start, end))
163
- start = None
164
- if start is not None:
165
- end = len(silent)
166
- if (end - start) >= min_pause_frames:
167
- pauses.append((start, end))
168
-
169
- n_pauses = int(len(pauses))
170
- pause_total_s = float(sum((e - s) * (hop / sr) for s, e in pauses))
171
- active_ratio = float(1.0 - (np.mean(silent) if silent.size else 0.0))
172
- else:
173
- pauses = []
174
- n_pauses = 0
175
- pause_total_s = 0.0
176
- active_ratio = np.nan
177
- thr = None
178
 
179
  feats = Features(
180
  duration_s=duration,
181
  rms_mean=rms_mean,
182
  rms_std=rms_std,
183
- zcr_mean=zcr_mean,
184
- pitch_median_hz=pitch_median,
185
- pitch_iqr_hz=pitch_iqr,
186
- voiced_ratio=voiced_ratio,
187
- n_pauses=n_pauses,
188
- pause_total_s=pause_total_s,
189
  active_ratio=active_ratio,
190
  )
191
 
192
  artifacts = {
193
  "y": y,
194
  "sr": sr,
195
- "hop": hop,
196
- "frame": frame,
197
  "rms": rms,
198
- "zcr": zcr,
199
- "pitch": pitch,
200
- "times": times,
201
  "pauses": pauses,
202
- "rms_thr": thr,
203
  }
 
204
  return feats, artifacts
205
 
206
 
207
- # -----------------------------
208
  # Plotting
209
- # -----------------------------
210
- def plot_waveform_with_pauses(artifacts: Dict[str, Any]) -> plt.Figure:
211
  y = artifacts["y"]
212
  sr = artifacts["sr"]
213
- pauses = artifacts.get("pauses", [])
214
- hop = artifacts.get("hop", 160)
215
 
216
- fig = plt.figure(figsize=(10, 3.2))
217
  ax = fig.add_subplot(111)
218
 
219
- if y.size:
220
- t = np.arange(len(y)) / sr
221
- ax.plot(t, y, linewidth=0.8)
222
- ax.set_xlim(0, t[-1] if t.size else 1)
223
- ax.set_xlabel("Tijd (s)")
224
- ax.set_ylabel("Amplitude")
225
- ax.set_title("Waveform (met gedetecteerde pauzes)")
226
-
227
- for (s, e) in pauses:
228
- ts = s * (hop / sr)
229
- te = e * (hop / sr)
230
- ax.axvspan(ts, te, alpha=0.2)
231
- else:
232
- ax.text(0.5, 0.5, "Geen audio", ha="center", va="center")
233
- ax.set_axis_off()
234
 
235
- fig.tight_layout()
236
- return fig
237
-
238
-
239
- def plot_pitch(artifacts: Dict[str, Any]) -> plt.Figure:
240
- pitch = artifacts.get("pitch", np.array([]))
241
- times = artifacts.get("times", np.array([]))
242
-
243
- fig = plt.figure(figsize=(10, 3.2))
244
- ax = fig.add_subplot(111)
245
-
246
- if pitch.size and times.size:
247
- ax.plot(times, pitch, linewidth=1.0)
248
- ax.set_xlabel("Tijd (s)")
249
- ax.set_ylabel("Pitch (Hz)")
250
- ax.set_title("Pitch contour (NaN = onvoiced)")
251
- else:
252
- ax.text(0.5, 0.5, "Pitch niet beschikbaar (te kort / te veel ruis)", ha="center", va="center")
253
- ax.set_axis_off()
254
 
 
 
 
255
  fig.tight_layout()
256
  return fig
257
 
258
 
259
- # -----------------------------
260
- # UI helpers
261
- # -----------------------------
262
- def format_features_table(feats: Features) -> List[List[str]]:
263
- def fmt_float(x):
264
- if x is None or (isinstance(x, float) and not math.isfinite(x)):
265
- return "—"
266
- return f"{float(x):.3f}"
267
-
268
- def fmt_int(x):
269
- if x is None:
270
- return "—"
271
- return str(int(x))
272
-
273
- return [
274
- ["Duur", _human_seconds(feats.duration_s)],
275
- ["Volume (RMS) gemiddeld", fmt_float(feats.rms_mean)],
276
- ["Volume (RMS) variatie", fmt_float(feats.rms_std)],
277
- ["ZCR (ruis/‘scherpte’) gemiddeld", fmt_float(feats.zcr_mean)],
278
- ["Pitch mediaan", "—" if not math.isfinite(feats.pitch_median_hz) else f"{feats.pitch_median_hz:.1f} Hz"],
279
- ["Pitch spreiding (IQR)", "—" if not math.isfinite(feats.pitch_iqr_hz) else f"{feats.pitch_iqr_hz:.1f} Hz"],
280
- ["Voiced ratio", "—" if not math.isfinite(feats.voiced_ratio) else f"{feats.voiced_ratio*100:.1f}%"],
281
- ["Aantal pauzes (≥ 0.2s)", fmt_int(feats.n_pauses)],
282
- ["Totale pauzeduur", _human_seconds(feats.pause_total_s)],
283
- ["Actieve-spraak ratio", "—" if not math.isfinite(feats.active_ratio) else f"{feats.active_ratio*100:.1f}%"],
284
- ]
285
-
286
-
287
- def explain_panel(feats: Features) -> str:
288
- bullets = []
289
- if math.isfinite(feats.pause_total_s):
290
- bullets.append(f"- **Pauzes**: {feats.n_pauses} pauzes (≥0.2s), samen {_human_seconds(feats.pause_total_s)}.")
291
- if math.isfinite(feats.pitch_median_hz):
292
- bullets.append(f"- **Pitch**: mediaan ~ {feats.pitch_median_hz:.1f} Hz, spreiding (IQR) {feats.pitch_iqr_hz:.1f} Hz.")
293
- if math.isfinite(feats.rms_mean):
294
- bullets.append(f"- **Volume**: RMS gemiddeld {feats.rms_mean:.3f} (relatief; alleen vergelijken binnen dezelfde setup).")
295
- if math.isfinite(feats.active_ratio):
296
- bullets.append(f"- **Actieve spraak**: ~ {feats.active_ratio*100:.1f}% van de tijd boven drempel.")
297
-
298
- if not bullets:
299
- bullets = ["- Geen features beschikbaar (audio te kort of leeg)."]
300
-
301
- return (
302
- "### Wat ‘ziet’ de AI hier?\n"
303
- "Dit is een **uitleg-demo**: we tonen *meetbare spraaksignalen* en hoe die veranderen tussen fragmenten.\n\n"
304
- + "\n".join(bullets)
305
- + "\n\n"
306
- "**Belangrijk:** dit systeem geeft **geen diagnose** en is **geen medisch hulpmiddel**. "
307
- "Gebruik dit als gespreksstarter of educatieve visualisatie."
308
- )
309
-
310
-
311
- # -----------------------------
312
- # Core callbacks
313
- # -----------------------------
314
- def analyze_single(audio: Tuple[int, np.ndarray]):
315
  if audio is None:
316
- return gr.Dataframe(value=[["—", "Upload of neem audio op om te starten."]]), None, None, "### Upload of neem audio op"
 
317
  sr, y = audio
318
  feats, art = compute_features(y, sr)
319
- table = format_features_table(feats)
320
- wf = plot_waveform_with_pauses(art)
321
- pc = plot_pitch(art)
322
- expl = explain_panel(feats)
323
- return gr.Dataframe(value=table, headers=["Kenmerk", "Waarde"]), wf, pc, expl
324
-
325
-
326
- def analyze_compare(a1, a2):
327
- if a1 is None or a2 is None:
328
- return "—", gr.Dataframe(value=[["—", "Selecteer twee fragmenten."]]), None
329
-
330
- sr1, y1 = a1
331
- sr2, y2 = a2
332
-
333
- f1, art1 = compute_features(y1, sr1)
334
- f2, art2 = compute_features(y2, sr2)
335
-
336
- e1 = embed_audio(art1["y"], art1["sr"])
337
- e2 = embed_audio(art2["y"], art2["sr"])
338
- sim = _cosine(e1, e2)
339
-
340
- def delta(a, b):
341
- if (a is None) or (b is None):
342
- return "—"
343
- if (isinstance(a, float) and not math.isfinite(a)) or (isinstance(b, float) and not math.isfinite(b)):
344
- return "—"
345
- return f"{(b - a):+.3f}"
346
-
347
- rows = [
348
- ["Duur (s)", f1.duration_s, f2.duration_s, delta(f1.duration_s, f2.duration_s)],
349
- ["RMS mean", f1.rms_mean, f2.rms_mean, delta(f1.rms_mean, f2.rms_mean)],
350
- ["Pitch mediaan (Hz)", f1.pitch_median_hz, f2.pitch_median_hz, delta(f1.pitch_median_hz, f2.pitch_median_hz)],
351
- ["Pauzes (#)", float(f1.n_pauses), float(f2.n_pauses), f"{(f2.n_pauses - f1.n_pauses):+d}"],
352
- ["Pauzeduur (s)", f1.pause_total_s, f2.pause_total_s, delta(f1.pause_total_s, f2.pause_total_s)],
353
- ["Actieve ratio", f1.active_ratio, f2.active_ratio, delta(f1.active_ratio, f2.active_ratio)],
354
- ]
355
 
356
- formatted = []
357
- for k, v1, v2, dv in rows:
358
- def fmt(v):
359
- if isinstance(v, float) and math.isfinite(v):
360
- if "ratio" in k.lower():
361
- return f"{v*100:.1f}%"
362
- if "pitch" in k.lower():
363
- return f"{v:.1f}"
364
- return f"{v:.3f}"
365
- return "—"
366
- formatted.append([k, fmt(v1), fmt(v2), dv])
367
-
368
- fig = plt.figure(figsize=(10, 3.2))
369
- ax = fig.add_subplot(111)
370
 
371
- def prep_plot(y, sr):
372
- if sr != TARGET_SR:
373
- y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
374
- sr = TARGET_SR
375
- if y.size > sr * 20:
376
- y = y[: sr * 20]
377
- t = np.arange(len(y)) / sr
378
- return t, y
379
 
380
- t1, yy1 = prep_plot(y1, sr1)
381
- t2, yy2 = prep_plot(y2, sr2)
382
 
383
- if yy1.size:
384
- ax.plot(t1, yy1, linewidth=0.8, label="Fragment A")
385
- if yy2.size:
386
- ax.plot(t2, yy2, linewidth=0.8, label="Fragment B", alpha=0.8)
387
 
388
- ax.set_title("Waveform overlay (eerste max 20s)")
389
- ax.set_xlabel("Tijd (s)")
390
- ax.set_ylabel("Amplitude")
391
- ax.legend(loc="upper right")
392
- fig.tight_layout()
 
 
 
393
 
394
- return f"{sim*100:.1f}%", gr.Dataframe(value=formatted, headers=["Kenmerk", "A", "B", "Δ (B−A)"]), fig
 
 
395
 
 
 
 
396
 
397
- # -----------------------------
398
- # UI
399
- # -----------------------------
400
- CSS = """
401
- :root{
402
- --bg: #0b0f19;
403
- --panel: rgba(255,255,255,0.06);
404
- --text: rgba(255,255,255,0.92);
405
- --muted: rgba(255,255,255,0.70);
406
- --border: rgba(255,255,255,0.14);
407
- --shadow: 0 10px 30px rgba(0,0,0,0.35);
408
- }
409
-
410
- .gradio-container{
411
- background: radial-gradient(1200px 700px at 10% 10%, rgba(124,58,237,0.25), transparent 55%),
412
- radial-gradient(900px 600px at 90% 20%, rgba(34,197,94,0.18), transparent 55%),
413
- radial-gradient(1100px 800px at 40% 100%, rgba(59,130,246,0.15), transparent 60%),
414
- var(--bg) !important;
415
- color: var(--text) !important;
416
- }
417
-
418
- #header-card{
419
- background: linear-gradient(135deg, rgba(124,58,237,0.22), rgba(34,197,94,0.14));
420
- border: 1px solid var(--border);
421
- border-radius: 18px;
422
- padding: 18px 18px 14px 18px;
423
- box-shadow: var(--shadow);
424
- }
425
-
426
- #header-title{
427
- font-size: 28px;
428
- font-weight: 750;
429
- letter-spacing: -0.02em;
430
- margin: 0;
431
- }
432
-
433
- #header-sub{
434
- margin-top: 6px;
435
- color: var(--muted);
436
- font-size: 14px;
437
- line-height: 1.45;
438
- }
439
-
440
- .badge{
441
- display: inline-flex;
442
- align-items: center;
443
- gap: 8px;
444
- padding: 6px 10px;
445
- border-radius: 999px;
446
- border: 1px solid var(--border);
447
- background: rgba(255,255,255,0.05);
448
- color: var(--muted);
449
- font-size: 12px;
450
- margin-right: 10px;
451
- }
452
-
453
- .badge b{
454
- color: var(--text);
455
- font-weight: 700;
456
- }
457
-
458
- a { color: rgba(255,255,255,0.9) !important; }
459
- label, .md, .markdown { color: var(--text) !important; }
460
- """
461
-
462
- def build_demo():
463
- with gr.Blocks(
464
- css=CSS,
465
- theme=gr.themes.Soft(primary_hue="violet", secondary_hue="emerald"),
466
- title="Explainable Speech Analytics (Demo)"
467
- ) as demo:
468
-
469
- gr.HTML(
470
- """
471
- <div id="header-card">
472
- <p id="header-title">Explainable Speech Analytics</p>
473
- <div id="header-sub">
474
- <span class="badge"><b>Doel</b> inzicht in meetbare spraaksignalen</span>
475
- <span class="badge"><b>Geen diagnose</b> geen medisch hulpmiddel</span>
476
- <span class="badge"><b>Privacy</b> audio wordt niet opgeslagen door deze demo</span>
477
- <p style="margin-top:12px">
478
- Upload of neem korte audiofragmenten op en bekijk <b>wat het systeem meet</b>: pauzes, pitch,
479
- volume-energie en een algemene <b>audio-embedding</b> om fragmenten te vergelijken.
480
- Gebruik dit als <b>educatieve visualisatie</b> of gespreksstarter — niet als klinische beslissing.
481
- </p>
482
- </div>
483
- </div>
484
- """
485
- )
486
 
487
- with gr.Tabs():
488
- with gr.TabItem("Analyse (1 fragment)"):
489
- with gr.Row():
490
- with gr.Column(scale=5):
491
- input_audio = gr.Audio(
492
- label="Audio",
493
- sources=["upload", "microphone"],
494
- type="numpy",
495
- )
496
- run_btn = gr.Button("Analyseer", variant="primary")
497
- with gr.Accordion("Wat gebeurt er technisch?", open=False):
498
- gr.Markdown(
499
- """
500
- - **Akoestiek**: we extraheren frame-based signalen (RMS, ZCR), schatten **pitch** met *pyin*,
501
- en detecteren **pauzes** met een adaptieve energiedrempel.
502
- - **Embedding**: een vooraf getraind **Wav2Vec2**-model maakt een vaste vector (embedding) van de audio
503
- waarmee we fragmenten **onderling** kunnen vergelijken (cosine similarity).
504
- - **Explainable by design**: we tonen de signalen en deltas, niet alleen een score.
505
- """
506
- )
507
- with gr.Column(scale=7):
508
- feat_df = gr.Dataframe(
509
- headers=["Kenmerk", "Waarde"],
510
- datatype=["str", "str"],
511
- interactive=False,
512
- wrap=True,
513
- label="Meetbare kenmerken",
514
- )
515
- wf_plot = gr.Plot(label="Waveform + pauzes")
516
- pitch_plot = gr.Plot(label="Pitch")
517
- explanation = gr.Markdown("### Upload of neem audio op", elem_id="explain-card")
518
-
519
- run_btn.click(analyze_single, inputs=[input_audio], outputs=[feat_df, wf_plot, pitch_plot, explanation])
520
-
521
- with gr.TabItem("Vergelijk (2 fragmenten)"):
522
- with gr.Row():
523
- with gr.Column(scale=5):
524
- a1 = gr.Audio(label="Fragment A", sources=["upload", "microphone"], type="numpy")
525
- a2 = gr.Audio(label="Fragment B", sources=["upload", "microphone"], type="numpy")
526
- compare_btn = gr.Button("Vergelijk", variant="primary")
527
- gr.Markdown(
528
- """
529
- **Interpretatie-tip:** een lagere overeenkomst betekent alleen dat de audio *anders* is
530
- (andere omgeving, microfoon, emotie, vermoeidheid, etc.). Het zegt **niet** *waarom*.
531
- """
532
- )
533
- with gr.Column(scale=7):
534
- sim_out = gr.Textbox(label="Embedding-overeenkomst (cosine similarity)", value="—", interactive=False)
535
- delta_df = gr.Dataframe(
536
- headers=["Kenmerk", "A", "B", "Δ (B−A)"],
537
- datatype=["str", "str", "str", "str"],
538
- interactive=False,
539
- wrap=True,
540
- label="Verschillen (uitlegbaar)",
541
- )
542
- overlay_plot = gr.Plot(label="Waveform overlay")
543
-
544
- compare_btn.click(analyze_compare, inputs=[a1, a2], outputs=[sim_out, delta_df, overlay_plot])
545
-
546
- with gr.Accordion("Ethiek & transparantie (anti–black box)", open=False):
547
- gr.Markdown(
548
- """
549
- **Hoe voorkomt deze demo ‘black box’ gedrag?**
550
- - We tonen **de signalen** (pauzes, pitch, energie) in grafieken en tabellen.
551
- - We tonen **verschillen** tussen fragmenten, i.p.v. één eindlabel.
552
- - We geven **geen diagnose** of medische claim; de output is bedoeld als **observatie**.
553
- - In een zorgcontext hoort interpretatie altijd samen te gaan met **context + gesprek + klinisch oordeel**.
554
- """
555
- )
556
-
557
- return demo
558
-
559
-
560
- if __name__ == "__main__":
561
- demo = build_demo()
562
- demo.queue(max_size=32)
563
- demo.launch()
564
- ```
 
 
1
  import os
2
  import math
3
  import numpy as np
 
12
  import torch
13
  from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
14
 
15
+ # =========================================================
16
  # Configuration
17
+ # =========================================================
18
  TARGET_SR = 16000
19
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
20
  MODEL_ID = os.getenv("W2V_MODEL_ID", "facebook/wav2vec2-base-960h")
21
 
22
+ # =========================================================
23
+ # Utility helpers
24
+ # =========================================================
25
+ def human_seconds(sec: float) -> str:
26
  if not math.isfinite(sec):
27
  return "—"
28
  if sec < 60:
29
  return f"{sec:.1f}s"
30
  m = int(sec // 60)
31
+ return f"{m}m {sec - 60*m:.1f}s"
 
32
 
33
 
34
+ def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
35
+ denom = (np.linalg.norm(a) * np.linalg.norm(b)) + 1e-9
 
 
36
  return float(np.dot(a, b) / denom)
37
 
38
 
39
+ # =========================================================
40
+ # Model loading (cached)
41
+ # =========================================================
42
  @lru_cache(maxsize=1)
43
+ def load_wav2vec():
44
  extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
45
  model = Wav2Vec2Model.from_pretrained(MODEL_ID).to(DEVICE)
46
  model.eval()
 
48
 
49
 
50
  def embed_audio(y: np.ndarray, sr: int) -> np.ndarray:
 
51
  if sr != TARGET_SR:
52
+ y = librosa.resample(y, sr, TARGET_SR)
 
53
 
54
  if y.size == 0:
55
+ return np.zeros(768, dtype=np.float32)
56
 
57
  y = y.astype(np.float32)
58
+ y /= np.max(np.abs(y)) + 1e-9
59
+
60
+ extractor, model = load_wav2vec()
61
+ inputs = extractor(y, sampling_rate=TARGET_SR, return_tensors="pt")
62
 
 
63
  with torch.no_grad():
64
+ out = model(inputs["input_values"].to(DEVICE))
65
+ emb = out.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
66
+
67
  return emb.astype(np.float32)
68
 
69
 
70
+ # =========================================================
71
  # Feature extraction
72
+ # =========================================================
73
  @dataclass
74
  class Features:
75
  duration_s: float
76
  rms_mean: float
77
  rms_std: float
78
+ pitch_median: float
79
+ pitch_iqr: float
 
 
80
  n_pauses: int
81
  pause_total_s: float
82
  active_ratio: float
83
 
84
 
85
  def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
 
 
 
 
 
86
  if sr != TARGET_SR:
87
+ y = librosa.resample(y, sr, TARGET_SR)
88
  sr = TARGET_SR
89
 
90
+ duration = len(y) / sr
91
+ hop = 160
92
+ frame = 400
 
 
93
 
94
  rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0]
95
+ rms_mean = float(np.mean(rms))
96
+ rms_std = float(np.std(rms))
97
 
 
 
 
 
 
98
  try:
99
+ f0, _, _ = librosa.pyin(
100
  y,
101
  fmin=librosa.note_to_hz("C2"),
102
  fmax=librosa.note_to_hz("C7"),
 
107
  except Exception:
108
  f0 = None
109
 
110
+ if f0 is not None and np.any(np.isfinite(f0)):
111
+ voiced = f0[np.isfinite(f0)]
112
+ pitch_median = float(np.median(voiced))
113
+ pitch_iqr = float(np.percentile(voiced, 75) - np.percentile(voiced, 25))
114
+ else:
115
  pitch_median = np.nan
116
  pitch_iqr = np.nan
117
+
118
+ silence = rms < np.percentile(rms, 20)
119
+ min_pause_frames = int(0.2 / (hop / sr))
120
+
121
+ pauses = []
122
+ start = None
123
+ for i, s in enumerate(silence):
124
+ if s and start is None:
125
+ start = i
126
+ if not s and start is not None:
127
+ if i - start >= min_pause_frames:
128
+ pauses.append((start, i))
129
+ start = None
130
+
131
+ pause_total = sum((e - s) * hop / sr for s, e in pauses)
132
+ active_ratio = 1.0 - float(np.mean(silence))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  feats = Features(
135
  duration_s=duration,
136
  rms_mean=rms_mean,
137
  rms_std=rms_std,
138
+ pitch_median=pitch_median,
139
+ pitch_iqr=pitch_iqr,
140
+ n_pauses=len(pauses),
141
+ pause_total_s=pause_total,
 
 
142
  active_ratio=active_ratio,
143
  )
144
 
145
  artifacts = {
146
  "y": y,
147
  "sr": sr,
 
 
148
  "rms": rms,
149
+ "pitch": f0,
 
 
150
  "pauses": pauses,
151
+ "hop": hop,
152
  }
153
+
154
  return feats, artifacts
155
 
156
 
157
+ # =========================================================
158
  # Plotting
159
+ # =========================================================
160
+ def plot_waveform(artifacts: Dict[str, Any]):
161
  y = artifacts["y"]
162
  sr = artifacts["sr"]
163
+ pauses = artifacts["pauses"]
164
+ hop = artifacts["hop"]
165
 
166
+ fig = plt.figure(figsize=(10, 3))
167
  ax = fig.add_subplot(111)
168
 
169
+ t = np.arange(len(y)) / sr
170
+ ax.plot(t, y, lw=0.8)
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
+ for s, e in pauses:
173
+ ax.axvspan(s * hop / sr, e * hop / sr, alpha=0.2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ ax.set_title("Waveform met pauzes")
176
+ ax.set_xlabel("Tijd (s)")
177
+ ax.set_ylabel("Amplitude")
178
  fig.tight_layout()
179
  return fig
180
 
181
 
182
+ # =========================================================
183
+ # UI callbacks
184
+ # =========================================================
185
+ def analyze_single(audio):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  if audio is None:
187
+ return [], None, "Upload of neem audio op."
188
+
189
  sr, y = audio
190
  feats, art = compute_features(y, sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ table = [
193
+ ["Duur", human_seconds(feats.duration_s)],
194
+ ["Gemiddeld volume (RMS)", f"{feats.rms_mean:.3f}"],
195
+ ["Volume-variatie", f"{feats.rms_std:.3f}"],
196
+ ["Pitch mediaan", "—" if not math.isfinite(feats.pitch_median) else f"{feats.pitch_median:.1f} Hz"],
197
+ ["Pitch spreiding (IQR)", "—" if not math.isfinite(feats.pitch_iqr) else f"{feats.pitch_iqr:.1f} Hz"],
198
+ ["Aantal pauzes ≥0.2s", str(feats.n_pauses)],
199
+ ["Totale pauzeduur", human_seconds(feats.pause_total_s)],
200
+ ["Actieve spraakratio", f"{feats.active_ratio*100:.1f}%"],
201
+ ]
 
 
 
 
202
 
203
+ fig = plot_waveform(art)
204
+ explanation = (
205
+ "### Wat laat dit zien?\n"
206
+ "- Dit zijn **meetbare spraaksignalen** (pauzes, pitch, volume).\n"
207
+ "- Er wordt **geen diagnose** gesteld.\n"
208
+ "- Interpretatie hoort altijd samen met context en gesprek."
209
+ )
 
210
 
211
+ return table, fig, explanation
 
212
 
 
 
 
 
213
 
214
+ # =========================================================
215
+ # UI
216
+ # =========================================================
217
+ with gr.Blocks(title="Explainable Speech Analytics") as demo:
218
+ gr.Markdown(
219
+ "## Explainable Speech Analytics\n"
220
+ "*Educatieve demo – geen medisch hulpmiddel*"
221
+ )
222
 
223
+ with gr.Row():
224
+ audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Audiofragment")
225
+ run = gr.Button("Analyseer", variant="primary")
226
 
227
+ table = gr.Dataframe(headers=["Kenmerk", "Waarde"], interactive=False)
228
+ plot = gr.Plot()
229
+ explanation = gr.Markdown()
230
 
231
+ run.click(analyze_single, inputs=audio, outputs=[table, plot, explanation])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
+ demo.launch()