Marcel0123 commited on
Commit
68f57e5
·
verified ·
1 Parent(s): 58cbc82

Upload 2 files

Browse files
Files changed (2) hide show
  1. app .py +586 -0
  2. requirements.txt.txt +8 -0
app .py ADDED
@@ -0,0 +1,586 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import math
3
+ import numpy as np
4
+ import gradio as gr
5
+ import librosa
6
+ import matplotlib.pyplot as plt
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Dict, Any, Tuple, Optional, List
10
+
11
+ import torch
12
+ from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
13
+
14
+ # -----------------------------
15
+ # Configuration
16
+ # -----------------------------
17
+ TARGET_SR = 16000
18
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
19
+ MODEL_ID = os.getenv("W2V_MODEL_ID", "facebook/wav2vec2-base-960h")
20
+
21
+ # -----------------------------
22
+ # Lightweight explainability helpers
23
+ # -----------------------------
24
+ def _safe_float(x, default=np.nan):
25
+ try:
26
+ if x is None:
27
+ return default
28
+ x = float(x)
29
+ if math.isfinite(x):
30
+ return x
31
+ return default
32
+ except Exception:
33
+ return default
34
+
35
+ def _human_seconds(sec: float) -> str:
36
+ if not math.isfinite(sec):
37
+ return "—"
38
+ if sec < 60:
39
+ return f"{sec:.1f}s"
40
+ m = int(sec // 60)
41
+ s = sec - 60*m
42
+ return f"{m}m {s:.1f}s"
43
+
44
+ def _cosine(a: np.ndarray, b: np.ndarray) -> float:
45
+ a = np.asarray(a, dtype=np.float32)
46
+ b = np.asarray(b, dtype=np.float32)
47
+ denom = (np.linalg.norm(a) * np.linalg.norm(b)) + 1e-12
48
+ return float(np.dot(a, b) / denom)
49
+
50
+ # -----------------------------
51
+ # Model (audio embedding)
52
+ # -----------------------------
53
+ @gr.cache()
54
+ def load_w2v():
55
+ extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
56
+ model = Wav2Vec2Model.from_pretrained(MODEL_ID).to(DEVICE)
57
+ model.eval()
58
+ return extractor, model
59
+
60
+ def embed_audio(y: np.ndarray, sr: int) -> np.ndarray:
61
+ extractor, model = load_w2v()
62
+ if sr != TARGET_SR:
63
+ y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
64
+ sr = TARGET_SR
65
+
66
+ # Normalize to [-1, 1]
67
+ if y.size == 0:
68
+ return np.zeros((768,), dtype=np.float32)
69
+ y = y.astype(np.float32)
70
+ mx = float(np.max(np.abs(y))) + 1e-9
71
+ y = y / mx
72
+
73
+ inputs = extractor(y, sampling_rate=sr, return_tensors="pt")
74
+ with torch.no_grad():
75
+ input_values = inputs["input_values"].to(DEVICE)
76
+ out = model(input_values)
77
+ # Mean pooling over time
78
+ emb = out.last_hidden_state.mean(dim=1).squeeze(0).detach().cpu().numpy()
79
+ return emb.astype(np.float32)
80
+
81
+ # -----------------------------
82
+ # Feature extraction
83
+ # -----------------------------
84
+ @dataclass
85
+ class Features:
86
+ duration_s: float
87
+ rms_mean: float
88
+ rms_std: float
89
+ zcr_mean: float
90
+ pitch_median_hz: float
91
+ pitch_iqr_hz: float
92
+ voiced_ratio: float
93
+ n_pauses: int
94
+ pause_total_s: float
95
+ active_ratio: float
96
+
97
+ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
98
+ """Return features + artifacts for plots/inspection."""
99
+ if y is None or len(y) == 0:
100
+ f = Features(np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 0, 0.0, np.nan)
101
+ return f, {"y": np.array([]), "sr": sr, "times": np.array([]), "pitch": np.array([])}
102
+
103
+ if sr != TARGET_SR:
104
+ y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
105
+ sr = TARGET_SR
106
+
107
+ y = y.astype(np.float32)
108
+ # Trim leading/trailing silence slightly for stability, but keep for pause detection
109
+ duration = float(len(y) / sr)
110
+
111
+ # Frame-level features
112
+ hop = 160 # 10 ms at 16k
113
+ frame = 400 # 25 ms at 16k
114
+
115
+ rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0]
116
+ zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame, hop_length=hop)[0]
117
+
118
+ rms_mean = float(np.mean(rms)) if rms.size else np.nan
119
+ rms_std = float(np.std(rms)) if rms.size else np.nan
120
+ zcr_mean = float(np.mean(zcr)) if zcr.size else np.nan
121
+
122
+ # Pitch using probabilistic YIN (pyin). Can be slow, but OK for short clips.
123
+ # f0 contains NaN for unvoiced frames.
124
+ try:
125
+ f0, voiced_flag, voiced_probs = librosa.pyin(
126
+ y,
127
+ fmin=librosa.note_to_hz("C2"),
128
+ fmax=librosa.note_to_hz("C7"),
129
+ sr=sr,
130
+ frame_length=frame,
131
+ hop_length=hop,
132
+ )
133
+ except Exception:
134
+ f0 = None
135
+ voiced_flag = None
136
+
137
+ if f0 is None:
138
+ pitch_median = np.nan
139
+ pitch_iqr = np.nan
140
+ voiced_ratio = np.nan
141
+ pitch = np.array([])
142
+ times = np.array([])
143
+ else:
144
+ pitch = np.asarray(f0, dtype=np.float32)
145
+ times = librosa.frames_to_time(np.arange(len(pitch)), sr=sr, hop_length=hop)
146
+ voiced = np.isfinite(pitch)
147
+ voiced_ratio = float(np.mean(voiced)) if voiced.size else np.nan
148
+ if np.any(voiced):
149
+ pv = pitch[voiced]
150
+ pitch_median = float(np.median(pv))
151
+ q75, q25 = np.percentile(pv, [75, 25])
152
+ pitch_iqr = float(q75 - q25)
153
+ else:
154
+ pitch_median = np.nan
155
+ pitch_iqr = np.nan
156
+
157
+ # Pause detection using RMS threshold (relative)
158
+ # Convert rms frames -> boolean "silent"
159
+ if rms.size:
160
+ thr = float(np.percentile(rms, 20)) * 0.8 # conservative
161
+ silent = rms < thr
162
+ # Count pauses longer than 0.2s
163
+ min_pause_frames = int(0.2 / (hop / sr))
164
+ # Run-length encoding
165
+ pauses = []
166
+ start = None
167
+ for i, s in enumerate(silent):
168
+ if s and start is None:
169
+ start = i
170
+ if (not s) and start is not None:
171
+ end = i
172
+ if (end - start) >= min_pause_frames:
173
+ pauses.append((start, end))
174
+ start = None
175
+ if start is not None:
176
+ end = len(silent)
177
+ if (end - start) >= min_pause_frames:
178
+ pauses.append((start, end))
179
+
180
+ n_pauses = int(len(pauses))
181
+ pause_total_s = float(sum((e - s) * (hop / sr) for s, e in pauses))
182
+ active_ratio = float(1.0 - (np.mean(silent) if silent.size else 0.0))
183
+ else:
184
+ pauses = []
185
+ n_pauses = 0
186
+ pause_total_s = 0.0
187
+ active_ratio = np.nan
188
+
189
+ feats = Features(
190
+ duration_s=duration,
191
+ rms_mean=rms_mean,
192
+ rms_std=rms_std,
193
+ zcr_mean=zcr_mean,
194
+ pitch_median_hz=pitch_median,
195
+ pitch_iqr_hz=pitch_iqr,
196
+ voiced_ratio=voiced_ratio,
197
+ n_pauses=n_pauses,
198
+ pause_total_s=pause_total_s,
199
+ active_ratio=active_ratio,
200
+ )
201
+
202
+ artifacts = {
203
+ "y": y,
204
+ "sr": sr,
205
+ "hop": hop,
206
+ "frame": frame,
207
+ "rms": rms,
208
+ "zcr": zcr,
209
+ "pitch": pitch,
210
+ "times": times,
211
+ "pauses": pauses,
212
+ "rms_thr": thr if rms.size else None,
213
+ }
214
+ return feats, artifacts
215
+
216
+ # -----------------------------
217
+ # Plotting
218
+ # -----------------------------
219
+ def plot_waveform_with_pauses(artifacts: Dict[str, Any]) -> plt.Figure:
220
+ y = artifacts["y"]
221
+ sr = artifacts["sr"]
222
+ pauses = artifacts.get("pauses", [])
223
+ hop = artifacts.get("hop", 160)
224
+
225
+ fig = plt.figure(figsize=(10, 3.2))
226
+ ax = fig.add_subplot(111)
227
+ if y.size:
228
+ t = np.arange(len(y)) / sr
229
+ ax.plot(t, y, linewidth=0.8)
230
+ ax.set_xlim(0, t[-1] if t.size else 1)
231
+ ax.set_xlabel("Tijd (s)")
232
+ ax.set_ylabel("Amplitude")
233
+ ax.set_title("Waveform (met gedetecteerde pauzes)")
234
+
235
+ # Overlay pause regions (convert pause frames to time)
236
+ for (s, e) in pauses:
237
+ ts = s * (hop / sr)
238
+ te = e * (hop / sr)
239
+ ax.axvspan(ts, te, alpha=0.2)
240
+ else:
241
+ ax.text(0.5, 0.5, "Geen audio", ha="center", va="center")
242
+ ax.set_axis_off()
243
+
244
+ fig.tight_layout()
245
+ return fig
246
+
247
+ def plot_pitch(artifacts: Dict[str, Any]) -> plt.Figure:
248
+ pitch = artifacts.get("pitch", np.array([]))
249
+ times = artifacts.get("times", np.array([]))
250
+
251
+ fig = plt.figure(figsize=(10, 3.2))
252
+ ax = fig.add_subplot(111)
253
+ if pitch.size and times.size:
254
+ ax.plot(times, pitch, linewidth=1.0)
255
+ ax.set_xlabel("Tijd (s)")
256
+ ax.set_ylabel("Pitch (Hz)")
257
+ ax.set_title("Pitch contour (NaN = onvoiced)")
258
+ else:
259
+ ax.text(0.5, 0.5, "Pitch niet beschikbaar (te kort / te veel ruis)", ha="center", va="center")
260
+ ax.set_axis_off()
261
+
262
+ fig.tight_layout()
263
+ return fig
264
+
265
+ # -----------------------------
266
+ # UI helpers
267
+ # -----------------------------
268
+ def format_features_table(feats: Features) -> List[List[str]]:
269
+ def fmt(x, kind="float"):
270
+ if x is None or (isinstance(x, float) and (not math.isfinite(x))):
271
+ return "—"
272
+ if kind == "sec":
273
+ return _human_seconds(float(x))
274
+ if kind == "int":
275
+ return str(int(x))
276
+ return f"{float(x):.3f}"
277
+
278
+ return [
279
+ ["Duur", fmt(feats.duration_s, "sec")],
280
+ ["Volume (RMS) gemiddeld", fmt(feats.rms_mean)],
281
+ ["Volume (RMS) variatie", fmt(feats.rms_std)],
282
+ ["ZCR (ruis/‘scherpte’) gemiddeld", fmt(feats.zcr_mean)],
283
+ ["Pitch mediaan", ("—" if not math.isfinite(feats.pitch_median_hz) else f"{feats.pitch_median_hz:.1f} Hz")],
284
+ ["Pitch spreiding (IQR)", ("—" if not math.isfinite(feats.pitch_iqr_hz) else f"{feats.pitch_iqr_hz:.1f} Hz")],
285
+ ["Voiced ratio", ("—" if not math.isfinite(feats.voiced_ratio) else f"{feats.voiced_ratio*100:.1f}%")],
286
+ ["Aantal pauzes (≥ 0.2s)", fmt(feats.n_pauses, "int")],
287
+ ["Totale pauzeduur", fmt(feats.pause_total_s, "sec")],
288
+ ["Actieve-spraak ratio", ("—" if not math.isfinite(feats.active_ratio) else f"{feats.active_ratio*100:.1f}%")],
289
+ ]
290
+
291
+ def explain_panel(feats: Features) -> str:
292
+ # Human-friendly explanation without medical conclusions.
293
+ bullets = []
294
+ if math.isfinite(feats.pause_total_s):
295
+ bullets.append(f"- **Pauzes**: {feats.n_pauses} pauzes (≥0.2s), samen { _human_seconds(feats.pause_total_s) }.")
296
+ if math.isfinite(feats.pitch_median_hz):
297
+ bullets.append(f"- **Pitch**: mediaan ~ {feats.pitch_median_hz:.1f} Hz, spreiding (IQR) {feats.pitch_iqr_hz:.1f} Hz.")
298
+ if math.isfinite(feats.rms_mean):
299
+ bullets.append(f"- **Volume**: RMS gemiddeld {feats.rms_mean:.3f} (relatief; alleen vergelijken binnen dezelfde setup).")
300
+ if math.isfinite(feats.active_ratio):
301
+ bullets.append(f"- **Actieve spraak**: ~ {feats.active_ratio*100:.1f}% van de tijd boven drempel.")
302
+ if not bullets:
303
+ bullets = ["- Geen features beschikbaar (audio te kort of leeg)."]
304
+
305
+ return (
306
+ "### Wat ‘ziet’ de AI hier?\n"
307
+ "Dit is een **uitleg-demo**: we tonen *meetbare spraaksignalen* en hoe die veranderen tussen fragmenten.\n\n"
308
+ + "\n".join(bullets)
309
+ + "\n\n"
310
+ "**Belangrijk:** dit systeem geeft **geen diagnose** en is **geen medisch hulpmiddel**. "
311
+ "Gebruik dit als gespreksstarter of educatieve visualisatie."
312
+ )
313
+
314
+ # -----------------------------
315
+ # Core callbacks
316
+ # -----------------------------
317
+ def analyze_single(audio: Tuple[int, np.ndarray]):
318
+ if audio is None:
319
+ return gr.Dataframe(value=[["—", "Upload of neem audio op om te starten."]]), None, None, "### Upload of neem audio op"
320
+ sr, y = audio
321
+ feats, art = compute_features(y, sr)
322
+ table = format_features_table(feats)
323
+ wf = plot_waveform_with_pauses(art)
324
+ pc = plot_pitch(art)
325
+ expl = explain_panel(feats)
326
+ return gr.Dataframe(value=table, headers=["Kenmerk", "Waarde"]), wf, pc, expl
327
+
328
+ def analyze_compare(a1, a2):
329
+ if a1 is None or a2 is None:
330
+ return "—", gr.Dataframe(value=[["—", "Selecteer twee fragmenten."]]), None
331
+
332
+ sr1, y1 = a1
333
+ sr2, y2 = a2
334
+
335
+ f1, art1 = compute_features(y1, sr1)
336
+ f2, art2 = compute_features(y2, sr2)
337
+
338
+ e1 = embed_audio(art1["y"], art1["sr"])
339
+ e2 = embed_audio(art2["y"], art2["sr"])
340
+ sim = _cosine(e1, e2)
341
+
342
+ # Delta table
343
+ def d(a, b):
344
+ if (a is None) or (b is None):
345
+ return "—"
346
+ if (isinstance(a, float) and not math.isfinite(a)) or (isinstance(b, float) and not math.isfinite(b)):
347
+ return "—"
348
+ return f"{(b - a):+.3f}"
349
+
350
+ rows = [
351
+ ["Duur (s)", f1.duration_s if math.isfinite(f1.duration_s) else np.nan, f2.duration_s if math.isfinite(f2.duration_s) else np.nan, d(f1.duration_s, f2.duration_s)],
352
+ ["RMS mean", f1.rms_mean, f2.rms_mean, d(f1.rms_mean, f2.rms_mean)],
353
+ ["Pitch mediaan (Hz)", f1.pitch_median_hz, f2.pitch_median_hz, d(f1.pitch_median_hz, f2.pitch_median_hz)],
354
+ ["Pauzes (#)", float(f1.n_pauses), float(f2.n_pauses), f"{(f2.n_pauses - f1.n_pauses):+d}"],
355
+ ["Pauzeduur (s)", f1.pause_total_s, f2.pause_total_s, d(f1.pause_total_s, f2.pause_total_s)],
356
+ ["Actieve ratio", f1.active_ratio, f2.active_ratio, d(f1.active_ratio, f2.active_ratio)],
357
+ ]
358
+
359
+ # Format values nicely
360
+ formatted = []
361
+ for k, v1, v2, dv in rows:
362
+ def fmtv(v):
363
+ if isinstance(v, float) and math.isfinite(v):
364
+ if "ratio" in k.lower():
365
+ return f"{v*100:.1f}%"
366
+ if "pitch" in k.lower():
367
+ return f"{v:.1f}"
368
+ if "duur" in k.lower() or "s)" in k.lower() or "(s)" in k.lower() or "RMS" in k:
369
+ return f"{v:.3f}"
370
+ return f"{v:.3f}"
371
+ if isinstance(v, (int, np.integer)):
372
+ return str(int(v))
373
+ return "—"
374
+ formatted.append([k, fmtv(v1), fmtv(v2), dv])
375
+
376
+ # Compare waveform overlay
377
+ fig = plt.figure(figsize=(10, 3.2))
378
+ ax = fig.add_subplot(111)
379
+ # downsample for plotting speed
380
+ def prep_plot(y, sr):
381
+ if sr != TARGET_SR:
382
+ y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
383
+ sr = TARGET_SR
384
+ if y.size > sr * 20:
385
+ y = y[: sr * 20]
386
+ t = np.arange(len(y)) / sr
387
+ return t, y
388
+
389
+ t1, yy1 = prep_plot(y1, sr1)
390
+ t2, yy2 = prep_plot(y2, sr2)
391
+ if yy1.size:
392
+ ax.plot(t1, yy1, linewidth=0.8, label="Fragment A")
393
+ if yy2.size:
394
+ ax.plot(t2, yy2, linewidth=0.8, label="Fragment B", alpha=0.8)
395
+ ax.set_title("Waveform overlay (eerste max 20s)")
396
+ ax.set_xlabel("Tijd (s)")
397
+ ax.set_ylabel("Amplitude")
398
+ ax.legend(loc="upper right")
399
+ fig.tight_layout()
400
+
401
+ sim_txt = f"{sim*100:.1f}%"
402
+ return sim_txt, gr.Dataframe(value=formatted, headers=["Kenmerk", "A", "B", "Δ (B−A)"]), fig
403
+
404
+ # -----------------------------
405
+ # UI
406
+ # -----------------------------
407
+ CSS = """
408
+ :root{
409
+ --bg: #0b0f19;
410
+ --panel: rgba(255,255,255,0.06);
411
+ --panel2: rgba(255,255,255,0.09);
412
+ --text: rgba(255,255,255,0.92);
413
+ --muted: rgba(255,255,255,0.70);
414
+ --accent: #7c3aed;
415
+ --accent2: #22c55e;
416
+ --border: rgba(255,255,255,0.14);
417
+ --shadow: 0 10px 30px rgba(0,0,0,0.35);
418
+ }
419
+
420
+ .gradio-container{
421
+ background: radial-gradient(1200px 700px at 10% 10%, rgba(124,58,237,0.25), transparent 55%),
422
+ radial-gradient(900px 600px at 90% 20%, rgba(34,197,94,0.18), transparent 55%),
423
+ radial-gradient(1100px 800px at 40% 100%, rgba(59,130,246,0.15), transparent 60%),
424
+ var(--bg) !important;
425
+ color: var(--text) !important;
426
+ }
427
+
428
+ #header-card{
429
+ background: linear-gradient(135deg, rgba(124,58,237,0.22), rgba(34,197,94,0.14));
430
+ border: 1px solid var(--border);
431
+ border-radius: 18px;
432
+ padding: 18px 18px 14px 18px;
433
+ box-shadow: var(--shadow);
434
+ }
435
+
436
+ #header-title{
437
+ font-size: 28px;
438
+ font-weight: 750;
439
+ letter-spacing: -0.02em;
440
+ margin: 0;
441
+ }
442
+
443
+ #header-sub{
444
+ margin-top: 6px;
445
+ color: var(--muted);
446
+ font-size: 14px;
447
+ line-height: 1.45;
448
+ }
449
+
450
+ .card{
451
+ background: var(--panel);
452
+ border: 1px solid var(--border);
453
+ border-radius: 18px;
454
+ padding: 14px;
455
+ box-shadow: var(--shadow);
456
+ }
457
+
458
+ .badge{
459
+ display: inline-flex;
460
+ align-items: center;
461
+ gap: 8px;
462
+ padding: 6px 10px;
463
+ border-radius: 999px;
464
+ border: 1px solid var(--border);
465
+ background: rgba(255,255,255,0.05);
466
+ color: var(--muted);
467
+ font-size: 12px;
468
+ margin-right: 10px;
469
+ }
470
+
471
+ .badge b{
472
+ color: var(--text);
473
+ font-weight: 700;
474
+ }
475
+
476
+ a { color: rgba(255,255,255,0.9) !important; }
477
+ label, .md, .markdown { color: var(--text) !important; }
478
+ """
479
+
480
+ def build_demo():
481
+ with gr.Blocks(
482
+ css=CSS,
483
+ theme=gr.themes.Soft(primary_hue="violet", secondary_hue="emerald"),
484
+ title="Explainable Speech Analytics (Demo)"
485
+ ) as demo:
486
+
487
+ gr.HTML(
488
+ """
489
+ <div id="header-card">
490
+ <p id="header-title">Explainable Speech Analytics</p>
491
+ <div id="header-sub">
492
+ <span class="badge"><b>Doel</b> inzicht in meetbare spraaksignalen</span>
493
+ <span class="badge"><b>Geen diagnose</b> geen medisch hulpmiddel</span>
494
+ <span class="badge"><b>Privacy</b> audio wordt niet opgeslagen door deze demo</span>
495
+ <p style="margin-top:12px">
496
+ Upload of neem korte audiofragmenten op en bekijk <b>wat het systeem meet</b>: pauzes, pitch,
497
+ volume-energie en een algemene <b>audio-embedding</b> om fragmenten te vergelijken.
498
+ Gebruik dit als <b>educatieve visualisatie</b> of gespreksstarter — niet als klinische beslissing.
499
+ </p>
500
+ </div>
501
+ </div>
502
+ """
503
+ )
504
+
505
+ with gr.Tabs():
506
+ with gr.TabItem("Analyse (1 fragment)"):
507
+ with gr.Row():
508
+ with gr.Column(scale=5):
509
+ input_audio = gr.Audio(
510
+ label="Audio",
511
+ sources=["upload", "microphone"],
512
+ type="numpy",
513
+ )
514
+ run_btn = gr.Button("Analyseer", variant="primary")
515
+ with gr.Accordion("Wat gebeurt er technisch?", open=False):
516
+ gr.Markdown(
517
+ """
518
+ - **Akoestiek**: we extraheren frame-based signalen (RMS, ZCR), schatten **pitch** met *pyin*,
519
+ en detecteren **pauzes** met een adaptieve energiedrempel.
520
+ - **Embedding**: een vooraf getraind **Wav2Vec2**-model maakt een vaste vector (embedding) van de audio
521
+ waarmee we fragmenten **onderling** kunnen vergelijken (cosine similarity).
522
+ - **Explainable by design**: we tonen de signalen en deltas, niet alleen een score.
523
+ """
524
+ )
525
+ with gr.Column(scale=7):
526
+ with gr.Row():
527
+ feat_df = gr.Dataframe(
528
+ headers=["Kenmerk", "Waarde"],
529
+ datatype=["str", "str"],
530
+ interactive=False,
531
+ wrap=True,
532
+ label="Meetbare kenmerken"
533
+ )
534
+ with gr.Row():
535
+ wf_plot = gr.Plot(label="Waveform + pauzes")
536
+ with gr.Row():
537
+ pitch_plot = gr.Plot(label="Pitch")
538
+ explanation = gr.Markdown("### Upload of neem audio op", elem_classes=["card"])
539
+
540
+ run_btn.click(analyze_single, inputs=[input_audio], outputs=[feat_df, wf_plot, pitch_plot, explanation])
541
+
542
+ with gr.TabItem("Vergelijk (2 fragmenten)"):
543
+ with gr.Row():
544
+ with gr.Column(scale=5):
545
+ a1 = gr.Audio(label="Fragment A", sources=["upload", "microphone"], type="numpy")
546
+ a2 = gr.Audio(label="Fragment B", sources=["upload", "microphone"], type="numpy")
547
+ compare_btn = gr.Button("Vergelijk", variant="primary")
548
+ gr.Markdown(
549
+ """
550
+ **Interpretatie-tip:** een lagere overeenkomst betekent alleen dat de audio *anders* is
551
+ (andere omgeving, microfoon, emotie, vermoeidheid, etc.). Het zegt **niet** *waarom*.
552
+ """
553
+ )
554
+ with gr.Column(scale=7):
555
+ sim_out = gr.Textbox(label="Embedding-overeenkomst (cosine similarity)", value="—", interactive=False)
556
+ delta_df = gr.Dataframe(
557
+ headers=["Kenmerk", "A", "B", "Δ (B−A)"],
558
+ datatype=["str", "str", "str", "str"],
559
+ interactive=False,
560
+ wrap=True,
561
+ label="Verschillen (uitlegbaar)"
562
+ )
563
+ overlay_plot = gr.Plot(label="Waveform overlay")
564
+
565
+ compare_btn.click(analyze_compare, inputs=[a1, a2], outputs=[sim_out, delta_df, overlay_plot])
566
+
567
+ with gr.Accordion("Ethiek & transparantie (anti–black box)", open=False):
568
+ gr.Markdown(
569
+ """
570
+ **Hoe voorkomt deze demo ‘black box’ gedrag?**
571
+ - We tonen **de signalen** (pauzes, pitch, energie) in grafieken en tabellen.
572
+ - We tonen **verschillen** tussen fragmenten, i.p.v. één eindlabel.
573
+ - We geven **geen diagnose** of medische claim; de output is bedoeld als **observatie**.
574
+ - In een zorgcontext hoort interpretatie altijd samen te gaan met **context + gesprek + klinisch oordeel**.
575
+
576
+ **Let op:** als je dit ooit richting praktijk wilt brengen, heb je o.a. nodig:
577
+ governance, dataminimalisatie, DPIA/AVG, bias-audit, modelmonitoring, en duidelijke ‘human-in-the-loop’ afspraken.
578
+ """
579
+ )
580
+
581
+ return demo
582
+
583
+ if __name__ == "__main__":
584
+ demo = build_demo()
585
+ demo.queue(max_size=32)
586
+ demo.launch()
requirements.txt.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio==4.44.1
2
+ numpy>=1.24
3
+ scipy>=1.10
4
+ librosa>=0.10.2.post1
5
+ soundfile>=0.12.1
6
+ matplotlib>=3.7
7
+ torch>=2.1
8
+ transformers>=4.41