Marcel0123 commited on
Commit
d1034a7
·
verified ·
1 Parent(s): 8358e4d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -154
app.py CHANGED
@@ -1,20 +1,23 @@
1
  import os
2
  import math
 
3
  import numpy as np
4
  import gradio as gr
5
- import librosa
6
  import matplotlib.pyplot as plt
7
 
8
  from dataclasses import dataclass
9
  from typing import Dict, Any, Tuple, List, Optional
10
 
 
 
 
 
11
  # =========================================================
12
  # Config
13
  # =========================================================
14
  TARGET_SR = 16000
15
  APP_DIR = os.path.dirname(os.path.abspath(__file__))
16
 
17
-
18
  # =========================================================
19
  # Helpers
20
  # =========================================================
@@ -44,11 +47,37 @@ def list_bundled_audio() -> List[str]:
44
  return files
45
 
46
 
 
 
 
 
 
 
 
 
 
47
  def load_audio_file(path: str) -> Tuple[np.ndarray, int]:
48
- y, sr = librosa.load(path, sr=None, mono=True)
49
- if y is None or len(y) == 0:
50
- return np.array([], dtype=np.float32), TARGET_SR
51
- return y.astype(np.float32), int(sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
 
54
  def diagnostics_text() -> str:
@@ -67,10 +96,9 @@ def diagnostics_text() -> str:
67
  lines.append(f"- `{fn}` (size unknown)")
68
  else:
69
  lines.append("- *(none found next to app.py)*")
70
-
71
  lines.append("")
72
- lines.append("**Microphone note:** recording can be blocked by browser permissions / corporate policy.")
73
- lines.append("Try opening the Space in a new tab and allow microphone access.")
74
  return "\n".join(lines)
75
 
76
 
@@ -79,7 +107,7 @@ def _finite(x: float) -> bool:
79
 
80
 
81
  # =========================================================
82
- # Features
83
  # =========================================================
84
  @dataclass
85
  class Features:
@@ -95,8 +123,63 @@ class Features:
95
  active_ratio: float
96
 
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
99
- if y is None or len(y) == 0:
100
  f = Features(
101
  duration_s=float("nan"),
102
  rms_mean=float("nan"),
@@ -111,62 +194,45 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
111
  )
112
  return f, {"y": np.array([]), "sr": sr, "hop": 160, "pauses": [], "pitch": np.array([]), "times": np.array([])}
113
 
114
- # Resample to stable SR
115
  if sr != TARGET_SR:
116
- y = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=TARGET_SR)
117
  sr = TARGET_SR
118
  else:
119
  y = y.astype(np.float32)
120
 
121
- # Normalize
122
  mx = float(np.max(np.abs(y))) + 1e-9
123
  y = y / mx
124
 
125
- duration = float(len(y) / sr)
126
- hop = 160
127
- frame = 400
128
 
129
- rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0]
130
- zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame, hop_length=hop)[0]
 
 
 
 
131
 
132
  rms_mean = float(np.mean(rms)) if rms.size else float("nan")
133
  rms_std = float(np.std(rms)) if rms.size else float("nan")
134
  zcr_mean = float(np.mean(zcr)) if zcr.size else float("nan")
135
 
136
- # Pitch via pyin
137
- try:
138
- f0, _, _ = librosa.pyin(
139
- y,
140
- fmin=librosa.note_to_hz("C2"),
141
- fmax=librosa.note_to_hz("C7"),
142
- sr=sr,
143
- frame_length=frame,
144
- hop_length=hop,
145
- )
146
- except Exception:
147
- f0 = None
148
-
149
- if f0 is None:
150
- pitch = np.array([])
151
- times = np.array([])
152
  pitch_median = float("nan")
153
  pitch_iqr = float("nan")
154
- voiced_ratio = float("nan")
155
- else:
156
- pitch = np.asarray(f0, dtype=np.float32)
157
- times = librosa.frames_to_time(np.arange(len(pitch)), sr=sr, hop_length=hop)
158
- voiced = np.isfinite(pitch)
159
- voiced_ratio = float(np.mean(voiced)) if voiced.size else float("nan")
160
- if np.any(voiced):
161
- pv = pitch[voiced]
162
- pitch_median = float(np.median(pv))
163
- q75, q25 = np.percentile(pv, [75, 25])
164
- pitch_iqr = float(q75 - q25)
165
- else:
166
- pitch_median = float("nan")
167
- pitch_iqr = float("nan")
168
-
169
- # Pause detection
170
  if rms.size:
171
  thr = float(np.percentile(rms, 20)) * 0.8
172
  silent = rms < thr
@@ -209,7 +275,7 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
209
  active_ratio=active_ratio,
210
  )
211
 
212
- artifacts = {"y": y, "sr": sr, "hop": hop, "rms": rms, "zcr": zcr, "pitch": pitch, "times": times, "pauses": pauses}
213
  return feats, artifacts
214
 
215
 
@@ -247,18 +313,18 @@ def plot_pitch(art: Dict[str, Any]) -> plt.Figure:
247
  ax = fig.add_subplot(111)
248
  if pitch.size and times.size:
249
  ax.plot(times, pitch, linewidth=1.0)
250
- ax.set_title("Pitch contour (NaN = unvoiced)")
251
  ax.set_xlabel("Time (s)")
252
  ax.set_ylabel("Pitch (Hz)")
253
  else:
254
- ax.text(0.5, 0.5, "Pitch not available (too short/noisy)", ha="center", va="center")
255
  ax.set_axis_off()
256
  fig.tight_layout()
257
  return fig
258
 
259
 
260
  # =========================================================
261
- # Tables + Explanations
262
  # =========================================================
263
  def features_table(feats: Features) -> List[List[str]]:
264
  def f3(x):
@@ -277,102 +343,38 @@ def features_table(feats: Features) -> List[List[str]]:
277
  ]
278
 
279
 
280
- def explain_single(feats: Features) -> str:
281
- return (
282
- "### What does the system ‘see’ here?\n"
283
- "- It shows **measurable signals**: pauses, pitch and energy.\n"
284
- "- This is **not a diagnosis** and **not a medical device**.\n"
285
- )
286
-
287
-
288
- def interpret_delta(label: str, delta: float) -> str:
289
- """
290
- Very conservative, explainable interpretation. No clinical claims.
291
- """
292
- if not _finite(delta):
293
- return f"- **{label}**: not available."
294
- # Use direction-only interpretations
295
- if "pause" in label.lower():
296
- if delta > 0:
297
- return f"- **{label}** increased. This can reflect slower speech, more hesitations, fatigue, distraction, or noise/environment changes."
298
- if delta < 0:
299
- return f"- **{label}** decreased. This can reflect more continuous speech or fewer hesitations."
300
- return f"- **{label}** stayed similar."
301
- if "pitch" in label.lower():
302
- if delta > 0:
303
- return f"- **{label}** increased. This can reflect different speaking style, emotion, or prosody changes."
304
- if delta < 0:
305
- return f"- **{label}** decreased. This can reflect a flatter/less variable prosody or a different speaking style."
306
- return f"- **{label}** stayed similar."
307
- if "rms" in label.lower() or "energy" in label.lower():
308
- if delta > 0:
309
- return f"- **{label}** increased. This can reflect speaking louder/closer to mic, or a quieter environment."
310
- if delta < 0:
311
- return f"- **{label}** decreased. This can reflect speaking softer/farther from mic, or a noisier environment."
312
- return f"- **{label}** stayed similar."
313
- if "active speech" in label.lower():
314
- if delta > 0:
315
- return f"- **{label}** increased. More time above the energy threshold (more continuous speech or less silence)."
316
- if delta < 0:
317
- return f"- **{label}** decreased. More time below threshold (more silence/pauses)."
318
- return f"- **{label}** stayed similar."
319
- return f"- **{label}** changed by {delta:+.3f}."
320
-
321
-
322
  def summary_of_changes(first: Features, last: Features) -> str:
323
- """
324
- Compare first vs last recording in the timeline.
325
- Generates an explainable summary + cautious interpretation.
326
- """
327
- # compute deltas (last - first)
328
- d_pause_total = (last.pause_total_s - first.pause_total_s) if (_finite(last.pause_total_s) and _finite(first.pause_total_s)) else float("nan")
329
- d_n_pauses = (last.n_pauses - first.n_pauses) if (last.n_pauses is not None and first.n_pauses is not None) else float("nan")
330
- d_pitch = (last.pitch_median_hz - first.pitch_median_hz) if (_finite(last.pitch_median_hz) and _finite(first.pitch_median_hz)) else float("nan")
331
- d_rms = (last.rms_mean - first.rms_mean) if (_finite(last.rms_mean) and _finite(first.rms_mean)) else float("nan")
332
- d_active = (last.active_ratio - first.active_ratio) if (_finite(last.active_ratio) and _finite(first.active_ratio)) else float("nan")
333
-
334
- # small helper formatting
335
  def fmt(x, unit=""):
336
  if not _finite(x):
337
  return "—"
338
- if unit == "%":
339
- return f"{x*100:+.1f}%"
340
  return f"{x:+.3f}{unit}"
341
 
 
 
 
 
 
 
342
  lines = []
343
  lines.append("### Summary of changes (last vs first)")
344
- lines.append("This compares the **first** and **last** recording you provided (chronological order recommended).")
345
  lines.append("")
346
  lines.append("**Measured differences (Δ = last − first):**")
347
- lines.append(f"- Total pause time: **{fmt(d_pause_total, 's')}**")
348
- lines.append(f"- Number of pauses: **{d_n_pauses:+d}**" if isinstance(d_n_pauses, int) else f"- Number of pauses: **{fmt(d_n_pauses)}**")
349
  lines.append(f"- Median pitch: **{fmt(d_pitch, ' Hz')}**")
350
  lines.append(f"- RMS energy: **{fmt(d_rms)}**")
351
- lines.append(f"- Active speech ratio: **{fmt(d_active, '%')}**")
352
  lines.append("")
353
- lines.append("**Possible (non-clinical) interpretations:**")
354
- lines.append(interpret_delta("Total pause time", d_pause_total))
355
- lines.append(interpret_delta("Number of pauses", float(d_n_pauses) if isinstance(d_n_pauses, int) else d_n_pauses))
356
- lines.append(interpret_delta("Median pitch", d_pitch))
357
- lines.append(interpret_delta("RMS energy", d_rms))
358
- lines.append(interpret_delta("Active speech ratio", d_active))
359
  lines.append("")
360
- lines.append(
361
- "**Important:** these are **speech-signal explanations**, not a diagnosis. "
362
- "Real-world meaning depends on context (device, environment, fatigue, stress, medication, etc.)."
363
- )
364
  return "\n".join(lines)
365
 
366
 
367
- def explain_timeline() -> str:
368
- return (
369
- "### Timeline principle\n"
370
- "- Use **multiple recordings of the same person**.\n"
371
- "- The key is **within-person change over time** relative to baseline.\n"
372
- "- The Summary box explains **what changed** (signals) and gives cautious, non-clinical interpretations.\n"
373
- )
374
-
375
-
376
  # =========================================================
377
  # Callbacks
378
  # =========================================================
@@ -381,7 +383,7 @@ def analyze_one(audio_path: Optional[str]):
381
  return [], None, None, "### Upload or record audio to start."
382
  y, sr = load_audio_file(audio_path)
383
  feats, art = compute_features(y, sr)
384
- return features_table(feats), plot_waveform_with_pauses(art), plot_pitch(art), explain_single(feats)
385
 
386
 
387
  def analyze_many_paths(paths: List[str]):
@@ -389,14 +391,12 @@ def analyze_many_paths(paths: List[str]):
389
  return (
390
  [[1, "—", "Upload/select at least 2 recordings.", "", "", "", "", ""]],
391
  None,
392
- explain_timeline(),
393
- "### Upload/select at least 2 recordings to generate a summary."
394
  )
395
 
396
  rows = []
397
  pause_series, pitch_series, rms_series = [], [], []
398
-
399
- # store first/last features for summary
400
  feats_first = None
401
  feats_last = None
402
 
@@ -445,15 +445,14 @@ def analyze_many_paths(paths: List[str]):
445
  if feats_first is not None and feats_last is not None:
446
  summary = summary_of_changes(feats_first, feats_last)
447
 
448
- return rows, fig, explain_timeline(), summary
449
 
450
 
451
  def analyze_many_uploaded(files):
452
  paths = []
453
  if files:
454
  for f in files:
455
- p = getattr(f, "name", None) or str(f)
456
- paths.append(p)
457
  return analyze_many_paths(paths)
458
 
459
 
@@ -500,16 +499,14 @@ CSS = """
500
  }
501
  .card *{ color: #0b0f19 !important; }
502
 
503
- /* Tabs: make readable on dark background */
504
  div[role="tablist"]{
505
  background: rgba(255,255,255,0.06) !important;
506
  border: 1px solid rgba(255,255,255,0.14) !important;
507
  border-radius: 14px !important;
508
  padding: 6px !important;
509
  }
510
- button[role="tab"]{
511
- color: rgba(255,255,255,0.92) !important;
512
- }
513
  button[role="tab"][aria-selected="true"]{
514
  color: rgba(255,255,255,0.98) !important;
515
  border-bottom: 2px solid rgba(255,255,255,0.65) !important;
@@ -542,7 +539,6 @@ def build_ui():
542
  with gr.Column(scale=5):
543
  audio = gr.Audio(label="Audio", sources=["upload", "microphone"], type="filepath")
544
  run = gr.Button("Analyze", variant="primary")
545
- gr.Markdown("If mic doesn’t work, try upload first. Then check Diagnostics.", elem_classes=["card"])
546
  with gr.Column(scale=7):
547
  feats_df = gr.Dataframe(headers=["Feature", "Value"], interactive=False, wrap=True)
548
  wf_plot = gr.Plot(label="Waveform + pauses")
@@ -553,18 +549,16 @@ def build_ui():
553
  with gr.TabItem("Timeline"):
554
  with gr.Row():
555
  with gr.Column(scale=5):
556
- gr.Markdown("#### Option A — Upload from your computer")
557
  files = gr.Files(label="Upload multiple audio files", file_count="multiple", file_types=["audio"])
558
  run_up = gr.Button("Analyze uploaded timeline", variant="primary")
559
 
560
- gr.Markdown("#### Option B — Use bundled samples (repo root)")
561
  bundled_select = gr.CheckboxGroup(choices=bundled0, label="Bundled audio files")
562
  with gr.Row():
563
  refresh_btn = gr.Button("Refresh list", variant="secondary")
564
  run_b = gr.Button("Analyze selected bundled", variant="secondary")
565
 
566
- gr.Markdown("Order matters: first = baseline, last = comparison.", elem_classes=["card"])
567
-
568
  with gr.Column(scale=7):
569
  timeline_df = gr.Dataframe(
570
  headers=["#", "File", "Duration", "Pauses", "Pause(s)", "Pitch(Hz)", "RMS", "Active %"],
@@ -572,8 +566,8 @@ def build_ui():
572
  wrap=True,
573
  )
574
  timeline_plot = gr.Plot(label="Trend plot")
575
- timeline_expl = gr.Markdown(explain_timeline(), elem_classes=["card"])
576
- timeline_summary = gr.Markdown("### Summary will appear here after analysis.", elem_classes=["card"])
577
 
578
  run_up.click(analyze_many_uploaded, inputs=[files], outputs=[timeline_df, timeline_plot, timeline_expl, timeline_summary])
579
  run_b.click(analyze_many_bundled, inputs=[bundled_select], outputs=[timeline_df, timeline_plot, timeline_expl, timeline_summary])
@@ -583,7 +577,6 @@ def build_ui():
583
  diag_refresh = gr.Button("Refresh diagnostics", variant="secondary")
584
  diag_refresh.click(lambda: diagnostics_text(), inputs=None, outputs=[diag])
585
 
586
- # Refresh bundled choices AND diagnostics
587
  refresh_btn.click(refresh_bundled, inputs=None, outputs=[bundled_select, diag])
588
 
589
  return demo
 
1
  import os
2
  import math
3
+ import tempfile
4
  import numpy as np
5
  import gradio as gr
 
6
  import matplotlib.pyplot as plt
7
 
8
  from dataclasses import dataclass
9
  from typing import Dict, Any, Tuple, List, Optional
10
 
11
+ import soundfile as sf
12
+ from pydub import AudioSegment
13
+ from scipy.signal import correlate
14
+
15
  # =========================================================
16
  # Config
17
  # =========================================================
18
  TARGET_SR = 16000
19
  APP_DIR = os.path.dirname(os.path.abspath(__file__))
20
 
 
21
  # =========================================================
22
  # Helpers
23
  # =========================================================
 
47
  return files
48
 
49
 
50
+ def _resample_linear(y: np.ndarray, sr: int, target_sr: int) -> np.ndarray:
51
+ if sr == target_sr or y.size == 0:
52
+ return y
53
+ x_old = np.linspace(0.0, 1.0, num=y.size, endpoint=False)
54
+ new_len = int(round(y.size * (target_sr / sr)))
55
+ x_new = np.linspace(0.0, 1.0, num=max(new_len, 1), endpoint=False)
56
+ return np.interp(x_new, x_old, y).astype(np.float32)
57
+
58
+
59
  def load_audio_file(path: str) -> Tuple[np.ndarray, int]:
60
+ """
61
+ Robust loader:
62
+ - WAV/FLAC/OGG via soundfile
63
+ - MP3/M4A via pydub (ffmpeg)
64
+ Returns mono float32 waveform + sr.
65
+ """
66
+ ext = os.path.splitext(path)[1].lower()
67
+
68
+ if ext in [".wav", ".flac", ".ogg"]:
69
+ y, sr = sf.read(path, always_2d=True)
70
+ y = y.mean(axis=1).astype(np.float32)
71
+ return y, int(sr)
72
+
73
+ # MP3/M4A/etc. via pydub
74
+ seg = AudioSegment.from_file(path)
75
+ seg = seg.set_channels(1)
76
+ sr = seg.frame_rate
77
+ samples = np.array(seg.get_array_of_samples())
78
+ # Convert to float32 in [-1, 1]
79
+ y = samples.astype(np.float32) / (2 ** (8 * seg.sample_width - 1))
80
+ return y, int(sr)
81
 
82
 
83
  def diagnostics_text() -> str:
 
96
  lines.append(f"- `{fn}` (size unknown)")
97
  else:
98
  lines.append("- *(none found next to app.py)*")
 
99
  lines.append("")
100
+ lines.append("**If build hangs:** usually heavy deps (e.g. librosa/numba). This version avoids them.")
101
+ lines.append("**Microphone note:** may be blocked by browser permissions/corporate policy.")
102
  return "\n".join(lines)
103
 
104
 
 
107
 
108
 
109
  # =========================================================
110
+ # Feature extraction (no librosa)
111
  # =========================================================
112
  @dataclass
113
  class Features:
 
123
  active_ratio: float
124
 
125
 
126
+ def _frame_signal(y: np.ndarray, frame: int, hop: int) -> np.ndarray:
127
+ if y.size < frame:
128
+ return np.zeros((0, frame), dtype=np.float32)
129
+ n = 1 + (y.size - frame) // hop
130
+ idx = (np.arange(n)[:, None] * hop) + np.arange(frame)[None, :]
131
+ return y[idx]
132
+
133
+
134
+ def _rms_per_frame(frames: np.ndarray) -> np.ndarray:
135
+ if frames.size == 0:
136
+ return np.array([], dtype=np.float32)
137
+ return np.sqrt(np.mean(frames * frames, axis=1) + 1e-12).astype(np.float32)
138
+
139
+
140
+ def _zcr_per_frame(frames: np.ndarray) -> np.ndarray:
141
+ if frames.size == 0:
142
+ return np.array([], dtype=np.float32)
143
+ signs = np.sign(frames)
144
+ signs[signs == 0] = 1
145
+ zc = np.mean(signs[:, 1:] != signs[:, :-1], axis=1).astype(np.float32)
146
+ return zc
147
+
148
+
149
+ def _pitch_autocorr(frame: np.ndarray, sr: int, fmin: float = 70.0, fmax: float = 350.0) -> float:
150
+ """
151
+ Simple autocorrelation pitch estimate for one frame.
152
+ Returns Hz or NaN.
153
+ """
154
+ if frame.size == 0:
155
+ return float("nan")
156
+ frame = frame - np.mean(frame)
157
+ energy = np.sqrt(np.mean(frame * frame) + 1e-12)
158
+ if energy < 0.01:
159
+ return float("nan")
160
+
161
+ ac = correlate(frame, frame, mode="full")
162
+ ac = ac[ac.size // 2 :]
163
+
164
+ min_lag = int(sr / fmax)
165
+ max_lag = int(sr / fmin)
166
+ if max_lag <= min_lag + 2 or max_lag >= ac.size:
167
+ return float("nan")
168
+
169
+ seg = ac[min_lag:max_lag]
170
+ if seg.size == 0:
171
+ return float("nan")
172
+
173
+ i = int(np.argmax(seg))
174
+ lag = min_lag + i
175
+
176
+ if lag <= 0:
177
+ return float("nan")
178
+ return float(sr / lag)
179
+
180
+
181
  def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
182
+ if y is None or y.size == 0:
183
  f = Features(
184
  duration_s=float("nan"),
185
  rms_mean=float("nan"),
 
194
  )
195
  return f, {"y": np.array([]), "sr": sr, "hop": 160, "pauses": [], "pitch": np.array([]), "times": np.array([])}
196
 
197
+ # resample + normalize
198
  if sr != TARGET_SR:
199
+ y = _resample_linear(y.astype(np.float32), sr, TARGET_SR)
200
  sr = TARGET_SR
201
  else:
202
  y = y.astype(np.float32)
203
 
 
204
  mx = float(np.max(np.abs(y))) + 1e-9
205
  y = y / mx
206
 
207
+ duration = float(y.size / sr)
 
 
208
 
209
+ hop = 160 # 10ms
210
+ frame = 400 # 25ms
211
+
212
+ frames = _frame_signal(y, frame=frame, hop=hop)
213
+ rms = _rms_per_frame(frames)
214
+ zcr = _zcr_per_frame(frames)
215
 
216
  rms_mean = float(np.mean(rms)) if rms.size else float("nan")
217
  rms_std = float(np.std(rms)) if rms.size else float("nan")
218
  zcr_mean = float(np.mean(zcr)) if zcr.size else float("nan")
219
 
220
+ # pitch per frame (simple + explainable)
221
+ pitch = np.array([_pitch_autocorr(frames[i], sr) for i in range(frames.shape[0])], dtype=np.float32)
222
+ times = (np.arange(pitch.size) * hop / sr).astype(np.float32)
223
+
224
+ voiced = np.isfinite(pitch) & (pitch > 0)
225
+ voiced_ratio = float(np.mean(voiced)) if voiced.size else float("nan")
226
+ if np.any(voiced):
227
+ pv = pitch[voiced]
228
+ pitch_median = float(np.median(pv))
229
+ q75, q25 = np.percentile(pv, [75, 25])
230
+ pitch_iqr = float(q75 - q25)
231
+ else:
 
 
 
 
232
  pitch_median = float("nan")
233
  pitch_iqr = float("nan")
234
+
235
+ # pause detection via RMS threshold
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  if rms.size:
237
  thr = float(np.percentile(rms, 20)) * 0.8
238
  silent = rms < thr
 
275
  active_ratio=active_ratio,
276
  )
277
 
278
+ artifacts = {"y": y, "sr": sr, "hop": hop, "pauses": pauses, "pitch": pitch, "times": times}
279
  return feats, artifacts
280
 
281
 
 
313
  ax = fig.add_subplot(111)
314
  if pitch.size and times.size:
315
  ax.plot(times, pitch, linewidth=1.0)
316
+ ax.set_title("Pitch contour (simple autocorrelation)")
317
  ax.set_xlabel("Time (s)")
318
  ax.set_ylabel("Pitch (Hz)")
319
  else:
320
+ ax.text(0.5, 0.5, "Pitch not available", ha="center", va="center")
321
  ax.set_axis_off()
322
  fig.tight_layout()
323
  return fig
324
 
325
 
326
  # =========================================================
327
+ # Explanations + summary
328
  # =========================================================
329
  def features_table(feats: Features) -> List[List[str]]:
330
  def f3(x):
 
343
  ]
344
 
345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  def summary_of_changes(first: Features, last: Features) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
347
  def fmt(x, unit=""):
348
  if not _finite(x):
349
  return "—"
 
 
350
  return f"{x:+.3f}{unit}"
351
 
352
+ d_pause_total = (last.pause_total_s - first.pause_total_s) if (_finite(last.pause_total_s) and _finite(first.pause_total_s)) else float("nan")
353
+ d_n_pauses = (last.n_pauses - first.n_pauses)
354
+ d_pitch = (last.pitch_median_hz - first.pitch_median_hz) if (_finite(last.pitch_median_hz) and _finite(first.pitch_median_hz)) else float("nan")
355
+ d_rms = (last.rms_mean - first.rms_mean) if (_finite(last.rms_mean) and _finite(first.rms_mean)) else float("nan")
356
+ d_active = (last.active_ratio - first.active_ratio) if (_finite(last.active_ratio) and _finite(first.active_ratio)) else float("nan")
357
+
358
  lines = []
359
  lines.append("### Summary of changes (last vs first)")
360
+ lines.append("This compares the **first** and **last** recording in your selection (upload order).")
361
  lines.append("")
362
  lines.append("**Measured differences (Δ = last − first):**")
363
+ lines.append(f"- Total pause time: **{fmt(d_pause_total, ' s')}**")
364
+ lines.append(f"- Number of pauses: **{d_n_pauses:+d}**")
365
  lines.append(f"- Median pitch: **{fmt(d_pitch, ' Hz')}**")
366
  lines.append(f"- RMS energy: **{fmt(d_rms)}**")
367
+ lines.append(f"- Active speech ratio: **{fmt(d_active * 100.0, ' %')}**")
368
  lines.append("")
369
+ lines.append("**How to interpret (non-clinical):**")
370
+ lines.append("- More pauses / lower active ratio can reflect hesitations, slower speech, fatigue, or different environment/microphone setup.")
371
+ lines.append("- Pitch changes can reflect speaking style, prosody, emotion, or recording conditions.")
372
+ lines.append("- Energy changes often reflect distance to microphone / loudness / background noise.")
 
 
373
  lines.append("")
374
+ lines.append("**Important:** not a diagnosis. These are explainable signal-level comparisons.")
 
 
 
375
  return "\n".join(lines)
376
 
377
 
 
 
 
 
 
 
 
 
 
378
  # =========================================================
379
  # Callbacks
380
  # =========================================================
 
383
  return [], None, None, "### Upload or record audio to start."
384
  y, sr = load_audio_file(audio_path)
385
  feats, art = compute_features(y, sr)
386
+ return features_table(feats), plot_waveform_with_pauses(art), plot_pitch(art), "### This shows measurable signals (no diagnosis)."
387
 
388
 
389
  def analyze_many_paths(paths: List[str]):
 
391
  return (
392
  [[1, "—", "Upload/select at least 2 recordings.", "", "", "", "", ""]],
393
  None,
394
+ "### Select at least 2 recordings to see a trend.",
395
+ "### Summary will appear here."
396
  )
397
 
398
  rows = []
399
  pause_series, pitch_series, rms_series = [], [], []
 
 
400
  feats_first = None
401
  feats_last = None
402
 
 
445
  if feats_first is not None and feats_last is not None:
446
  summary = summary_of_changes(feats_first, feats_last)
447
 
448
+ return rows, fig, "### Trend over time (within-person).", summary
449
 
450
 
451
  def analyze_many_uploaded(files):
452
  paths = []
453
  if files:
454
  for f in files:
455
+ paths.append(getattr(f, "name", None) or str(f))
 
456
  return analyze_many_paths(paths)
457
 
458
 
 
499
  }
500
  .card *{ color: #0b0f19 !important; }
501
 
502
+ /* Tabs readable on dark background */
503
  div[role="tablist"]{
504
  background: rgba(255,255,255,0.06) !important;
505
  border: 1px solid rgba(255,255,255,0.14) !important;
506
  border-radius: 14px !important;
507
  padding: 6px !important;
508
  }
509
+ button[role="tab"]{ color: rgba(255,255,255,0.92) !important; }
 
 
510
  button[role="tab"][aria-selected="true"]{
511
  color: rgba(255,255,255,0.98) !important;
512
  border-bottom: 2px solid rgba(255,255,255,0.65) !important;
 
539
  with gr.Column(scale=5):
540
  audio = gr.Audio(label="Audio", sources=["upload", "microphone"], type="filepath")
541
  run = gr.Button("Analyze", variant="primary")
 
542
  with gr.Column(scale=7):
543
  feats_df = gr.Dataframe(headers=["Feature", "Value"], interactive=False, wrap=True)
544
  wf_plot = gr.Plot(label="Waveform + pauses")
 
549
  with gr.TabItem("Timeline"):
550
  with gr.Row():
551
  with gr.Column(scale=5):
552
+ gr.Markdown("#### Option A — Upload")
553
  files = gr.Files(label="Upload multiple audio files", file_count="multiple", file_types=["audio"])
554
  run_up = gr.Button("Analyze uploaded timeline", variant="primary")
555
 
556
+ gr.Markdown("#### Option B — Bundled samples (repo root)")
557
  bundled_select = gr.CheckboxGroup(choices=bundled0, label="Bundled audio files")
558
  with gr.Row():
559
  refresh_btn = gr.Button("Refresh list", variant="secondary")
560
  run_b = gr.Button("Analyze selected bundled", variant="secondary")
561
 
 
 
562
  with gr.Column(scale=7):
563
  timeline_df = gr.Dataframe(
564
  headers=["#", "File", "Duration", "Pauses", "Pause(s)", "Pitch(Hz)", "RMS", "Active %"],
 
566
  wrap=True,
567
  )
568
  timeline_plot = gr.Plot(label="Trend plot")
569
+ timeline_expl = gr.Markdown("### Select at least 2 recordings.", elem_classes=["card"])
570
+ timeline_summary = gr.Markdown("### Summary will appear here.", elem_classes=["card"])
571
 
572
  run_up.click(analyze_many_uploaded, inputs=[files], outputs=[timeline_df, timeline_plot, timeline_expl, timeline_summary])
573
  run_b.click(analyze_many_bundled, inputs=[bundled_select], outputs=[timeline_df, timeline_plot, timeline_expl, timeline_summary])
 
577
  diag_refresh = gr.Button("Refresh diagnostics", variant="secondary")
578
  diag_refresh.click(lambda: diagnostics_text(), inputs=None, outputs=[diag])
579
 
 
580
  refresh_btn.click(refresh_bundled, inputs=None, outputs=[bundled_select, diag])
581
 
582
  return demo