Update app.py
Browse files
app.py
CHANGED
|
@@ -1,20 +1,23 @@
|
|
| 1 |
import os
|
| 2 |
import math
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import gradio as gr
|
| 5 |
-
import librosa
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
|
| 8 |
from dataclasses import dataclass
|
| 9 |
from typing import Dict, Any, Tuple, List, Optional
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
# =========================================================
|
| 12 |
# Config
|
| 13 |
# =========================================================
|
| 14 |
TARGET_SR = 16000
|
| 15 |
APP_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 16 |
|
| 17 |
-
|
| 18 |
# =========================================================
|
| 19 |
# Helpers
|
| 20 |
# =========================================================
|
|
@@ -44,11 +47,37 @@ def list_bundled_audio() -> List[str]:
|
|
| 44 |
return files
|
| 45 |
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
def load_audio_file(path: str) -> Tuple[np.ndarray, int]:
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
def diagnostics_text() -> str:
|
|
@@ -67,10 +96,9 @@ def diagnostics_text() -> str:
|
|
| 67 |
lines.append(f"- `{fn}` (size unknown)")
|
| 68 |
else:
|
| 69 |
lines.append("- *(none found next to app.py)*")
|
| 70 |
-
|
| 71 |
lines.append("")
|
| 72 |
-
lines.append("**
|
| 73 |
-
lines.append("
|
| 74 |
return "\n".join(lines)
|
| 75 |
|
| 76 |
|
|
@@ -79,7 +107,7 @@ def _finite(x: float) -> bool:
|
|
| 79 |
|
| 80 |
|
| 81 |
# =========================================================
|
| 82 |
-
#
|
| 83 |
# =========================================================
|
| 84 |
@dataclass
|
| 85 |
class Features:
|
|
@@ -95,8 +123,63 @@ class Features:
|
|
| 95 |
active_ratio: float
|
| 96 |
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
|
| 99 |
-
if y is None or
|
| 100 |
f = Features(
|
| 101 |
duration_s=float("nan"),
|
| 102 |
rms_mean=float("nan"),
|
|
@@ -111,62 +194,45 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
|
|
| 111 |
)
|
| 112 |
return f, {"y": np.array([]), "sr": sr, "hop": 160, "pauses": [], "pitch": np.array([]), "times": np.array([])}
|
| 113 |
|
| 114 |
-
#
|
| 115 |
if sr != TARGET_SR:
|
| 116 |
-
y =
|
| 117 |
sr = TARGET_SR
|
| 118 |
else:
|
| 119 |
y = y.astype(np.float32)
|
| 120 |
|
| 121 |
-
# Normalize
|
| 122 |
mx = float(np.max(np.abs(y))) + 1e-9
|
| 123 |
y = y / mx
|
| 124 |
|
| 125 |
-
duration = float(
|
| 126 |
-
hop = 160
|
| 127 |
-
frame = 400
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
rms_mean = float(np.mean(rms)) if rms.size else float("nan")
|
| 133 |
rms_std = float(np.std(rms)) if rms.size else float("nan")
|
| 134 |
zcr_mean = float(np.mean(zcr)) if zcr.size else float("nan")
|
| 135 |
|
| 136 |
-
#
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
)
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
if f0 is None:
|
| 150 |
-
pitch = np.array([])
|
| 151 |
-
times = np.array([])
|
| 152 |
pitch_median = float("nan")
|
| 153 |
pitch_iqr = float("nan")
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
pitch = np.asarray(f0, dtype=np.float32)
|
| 157 |
-
times = librosa.frames_to_time(np.arange(len(pitch)), sr=sr, hop_length=hop)
|
| 158 |
-
voiced = np.isfinite(pitch)
|
| 159 |
-
voiced_ratio = float(np.mean(voiced)) if voiced.size else float("nan")
|
| 160 |
-
if np.any(voiced):
|
| 161 |
-
pv = pitch[voiced]
|
| 162 |
-
pitch_median = float(np.median(pv))
|
| 163 |
-
q75, q25 = np.percentile(pv, [75, 25])
|
| 164 |
-
pitch_iqr = float(q75 - q25)
|
| 165 |
-
else:
|
| 166 |
-
pitch_median = float("nan")
|
| 167 |
-
pitch_iqr = float("nan")
|
| 168 |
-
|
| 169 |
-
# Pause detection
|
| 170 |
if rms.size:
|
| 171 |
thr = float(np.percentile(rms, 20)) * 0.8
|
| 172 |
silent = rms < thr
|
|
@@ -209,7 +275,7 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
|
|
| 209 |
active_ratio=active_ratio,
|
| 210 |
)
|
| 211 |
|
| 212 |
-
artifacts = {"y": y, "sr": sr, "hop": hop, "
|
| 213 |
return feats, artifacts
|
| 214 |
|
| 215 |
|
|
@@ -247,18 +313,18 @@ def plot_pitch(art: Dict[str, Any]) -> plt.Figure:
|
|
| 247 |
ax = fig.add_subplot(111)
|
| 248 |
if pitch.size and times.size:
|
| 249 |
ax.plot(times, pitch, linewidth=1.0)
|
| 250 |
-
ax.set_title("Pitch contour (
|
| 251 |
ax.set_xlabel("Time (s)")
|
| 252 |
ax.set_ylabel("Pitch (Hz)")
|
| 253 |
else:
|
| 254 |
-
ax.text(0.5, 0.5, "Pitch not available
|
| 255 |
ax.set_axis_off()
|
| 256 |
fig.tight_layout()
|
| 257 |
return fig
|
| 258 |
|
| 259 |
|
| 260 |
# =========================================================
|
| 261 |
-
#
|
| 262 |
# =========================================================
|
| 263 |
def features_table(feats: Features) -> List[List[str]]:
|
| 264 |
def f3(x):
|
|
@@ -277,102 +343,38 @@ def features_table(feats: Features) -> List[List[str]]:
|
|
| 277 |
]
|
| 278 |
|
| 279 |
|
| 280 |
-
def explain_single(feats: Features) -> str:
|
| 281 |
-
return (
|
| 282 |
-
"### What does the system ‘see’ here?\n"
|
| 283 |
-
"- It shows **measurable signals**: pauses, pitch and energy.\n"
|
| 284 |
-
"- This is **not a diagnosis** and **not a medical device**.\n"
|
| 285 |
-
)
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
def interpret_delta(label: str, delta: float) -> str:
|
| 289 |
-
"""
|
| 290 |
-
Very conservative, explainable interpretation. No clinical claims.
|
| 291 |
-
"""
|
| 292 |
-
if not _finite(delta):
|
| 293 |
-
return f"- **{label}**: not available."
|
| 294 |
-
# Use direction-only interpretations
|
| 295 |
-
if "pause" in label.lower():
|
| 296 |
-
if delta > 0:
|
| 297 |
-
return f"- **{label}** increased. This can reflect slower speech, more hesitations, fatigue, distraction, or noise/environment changes."
|
| 298 |
-
if delta < 0:
|
| 299 |
-
return f"- **{label}** decreased. This can reflect more continuous speech or fewer hesitations."
|
| 300 |
-
return f"- **{label}** stayed similar."
|
| 301 |
-
if "pitch" in label.lower():
|
| 302 |
-
if delta > 0:
|
| 303 |
-
return f"- **{label}** increased. This can reflect different speaking style, emotion, or prosody changes."
|
| 304 |
-
if delta < 0:
|
| 305 |
-
return f"- **{label}** decreased. This can reflect a flatter/less variable prosody or a different speaking style."
|
| 306 |
-
return f"- **{label}** stayed similar."
|
| 307 |
-
if "rms" in label.lower() or "energy" in label.lower():
|
| 308 |
-
if delta > 0:
|
| 309 |
-
return f"- **{label}** increased. This can reflect speaking louder/closer to mic, or a quieter environment."
|
| 310 |
-
if delta < 0:
|
| 311 |
-
return f"- **{label}** decreased. This can reflect speaking softer/farther from mic, or a noisier environment."
|
| 312 |
-
return f"- **{label}** stayed similar."
|
| 313 |
-
if "active speech" in label.lower():
|
| 314 |
-
if delta > 0:
|
| 315 |
-
return f"- **{label}** increased. More time above the energy threshold (more continuous speech or less silence)."
|
| 316 |
-
if delta < 0:
|
| 317 |
-
return f"- **{label}** decreased. More time below threshold (more silence/pauses)."
|
| 318 |
-
return f"- **{label}** stayed similar."
|
| 319 |
-
return f"- **{label}** changed by {delta:+.3f}."
|
| 320 |
-
|
| 321 |
-
|
| 322 |
def summary_of_changes(first: Features, last: Features) -> str:
|
| 323 |
-
"""
|
| 324 |
-
Compare first vs last recording in the timeline.
|
| 325 |
-
Generates an explainable summary + cautious interpretation.
|
| 326 |
-
"""
|
| 327 |
-
# compute deltas (last - first)
|
| 328 |
-
d_pause_total = (last.pause_total_s - first.pause_total_s) if (_finite(last.pause_total_s) and _finite(first.pause_total_s)) else float("nan")
|
| 329 |
-
d_n_pauses = (last.n_pauses - first.n_pauses) if (last.n_pauses is not None and first.n_pauses is not None) else float("nan")
|
| 330 |
-
d_pitch = (last.pitch_median_hz - first.pitch_median_hz) if (_finite(last.pitch_median_hz) and _finite(first.pitch_median_hz)) else float("nan")
|
| 331 |
-
d_rms = (last.rms_mean - first.rms_mean) if (_finite(last.rms_mean) and _finite(first.rms_mean)) else float("nan")
|
| 332 |
-
d_active = (last.active_ratio - first.active_ratio) if (_finite(last.active_ratio) and _finite(first.active_ratio)) else float("nan")
|
| 333 |
-
|
| 334 |
-
# small helper formatting
|
| 335 |
def fmt(x, unit=""):
|
| 336 |
if not _finite(x):
|
| 337 |
return "—"
|
| 338 |
-
if unit == "%":
|
| 339 |
-
return f"{x*100:+.1f}%"
|
| 340 |
return f"{x:+.3f}{unit}"
|
| 341 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
lines = []
|
| 343 |
lines.append("### Summary of changes (last vs first)")
|
| 344 |
-
lines.append("This compares the **first** and **last** recording
|
| 345 |
lines.append("")
|
| 346 |
lines.append("**Measured differences (Δ = last − first):**")
|
| 347 |
-
lines.append(f"- Total pause time: **{fmt(d_pause_total, 's')}**")
|
| 348 |
-
lines.append(f"- Number of pauses: **{d_n_pauses:+d}**"
|
| 349 |
lines.append(f"- Median pitch: **{fmt(d_pitch, ' Hz')}**")
|
| 350 |
lines.append(f"- RMS energy: **{fmt(d_rms)}**")
|
| 351 |
-
lines.append(f"- Active speech ratio: **{fmt(d_active, '%')}**")
|
| 352 |
lines.append("")
|
| 353 |
-
lines.append("**
|
| 354 |
-
lines.append(
|
| 355 |
-
lines.append(
|
| 356 |
-
lines.append(
|
| 357 |
-
lines.append(interpret_delta("RMS energy", d_rms))
|
| 358 |
-
lines.append(interpret_delta("Active speech ratio", d_active))
|
| 359 |
lines.append("")
|
| 360 |
-
lines.append(
|
| 361 |
-
"**Important:** these are **speech-signal explanations**, not a diagnosis. "
|
| 362 |
-
"Real-world meaning depends on context (device, environment, fatigue, stress, medication, etc.)."
|
| 363 |
-
)
|
| 364 |
return "\n".join(lines)
|
| 365 |
|
| 366 |
|
| 367 |
-
def explain_timeline() -> str:
|
| 368 |
-
return (
|
| 369 |
-
"### Timeline principle\n"
|
| 370 |
-
"- Use **multiple recordings of the same person**.\n"
|
| 371 |
-
"- The key is **within-person change over time** relative to baseline.\n"
|
| 372 |
-
"- The Summary box explains **what changed** (signals) and gives cautious, non-clinical interpretations.\n"
|
| 373 |
-
)
|
| 374 |
-
|
| 375 |
-
|
| 376 |
# =========================================================
|
| 377 |
# Callbacks
|
| 378 |
# =========================================================
|
|
@@ -381,7 +383,7 @@ def analyze_one(audio_path: Optional[str]):
|
|
| 381 |
return [], None, None, "### Upload or record audio to start."
|
| 382 |
y, sr = load_audio_file(audio_path)
|
| 383 |
feats, art = compute_features(y, sr)
|
| 384 |
-
return features_table(feats), plot_waveform_with_pauses(art), plot_pitch(art),
|
| 385 |
|
| 386 |
|
| 387 |
def analyze_many_paths(paths: List[str]):
|
|
@@ -389,14 +391,12 @@ def analyze_many_paths(paths: List[str]):
|
|
| 389 |
return (
|
| 390 |
[[1, "—", "Upload/select at least 2 recordings.", "", "", "", "", ""]],
|
| 391 |
None,
|
| 392 |
-
|
| 393 |
-
"###
|
| 394 |
)
|
| 395 |
|
| 396 |
rows = []
|
| 397 |
pause_series, pitch_series, rms_series = [], [], []
|
| 398 |
-
|
| 399 |
-
# store first/last features for summary
|
| 400 |
feats_first = None
|
| 401 |
feats_last = None
|
| 402 |
|
|
@@ -445,15 +445,14 @@ def analyze_many_paths(paths: List[str]):
|
|
| 445 |
if feats_first is not None and feats_last is not None:
|
| 446 |
summary = summary_of_changes(feats_first, feats_last)
|
| 447 |
|
| 448 |
-
return rows, fig,
|
| 449 |
|
| 450 |
|
| 451 |
def analyze_many_uploaded(files):
|
| 452 |
paths = []
|
| 453 |
if files:
|
| 454 |
for f in files:
|
| 455 |
-
|
| 456 |
-
paths.append(p)
|
| 457 |
return analyze_many_paths(paths)
|
| 458 |
|
| 459 |
|
|
@@ -500,16 +499,14 @@ CSS = """
|
|
| 500 |
}
|
| 501 |
.card *{ color: #0b0f19 !important; }
|
| 502 |
|
| 503 |
-
/* Tabs
|
| 504 |
div[role="tablist"]{
|
| 505 |
background: rgba(255,255,255,0.06) !important;
|
| 506 |
border: 1px solid rgba(255,255,255,0.14) !important;
|
| 507 |
border-radius: 14px !important;
|
| 508 |
padding: 6px !important;
|
| 509 |
}
|
| 510 |
-
button[role="tab"]{
|
| 511 |
-
color: rgba(255,255,255,0.92) !important;
|
| 512 |
-
}
|
| 513 |
button[role="tab"][aria-selected="true"]{
|
| 514 |
color: rgba(255,255,255,0.98) !important;
|
| 515 |
border-bottom: 2px solid rgba(255,255,255,0.65) !important;
|
|
@@ -542,7 +539,6 @@ def build_ui():
|
|
| 542 |
with gr.Column(scale=5):
|
| 543 |
audio = gr.Audio(label="Audio", sources=["upload", "microphone"], type="filepath")
|
| 544 |
run = gr.Button("Analyze", variant="primary")
|
| 545 |
-
gr.Markdown("If mic doesn’t work, try upload first. Then check Diagnostics.", elem_classes=["card"])
|
| 546 |
with gr.Column(scale=7):
|
| 547 |
feats_df = gr.Dataframe(headers=["Feature", "Value"], interactive=False, wrap=True)
|
| 548 |
wf_plot = gr.Plot(label="Waveform + pauses")
|
|
@@ -553,18 +549,16 @@ def build_ui():
|
|
| 553 |
with gr.TabItem("Timeline"):
|
| 554 |
with gr.Row():
|
| 555 |
with gr.Column(scale=5):
|
| 556 |
-
gr.Markdown("#### Option A — Upload
|
| 557 |
files = gr.Files(label="Upload multiple audio files", file_count="multiple", file_types=["audio"])
|
| 558 |
run_up = gr.Button("Analyze uploaded timeline", variant="primary")
|
| 559 |
|
| 560 |
-
gr.Markdown("#### Option B —
|
| 561 |
bundled_select = gr.CheckboxGroup(choices=bundled0, label="Bundled audio files")
|
| 562 |
with gr.Row():
|
| 563 |
refresh_btn = gr.Button("Refresh list", variant="secondary")
|
| 564 |
run_b = gr.Button("Analyze selected bundled", variant="secondary")
|
| 565 |
|
| 566 |
-
gr.Markdown("Order matters: first = baseline, last = comparison.", elem_classes=["card"])
|
| 567 |
-
|
| 568 |
with gr.Column(scale=7):
|
| 569 |
timeline_df = gr.Dataframe(
|
| 570 |
headers=["#", "File", "Duration", "Pauses", "Pause(s)", "Pitch(Hz)", "RMS", "Active %"],
|
|
@@ -572,8 +566,8 @@ def build_ui():
|
|
| 572 |
wrap=True,
|
| 573 |
)
|
| 574 |
timeline_plot = gr.Plot(label="Trend plot")
|
| 575 |
-
timeline_expl = gr.Markdown(
|
| 576 |
-
timeline_summary = gr.Markdown("### Summary will appear here
|
| 577 |
|
| 578 |
run_up.click(analyze_many_uploaded, inputs=[files], outputs=[timeline_df, timeline_plot, timeline_expl, timeline_summary])
|
| 579 |
run_b.click(analyze_many_bundled, inputs=[bundled_select], outputs=[timeline_df, timeline_plot, timeline_expl, timeline_summary])
|
|
@@ -583,7 +577,6 @@ def build_ui():
|
|
| 583 |
diag_refresh = gr.Button("Refresh diagnostics", variant="secondary")
|
| 584 |
diag_refresh.click(lambda: diagnostics_text(), inputs=None, outputs=[diag])
|
| 585 |
|
| 586 |
-
# Refresh bundled choices AND diagnostics
|
| 587 |
refresh_btn.click(refresh_bundled, inputs=None, outputs=[bundled_select, diag])
|
| 588 |
|
| 589 |
return demo
|
|
|
|
| 1 |
import os
|
| 2 |
import math
|
| 3 |
+
import tempfile
|
| 4 |
import numpy as np
|
| 5 |
import gradio as gr
|
|
|
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
|
| 8 |
from dataclasses import dataclass
|
| 9 |
from typing import Dict, Any, Tuple, List, Optional
|
| 10 |
|
| 11 |
+
import soundfile as sf
|
| 12 |
+
from pydub import AudioSegment
|
| 13 |
+
from scipy.signal import correlate
|
| 14 |
+
|
| 15 |
# =========================================================
|
| 16 |
# Config
|
| 17 |
# =========================================================
|
| 18 |
TARGET_SR = 16000
|
| 19 |
APP_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 20 |
|
|
|
|
| 21 |
# =========================================================
|
| 22 |
# Helpers
|
| 23 |
# =========================================================
|
|
|
|
| 47 |
return files
|
| 48 |
|
| 49 |
|
| 50 |
+
def _resample_linear(y: np.ndarray, sr: int, target_sr: int) -> np.ndarray:
|
| 51 |
+
if sr == target_sr or y.size == 0:
|
| 52 |
+
return y
|
| 53 |
+
x_old = np.linspace(0.0, 1.0, num=y.size, endpoint=False)
|
| 54 |
+
new_len = int(round(y.size * (target_sr / sr)))
|
| 55 |
+
x_new = np.linspace(0.0, 1.0, num=max(new_len, 1), endpoint=False)
|
| 56 |
+
return np.interp(x_new, x_old, y).astype(np.float32)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
def load_audio_file(path: str) -> Tuple[np.ndarray, int]:
|
| 60 |
+
"""
|
| 61 |
+
Robust loader:
|
| 62 |
+
- WAV/FLAC/OGG via soundfile
|
| 63 |
+
- MP3/M4A via pydub (ffmpeg)
|
| 64 |
+
Returns mono float32 waveform + sr.
|
| 65 |
+
"""
|
| 66 |
+
ext = os.path.splitext(path)[1].lower()
|
| 67 |
+
|
| 68 |
+
if ext in [".wav", ".flac", ".ogg"]:
|
| 69 |
+
y, sr = sf.read(path, always_2d=True)
|
| 70 |
+
y = y.mean(axis=1).astype(np.float32)
|
| 71 |
+
return y, int(sr)
|
| 72 |
+
|
| 73 |
+
# MP3/M4A/etc. via pydub
|
| 74 |
+
seg = AudioSegment.from_file(path)
|
| 75 |
+
seg = seg.set_channels(1)
|
| 76 |
+
sr = seg.frame_rate
|
| 77 |
+
samples = np.array(seg.get_array_of_samples())
|
| 78 |
+
# Convert to float32 in [-1, 1]
|
| 79 |
+
y = samples.astype(np.float32) / (2 ** (8 * seg.sample_width - 1))
|
| 80 |
+
return y, int(sr)
|
| 81 |
|
| 82 |
|
| 83 |
def diagnostics_text() -> str:
|
|
|
|
| 96 |
lines.append(f"- `{fn}` (size unknown)")
|
| 97 |
else:
|
| 98 |
lines.append("- *(none found next to app.py)*")
|
|
|
|
| 99 |
lines.append("")
|
| 100 |
+
lines.append("**If build hangs:** usually heavy deps (e.g. librosa/numba). This version avoids them.")
|
| 101 |
+
lines.append("**Microphone note:** may be blocked by browser permissions/corporate policy.")
|
| 102 |
return "\n".join(lines)
|
| 103 |
|
| 104 |
|
|
|
|
| 107 |
|
| 108 |
|
| 109 |
# =========================================================
|
| 110 |
+
# Feature extraction (no librosa)
|
| 111 |
# =========================================================
|
| 112 |
@dataclass
|
| 113 |
class Features:
|
|
|
|
| 123 |
active_ratio: float
|
| 124 |
|
| 125 |
|
| 126 |
+
def _frame_signal(y: np.ndarray, frame: int, hop: int) -> np.ndarray:
|
| 127 |
+
if y.size < frame:
|
| 128 |
+
return np.zeros((0, frame), dtype=np.float32)
|
| 129 |
+
n = 1 + (y.size - frame) // hop
|
| 130 |
+
idx = (np.arange(n)[:, None] * hop) + np.arange(frame)[None, :]
|
| 131 |
+
return y[idx]
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _rms_per_frame(frames: np.ndarray) -> np.ndarray:
|
| 135 |
+
if frames.size == 0:
|
| 136 |
+
return np.array([], dtype=np.float32)
|
| 137 |
+
return np.sqrt(np.mean(frames * frames, axis=1) + 1e-12).astype(np.float32)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def _zcr_per_frame(frames: np.ndarray) -> np.ndarray:
|
| 141 |
+
if frames.size == 0:
|
| 142 |
+
return np.array([], dtype=np.float32)
|
| 143 |
+
signs = np.sign(frames)
|
| 144 |
+
signs[signs == 0] = 1
|
| 145 |
+
zc = np.mean(signs[:, 1:] != signs[:, :-1], axis=1).astype(np.float32)
|
| 146 |
+
return zc
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def _pitch_autocorr(frame: np.ndarray, sr: int, fmin: float = 70.0, fmax: float = 350.0) -> float:
|
| 150 |
+
"""
|
| 151 |
+
Simple autocorrelation pitch estimate for one frame.
|
| 152 |
+
Returns Hz or NaN.
|
| 153 |
+
"""
|
| 154 |
+
if frame.size == 0:
|
| 155 |
+
return float("nan")
|
| 156 |
+
frame = frame - np.mean(frame)
|
| 157 |
+
energy = np.sqrt(np.mean(frame * frame) + 1e-12)
|
| 158 |
+
if energy < 0.01:
|
| 159 |
+
return float("nan")
|
| 160 |
+
|
| 161 |
+
ac = correlate(frame, frame, mode="full")
|
| 162 |
+
ac = ac[ac.size // 2 :]
|
| 163 |
+
|
| 164 |
+
min_lag = int(sr / fmax)
|
| 165 |
+
max_lag = int(sr / fmin)
|
| 166 |
+
if max_lag <= min_lag + 2 or max_lag >= ac.size:
|
| 167 |
+
return float("nan")
|
| 168 |
+
|
| 169 |
+
seg = ac[min_lag:max_lag]
|
| 170 |
+
if seg.size == 0:
|
| 171 |
+
return float("nan")
|
| 172 |
+
|
| 173 |
+
i = int(np.argmax(seg))
|
| 174 |
+
lag = min_lag + i
|
| 175 |
+
|
| 176 |
+
if lag <= 0:
|
| 177 |
+
return float("nan")
|
| 178 |
+
return float(sr / lag)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
|
| 182 |
+
if y is None or y.size == 0:
|
| 183 |
f = Features(
|
| 184 |
duration_s=float("nan"),
|
| 185 |
rms_mean=float("nan"),
|
|
|
|
| 194 |
)
|
| 195 |
return f, {"y": np.array([]), "sr": sr, "hop": 160, "pauses": [], "pitch": np.array([]), "times": np.array([])}
|
| 196 |
|
| 197 |
+
# resample + normalize
|
| 198 |
if sr != TARGET_SR:
|
| 199 |
+
y = _resample_linear(y.astype(np.float32), sr, TARGET_SR)
|
| 200 |
sr = TARGET_SR
|
| 201 |
else:
|
| 202 |
y = y.astype(np.float32)
|
| 203 |
|
|
|
|
| 204 |
mx = float(np.max(np.abs(y))) + 1e-9
|
| 205 |
y = y / mx
|
| 206 |
|
| 207 |
+
duration = float(y.size / sr)
|
|
|
|
|
|
|
| 208 |
|
| 209 |
+
hop = 160 # 10ms
|
| 210 |
+
frame = 400 # 25ms
|
| 211 |
+
|
| 212 |
+
frames = _frame_signal(y, frame=frame, hop=hop)
|
| 213 |
+
rms = _rms_per_frame(frames)
|
| 214 |
+
zcr = _zcr_per_frame(frames)
|
| 215 |
|
| 216 |
rms_mean = float(np.mean(rms)) if rms.size else float("nan")
|
| 217 |
rms_std = float(np.std(rms)) if rms.size else float("nan")
|
| 218 |
zcr_mean = float(np.mean(zcr)) if zcr.size else float("nan")
|
| 219 |
|
| 220 |
+
# pitch per frame (simple + explainable)
|
| 221 |
+
pitch = np.array([_pitch_autocorr(frames[i], sr) for i in range(frames.shape[0])], dtype=np.float32)
|
| 222 |
+
times = (np.arange(pitch.size) * hop / sr).astype(np.float32)
|
| 223 |
+
|
| 224 |
+
voiced = np.isfinite(pitch) & (pitch > 0)
|
| 225 |
+
voiced_ratio = float(np.mean(voiced)) if voiced.size else float("nan")
|
| 226 |
+
if np.any(voiced):
|
| 227 |
+
pv = pitch[voiced]
|
| 228 |
+
pitch_median = float(np.median(pv))
|
| 229 |
+
q75, q25 = np.percentile(pv, [75, 25])
|
| 230 |
+
pitch_iqr = float(q75 - q25)
|
| 231 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
pitch_median = float("nan")
|
| 233 |
pitch_iqr = float("nan")
|
| 234 |
+
|
| 235 |
+
# pause detection via RMS threshold
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
if rms.size:
|
| 237 |
thr = float(np.percentile(rms, 20)) * 0.8
|
| 238 |
silent = rms < thr
|
|
|
|
| 275 |
active_ratio=active_ratio,
|
| 276 |
)
|
| 277 |
|
| 278 |
+
artifacts = {"y": y, "sr": sr, "hop": hop, "pauses": pauses, "pitch": pitch, "times": times}
|
| 279 |
return feats, artifacts
|
| 280 |
|
| 281 |
|
|
|
|
| 313 |
ax = fig.add_subplot(111)
|
| 314 |
if pitch.size and times.size:
|
| 315 |
ax.plot(times, pitch, linewidth=1.0)
|
| 316 |
+
ax.set_title("Pitch contour (simple autocorrelation)")
|
| 317 |
ax.set_xlabel("Time (s)")
|
| 318 |
ax.set_ylabel("Pitch (Hz)")
|
| 319 |
else:
|
| 320 |
+
ax.text(0.5, 0.5, "Pitch not available", ha="center", va="center")
|
| 321 |
ax.set_axis_off()
|
| 322 |
fig.tight_layout()
|
| 323 |
return fig
|
| 324 |
|
| 325 |
|
| 326 |
# =========================================================
|
| 327 |
+
# Explanations + summary
|
| 328 |
# =========================================================
|
| 329 |
def features_table(feats: Features) -> List[List[str]]:
|
| 330 |
def f3(x):
|
|
|
|
| 343 |
]
|
| 344 |
|
| 345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
def summary_of_changes(first: Features, last: Features) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
def fmt(x, unit=""):
|
| 348 |
if not _finite(x):
|
| 349 |
return "—"
|
|
|
|
|
|
|
| 350 |
return f"{x:+.3f}{unit}"
|
| 351 |
|
| 352 |
+
d_pause_total = (last.pause_total_s - first.pause_total_s) if (_finite(last.pause_total_s) and _finite(first.pause_total_s)) else float("nan")
|
| 353 |
+
d_n_pauses = (last.n_pauses - first.n_pauses)
|
| 354 |
+
d_pitch = (last.pitch_median_hz - first.pitch_median_hz) if (_finite(last.pitch_median_hz) and _finite(first.pitch_median_hz)) else float("nan")
|
| 355 |
+
d_rms = (last.rms_mean - first.rms_mean) if (_finite(last.rms_mean) and _finite(first.rms_mean)) else float("nan")
|
| 356 |
+
d_active = (last.active_ratio - first.active_ratio) if (_finite(last.active_ratio) and _finite(first.active_ratio)) else float("nan")
|
| 357 |
+
|
| 358 |
lines = []
|
| 359 |
lines.append("### Summary of changes (last vs first)")
|
| 360 |
+
lines.append("This compares the **first** and **last** recording in your selection (upload order).")
|
| 361 |
lines.append("")
|
| 362 |
lines.append("**Measured differences (Δ = last − first):**")
|
| 363 |
+
lines.append(f"- Total pause time: **{fmt(d_pause_total, ' s')}**")
|
| 364 |
+
lines.append(f"- Number of pauses: **{d_n_pauses:+d}**")
|
| 365 |
lines.append(f"- Median pitch: **{fmt(d_pitch, ' Hz')}**")
|
| 366 |
lines.append(f"- RMS energy: **{fmt(d_rms)}**")
|
| 367 |
+
lines.append(f"- Active speech ratio: **{fmt(d_active * 100.0, ' %')}**")
|
| 368 |
lines.append("")
|
| 369 |
+
lines.append("**How to interpret (non-clinical):**")
|
| 370 |
+
lines.append("- More pauses / lower active ratio can reflect hesitations, slower speech, fatigue, or different environment/microphone setup.")
|
| 371 |
+
lines.append("- Pitch changes can reflect speaking style, prosody, emotion, or recording conditions.")
|
| 372 |
+
lines.append("- Energy changes often reflect distance to microphone / loudness / background noise.")
|
|
|
|
|
|
|
| 373 |
lines.append("")
|
| 374 |
+
lines.append("**Important:** not a diagnosis. These are explainable signal-level comparisons.")
|
|
|
|
|
|
|
|
|
|
| 375 |
return "\n".join(lines)
|
| 376 |
|
| 377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
# =========================================================
|
| 379 |
# Callbacks
|
| 380 |
# =========================================================
|
|
|
|
| 383 |
return [], None, None, "### Upload or record audio to start."
|
| 384 |
y, sr = load_audio_file(audio_path)
|
| 385 |
feats, art = compute_features(y, sr)
|
| 386 |
+
return features_table(feats), plot_waveform_with_pauses(art), plot_pitch(art), "### This shows measurable signals (no diagnosis)."
|
| 387 |
|
| 388 |
|
| 389 |
def analyze_many_paths(paths: List[str]):
|
|
|
|
| 391 |
return (
|
| 392 |
[[1, "—", "Upload/select at least 2 recordings.", "", "", "", "", ""]],
|
| 393 |
None,
|
| 394 |
+
"### Select at least 2 recordings to see a trend.",
|
| 395 |
+
"### Summary will appear here."
|
| 396 |
)
|
| 397 |
|
| 398 |
rows = []
|
| 399 |
pause_series, pitch_series, rms_series = [], [], []
|
|
|
|
|
|
|
| 400 |
feats_first = None
|
| 401 |
feats_last = None
|
| 402 |
|
|
|
|
| 445 |
if feats_first is not None and feats_last is not None:
|
| 446 |
summary = summary_of_changes(feats_first, feats_last)
|
| 447 |
|
| 448 |
+
return rows, fig, "### Trend over time (within-person).", summary
|
| 449 |
|
| 450 |
|
| 451 |
def analyze_many_uploaded(files):
|
| 452 |
paths = []
|
| 453 |
if files:
|
| 454 |
for f in files:
|
| 455 |
+
paths.append(getattr(f, "name", None) or str(f))
|
|
|
|
| 456 |
return analyze_many_paths(paths)
|
| 457 |
|
| 458 |
|
|
|
|
| 499 |
}
|
| 500 |
.card *{ color: #0b0f19 !important; }
|
| 501 |
|
| 502 |
+
/* Tabs readable on dark background */
|
| 503 |
div[role="tablist"]{
|
| 504 |
background: rgba(255,255,255,0.06) !important;
|
| 505 |
border: 1px solid rgba(255,255,255,0.14) !important;
|
| 506 |
border-radius: 14px !important;
|
| 507 |
padding: 6px !important;
|
| 508 |
}
|
| 509 |
+
button[role="tab"]{ color: rgba(255,255,255,0.92) !important; }
|
|
|
|
|
|
|
| 510 |
button[role="tab"][aria-selected="true"]{
|
| 511 |
color: rgba(255,255,255,0.98) !important;
|
| 512 |
border-bottom: 2px solid rgba(255,255,255,0.65) !important;
|
|
|
|
| 539 |
with gr.Column(scale=5):
|
| 540 |
audio = gr.Audio(label="Audio", sources=["upload", "microphone"], type="filepath")
|
| 541 |
run = gr.Button("Analyze", variant="primary")
|
|
|
|
| 542 |
with gr.Column(scale=7):
|
| 543 |
feats_df = gr.Dataframe(headers=["Feature", "Value"], interactive=False, wrap=True)
|
| 544 |
wf_plot = gr.Plot(label="Waveform + pauses")
|
|
|
|
| 549 |
with gr.TabItem("Timeline"):
|
| 550 |
with gr.Row():
|
| 551 |
with gr.Column(scale=5):
|
| 552 |
+
gr.Markdown("#### Option A — Upload")
|
| 553 |
files = gr.Files(label="Upload multiple audio files", file_count="multiple", file_types=["audio"])
|
| 554 |
run_up = gr.Button("Analyze uploaded timeline", variant="primary")
|
| 555 |
|
| 556 |
+
gr.Markdown("#### Option B — Bundled samples (repo root)")
|
| 557 |
bundled_select = gr.CheckboxGroup(choices=bundled0, label="Bundled audio files")
|
| 558 |
with gr.Row():
|
| 559 |
refresh_btn = gr.Button("Refresh list", variant="secondary")
|
| 560 |
run_b = gr.Button("Analyze selected bundled", variant="secondary")
|
| 561 |
|
|
|
|
|
|
|
| 562 |
with gr.Column(scale=7):
|
| 563 |
timeline_df = gr.Dataframe(
|
| 564 |
headers=["#", "File", "Duration", "Pauses", "Pause(s)", "Pitch(Hz)", "RMS", "Active %"],
|
|
|
|
| 566 |
wrap=True,
|
| 567 |
)
|
| 568 |
timeline_plot = gr.Plot(label="Trend plot")
|
| 569 |
+
timeline_expl = gr.Markdown("### Select at least 2 recordings.", elem_classes=["card"])
|
| 570 |
+
timeline_summary = gr.Markdown("### Summary will appear here.", elem_classes=["card"])
|
| 571 |
|
| 572 |
run_up.click(analyze_many_uploaded, inputs=[files], outputs=[timeline_df, timeline_plot, timeline_expl, timeline_summary])
|
| 573 |
run_b.click(analyze_many_bundled, inputs=[bundled_select], outputs=[timeline_df, timeline_plot, timeline_expl, timeline_summary])
|
|
|
|
| 577 |
diag_refresh = gr.Button("Refresh diagnostics", variant="secondary")
|
| 578 |
diag_refresh.click(lambda: diagnostics_text(), inputs=None, outputs=[diag])
|
| 579 |
|
|
|
|
| 580 |
refresh_btn.click(refresh_bundled, inputs=None, outputs=[bundled_select, diag])
|
| 581 |
|
| 582 |
return demo
|