Spaces:

Marcel0123
/

Explainable-Speech-Analytics

Sleeping

App Files Files Community

Marcel0123 commited on Jan 27

Commit

d1034a7

verified ·

1 Parent(s): 8358e4d

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -154

app.py CHANGED Viewed

@@ -1,20 +1,23 @@
 import os
 import math
 import numpy as np
 import gradio as gr
-import librosa
 import matplotlib.pyplot as plt
 from dataclasses import dataclass
 from typing import Dict, Any, Tuple, List, Optional
 # =========================================================
 # Config
 # =========================================================
 TARGET_SR = 16000
 APP_DIR = os.path.dirname(os.path.abspath(__file__))
 # =========================================================
 # Helpers
 # =========================================================
@@ -44,11 +47,37 @@ def list_bundled_audio() -> List[str]:
     return files
 def load_audio_file(path: str) -> Tuple[np.ndarray, int]:
-    y, sr = librosa.load(path, sr=None, mono=True)
-    if y is None or len(y) == 0:
-        return np.array([], dtype=np.float32), TARGET_SR
-    return y.astype(np.float32), int(sr)
 def diagnostics_text() -> str:
@@ -67,10 +96,9 @@ def diagnostics_text() -> str:
                 lines.append(f"- `{fn}` (size unknown)")
     else:
         lines.append("- *(none found next to app.py)*")
     lines.append("")
-    lines.append("**Microphone note:** recording can be blocked by browser permissions / corporate policy.")
-    lines.append("Try opening the Space in a new tab and allow microphone access.")
     return "\n".join(lines)
@@ -79,7 +107,7 @@ def _finite(x: float) -> bool:
 # =========================================================
-# Features
 # =========================================================
 @dataclass
 class Features:
@@ -95,8 +123,63 @@ class Features:
     active_ratio: float
 def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
-    if y is None or len(y) == 0:
         f = Features(
             duration_s=float("nan"),
             rms_mean=float("nan"),
@@ -111,62 +194,45 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
         )
         return f, {"y": np.array([]), "sr": sr, "hop": 160, "pauses": [], "pitch": np.array([]), "times": np.array([])}
-    # Resample to stable SR
     if sr != TARGET_SR:
-        y = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=TARGET_SR)
         sr = TARGET_SR
     else:
         y = y.astype(np.float32)
-    # Normalize
     mx = float(np.max(np.abs(y))) + 1e-9
     y = y / mx
-    duration = float(len(y) / sr)
-    hop = 160
-    frame = 400
-    rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0]
-    zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame, hop_length=hop)[0]
     rms_mean = float(np.mean(rms)) if rms.size else float("nan")
     rms_std = float(np.std(rms)) if rms.size else float("nan")
     zcr_mean = float(np.mean(zcr)) if zcr.size else float("nan")
-    # Pitch via pyin
-    try:
-        f0, _, _ = librosa.pyin(
-            y,
-            fmin=librosa.note_to_hz("C2"),
-            fmax=librosa.note_to_hz("C7"),
-            sr=sr,
-            frame_length=frame,
-            hop_length=hop,
-        )
-    except Exception:
-        f0 = None
-    if f0 is None:
-        pitch = np.array([])
-        times = np.array([])
         pitch_median = float("nan")
         pitch_iqr = float("nan")
-        voiced_ratio = float("nan")
-    else:
-        pitch = np.asarray(f0, dtype=np.float32)
-        times = librosa.frames_to_time(np.arange(len(pitch)), sr=sr, hop_length=hop)
-        voiced = np.isfinite(pitch)
-        voiced_ratio = float(np.mean(voiced)) if voiced.size else float("nan")
-        if np.any(voiced):
-            pv = pitch[voiced]
-            pitch_median = float(np.median(pv))
-            q75, q25 = np.percentile(pv, [75, 25])
-            pitch_iqr = float(q75 - q25)
-        else:
-            pitch_median = float("nan")
-            pitch_iqr = float("nan")
-    # Pause detection
     if rms.size:
         thr = float(np.percentile(rms, 20)) * 0.8
         silent = rms < thr
@@ -209,7 +275,7 @@ def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
         active_ratio=active_ratio,
     )
-    artifacts = {"y": y, "sr": sr, "hop": hop, "rms": rms, "zcr": zcr, "pitch": pitch, "times": times, "pauses": pauses}
     return feats, artifacts
@@ -247,18 +313,18 @@ def plot_pitch(art: Dict[str, Any]) -> plt.Figure:
     ax = fig.add_subplot(111)
     if pitch.size and times.size:
         ax.plot(times, pitch, linewidth=1.0)
-        ax.set_title("Pitch contour (NaN = unvoiced)")
         ax.set_xlabel("Time (s)")
         ax.set_ylabel("Pitch (Hz)")
     else:
-        ax.text(0.5, 0.5, "Pitch not available (too short/noisy)", ha="center", va="center")
         ax.set_axis_off()
     fig.tight_layout()
     return fig
 # =========================================================
-# Tables + Explanations
 # =========================================================
 def features_table(feats: Features) -> List[List[str]]:
     def f3(x):
@@ -277,102 +343,38 @@ def features_table(feats: Features) -> List[List[str]]:
     ]
-def explain_single(feats: Features) -> str:
-    return (
-        "### What does the system ‘see’ here?\n"
-        "- It shows **measurable signals**: pauses, pitch and energy.\n"
-        "- This is **not a diagnosis** and **not a medical device**.\n"
-    )
-def interpret_delta(label: str, delta: float) -> str:
-    """
-    Very conservative, explainable interpretation. No clinical claims.
-    """
-    if not _finite(delta):
-        return f"- **{label}**: not available."
-    # Use direction-only interpretations
-    if "pause" in label.lower():
-        if delta > 0:
-            return f"- **{label}** increased. This can reflect slower speech, more hesitations, fatigue, distraction, or noise/environment changes."
-        if delta < 0:
-            return f"- **{label}** decreased. This can reflect more continuous speech or fewer hesitations."
-        return f"- **{label}** stayed similar."
-    if "pitch" in label.lower():
-        if delta > 0:
-            return f"- **{label}** increased. This can reflect different speaking style, emotion, or prosody changes."
-        if delta < 0:
-            return f"- **{label}** decreased. This can reflect a flatter/less variable prosody or a different speaking style."
-        return f"- **{label}** stayed similar."
-    if "rms" in label.lower() or "energy" in label.lower():
-        if delta > 0:
-            return f"- **{label}** increased. This can reflect speaking louder/closer to mic, or a quieter environment."
-        if delta < 0:
-            return f"- **{label}** decreased. This can reflect speaking softer/farther from mic, or a noisier environment."
-        return f"- **{label}** stayed similar."
-    if "active speech" in label.lower():
-        if delta > 0:
-            return f"- **{label}** increased. More time above the energy threshold (more continuous speech or less silence)."
-        if delta < 0:
-            return f"- **{label}** decreased. More time below threshold (more silence/pauses)."
-        return f"- **{label}** stayed similar."
-    return f"- **{label}** changed by {delta:+.3f}."
 def summary_of_changes(first: Features, last: Features) -> str:
-    """
-    Compare first vs last recording in the timeline.
-    Generates an explainable summary + cautious interpretation.
-    """
-    # compute deltas (last - first)
-    d_pause_total = (last.pause_total_s - first.pause_total_s) if (_finite(last.pause_total_s) and _finite(first.pause_total_s)) else float("nan")
-    d_n_pauses = (last.n_pauses - first.n_pauses) if (last.n_pauses is not None and first.n_pauses is not None) else float("nan")
-    d_pitch = (last.pitch_median_hz - first.pitch_median_hz) if (_finite(last.pitch_median_hz) and _finite(first.pitch_median_hz)) else float("nan")
-    d_rms = (last.rms_mean - first.rms_mean) if (_finite(last.rms_mean) and _finite(first.rms_mean)) else float("nan")
-    d_active = (last.active_ratio - first.active_ratio) if (_finite(last.active_ratio) and _finite(first.active_ratio)) else float("nan")
-    # small helper formatting
     def fmt(x, unit=""):
         if not _finite(x):
             return "—"
-        if unit == "%":
-            return f"{x*100:+.1f}%"
         return f"{x:+.3f}{unit}"
     lines = []
     lines.append("### Summary of changes (last vs first)")
-    lines.append("This compares the **first** and **last** recording you provided (chronological order recommended).")
     lines.append("")
     lines.append("**Measured differences (Δ = last − first):**")
-    lines.append(f"- Total pause time: **{fmt(d_pause_total, 's')}**")
-    lines.append(f"- Number of pauses: **{d_n_pauses:+d}**" if isinstance(d_n_pauses, int) else f"- Number of pauses: **{fmt(d_n_pauses)}**")
     lines.append(f"- Median pitch: **{fmt(d_pitch, ' Hz')}**")
     lines.append(f"- RMS energy: **{fmt(d_rms)}**")
-    lines.append(f"- Active speech ratio: **{fmt(d_active, '%')}**")
     lines.append("")
-    lines.append("**Possible (non-clinical) interpretations:**")
-    lines.append(interpret_delta("Total pause time", d_pause_total))
-    lines.append(interpret_delta("Number of pauses", float(d_n_pauses) if isinstance(d_n_pauses, int) else d_n_pauses))
-    lines.append(interpret_delta("Median pitch", d_pitch))
-    lines.append(interpret_delta("RMS energy", d_rms))
-    lines.append(interpret_delta("Active speech ratio", d_active))
     lines.append("")
-    lines.append(
-        "**Important:** these are **speech-signal explanations**, not a diagnosis. "
-        "Real-world meaning depends on context (device, environment, fatigue, stress, medication, etc.)."
-    )
     return "\n".join(lines)
-def explain_timeline() -> str:
-    return (
-        "### Timeline principle\n"
-        "- Use **multiple recordings of the same person**.\n"
-        "- The key is **within-person change over time** relative to baseline.\n"
-        "- The Summary box explains **what changed** (signals) and gives cautious, non-clinical interpretations.\n"
-    )
 # =========================================================
 # Callbacks
 # =========================================================
@@ -381,7 +383,7 @@ def analyze_one(audio_path: Optional[str]):
         return [], None, None, "### Upload or record audio to start."
     y, sr = load_audio_file(audio_path)
     feats, art = compute_features(y, sr)
-    return features_table(feats), plot_waveform_with_pauses(art), plot_pitch(art), explain_single(feats)
 def analyze_many_paths(paths: List[str]):
@@ -389,14 +391,12 @@ def analyze_many_paths(paths: List[str]):
         return (
             [[1, "—", "Upload/select at least 2 recordings.", "", "", "", "", ""]],
             None,
-            explain_timeline(),
-            "### Upload/select at least 2 recordings to generate a summary."
         )
     rows = []
     pause_series, pitch_series, rms_series = [], [], []
-    # store first/last features for summary
     feats_first = None
     feats_last = None
@@ -445,15 +445,14 @@ def analyze_many_paths(paths: List[str]):
     if feats_first is not None and feats_last is not None:
         summary = summary_of_changes(feats_first, feats_last)
-    return rows, fig, explain_timeline(), summary
 def analyze_many_uploaded(files):
     paths = []
     if files:
         for f in files:
-            p = getattr(f, "name", None) or str(f)
-            paths.append(p)
     return analyze_many_paths(paths)
@@ -500,16 +499,14 @@ CSS = """
 }
 .card *{ color: #0b0f19 !important; }
-/* Tabs: make readable on dark background */
 div[role="tablist"]{
   background: rgba(255,255,255,0.06) !important;
   border: 1px solid rgba(255,255,255,0.14) !important;
   border-radius: 14px !important;
   padding: 6px !important;
 }
-button[role="tab"]{
-  color: rgba(255,255,255,0.92) !important;
-}
 button[role="tab"][aria-selected="true"]{
   color: rgba(255,255,255,0.98) !important;
   border-bottom: 2px solid rgba(255,255,255,0.65) !important;
@@ -542,7 +539,6 @@ def build_ui():
                     with gr.Column(scale=5):
                         audio = gr.Audio(label="Audio", sources=["upload", "microphone"], type="filepath")
                         run = gr.Button("Analyze", variant="primary")
-                        gr.Markdown("If mic doesn’t work, try upload first. Then check Diagnostics.", elem_classes=["card"])
                     with gr.Column(scale=7):
                         feats_df = gr.Dataframe(headers=["Feature", "Value"], interactive=False, wrap=True)
                         wf_plot = gr.Plot(label="Waveform + pauses")
@@ -553,18 +549,16 @@ def build_ui():
             with gr.TabItem("Timeline"):
                 with gr.Row():
                     with gr.Column(scale=5):
-                        gr.Markdown("#### Option A — Upload from your computer")
                         files = gr.Files(label="Upload multiple audio files", file_count="multiple", file_types=["audio"])
                         run_up = gr.Button("Analyze uploaded timeline", variant="primary")
-                        gr.Markdown("#### Option B — Use bundled samples (repo root)")
                         bundled_select = gr.CheckboxGroup(choices=bundled0, label="Bundled audio files")
                         with gr.Row():
                             refresh_btn = gr.Button("Refresh list", variant="secondary")
                             run_b = gr.Button("Analyze selected bundled", variant="secondary")
-                        gr.Markdown("Order matters: first = baseline, last = comparison.", elem_classes=["card"])
                     with gr.Column(scale=7):
                         timeline_df = gr.Dataframe(
                             headers=["#", "File", "Duration", "Pauses", "Pause(s)", "Pitch(Hz)", "RMS", "Active %"],
@@ -572,8 +566,8 @@ def build_ui():
                             wrap=True,
                         )
                         timeline_plot = gr.Plot(label="Trend plot")
-                        timeline_expl = gr.Markdown(explain_timeline(), elem_classes=["card"])
-                        timeline_summary = gr.Markdown("### Summary will appear here after analysis.", elem_classes=["card"])
                 run_up.click(analyze_many_uploaded, inputs=[files], outputs=[timeline_df, timeline_plot, timeline_expl, timeline_summary])
                 run_b.click(analyze_many_bundled, inputs=[bundled_select], outputs=[timeline_df, timeline_plot, timeline_expl, timeline_summary])
@@ -583,7 +577,6 @@ def build_ui():
                 diag_refresh = gr.Button("Refresh diagnostics", variant="secondary")
                 diag_refresh.click(lambda: diagnostics_text(), inputs=None, outputs=[diag])
-        # Refresh bundled choices AND diagnostics
         refresh_btn.click(refresh_bundled, inputs=None, outputs=[bundled_select, diag])
     return demo

 import os
 import math
+import tempfile
 import numpy as np
 import gradio as gr
 import matplotlib.pyplot as plt
 from dataclasses import dataclass
 from typing import Dict, Any, Tuple, List, Optional
+import soundfile as sf
+from pydub import AudioSegment
+from scipy.signal import correlate
 # =========================================================
 # Config
 # =========================================================
 TARGET_SR = 16000
 APP_DIR = os.path.dirname(os.path.abspath(__file__))
 # =========================================================
 # Helpers
 # =========================================================
     return files
+def _resample_linear(y: np.ndarray, sr: int, target_sr: int) -> np.ndarray:
+    if sr == target_sr or y.size == 0:
+        return y
+    x_old = np.linspace(0.0, 1.0, num=y.size, endpoint=False)
+    new_len = int(round(y.size * (target_sr / sr)))
+    x_new = np.linspace(0.0, 1.0, num=max(new_len, 1), endpoint=False)
+    return np.interp(x_new, x_old, y).astype(np.float32)
 def load_audio_file(path: str) -> Tuple[np.ndarray, int]:
+    """
+    Robust loader:
+    - WAV/FLAC/OGG via soundfile
+    - MP3/M4A via pydub (ffmpeg)
+    Returns mono float32 waveform + sr.
+    """
+    ext = os.path.splitext(path)[1].lower()
+    if ext in [".wav", ".flac", ".ogg"]:
+        y, sr = sf.read(path, always_2d=True)
+        y = y.mean(axis=1).astype(np.float32)
+        return y, int(sr)
+    # MP3/M4A/etc. via pydub
+    seg = AudioSegment.from_file(path)
+    seg = seg.set_channels(1)
+    sr = seg.frame_rate
+    samples = np.array(seg.get_array_of_samples())
+    # Convert to float32 in [-1, 1]
+    y = samples.astype(np.float32) / (2 ** (8 * seg.sample_width - 1))
+    return y, int(sr)
 def diagnostics_text() -> str:
                 lines.append(f"- `{fn}` (size unknown)")
     else:
         lines.append("- *(none found next to app.py)*")
     lines.append("")
+    lines.append("**If build hangs:** usually heavy deps (e.g. librosa/numba). This version avoids them.")
+    lines.append("**Microphone note:** may be blocked by browser permissions/corporate policy.")
     return "\n".join(lines)
 # =========================================================
+# Feature extraction (no librosa)
 # =========================================================
 @dataclass
 class Features:
     active_ratio: float
+def _frame_signal(y: np.ndarray, frame: int, hop: int) -> np.ndarray:
+    if y.size < frame:
+        return np.zeros((0, frame), dtype=np.float32)
+    n = 1 + (y.size - frame) // hop
+    idx = (np.arange(n)[:, None] * hop) + np.arange(frame)[None, :]
+    return y[idx]
+def _rms_per_frame(frames: np.ndarray) -> np.ndarray:
+    if frames.size == 0:
+        return np.array([], dtype=np.float32)
+    return np.sqrt(np.mean(frames * frames, axis=1) + 1e-12).astype(np.float32)
+def _zcr_per_frame(frames: np.ndarray) -> np.ndarray:
+    if frames.size == 0:
+        return np.array([], dtype=np.float32)
+    signs = np.sign(frames)
+    signs[signs == 0] = 1
+    zc = np.mean(signs[:, 1:] != signs[:, :-1], axis=1).astype(np.float32)
+    return zc
+def _pitch_autocorr(frame: np.ndarray, sr: int, fmin: float = 70.0, fmax: float = 350.0) -> float:
+    """
+    Simple autocorrelation pitch estimate for one frame.
+    Returns Hz or NaN.
+    """
+    if frame.size == 0:
+        return float("nan")
+    frame = frame - np.mean(frame)
+    energy = np.sqrt(np.mean(frame * frame) + 1e-12)
+    if energy < 0.01:
+        return float("nan")
+    ac = correlate(frame, frame, mode="full")
+    ac = ac[ac.size // 2 :]
+    min_lag = int(sr / fmax)
+    max_lag = int(sr / fmin)
+    if max_lag <= min_lag + 2 or max_lag >= ac.size:
+        return float("nan")
+    seg = ac[min_lag:max_lag]
+    if seg.size == 0:
+        return float("nan")
+    i = int(np.argmax(seg))
+    lag = min_lag + i
+    if lag <= 0:
+        return float("nan")
+    return float(sr / lag)
 def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
+    if y is None or y.size == 0:
         f = Features(
             duration_s=float("nan"),
             rms_mean=float("nan"),
         )
         return f, {"y": np.array([]), "sr": sr, "hop": 160, "pauses": [], "pitch": np.array([]), "times": np.array([])}
+    # resample + normalize
     if sr != TARGET_SR:
+        y = _resample_linear(y.astype(np.float32), sr, TARGET_SR)
         sr = TARGET_SR
     else:
         y = y.astype(np.float32)
     mx = float(np.max(np.abs(y))) + 1e-9
     y = y / mx
+    duration = float(y.size / sr)
+    hop = 160   # 10ms
+    frame = 400 # 25ms
+    frames = _frame_signal(y, frame=frame, hop=hop)
+    rms = _rms_per_frame(frames)
+    zcr = _zcr_per_frame(frames)
     rms_mean = float(np.mean(rms)) if rms.size else float("nan")
     rms_std = float(np.std(rms)) if rms.size else float("nan")
     zcr_mean = float(np.mean(zcr)) if zcr.size else float("nan")
+    # pitch per frame (simple + explainable)
+    pitch = np.array([_pitch_autocorr(frames[i], sr) for i in range(frames.shape[0])], dtype=np.float32)
+    times = (np.arange(pitch.size) * hop / sr).astype(np.float32)
+    voiced = np.isfinite(pitch) & (pitch > 0)
+    voiced_ratio = float(np.mean(voiced)) if voiced.size else float("nan")
+    if np.any(voiced):
+        pv = pitch[voiced]
+        pitch_median = float(np.median(pv))
+        q75, q25 = np.percentile(pv, [75, 25])
+        pitch_iqr = float(q75 - q25)
+    else:
         pitch_median = float("nan")
         pitch_iqr = float("nan")
+    # pause detection via RMS threshold
     if rms.size:
         thr = float(np.percentile(rms, 20)) * 0.8
         silent = rms < thr
         active_ratio=active_ratio,
     )
+    artifacts = {"y": y, "sr": sr, "hop": hop, "pauses": pauses, "pitch": pitch, "times": times}
     return feats, artifacts
     ax = fig.add_subplot(111)
     if pitch.size and times.size:
         ax.plot(times, pitch, linewidth=1.0)
+        ax.set_title("Pitch contour (simple autocorrelation)")
         ax.set_xlabel("Time (s)")
         ax.set_ylabel("Pitch (Hz)")
     else:
+        ax.text(0.5, 0.5, "Pitch not available", ha="center", va="center")
         ax.set_axis_off()
     fig.tight_layout()
     return fig
 # =========================================================
+# Explanations + summary
 # =========================================================
 def features_table(feats: Features) -> List[List[str]]:
     def f3(x):
     ]
 def summary_of_changes(first: Features, last: Features) -> str:
     def fmt(x, unit=""):
         if not _finite(x):
             return "—"
         return f"{x:+.3f}{unit}"
+    d_pause_total = (last.pause_total_s - first.pause_total_s) if (_finite(last.pause_total_s) and _finite(first.pause_total_s)) else float("nan")
+    d_n_pauses = (last.n_pauses - first.n_pauses)
+    d_pitch = (last.pitch_median_hz - first.pitch_median_hz) if (_finite(last.pitch_median_hz) and _finite(first.pitch_median_hz)) else float("nan")
+    d_rms = (last.rms_mean - first.rms_mean) if (_finite(last.rms_mean) and _finite(first.rms_mean)) else float("nan")
+    d_active = (last.active_ratio - first.active_ratio) if (_finite(last.active_ratio) and _finite(first.active_ratio)) else float("nan")
     lines = []
     lines.append("### Summary of changes (last vs first)")
+    lines.append("This compares the **first** and **last** recording in your selection (upload order).")
     lines.append("")
     lines.append("**Measured differences (Δ = last − first):**")
+    lines.append(f"- Total pause time: **{fmt(d_pause_total, ' s')}**")
+    lines.append(f"- Number of pauses: **{d_n_pauses:+d}**")
     lines.append(f"- Median pitch: **{fmt(d_pitch, ' Hz')}**")
     lines.append(f"- RMS energy: **{fmt(d_rms)}**")
+    lines.append(f"- Active speech ratio: **{fmt(d_active * 100.0, ' %')}**")
     lines.append("")
+    lines.append("**How to interpret (non-clinical):**")
+    lines.append("- More pauses / lower active ratio can reflect hesitations, slower speech, fatigue, or different environment/microphone setup.")
+    lines.append("- Pitch changes can reflect speaking style, prosody, emotion, or recording conditions.")
+    lines.append("- Energy changes often reflect distance to microphone / loudness / background noise.")
     lines.append("")
+    lines.append("**Important:** not a diagnosis. These are explainable signal-level comparisons.")
     return "\n".join(lines)
 # =========================================================
 # Callbacks
 # =========================================================
         return [], None, None, "### Upload or record audio to start."
     y, sr = load_audio_file(audio_path)
     feats, art = compute_features(y, sr)
+    return features_table(feats), plot_waveform_with_pauses(art), plot_pitch(art), "### This shows measurable signals (no diagnosis)."
 def analyze_many_paths(paths: List[str]):
         return (
             [[1, "—", "Upload/select at least 2 recordings.", "", "", "", "", ""]],
             None,
+            "### Select at least 2 recordings to see a trend.",
+            "### Summary will appear here."
         )
     rows = []
     pause_series, pitch_series, rms_series = [], [], []
     feats_first = None
     feats_last = None
     if feats_first is not None and feats_last is not None:
         summary = summary_of_changes(feats_first, feats_last)
+    return rows, fig, "### Trend over time (within-person).", summary
 def analyze_many_uploaded(files):
     paths = []
     if files:
         for f in files:
+            paths.append(getattr(f, "name", None) or str(f))
     return analyze_many_paths(paths)
 }
 .card *{ color: #0b0f19 !important; }
+/* Tabs readable on dark background */
 div[role="tablist"]{
   background: rgba(255,255,255,0.06) !important;
   border: 1px solid rgba(255,255,255,0.14) !important;
   border-radius: 14px !important;
   padding: 6px !important;
 }
+button[role="tab"]{ color: rgba(255,255,255,0.92) !important; }
 button[role="tab"][aria-selected="true"]{
   color: rgba(255,255,255,0.98) !important;
   border-bottom: 2px solid rgba(255,255,255,0.65) !important;
                     with gr.Column(scale=5):
                         audio = gr.Audio(label="Audio", sources=["upload", "microphone"], type="filepath")
                         run = gr.Button("Analyze", variant="primary")
                     with gr.Column(scale=7):
                         feats_df = gr.Dataframe(headers=["Feature", "Value"], interactive=False, wrap=True)
                         wf_plot = gr.Plot(label="Waveform + pauses")
             with gr.TabItem("Timeline"):
                 with gr.Row():
                     with gr.Column(scale=5):
+                        gr.Markdown("#### Option A — Upload")
                         files = gr.Files(label="Upload multiple audio files", file_count="multiple", file_types=["audio"])
                         run_up = gr.Button("Analyze uploaded timeline", variant="primary")
+                        gr.Markdown("#### Option B — Bundled samples (repo root)")
                         bundled_select = gr.CheckboxGroup(choices=bundled0, label="Bundled audio files")
                         with gr.Row():
                             refresh_btn = gr.Button("Refresh list", variant="secondary")
                             run_b = gr.Button("Analyze selected bundled", variant="secondary")
                     with gr.Column(scale=7):
                         timeline_df = gr.Dataframe(
                             headers=["#", "File", "Duration", "Pauses", "Pause(s)", "Pitch(Hz)", "RMS", "Active %"],
                             wrap=True,
                         )
                         timeline_plot = gr.Plot(label="Trend plot")
+                        timeline_expl = gr.Markdown("### Select at least 2 recordings.", elem_classes=["card"])
+                        timeline_summary = gr.Markdown("### Summary will appear here.", elem_classes=["card"])
                 run_up.click(analyze_many_uploaded, inputs=[files], outputs=[timeline_df, timeline_plot, timeline_expl, timeline_summary])
                 run_b.click(analyze_many_bundled, inputs=[bundled_select], outputs=[timeline_df, timeline_plot, timeline_expl, timeline_summary])
                 diag_refresh = gr.Button("Refresh diagnostics", variant="secondary")
                 diag_refresh.click(lambda: diagnostics_text(), inputs=None, outputs=[diag])
         refresh_btn.click(refresh_bundled, inputs=None, outputs=[bundled_select, diag])
     return demo