Spaces:

neongeckocom
/

AskJerry

Running

App Files Files Community

NeonClary commited on Apr 7

Commit

f714780

1 Parent(s): e7c7cf6

refactor: simplify TTS pipeline with stable sentence boundaries, remove prefetch cache

Browse files

Files changed (5) hide show

backend/.env.example +3 -0
backend/app/config.py +2 -0
backend/app/voice_routes.py +120 -12
frontend/src/App.css +44 -4
frontend/src/App.jsx +329 -133

backend/.env.example CHANGED Viewed

@@ -12,3 +12,6 @@ CORS_ORIGINS=http://localhost:3006,http://127.0.0.1:3006
 # External TTS/STT (Coqui + Whisper) — same defaults as other Neon demos
 # COQUI_BASE_URL=https://coqui.neonaiservices.com
 # WHISPER_BASE_URL=https://whisper.neonaiservices.com

 # External TTS/STT (Coqui + Whisper) — same defaults as other Neon demos
 # COQUI_BASE_URL=https://coqui.neonaiservices.com
 # WHISPER_BASE_URL=https://whisper.neonaiservices.com
+# Local speech-to-text: ffmpeg must be on PATH, or set full path to ffmpeg.exe (Windows)
+# FFMPEG_PATH=C:\path\to\ffmpeg.exe

backend/app/config.py CHANGED Viewed

@@ -23,6 +23,8 @@ class Settings(BaseSettings):
     # External voice services (same defaults as CCAI / Neon demos)
     coqui_base_url: str = "https://coqui.neonaiservices.com"
     whisper_base_url: str = "https://whisper.neonaiservices.com"
     @property
     def cors_origin_list(self) -> list[str]:

     # External voice services (same defaults as CCAI / Neon demos)
     coqui_base_url: str = "https://coqui.neonaiservices.com"
     whisper_base_url: str = "https://whisper.neonaiservices.com"
+    # Local STT: ffmpeg must be on PATH, or set absolute path (Windows: where winget puts ffmpeg)
+    ffmpeg_path: str = ""
     @property
     def cors_origin_list(self) -> list[str]:

backend/app/voice_routes.py CHANGED Viewed

@@ -4,10 +4,14 @@ from __future__ import annotations
 import asyncio
 import html as html_module
 import logging
 import re
 import struct
 import subprocess
 import tempfile
 import time
 from pathlib import Path
@@ -41,7 +45,8 @@ router = APIRouter()
 PROBE_TIMEOUT = 12.0
 CACHE_TTL = 120.0
-MAX_CHUNK_CHARS = 160
 _status_cache: Dict[str, Any] = {
     "tts": {"ready": False, "checked_at": 0.0},
@@ -103,6 +108,8 @@ def _md_to_spoken_text(md: str) -> str:
     text = re.sub(r"<[^>]+>", " ", text)
     text = html_module.unescape(text)
     text = _SECTION_HEADERS.sub(" ", text)
     text = re.sub(r"([.!?])\s*\1+", r"\1", text)
     text = re.sub(r"\s*\.\s*\.", ".", text)
     text = re.sub(r"\s+", " ", text).strip()
@@ -128,18 +135,66 @@ def _split_sentences(text: str) -> List[str]:
     return chunks
-async def _synthesize_one(client: httpx.AsyncClient, base: str, chunk: str) -> Optional[bytes]:
     url = f"{base}/synthesize/{quote(chunk, safe='')}"
     try:
         r = await client.get(url)
-        r.raise_for_status()
-        return r.content
     except Exception as exc:
         LOG.warning("TTS chunk failed (%s chars): %s", len(chunk), exc)
-        return None
-def _concat_wav(segments: List[bytes]) -> bytes:
     if len(segments) == 1:
         return segments[0]
     pcm_parts: List[bytes] = []
@@ -164,6 +219,47 @@ def _concat_wav(segments: List[bytes]) -> bytes:
     return bytes(header) + all_pcm
 @router.get("/voice/status")
 async def voice_status() -> Dict[str, bool]:
     tts_ready = _cached_ready("tts")
@@ -201,10 +297,18 @@ async def text_to_speech(req: TTSRequest) -> Response:
     try:
         async with httpx.AsyncClient(timeout=120.0) as client:
-            results = await asyncio.gather(*[_synthesize_one(client, base, c) for c in chunks])
-            wav_segments = [r for r in results if r and len(r) > 44]
             if not wav_segments:
-                raise HTTPException(status_code=502, detail="TTS synthesis failed for all chunks")
             combined = _concat_wav(wav_segments)
             _status_cache["tts"] = {"ready": True, "checked_at": time.time()}
             return Response(content=combined, media_type="audio/wav")
@@ -218,6 +322,7 @@ async def text_to_speech(req: TTSRequest) -> Response:
 def _convert_to_wav(audio_bytes: bytes, src_mime: str) -> bytes:
     with tempfile.TemporaryDirectory() as tmp:
         ext = "webm" if "webm" in (src_mime or "") else "ogg"
         src = Path(tmp) / f"in.{ext}"
@@ -225,7 +330,7 @@ def _convert_to_wav(audio_bytes: bytes, src_mime: str) -> bytes:
         src.write_bytes(audio_bytes)
         result = subprocess.run(
             [
-                "ffmpeg",
                 "-y",
                 "-i",
                 str(src),
@@ -249,9 +354,9 @@ def _convert_to_wav(audio_bytes: bytes, src_mime: str) -> bytes:
 @router.post("/transcribe")
 async def transcribe_audio(audio: UploadFile = File(...)) -> Dict[str, str]:
     contents = await audio.read()
     if not contents:
         return {"text": ""}
-    mime = audio.content_type or "audio/webm"
     LOG.info("STT: received %s bytes (%s)", len(contents), mime)
     try:
@@ -266,9 +371,12 @@ async def transcribe_audio(audio: UploadFile = File(...)) -> Dict[str, str]:
             loop = asyncio.get_running_loop()
             wav_bytes = await loop.run_in_executor(None, _convert_to_wav, contents, mime)
             LOG.info("STT: converted to WAV (%s bytes)", len(wav_bytes))
         except Exception as e:
             LOG.error("STT conversion error: %s", e)
-            raise HTTPException(status_code=500, detail="Audio conversion failed")
     else:
         wav_bytes = contents

 import asyncio
 import html as html_module
+import io
 import logging
+import os
 import re
+import shutil
 import struct
 import subprocess
+import wave
 import tempfile
 import time
 from pathlib import Path
 PROBE_TIMEOUT = 12.0
 CACHE_TTL = 120.0
+# Fewer round-trips to Coqui (longer URLs); still typically under proxy URL limits.
+MAX_CHUNK_CHARS = 380
 _status_cache: Dict[str, Any] = {
     "tts": {"ready": False, "checked_at": 0.0},
     text = re.sub(r"<[^>]+>", " ", text)
     text = html_module.unescape(text)
     text = _SECTION_HEADERS.sub(" ", text)
+    # Remove trailing period after other sentence-ending punctuation ("!." → "!", "?." → "?")
+    text = re.sub(r"([!?])\s*\.+", r"\1", text)
     text = re.sub(r"([.!?])\s*\1+", r"\1", text)
     text = re.sub(r"\s*\.\s*\.", ".", text)
     text = re.sub(r"\s+", " ", text).strip()
     return chunks
+async def _synthesize_one(
+    client: httpx.AsyncClient, base: str, chunk: str
+) -> tuple[Optional[bytes], Optional[str]]:
     url = f"{base}/synthesize/{quote(chunk, safe='')}"
     try:
         r = await client.get(url)
+        if r.status_code >= 400:
+            err = f"HTTP {r.status_code}: {r.text[:160]}"
+            LOG.warning("TTS chunk failed: %s", err)
+            return None, err
+        return r.content, None
     except Exception as exc:
+        err = str(exc)[:220]
         LOG.warning("TTS chunk failed (%s chars): %s", len(chunk), exc)
+        return None, err
+def _ffmpeg_executable() -> str:
+    raw = (settings.ffmpeg_path or "").strip()
+    if raw:
+        p = Path(raw)
+        if p.is_file():
+            return str(p.resolve())
+    w = shutil.which("ffmpeg")
+    if w:
+        return w
+    localappdata = os.environ.get("LOCALAPPDATA", "")
+    if localappdata:
+        winget_link = Path(localappdata) / "Microsoft" / "WinGet" / "Links" / "ffmpeg.exe"
+        if winget_link.is_file():
+            return str(winget_link.resolve())
+        pkg_root = Path(localappdata) / "Microsoft" / "WinGet" / "Packages"
+        if pkg_root.is_dir():
+            try:
+                candidates = [p for p in pkg_root.rglob("ffmpeg.exe") if p.is_file()]
+                candidates.sort(
+                    key=lambda p: (
+                        0 if "\\bin\\" in str(p).lower() or "/bin/" in str(p).lower() else 1,
+                        len(str(p)),
+                    )
+                )
+                if candidates:
+                    return str(candidates[0].resolve())
+            except OSError:
+                pass
+    for env in ("ProgramFiles", "ProgramFiles(x86)"):
+        base = os.environ.get(env, "")
+        if not base:
+            continue
+        cand = Path(base) / "ffmpeg" / "bin" / "ffmpeg.exe"
+        if cand.is_file():
+            return str(cand.resolve())
+    raise FileNotFoundError(
+        "ffmpeg not found. Install ffmpeg (e.g. winget install Gyan.FFmpeg), add it to PATH, "
+        "or set FFMPEG_PATH in backend/.env to the full path to ffmpeg.exe"
+    )
+def _concat_wav_legacy(segments: List[bytes]) -> bytes:
+    """Best-effort RIFF merge (can confuse decoders); used only as fallback."""
     if len(segments) == 1:
         return segments[0]
     pcm_parts: List[bytes] = []
     return bytes(header) + all_pcm
+def _concat_wav(segments: List[bytes]) -> bytes:
+    """Merge Coqui WAV segments with matching fmt using stdlib wave (reliable playback)."""
+    segs = [s for s in segments if s and len(s) >= 44]
+    if not segs:
+        return b""
+    if len(segs) == 1:
+        return segs[0]
+    readers: List[wave.Wave_read] = []
+    try:
+        for raw in segs:
+            readers.append(wave.open(io.BytesIO(raw), "rb"))
+        r0 = readers[0]
+        ch, sw, fr = r0.getnchannels(), r0.getsampwidth(), r0.getframerate()
+        out_buf = io.BytesIO()
+        wo = wave.open(out_buf, "wb")
+        wo.setnchannels(ch)
+        wo.setsampwidth(sw)
+        wo.setframerate(fr)
+        try:
+            for w in readers:
+                if (
+                    w.getnchannels() != ch
+                    or w.getsampwidth() != sw
+                    or w.getframerate() != fr
+                ):
+                    raise ValueError("WAV format mismatch between segments")
+                wo.writeframes(w.readframes(w.getnframes()))
+        finally:
+            wo.close()
+        return out_buf.getvalue()
+    except Exception as exc:
+        LOG.warning("WAV concat via wave module failed, using legacy merge: %s", exc)
+        return _concat_wav_legacy(segments)
+    finally:
+        for w in readers:
+            try:
+                w.close()
+            except Exception:
+                pass
 @router.get("/voice/status")
 async def voice_status() -> Dict[str, bool]:
     tts_ready = _cached_ready("tts")
     try:
         async with httpx.AsyncClient(timeout=120.0) as client:
+            pairs = await asyncio.gather(*[_synthesize_one(client, base, c) for c in chunks])
+            wav_segments = [seg for seg, _ in pairs if seg and len(seg) > 44]
             if not wav_segments:
+                first_err = next((e for _, e in pairs if e), None)
+                detail = (first_err or "TTS synthesis failed for all chunks")[:500]
+                raise HTTPException(status_code=502, detail=detail)
+            if len(wav_segments) < len(chunks):
+                LOG.warning(
+                    "TTS: partial success — %s/%s chunk(s) synthesized (others failed upstream)",
+                    len(wav_segments),
+                    len(chunks),
+                )
             combined = _concat_wav(wav_segments)
             _status_cache["tts"] = {"ready": True, "checked_at": time.time()}
             return Response(content=combined, media_type="audio/wav")
 def _convert_to_wav(audio_bytes: bytes, src_mime: str) -> bytes:
+    ff = _ffmpeg_executable()
     with tempfile.TemporaryDirectory() as tmp:
         ext = "webm" if "webm" in (src_mime or "") else "ogg"
         src = Path(tmp) / f"in.{ext}"
         src.write_bytes(audio_bytes)
         result = subprocess.run(
             [
+                ff,
                 "-y",
                 "-i",
                 str(src),
 @router.post("/transcribe")
 async def transcribe_audio(audio: UploadFile = File(...)) -> Dict[str, str]:
     contents = await audio.read()
+    mime = audio.content_type or "audio/webm"
     if not contents:
         return {"text": ""}
     LOG.info("STT: received %s bytes (%s)", len(contents), mime)
     try:
             loop = asyncio.get_running_loop()
             wav_bytes = await loop.run_in_executor(None, _convert_to_wav, contents, mime)
             LOG.info("STT: converted to WAV (%s bytes)", len(wav_bytes))
+        except FileNotFoundError as e:
+            LOG.error("STT: ffmpeg missing: %s", e)
+            raise HTTPException(status_code=503, detail=str(e))
         except Exception as e:
             LOG.error("STT conversion error: %s", e)
+            raise HTTPException(status_code=500, detail=f"Audio conversion failed: {e!s}"[:500])
     else:
         wav_bytes = contents

frontend/src/App.css CHANGED Viewed

@@ -631,24 +631,61 @@ html.aj-hide-pointer * {
 .aj-composer {
   z-index: 40;
-  padding: 6px 6px;
   border-top: 1px solid var(--lc-border);
   background: var(--lc-sidebar);
   flex-shrink: 0;
   flex-grow: 0;
 }
 .aj-composer-inner {
   display: flex;
   gap: 6px;
   align-items: flex-end;
-  max-width: 1040px;
-  margin: 0 auto;
 }
 .aj-composer-field {
   flex: 1;
   min-width: 0;
 }
 .aj-composer-field--listening textarea {
@@ -657,7 +694,10 @@ html.aj-hide-pointer * {
 }
 .aj-composer textarea {
-  flex: 1;
   box-sizing: border-box;
   min-height: 34px;
   max-height: 50vh;

 .aj-composer {
   z-index: 40;
+  padding: 6px 10px;
   border-top: 1px solid var(--lc-border);
   background: var(--lc-sidebar);
   flex-shrink: 0;
   flex-grow: 0;
 }
+.aj-voice-error-banner {
+  display: flex;
+  align-items: flex-start;
+  justify-content: space-between;
+  gap: 12px;
+  padding: 8px 12px;
+  font-size: 13px;
+  line-height: 1.45;
+  color: #7f1d1d;
+  background: #fef2f2;
+  border-bottom: 1px solid #fecaca;
+}
+.aj-voice-error-banner span {
+  flex: 1;
+  min-width: 0;
+}
+.aj-voice-error-dismiss {
+  flex-shrink: 0;
+  border: none;
+  background: transparent;
+  color: #991b1b;
+  font-size: 20px;
+  line-height: 1;
+  cursor: pointer;
+  padding: 0 4px;
+}
+.aj-voice-error-dismiss:hover {
+  color: #450a0a;
+}
 .aj-composer-inner {
   display: flex;
   gap: 6px;
   align-items: flex-end;
+  width: 100%;
+  max-width: none;
+  box-sizing: border-box;
 }
 .aj-composer-field {
   flex: 1;
   min-width: 0;
+  display: flex;
+  flex-direction: column;
+  align-self: stretch;
 }
 .aj-composer-field--listening textarea {
 }
 .aj-composer textarea {
+  width: 100%;
+  min-width: 0;
+  flex: 1 1 auto;
+  align-self: stretch;
   box-sizing: border-box;
   min-height: 34px;
   max-height: 50vh;

frontend/src/App.jsx CHANGED Viewed

@@ -8,6 +8,8 @@ import {
   Loader2,
   MessageSquarePlus,
   Mic,
   RefreshCw,
   Search,
   Send,
@@ -73,11 +75,29 @@ function parseSseBuffer(buf, onEvent) {
   return remainder
 }
-function extractFirstSentence(text) {
   const t = (text || '').trim()
-  if (!t) return null
-  const m = t.match(/^[\s\S]+?[.!?](?=\s|$)/)
-  return m ? m[0].trim() : null
 }
 function appendTokenContent(acc, ev) {
@@ -196,28 +216,72 @@ function AssistantSearchBar({ content, show, speak }) {
   }
   if (!show) return null
-  if (!(content || '').trim() && !speak?.loading && !speak?.playing) return null
   return (
     <div className="aj-msg-actions" ref={wrapRef}>
       <div className="aj-msg-search-wrap">
         {speak && (
-          <button
-            type="button"
-            className="aj-msg-search-btn"
-            onClick={speak.onSpeak}
-            disabled={speak.disabled}
-            data-tip={speak.playing ? 'Stop' : speak.loading ? 'Loading speech…' : 'Read aloud'}
-            aria-label={speak.playing ? 'Stop' : speak.loading ? 'Loading speech' : 'Read aloud'}
-          >
-            {speak.loading ? (
-              <Loader2 size={14} className="aj-spin" aria-hidden />
-            ) : speak.playing ? (
-              <Square size={14} aria-hidden />
-            ) : (
-              <Volume2 size={14} aria-hidden />
             )}
-          </button>
         )}
         <button
           type="button"
@@ -315,15 +379,21 @@ export default function App() {
   const [ttsLoadingIndex, setTtsLoadingIndex] = useState(null)
   const [ttsPlayingIndex, setTtsPlayingIndex] = useState(null)
   const [alwaysSpeak, setAlwaysSpeak] = useState(initialAlwaysSpeak)
   const [micListening, setMicListening] = useState(false)
   const [micTranscribing, setMicTranscribing] = useState(false)
   const audioRef = useRef(null)
   const ttsBlobUrlRef = useRef(null)
-  const prefetchAbortRef = useRef(null)
-  const prefetchKeyRef = useRef('')
-  const prefetchBlobRef = useRef(null)
   const streamingWasRef = useRef(false)
   const mediaRecorderRef = useRef(null)
   const audioChunksRef = useRef([])
@@ -486,6 +556,7 @@ export default function App() {
   }, [exportOpen])
   const stopTts = useCallback(() => {
     if (audioRef.current) {
       audioRef.current.pause()
       audioRef.current.src = ''
@@ -497,126 +568,229 @@ export default function App() {
     }
     setTtsLoadingIndex(null)
     setTtsPlayingIndex(null)
   }, [])
-  const playTtsForIndex = useCallback(
-    async (index, text) => {
-      const raw = (text || '').trim()
-      if (!raw) return
-      try {
-        sessionStorage.setItem(STORAGE_TTS_PRIMED, '1')
-      } catch {
-        /* */
       }
-      stopTts()
-      setTtsLoadingIndex(index)
       try {
         const res = await fetch('/api/tts', {
           method: 'POST',
           headers: { 'Content-Type': 'application/json' },
-          body: JSON.stringify({ text: raw }),
         })
-        if (!res.ok) throw new Error('TTS failed')
         const blob = await res.blob()
-        const url = URL.createObjectURL(blob)
-        ttsBlobUrlRef.current = url
-        const audio = new Audio(url)
-        audioRef.current = audio
-        setTtsLoadingIndex(null)
-        setTtsPlayingIndex(index)
-        audio.onended = () => {
-          stopTts()
         }
-        audio.onerror = () => {
-          stopTts()
         }
-        await audio.play()
-      } catch {
         setTtsLoadingIndex(null)
         setTtsPlayingIndex(null)
-      }
-    },
-    [stopTts],
-  )
-  const onSpeakToggle = useCallback(
-    (index, text) => {
-      if (ttsLoadingIndex === index || ttsPlayingIndex === index) {
-        stopTts()
-        return
       }
-      playTtsForIndex(index, text)
     },
-    [ttsLoadingIndex, ttsPlayingIndex, stopTts, playTtsForIndex],
   )
   useEffect(() => {
-    if (streamingWasRef.current && !streaming && alwaysSpeak) {
-      const last = messages[messages.length - 1]
       if (last?.role === 'assistant' && last.content?.trim()) {
         playTtsForIndex(messages.length - 1, last.content)
       }
     }
     streamingWasRef.current = streaming
   }, [streaming, alwaysSpeak, messages, playTtsForIndex])
   useEffect(() => {
-    try {
-      sessionStorage.setItem(STORAGE_ALWAYS_SPEAK, alwaysSpeak ? '1' : '0')
-    } catch {
-      /* */
-    }
   }, [alwaysSpeak])
   useEffect(() => {
-    return () => {
-      stopTts()
-      prefetchAbortRef.current?.abort()
-      if (prefetchBlobRef.current?.url) {
-        URL.revokeObjectURL(prefetchBlobRef.current.url)
-      }
     }
-  }, [stopTts])
   useEffect(() => {
-    if (!streaming) return
-    let primed = false
-    try {
-      primed = sessionStorage.getItem(STORAGE_TTS_PRIMED) === '1'
-    } catch {
-      /* */
-    }
-    if (!primed) return
-    const last = messages[messages.length - 1]
-    if (!last || last.role !== 'assistant') return
-    const text = last.content
-    const first = extractFirstSentence(text)
-    if (!first || first.length < 8) return
-    const key = `${messages.length - 1}:${first}`
-    if (prefetchKeyRef.current === key) return
-    prefetchKeyRef.current = key
-    prefetchAbortRef.current?.abort()
-    const ac = new AbortController()
-    prefetchAbortRef.current = ac
-    fetch('/api/tts', {
-      method: 'POST',
-      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ text: first }),
-      signal: ac.signal,
-    })
-      .then((r) => {
-        if (!r.ok) throw new Error('prefetch')
-        return r.blob()
-      })
-      .then((blob) => {
-        const url = URL.createObjectURL(blob)
-        if (prefetchBlobRef.current?.url) {
-          URL.revokeObjectURL(prefetchBlobRef.current.url)
-        }
-        prefetchBlobRef.current = { index: messages.length - 1, sentence: first, url }
-      })
-      .catch(() => {})
-  }, [streaming, messages])
   const toggleMic = useCallback(async () => {
     if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
@@ -646,12 +820,32 @@ export default function App() {
         const form = new FormData()
         form.append('audio', blob, 'recording.webm')
         fetch('/api/transcribe', { method: 'POST', body: form })
-          .then((r) => r.json())
           .then((data) => {
             const tx = data?.text?.trim()
-            if (tx) setInput((prev) => (prev ? `${prev} ${tx}` : tx))
           })
-          .catch(() => {})
           .finally(() => setMicTranscribing(false))
       }
       mediaRecorderRef.current = mediaRecorder
@@ -853,12 +1047,7 @@ export default function App() {
   const handleNewChat = () => {
     abortRef.current?.abort()
     stopTts()
-    prefetchAbortRef.current?.abort()
-    prefetchKeyRef.current = ''
-    if (prefetchBlobRef.current?.url) {
-      URL.revokeObjectURL(prefetchBlobRef.current.url)
-      prefetchBlobRef.current = null
-    }
     setMessages([])
     setInput('')
     setSummary(null)
@@ -872,12 +1061,7 @@ export default function App() {
   const handleRefresh = () => {
     if (streaming) return
-    prefetchAbortRef.current?.abort()
-    prefetchKeyRef.current = ''
-    if (prefetchBlobRef.current?.url) {
-      URL.revokeObjectURL(prefetchBlobRef.current.url)
-      prefetchBlobRef.current = null
-    }
     let base = [...messages]
     if (base.length && base[base.length - 1].role === 'assistant') base = base.slice(0, -1)
     if (!base.length || base[base.length - 1].role !== 'user') return
@@ -891,12 +1075,7 @@ export default function App() {
     const text = input.trim()
     if (!text || streaming) return
     setHideTypingCursor(false)
-    prefetchAbortRef.current?.abort()
-    prefetchKeyRef.current = ''
-    if (prefetchBlobRef.current?.url) {
-      URL.revokeObjectURL(prefetchBlobRef.current.url)
-      prefetchBlobRef.current = null
-    }
     const userMsg = { role: 'user', content: text }
     const apiMsgs = [...messages, userMsg].map(({ role, content }) => ({ role, content }))
     setMessages([...messages, userMsg, { role: 'assistant', content: '' }])
@@ -1149,6 +1328,15 @@ export default function App() {
         </div>
       </header>
       {messages.length > 0 && (
         <div className="aj-context-bar">
           <div className="aj-context-meter">
@@ -1211,12 +1399,20 @@ export default function App() {
                       show={!(streaming && i === messages.length - 1)}
                       speak={{
                         loading: ttsLoadingIndex === i,
-                        playing: ttsPlayingIndex === i,
                         disabled:
                           !(m.content || '').trim()
                           && ttsLoadingIndex !== i
                           && ttsPlayingIndex !== i,
-                        onSpeak: () => onSpeakToggle(i, m.content),
                       }}
                     />
                   )}

   Loader2,
   MessageSquarePlus,
   Mic,
+  Pause,
+  Play,
   RefreshCw,
   Search,
   Send,
   return remainder
 }
+/**
+ * Extract complete sentences from text. Boundaries are stable: once a sentence
+ * ends with punctuation, it never changes even as more text streams in.
+ * When includeTrailing is true, any text after the last sentence terminator is
+ * also returned (used after streaming ends so nothing is silently dropped).
+ */
+function extractSentences(text, includeTrailing = false) {
   const t = (text || '').trim()
+  if (!t) return []
+  const re = /[^.!?\n]+[.!?\n]+/g
+  const sentences = []
+  let match, lastEnd = 0
+  while ((match = re.exec(t)) !== null) {
+    const s = match[0].trim()
+    if (s) sentences.push(s)
+    lastEnd = re.lastIndex
+  }
+  if (!sentences.length) return includeTrailing ? [t] : []
+  if (includeTrailing) {
+    const remainder = t.slice(lastEnd).trim()
+    if (remainder) sentences.push(remainder)
+  }
+  return sentences
 }
 function appendTokenContent(acc, ev) {
   }
   if (!show) return null
+  if (
+    !(content || '').trim()
+    && !speak?.loading
+    && !speak?.playing
+    && !speak?.paused
+  ) {
+    return null
+  }
   return (
     <div className="aj-msg-actions" ref={wrapRef}>
       <div className="aj-msg-search-wrap">
         {speak && (
+          <>
+            <button
+              type="button"
+              className="aj-msg-search-btn"
+              onClick={
+                speak.paused
+                  ? speak.onResume
+                  : speak.playing
+                    ? speak.onPause
+                    : speak.onReadAloud
+              }
+              disabled={speak.disabled || speak.loading}
+              data-tip={
+                speak.paused
+                  ? 'Resume'
+                  : speak.playing
+                    ? 'Pause'
+                    : speak.loading
+                      ? 'Loading speech…'
+                      : 'Read aloud'
+              }
+              aria-label={
+                speak.paused
+                  ? 'Resume'
+                  : speak.playing
+                    ? 'Pause'
+                    : speak.loading
+                      ? 'Loading speech'
+                      : 'Read aloud'
+              }
+            >
+              {speak.paused ? (
+                <Play size={14} aria-hidden />
+              ) : speak.loading ? (
+                <Loader2 size={14} className="aj-spin" aria-hidden />
+              ) : speak.playing ? (
+                <Pause size={14} aria-hidden />
+              ) : (
+                <Volume2 size={14} aria-hidden />
+              )}
+            </button>
+            {(speak.playing || speak.paused || (speak.loading && speak.showStop)) && (
+              <button
+                type="button"
+                className="aj-msg-search-btn"
+                onClick={speak.onStopReading}
+                data-tip="Stop reading"
+                aria-label="Stop reading"
+              >
+                <Square size={14} aria-hidden />
+              </button>
             )}
+          </>
         )}
         <button
           type="button"
   const [ttsLoadingIndex, setTtsLoadingIndex] = useState(null)
   const [ttsPlayingIndex, setTtsPlayingIndex] = useState(null)
+  const [ttsPaused, setTtsPaused] = useState(false)
   const [alwaysSpeak, setAlwaysSpeak] = useState(initialAlwaysSpeak)
   const [micListening, setMicListening] = useState(false)
   const [micTranscribing, setMicTranscribing] = useState(false)
+  const [voiceError, setVoiceError] = useState(null)
   const audioRef = useRef(null)
   const ttsBlobUrlRef = useRef(null)
+  const ttsSessionRef = useRef(0)
+  const messagesRef = useRef(messages)
+  messagesRef.current = messages
+  const streamingRef = useRef(streaming)
+  streamingRef.current = streaming
+  const ttsContentResolverRef = useRef(null)
+  const ttsPlaybackActiveRef = useRef(false)
   const streamingWasRef = useRef(false)
   const mediaRecorderRef = useRef(null)
   const audioChunksRef = useRef([])
   }, [exportOpen])
   const stopTts = useCallback(() => {
+    ttsSessionRef.current += 1
     if (audioRef.current) {
       audioRef.current.pause()
       audioRef.current.src = ''
     }
     setTtsLoadingIndex(null)
     setTtsPlayingIndex(null)
+    setTtsPaused(false)
   }, [])
+  const pauseTts = useCallback(() => {
+    const a = audioRef.current
+    if (a && !a.paused) {
+      a.pause()
+      setTtsPaused(true)
+    }
+  }, [])
+  const resumeTts = useCallback(async () => {
+    const a = audioRef.current
+    if (!a) return
+    try {
+      await a.play()
+      setTtsPaused(false)
+    } catch (e) {
+      setVoiceError(String(e?.message || e))
+    }
+  }, [])
+  const playAudioUrlUntilDone = useCallback((url, session) => {
+    return new Promise((resolve) => {
+      if (session !== ttsSessionRef.current) {
+        resolve()
+        return
       }
+      const audio = new Audio(url)
+      audioRef.current = audio
+      ttsBlobUrlRef.current = url
+      let settled = false
+      let pollAbort = null
+      const finish = () => {
+        if (settled) return
+        settled = true
+        if (pollAbort != null) clearInterval(pollAbort)
+        audio.onended = null
+        audio.onerror = null
+        resolve()
+      }
+      pollAbort = setInterval(() => {
+        if (session !== ttsSessionRef.current) finish()
+      }, 120)
+      audio.onended = () => {
+        finish()
+      }
+      audio.onerror = () => {
+        finish()
+      }
+      audio.play().catch(() => finish())
+    })
+  }, [])
+  const fetchTtsAudio = useCallback(async (chunkText) => {
+    for (let attempt = 0; attempt < 2; attempt++) {
       try {
         const res = await fetch('/api/tts', {
           method: 'POST',
           headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({ text: chunkText }),
         })
+        if (!res.ok) {
+          let detail = `TTS failed (${res.status})`
+          try {
+            const ct = res.headers.get('content-type') || ''
+            if (ct.includes('json')) {
+              const j = await res.json()
+              if (j?.detail != null) {
+                detail = typeof j.detail === 'string' ? j.detail : JSON.stringify(j.detail)
+              }
+            }
+          } catch { /* */ }
+          if (res.status >= 500 && attempt === 0) {
+            continue
+          }
+          return { error: detail }
+        }
         const blob = await res.blob()
+        return { url: URL.createObjectURL(blob) }
+      } catch (e) {
+        if (attempt === 0) {
+          continue
         }
+        return { error: String(e?.message || e) }
+      }
+    }
+    return { error: 'TTS failed after retries' }
+  }, [])
+  const playTtsForIndex = useCallback(
+    async (index, initialText) => {
+      try { sessionStorage.setItem(STORAGE_TTS_PRIMED, '1') } catch { /* */ }
+      setVoiceError(null)
+      stopTts()
+      const session = ttsSessionRef.current
+      ttsPlaybackActiveRef.current = true
+      setTtsLoadingIndex(index)
+      setTtsPlayingIndex(null)
+      setTtsPaused(false)
+      const LOOKAHEAD = 2
+      let playedCount = 0
+      let anyPlayed = false
+      let lastErr = null
+      const inFlight = new Map()
+      const getSentencesAndLimit = () => {
+        const live = streamingRef.current
+        const text = (messagesRef.current[index]?.content || initialText || '').trim()
+        const sentences = extractSentences(text, !live)
+        return { sentences, limit: sentences.length, live }
+      }
+      try {
+        // eslint-disable-next-line no-constant-condition
+        while (true) {
+          if (session !== ttsSessionRef.current) return
+          const { sentences, limit, live } = getSentencesAndLimit()
+          if (playedCount < limit) {
+            for (let ahead = playedCount; ahead < Math.min(playedCount + LOOKAHEAD, limit); ahead++) {
+              if (!inFlight.has(ahead)) {
+                inFlight.set(ahead, fetchTtsAudio(sentences[ahead]))
+              }
+            }
+            if (!anyPlayed) setTtsLoadingIndex(index)
+            const result = await (inFlight.get(playedCount) || fetchTtsAudio(sentences[playedCount]))
+            inFlight.delete(playedCount)
+            if (session !== ttsSessionRef.current) return
+            if (!result?.url) {
+              if (result?.error) lastErr = result.error
+              playedCount++
+              continue
+            }
+            anyPlayed = true
+            setTtsLoadingIndex(null)
+            setTtsPlayingIndex(index)
+            setTtsPaused(false)
+            await playAudioUrlUntilDone(result.url, session)
+            URL.revokeObjectURL(result.url)
+            if (ttsBlobUrlRef.current === result.url) {
+              ttsBlobUrlRef.current = null
+              audioRef.current = null
+            }
+            playedCount++
+            if (session !== ttsSessionRef.current) return
+            continue
+          }
+          if (!live) break
+          setTtsLoadingIndex(index)
+          await new Promise(resolve => {
+            ttsContentResolverRef.current = resolve
+            const rechk = getSentencesAndLimit()
+            if (rechk.limit > playedCount || !rechk.live) {
+              ttsContentResolverRef.current = null
+              resolve()
+            }
+          })
         }
         setTtsLoadingIndex(null)
         setTtsPlayingIndex(null)
+        setTtsPaused(false)
+        if (!anyPlayed && lastErr) setVoiceError(lastErr)
+      } catch (e) {
+        setVoiceError(String(e?.message || e))
+        setTtsLoadingIndex(null)
+        setTtsPlayingIndex(null)
+        setTtsPaused(false)
+      } finally {
+        ttsPlaybackActiveRef.current = false
       }
     },
+    [stopTts, playAudioUrlUntilDone, fetchTtsAudio],
   )
   useEffect(() => {
+    const last = messages[messages.length - 1]
+    if (streaming && alwaysSpeak && !ttsPlaybackActiveRef.current) {
+      if (last?.role === 'assistant' && last.content?.trim()) {
+        const sentences = extractSentences(last.content.trim())
+        if (sentences.length >= 2) {
+          playTtsForIndex(messages.length - 1, last.content)
+        }
+      }
+    }
+    if (streamingWasRef.current && !streaming && alwaysSpeak && !ttsPlaybackActiveRef.current) {
       if (last?.role === 'assistant' && last.content?.trim()) {
         playTtsForIndex(messages.length - 1, last.content)
       }
     }
     streamingWasRef.current = streaming
   }, [streaming, alwaysSpeak, messages, playTtsForIndex])
   useEffect(() => {
+    try { sessionStorage.setItem(STORAGE_ALWAYS_SPEAK, alwaysSpeak ? '1' : '0') } catch { /* */ }
   }, [alwaysSpeak])
   useEffect(() => {
+    if (ttsContentResolverRef.current) {
+      ttsContentResolverRef.current()
+      ttsContentResolverRef.current = null
     }
+  }, [messages, streaming])
   useEffect(() => {
+    return () => { stopTts() }
+  }, [stopTts])
   const toggleMic = useCallback(async () => {
     if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
         const form = new FormData()
         form.append('audio', blob, 'recording.webm')
         fetch('/api/transcribe', { method: 'POST', body: form })
+          .then(async (r) => {
+            const data = await r.json().catch(() => ({}))
+            if (!r.ok) {
+              const d = data?.detail
+              const msg =
+                typeof d === 'string'
+                  ? d
+                  : d != null
+                    ? JSON.stringify(d)
+                    : `Speech-to-text failed (${r.status})`
+              setVoiceError(msg)
+              return null
+            }
+            return data
+          })
           .then((data) => {
+            if (!data) return
             const tx = data?.text?.trim()
+            if (tx) {
+              setVoiceError(null)
+              setInput((prev) => (prev ? `${prev} ${tx}` : tx))
+            }
+          })
+          .catch((e) => {
+            setVoiceError(String(e?.message || e))
           })
           .finally(() => setMicTranscribing(false))
       }
       mediaRecorderRef.current = mediaRecorder
   const handleNewChat = () => {
     abortRef.current?.abort()
     stopTts()
+    setVoiceError(null)
     setMessages([])
     setInput('')
     setSummary(null)
   const handleRefresh = () => {
     if (streaming) return
+    stopTts()
     let base = [...messages]
     if (base.length && base[base.length - 1].role === 'assistant') base = base.slice(0, -1)
     if (!base.length || base[base.length - 1].role !== 'user') return
     const text = input.trim()
     if (!text || streaming) return
     setHideTypingCursor(false)
+    stopTts()
     const userMsg = { role: 'user', content: text }
     const apiMsgs = [...messages, userMsg].map(({ role, content }) => ({ role, content }))
     setMessages([...messages, userMsg, { role: 'assistant', content: '' }])
         </div>
       </header>
+      {voiceError && (
+        <div className="aj-voice-error-banner" role="alert">
+          <span>{voiceError}</span>
+          <button type="button" className="aj-voice-error-dismiss" onClick={() => setVoiceError(null)} aria-label="Dismiss">
+            ×
+          </button>
+        </div>
+      )}
       {messages.length > 0 && (
         <div className="aj-context-bar">
           <div className="aj-context-meter">
                       show={!(streaming && i === messages.length - 1)}
                       speak={{
                         loading: ttsLoadingIndex === i,
+                        playing:
+                          ttsPlayingIndex === i
+                          && !ttsPaused
+                          && ttsLoadingIndex !== i,
+                        paused: ttsPlayingIndex === i && ttsPaused,
+                        showStop: ttsLoadingIndex === i || ttsPlayingIndex === i,
                         disabled:
                           !(m.content || '').trim()
                           && ttsLoadingIndex !== i
                           && ttsPlayingIndex !== i,
+                        onReadAloud: () => playTtsForIndex(i, m.content),
+                        onPause: pauseTts,
+                        onResume: resumeTts,
+                        onStopReading: stopTts,
                       }}
                     />
                   )}