NeonClary commited on
Commit
f714780
·
1 Parent(s): e7c7cf6

refactor: simplify TTS pipeline with stable sentence boundaries, remove prefetch cache

Browse files
backend/.env.example CHANGED
@@ -12,3 +12,6 @@ CORS_ORIGINS=http://localhost:3006,http://127.0.0.1:3006
12
  # External TTS/STT (Coqui + Whisper) — same defaults as other Neon demos
13
  # COQUI_BASE_URL=https://coqui.neonaiservices.com
14
  # WHISPER_BASE_URL=https://whisper.neonaiservices.com
 
 
 
 
12
  # External TTS/STT (Coqui + Whisper) — same defaults as other Neon demos
13
  # COQUI_BASE_URL=https://coqui.neonaiservices.com
14
  # WHISPER_BASE_URL=https://whisper.neonaiservices.com
15
+
16
+ # Local speech-to-text: ffmpeg must be on PATH, or set full path to ffmpeg.exe (Windows)
17
+ # FFMPEG_PATH=C:\path\to\ffmpeg.exe
backend/app/config.py CHANGED
@@ -23,6 +23,8 @@ class Settings(BaseSettings):
23
  # External voice services (same defaults as CCAI / Neon demos)
24
  coqui_base_url: str = "https://coqui.neonaiservices.com"
25
  whisper_base_url: str = "https://whisper.neonaiservices.com"
 
 
26
 
27
  @property
28
  def cors_origin_list(self) -> list[str]:
 
23
  # External voice services (same defaults as CCAI / Neon demos)
24
  coqui_base_url: str = "https://coqui.neonaiservices.com"
25
  whisper_base_url: str = "https://whisper.neonaiservices.com"
26
+ # Local STT: ffmpeg must be on PATH, or set absolute path (Windows: where winget puts ffmpeg)
27
+ ffmpeg_path: str = ""
28
 
29
  @property
30
  def cors_origin_list(self) -> list[str]:
backend/app/voice_routes.py CHANGED
@@ -4,10 +4,14 @@ from __future__ import annotations
4
 
5
  import asyncio
6
  import html as html_module
 
7
  import logging
 
8
  import re
 
9
  import struct
10
  import subprocess
 
11
  import tempfile
12
  import time
13
  from pathlib import Path
@@ -41,7 +45,8 @@ router = APIRouter()
41
 
42
  PROBE_TIMEOUT = 12.0
43
  CACHE_TTL = 120.0
44
- MAX_CHUNK_CHARS = 160
 
45
 
46
  _status_cache: Dict[str, Any] = {
47
  "tts": {"ready": False, "checked_at": 0.0},
@@ -103,6 +108,8 @@ def _md_to_spoken_text(md: str) -> str:
103
  text = re.sub(r"<[^>]+>", " ", text)
104
  text = html_module.unescape(text)
105
  text = _SECTION_HEADERS.sub(" ", text)
 
 
106
  text = re.sub(r"([.!?])\s*\1+", r"\1", text)
107
  text = re.sub(r"\s*\.\s*\.", ".", text)
108
  text = re.sub(r"\s+", " ", text).strip()
@@ -128,18 +135,66 @@ def _split_sentences(text: str) -> List[str]:
128
  return chunks
129
 
130
 
131
- async def _synthesize_one(client: httpx.AsyncClient, base: str, chunk: str) -> Optional[bytes]:
 
 
132
  url = f"{base}/synthesize/{quote(chunk, safe='')}"
133
  try:
134
  r = await client.get(url)
135
- r.raise_for_status()
136
- return r.content
 
 
 
137
  except Exception as exc:
 
138
  LOG.warning("TTS chunk failed (%s chars): %s", len(chunk), exc)
139
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
 
142
- def _concat_wav(segments: List[bytes]) -> bytes:
 
143
  if len(segments) == 1:
144
  return segments[0]
145
  pcm_parts: List[bytes] = []
@@ -164,6 +219,47 @@ def _concat_wav(segments: List[bytes]) -> bytes:
164
  return bytes(header) + all_pcm
165
 
166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  @router.get("/voice/status")
168
  async def voice_status() -> Dict[str, bool]:
169
  tts_ready = _cached_ready("tts")
@@ -201,10 +297,18 @@ async def text_to_speech(req: TTSRequest) -> Response:
201
 
202
  try:
203
  async with httpx.AsyncClient(timeout=120.0) as client:
204
- results = await asyncio.gather(*[_synthesize_one(client, base, c) for c in chunks])
205
- wav_segments = [r for r in results if r and len(r) > 44]
206
  if not wav_segments:
207
- raise HTTPException(status_code=502, detail="TTS synthesis failed for all chunks")
 
 
 
 
 
 
 
 
208
  combined = _concat_wav(wav_segments)
209
  _status_cache["tts"] = {"ready": True, "checked_at": time.time()}
210
  return Response(content=combined, media_type="audio/wav")
@@ -218,6 +322,7 @@ async def text_to_speech(req: TTSRequest) -> Response:
218
 
219
 
220
  def _convert_to_wav(audio_bytes: bytes, src_mime: str) -> bytes:
 
221
  with tempfile.TemporaryDirectory() as tmp:
222
  ext = "webm" if "webm" in (src_mime or "") else "ogg"
223
  src = Path(tmp) / f"in.{ext}"
@@ -225,7 +330,7 @@ def _convert_to_wav(audio_bytes: bytes, src_mime: str) -> bytes:
225
  src.write_bytes(audio_bytes)
226
  result = subprocess.run(
227
  [
228
- "ffmpeg",
229
  "-y",
230
  "-i",
231
  str(src),
@@ -249,9 +354,9 @@ def _convert_to_wav(audio_bytes: bytes, src_mime: str) -> bytes:
249
  @router.post("/transcribe")
250
  async def transcribe_audio(audio: UploadFile = File(...)) -> Dict[str, str]:
251
  contents = await audio.read()
 
252
  if not contents:
253
  return {"text": ""}
254
- mime = audio.content_type or "audio/webm"
255
  LOG.info("STT: received %s bytes (%s)", len(contents), mime)
256
 
257
  try:
@@ -266,9 +371,12 @@ async def transcribe_audio(audio: UploadFile = File(...)) -> Dict[str, str]:
266
  loop = asyncio.get_running_loop()
267
  wav_bytes = await loop.run_in_executor(None, _convert_to_wav, contents, mime)
268
  LOG.info("STT: converted to WAV (%s bytes)", len(wav_bytes))
 
 
 
269
  except Exception as e:
270
  LOG.error("STT conversion error: %s", e)
271
- raise HTTPException(status_code=500, detail="Audio conversion failed")
272
  else:
273
  wav_bytes = contents
274
 
 
4
 
5
  import asyncio
6
  import html as html_module
7
+ import io
8
  import logging
9
+ import os
10
  import re
11
+ import shutil
12
  import struct
13
  import subprocess
14
+ import wave
15
  import tempfile
16
  import time
17
  from pathlib import Path
 
45
 
46
  PROBE_TIMEOUT = 12.0
47
  CACHE_TTL = 120.0
48
+ # Fewer round-trips to Coqui (longer URLs); still typically under proxy URL limits.
49
+ MAX_CHUNK_CHARS = 380
50
 
51
  _status_cache: Dict[str, Any] = {
52
  "tts": {"ready": False, "checked_at": 0.0},
 
108
  text = re.sub(r"<[^>]+>", " ", text)
109
  text = html_module.unescape(text)
110
  text = _SECTION_HEADERS.sub(" ", text)
111
+ # Remove trailing period after other sentence-ending punctuation ("!." → "!", "?." → "?")
112
+ text = re.sub(r"([!?])\s*\.+", r"\1", text)
113
  text = re.sub(r"([.!?])\s*\1+", r"\1", text)
114
  text = re.sub(r"\s*\.\s*\.", ".", text)
115
  text = re.sub(r"\s+", " ", text).strip()
 
135
  return chunks
136
 
137
 
138
+ async def _synthesize_one(
139
+ client: httpx.AsyncClient, base: str, chunk: str
140
+ ) -> tuple[Optional[bytes], Optional[str]]:
141
  url = f"{base}/synthesize/{quote(chunk, safe='')}"
142
  try:
143
  r = await client.get(url)
144
+ if r.status_code >= 400:
145
+ err = f"HTTP {r.status_code}: {r.text[:160]}"
146
+ LOG.warning("TTS chunk failed: %s", err)
147
+ return None, err
148
+ return r.content, None
149
  except Exception as exc:
150
+ err = str(exc)[:220]
151
  LOG.warning("TTS chunk failed (%s chars): %s", len(chunk), exc)
152
+ return None, err
153
+
154
+
155
+ def _ffmpeg_executable() -> str:
156
+ raw = (settings.ffmpeg_path or "").strip()
157
+ if raw:
158
+ p = Path(raw)
159
+ if p.is_file():
160
+ return str(p.resolve())
161
+ w = shutil.which("ffmpeg")
162
+ if w:
163
+ return w
164
+ localappdata = os.environ.get("LOCALAPPDATA", "")
165
+ if localappdata:
166
+ winget_link = Path(localappdata) / "Microsoft" / "WinGet" / "Links" / "ffmpeg.exe"
167
+ if winget_link.is_file():
168
+ return str(winget_link.resolve())
169
+ pkg_root = Path(localappdata) / "Microsoft" / "WinGet" / "Packages"
170
+ if pkg_root.is_dir():
171
+ try:
172
+ candidates = [p for p in pkg_root.rglob("ffmpeg.exe") if p.is_file()]
173
+ candidates.sort(
174
+ key=lambda p: (
175
+ 0 if "\\bin\\" in str(p).lower() or "/bin/" in str(p).lower() else 1,
176
+ len(str(p)),
177
+ )
178
+ )
179
+ if candidates:
180
+ return str(candidates[0].resolve())
181
+ except OSError:
182
+ pass
183
+ for env in ("ProgramFiles", "ProgramFiles(x86)"):
184
+ base = os.environ.get(env, "")
185
+ if not base:
186
+ continue
187
+ cand = Path(base) / "ffmpeg" / "bin" / "ffmpeg.exe"
188
+ if cand.is_file():
189
+ return str(cand.resolve())
190
+ raise FileNotFoundError(
191
+ "ffmpeg not found. Install ffmpeg (e.g. winget install Gyan.FFmpeg), add it to PATH, "
192
+ "or set FFMPEG_PATH in backend/.env to the full path to ffmpeg.exe"
193
+ )
194
 
195
 
196
+ def _concat_wav_legacy(segments: List[bytes]) -> bytes:
197
+ """Best-effort RIFF merge (can confuse decoders); used only as fallback."""
198
  if len(segments) == 1:
199
  return segments[0]
200
  pcm_parts: List[bytes] = []
 
219
  return bytes(header) + all_pcm
220
 
221
 
222
+ def _concat_wav(segments: List[bytes]) -> bytes:
223
+ """Merge Coqui WAV segments with matching fmt using stdlib wave (reliable playback)."""
224
+ segs = [s for s in segments if s and len(s) >= 44]
225
+ if not segs:
226
+ return b""
227
+ if len(segs) == 1:
228
+ return segs[0]
229
+ readers: List[wave.Wave_read] = []
230
+ try:
231
+ for raw in segs:
232
+ readers.append(wave.open(io.BytesIO(raw), "rb"))
233
+ r0 = readers[0]
234
+ ch, sw, fr = r0.getnchannels(), r0.getsampwidth(), r0.getframerate()
235
+ out_buf = io.BytesIO()
236
+ wo = wave.open(out_buf, "wb")
237
+ wo.setnchannels(ch)
238
+ wo.setsampwidth(sw)
239
+ wo.setframerate(fr)
240
+ try:
241
+ for w in readers:
242
+ if (
243
+ w.getnchannels() != ch
244
+ or w.getsampwidth() != sw
245
+ or w.getframerate() != fr
246
+ ):
247
+ raise ValueError("WAV format mismatch between segments")
248
+ wo.writeframes(w.readframes(w.getnframes()))
249
+ finally:
250
+ wo.close()
251
+ return out_buf.getvalue()
252
+ except Exception as exc:
253
+ LOG.warning("WAV concat via wave module failed, using legacy merge: %s", exc)
254
+ return _concat_wav_legacy(segments)
255
+ finally:
256
+ for w in readers:
257
+ try:
258
+ w.close()
259
+ except Exception:
260
+ pass
261
+
262
+
263
  @router.get("/voice/status")
264
  async def voice_status() -> Dict[str, bool]:
265
  tts_ready = _cached_ready("tts")
 
297
 
298
  try:
299
  async with httpx.AsyncClient(timeout=120.0) as client:
300
+ pairs = await asyncio.gather(*[_synthesize_one(client, base, c) for c in chunks])
301
+ wav_segments = [seg for seg, _ in pairs if seg and len(seg) > 44]
302
  if not wav_segments:
303
+ first_err = next((e for _, e in pairs if e), None)
304
+ detail = (first_err or "TTS synthesis failed for all chunks")[:500]
305
+ raise HTTPException(status_code=502, detail=detail)
306
+ if len(wav_segments) < len(chunks):
307
+ LOG.warning(
308
+ "TTS: partial success — %s/%s chunk(s) synthesized (others failed upstream)",
309
+ len(wav_segments),
310
+ len(chunks),
311
+ )
312
  combined = _concat_wav(wav_segments)
313
  _status_cache["tts"] = {"ready": True, "checked_at": time.time()}
314
  return Response(content=combined, media_type="audio/wav")
 
322
 
323
 
324
  def _convert_to_wav(audio_bytes: bytes, src_mime: str) -> bytes:
325
+ ff = _ffmpeg_executable()
326
  with tempfile.TemporaryDirectory() as tmp:
327
  ext = "webm" if "webm" in (src_mime or "") else "ogg"
328
  src = Path(tmp) / f"in.{ext}"
 
330
  src.write_bytes(audio_bytes)
331
  result = subprocess.run(
332
  [
333
+ ff,
334
  "-y",
335
  "-i",
336
  str(src),
 
354
  @router.post("/transcribe")
355
  async def transcribe_audio(audio: UploadFile = File(...)) -> Dict[str, str]:
356
  contents = await audio.read()
357
+ mime = audio.content_type or "audio/webm"
358
  if not contents:
359
  return {"text": ""}
 
360
  LOG.info("STT: received %s bytes (%s)", len(contents), mime)
361
 
362
  try:
 
371
  loop = asyncio.get_running_loop()
372
  wav_bytes = await loop.run_in_executor(None, _convert_to_wav, contents, mime)
373
  LOG.info("STT: converted to WAV (%s bytes)", len(wav_bytes))
374
+ except FileNotFoundError as e:
375
+ LOG.error("STT: ffmpeg missing: %s", e)
376
+ raise HTTPException(status_code=503, detail=str(e))
377
  except Exception as e:
378
  LOG.error("STT conversion error: %s", e)
379
+ raise HTTPException(status_code=500, detail=f"Audio conversion failed: {e!s}"[:500])
380
  else:
381
  wav_bytes = contents
382
 
frontend/src/App.css CHANGED
@@ -631,24 +631,61 @@ html.aj-hide-pointer * {
631
 
632
  .aj-composer {
633
  z-index: 40;
634
- padding: 6px 6px;
635
  border-top: 1px solid var(--lc-border);
636
  background: var(--lc-sidebar);
637
  flex-shrink: 0;
638
  flex-grow: 0;
639
  }
640
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
641
  .aj-composer-inner {
642
  display: flex;
643
  gap: 6px;
644
  align-items: flex-end;
645
- max-width: 1040px;
646
- margin: 0 auto;
 
647
  }
648
 
649
  .aj-composer-field {
650
  flex: 1;
651
  min-width: 0;
 
 
 
652
  }
653
 
654
  .aj-composer-field--listening textarea {
@@ -657,7 +694,10 @@ html.aj-hide-pointer * {
657
  }
658
 
659
  .aj-composer textarea {
660
- flex: 1;
 
 
 
661
  box-sizing: border-box;
662
  min-height: 34px;
663
  max-height: 50vh;
 
631
 
632
  .aj-composer {
633
  z-index: 40;
634
+ padding: 6px 10px;
635
  border-top: 1px solid var(--lc-border);
636
  background: var(--lc-sidebar);
637
  flex-shrink: 0;
638
  flex-grow: 0;
639
  }
640
 
641
+ .aj-voice-error-banner {
642
+ display: flex;
643
+ align-items: flex-start;
644
+ justify-content: space-between;
645
+ gap: 12px;
646
+ padding: 8px 12px;
647
+ font-size: 13px;
648
+ line-height: 1.45;
649
+ color: #7f1d1d;
650
+ background: #fef2f2;
651
+ border-bottom: 1px solid #fecaca;
652
+ }
653
+
654
+ .aj-voice-error-banner span {
655
+ flex: 1;
656
+ min-width: 0;
657
+ }
658
+
659
+ .aj-voice-error-dismiss {
660
+ flex-shrink: 0;
661
+ border: none;
662
+ background: transparent;
663
+ color: #991b1b;
664
+ font-size: 20px;
665
+ line-height: 1;
666
+ cursor: pointer;
667
+ padding: 0 4px;
668
+ }
669
+
670
+ .aj-voice-error-dismiss:hover {
671
+ color: #450a0a;
672
+ }
673
+
674
  .aj-composer-inner {
675
  display: flex;
676
  gap: 6px;
677
  align-items: flex-end;
678
+ width: 100%;
679
+ max-width: none;
680
+ box-sizing: border-box;
681
  }
682
 
683
  .aj-composer-field {
684
  flex: 1;
685
  min-width: 0;
686
+ display: flex;
687
+ flex-direction: column;
688
+ align-self: stretch;
689
  }
690
 
691
  .aj-composer-field--listening textarea {
 
694
  }
695
 
696
  .aj-composer textarea {
697
+ width: 100%;
698
+ min-width: 0;
699
+ flex: 1 1 auto;
700
+ align-self: stretch;
701
  box-sizing: border-box;
702
  min-height: 34px;
703
  max-height: 50vh;
frontend/src/App.jsx CHANGED
@@ -8,6 +8,8 @@ import {
8
  Loader2,
9
  MessageSquarePlus,
10
  Mic,
 
 
11
  RefreshCw,
12
  Search,
13
  Send,
@@ -73,11 +75,29 @@ function parseSseBuffer(buf, onEvent) {
73
  return remainder
74
  }
75
 
76
- function extractFirstSentence(text) {
 
 
 
 
 
 
77
  const t = (text || '').trim()
78
- if (!t) return null
79
- const m = t.match(/^[\s\S]+?[.!?](?=\s|$)/)
80
- return m ? m[0].trim() : null
 
 
 
 
 
 
 
 
 
 
 
 
81
  }
82
 
83
  function appendTokenContent(acc, ev) {
@@ -196,28 +216,72 @@ function AssistantSearchBar({ content, show, speak }) {
196
  }
197
 
198
  if (!show) return null
199
- if (!(content || '').trim() && !speak?.loading && !speak?.playing) return null
 
 
 
 
 
 
 
200
 
201
  return (
202
  <div className="aj-msg-actions" ref={wrapRef}>
203
  <div className="aj-msg-search-wrap">
204
  {speak && (
205
- <button
206
- type="button"
207
- className="aj-msg-search-btn"
208
- onClick={speak.onSpeak}
209
- disabled={speak.disabled}
210
- data-tip={speak.playing ? 'Stop' : speak.loading ? 'Loading speech…' : 'Read aloud'}
211
- aria-label={speak.playing ? 'Stop' : speak.loading ? 'Loading speech' : 'Read aloud'}
212
- >
213
- {speak.loading ? (
214
- <Loader2 size={14} className="aj-spin" aria-hidden />
215
- ) : speak.playing ? (
216
- <Square size={14} aria-hidden />
217
- ) : (
218
- <Volume2 size={14} aria-hidden />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  )}
220
- </button>
221
  )}
222
  <button
223
  type="button"
@@ -315,15 +379,21 @@ export default function App() {
315
 
316
  const [ttsLoadingIndex, setTtsLoadingIndex] = useState(null)
317
  const [ttsPlayingIndex, setTtsPlayingIndex] = useState(null)
 
318
  const [alwaysSpeak, setAlwaysSpeak] = useState(initialAlwaysSpeak)
319
  const [micListening, setMicListening] = useState(false)
320
  const [micTranscribing, setMicTranscribing] = useState(false)
 
321
 
322
  const audioRef = useRef(null)
323
  const ttsBlobUrlRef = useRef(null)
324
- const prefetchAbortRef = useRef(null)
325
- const prefetchKeyRef = useRef('')
326
- const prefetchBlobRef = useRef(null)
 
 
 
 
327
  const streamingWasRef = useRef(false)
328
  const mediaRecorderRef = useRef(null)
329
  const audioChunksRef = useRef([])
@@ -486,6 +556,7 @@ export default function App() {
486
  }, [exportOpen])
487
 
488
  const stopTts = useCallback(() => {
 
489
  if (audioRef.current) {
490
  audioRef.current.pause()
491
  audioRef.current.src = ''
@@ -497,126 +568,229 @@ export default function App() {
497
  }
498
  setTtsLoadingIndex(null)
499
  setTtsPlayingIndex(null)
 
500
  }, [])
501
 
502
- const playTtsForIndex = useCallback(
503
- async (index, text) => {
504
- const raw = (text || '').trim()
505
- if (!raw) return
506
- try {
507
- sessionStorage.setItem(STORAGE_TTS_PRIMED, '1')
508
- } catch {
509
- /* */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  }
511
- stopTts()
512
- setTtsLoadingIndex(index)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
  try {
514
  const res = await fetch('/api/tts', {
515
  method: 'POST',
516
  headers: { 'Content-Type': 'application/json' },
517
- body: JSON.stringify({ text: raw }),
518
  })
519
- if (!res.ok) throw new Error('TTS failed')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  const blob = await res.blob()
521
- const url = URL.createObjectURL(blob)
522
- ttsBlobUrlRef.current = url
523
- const audio = new Audio(url)
524
- audioRef.current = audio
525
- setTtsLoadingIndex(null)
526
- setTtsPlayingIndex(index)
527
- audio.onended = () => {
528
- stopTts()
529
  }
530
- audio.onerror = () => {
531
- stopTts()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
  }
533
- await audio.play()
534
- } catch {
535
  setTtsLoadingIndex(null)
536
  setTtsPlayingIndex(null)
537
- }
538
- },
539
- [stopTts],
540
- )
541
 
542
- const onSpeakToggle = useCallback(
543
- (index, text) => {
544
- if (ttsLoadingIndex === index || ttsPlayingIndex === index) {
545
- stopTts()
546
- return
 
 
 
547
  }
548
- playTtsForIndex(index, text)
549
  },
550
- [ttsLoadingIndex, ttsPlayingIndex, stopTts, playTtsForIndex],
551
  )
552
 
553
  useEffect(() => {
554
- if (streamingWasRef.current && !streaming && alwaysSpeak) {
555
- const last = messages[messages.length - 1]
 
 
 
 
 
 
 
 
 
 
556
  if (last?.role === 'assistant' && last.content?.trim()) {
557
  playTtsForIndex(messages.length - 1, last.content)
558
  }
559
  }
 
560
  streamingWasRef.current = streaming
561
  }, [streaming, alwaysSpeak, messages, playTtsForIndex])
562
 
563
  useEffect(() => {
564
- try {
565
- sessionStorage.setItem(STORAGE_ALWAYS_SPEAK, alwaysSpeak ? '1' : '0')
566
- } catch {
567
- /* */
568
- }
569
  }, [alwaysSpeak])
570
 
571
  useEffect(() => {
572
- return () => {
573
- stopTts()
574
- prefetchAbortRef.current?.abort()
575
- if (prefetchBlobRef.current?.url) {
576
- URL.revokeObjectURL(prefetchBlobRef.current.url)
577
- }
578
  }
579
- }, [stopTts])
580
 
581
  useEffect(() => {
582
- if (!streaming) return
583
- let primed = false
584
- try {
585
- primed = sessionStorage.getItem(STORAGE_TTS_PRIMED) === '1'
586
- } catch {
587
- /* */
588
- }
589
- if (!primed) return
590
- const last = messages[messages.length - 1]
591
- if (!last || last.role !== 'assistant') return
592
- const text = last.content
593
- const first = extractFirstSentence(text)
594
- if (!first || first.length < 8) return
595
- const key = `${messages.length - 1}:${first}`
596
- if (prefetchKeyRef.current === key) return
597
- prefetchKeyRef.current = key
598
- prefetchAbortRef.current?.abort()
599
- const ac = new AbortController()
600
- prefetchAbortRef.current = ac
601
- fetch('/api/tts', {
602
- method: 'POST',
603
- headers: { 'Content-Type': 'application/json' },
604
- body: JSON.stringify({ text: first }),
605
- signal: ac.signal,
606
- })
607
- .then((r) => {
608
- if (!r.ok) throw new Error('prefetch')
609
- return r.blob()
610
- })
611
- .then((blob) => {
612
- const url = URL.createObjectURL(blob)
613
- if (prefetchBlobRef.current?.url) {
614
- URL.revokeObjectURL(prefetchBlobRef.current.url)
615
- }
616
- prefetchBlobRef.current = { index: messages.length - 1, sentence: first, url }
617
- })
618
- .catch(() => {})
619
- }, [streaming, messages])
620
 
621
  const toggleMic = useCallback(async () => {
622
  if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
@@ -646,12 +820,32 @@ export default function App() {
646
  const form = new FormData()
647
  form.append('audio', blob, 'recording.webm')
648
  fetch('/api/transcribe', { method: 'POST', body: form })
649
- .then((r) => r.json())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
650
  .then((data) => {
 
651
  const tx = data?.text?.trim()
652
- if (tx) setInput((prev) => (prev ? `${prev} ${tx}` : tx))
 
 
 
 
 
 
653
  })
654
- .catch(() => {})
655
  .finally(() => setMicTranscribing(false))
656
  }
657
  mediaRecorderRef.current = mediaRecorder
@@ -853,12 +1047,7 @@ export default function App() {
853
  const handleNewChat = () => {
854
  abortRef.current?.abort()
855
  stopTts()
856
- prefetchAbortRef.current?.abort()
857
- prefetchKeyRef.current = ''
858
- if (prefetchBlobRef.current?.url) {
859
- URL.revokeObjectURL(prefetchBlobRef.current.url)
860
- prefetchBlobRef.current = null
861
- }
862
  setMessages([])
863
  setInput('')
864
  setSummary(null)
@@ -872,12 +1061,7 @@ export default function App() {
872
 
873
  const handleRefresh = () => {
874
  if (streaming) return
875
- prefetchAbortRef.current?.abort()
876
- prefetchKeyRef.current = ''
877
- if (prefetchBlobRef.current?.url) {
878
- URL.revokeObjectURL(prefetchBlobRef.current.url)
879
- prefetchBlobRef.current = null
880
- }
881
  let base = [...messages]
882
  if (base.length && base[base.length - 1].role === 'assistant') base = base.slice(0, -1)
883
  if (!base.length || base[base.length - 1].role !== 'user') return
@@ -891,12 +1075,7 @@ export default function App() {
891
  const text = input.trim()
892
  if (!text || streaming) return
893
  setHideTypingCursor(false)
894
- prefetchAbortRef.current?.abort()
895
- prefetchKeyRef.current = ''
896
- if (prefetchBlobRef.current?.url) {
897
- URL.revokeObjectURL(prefetchBlobRef.current.url)
898
- prefetchBlobRef.current = null
899
- }
900
  const userMsg = { role: 'user', content: text }
901
  const apiMsgs = [...messages, userMsg].map(({ role, content }) => ({ role, content }))
902
  setMessages([...messages, userMsg, { role: 'assistant', content: '' }])
@@ -1149,6 +1328,15 @@ export default function App() {
1149
  </div>
1150
  </header>
1151
 
 
 
 
 
 
 
 
 
 
1152
  {messages.length > 0 && (
1153
  <div className="aj-context-bar">
1154
  <div className="aj-context-meter">
@@ -1211,12 +1399,20 @@ export default function App() {
1211
  show={!(streaming && i === messages.length - 1)}
1212
  speak={{
1213
  loading: ttsLoadingIndex === i,
1214
- playing: ttsPlayingIndex === i,
 
 
 
 
 
1215
  disabled:
1216
  !(m.content || '').trim()
1217
  && ttsLoadingIndex !== i
1218
  && ttsPlayingIndex !== i,
1219
- onSpeak: () => onSpeakToggle(i, m.content),
 
 
 
1220
  }}
1221
  />
1222
  )}
 
8
  Loader2,
9
  MessageSquarePlus,
10
  Mic,
11
+ Pause,
12
+ Play,
13
  RefreshCw,
14
  Search,
15
  Send,
 
75
  return remainder
76
  }
77
 
78
+ /**
79
+ * Extract complete sentences from text. Boundaries are stable: once a sentence
80
+ * ends with punctuation, it never changes even as more text streams in.
81
+ * When includeTrailing is true, any text after the last sentence terminator is
82
+ * also returned (used after streaming ends so nothing is silently dropped).
83
+ */
84
+ function extractSentences(text, includeTrailing = false) {
85
  const t = (text || '').trim()
86
+ if (!t) return []
87
+ const re = /[^.!?\n]+[.!?\n]+/g
88
+ const sentences = []
89
+ let match, lastEnd = 0
90
+ while ((match = re.exec(t)) !== null) {
91
+ const s = match[0].trim()
92
+ if (s) sentences.push(s)
93
+ lastEnd = re.lastIndex
94
+ }
95
+ if (!sentences.length) return includeTrailing ? [t] : []
96
+ if (includeTrailing) {
97
+ const remainder = t.slice(lastEnd).trim()
98
+ if (remainder) sentences.push(remainder)
99
+ }
100
+ return sentences
101
  }
102
 
103
  function appendTokenContent(acc, ev) {
 
216
  }
217
 
218
  if (!show) return null
219
+ if (
220
+ !(content || '').trim()
221
+ && !speak?.loading
222
+ && !speak?.playing
223
+ && !speak?.paused
224
+ ) {
225
+ return null
226
+ }
227
 
228
  return (
229
  <div className="aj-msg-actions" ref={wrapRef}>
230
  <div className="aj-msg-search-wrap">
231
  {speak && (
232
+ <>
233
+ <button
234
+ type="button"
235
+ className="aj-msg-search-btn"
236
+ onClick={
237
+ speak.paused
238
+ ? speak.onResume
239
+ : speak.playing
240
+ ? speak.onPause
241
+ : speak.onReadAloud
242
+ }
243
+ disabled={speak.disabled || speak.loading}
244
+ data-tip={
245
+ speak.paused
246
+ ? 'Resume'
247
+ : speak.playing
248
+ ? 'Pause'
249
+ : speak.loading
250
+ ? 'Loading speech…'
251
+ : 'Read aloud'
252
+ }
253
+ aria-label={
254
+ speak.paused
255
+ ? 'Resume'
256
+ : speak.playing
257
+ ? 'Pause'
258
+ : speak.loading
259
+ ? 'Loading speech'
260
+ : 'Read aloud'
261
+ }
262
+ >
263
+ {speak.paused ? (
264
+ <Play size={14} aria-hidden />
265
+ ) : speak.loading ? (
266
+ <Loader2 size={14} className="aj-spin" aria-hidden />
267
+ ) : speak.playing ? (
268
+ <Pause size={14} aria-hidden />
269
+ ) : (
270
+ <Volume2 size={14} aria-hidden />
271
+ )}
272
+ </button>
273
+ {(speak.playing || speak.paused || (speak.loading && speak.showStop)) && (
274
+ <button
275
+ type="button"
276
+ className="aj-msg-search-btn"
277
+ onClick={speak.onStopReading}
278
+ data-tip="Stop reading"
279
+ aria-label="Stop reading"
280
+ >
281
+ <Square size={14} aria-hidden />
282
+ </button>
283
  )}
284
+ </>
285
  )}
286
  <button
287
  type="button"
 
379
 
380
  const [ttsLoadingIndex, setTtsLoadingIndex] = useState(null)
381
  const [ttsPlayingIndex, setTtsPlayingIndex] = useState(null)
382
+ const [ttsPaused, setTtsPaused] = useState(false)
383
  const [alwaysSpeak, setAlwaysSpeak] = useState(initialAlwaysSpeak)
384
  const [micListening, setMicListening] = useState(false)
385
  const [micTranscribing, setMicTranscribing] = useState(false)
386
+ const [voiceError, setVoiceError] = useState(null)
387
 
388
  const audioRef = useRef(null)
389
  const ttsBlobUrlRef = useRef(null)
390
+ const ttsSessionRef = useRef(0)
391
+ const messagesRef = useRef(messages)
392
+ messagesRef.current = messages
393
+ const streamingRef = useRef(streaming)
394
+ streamingRef.current = streaming
395
+ const ttsContentResolverRef = useRef(null)
396
+ const ttsPlaybackActiveRef = useRef(false)
397
  const streamingWasRef = useRef(false)
398
  const mediaRecorderRef = useRef(null)
399
  const audioChunksRef = useRef([])
 
556
  }, [exportOpen])
557
 
558
  const stopTts = useCallback(() => {
559
+ ttsSessionRef.current += 1
560
  if (audioRef.current) {
561
  audioRef.current.pause()
562
  audioRef.current.src = ''
 
568
  }
569
  setTtsLoadingIndex(null)
570
  setTtsPlayingIndex(null)
571
+ setTtsPaused(false)
572
  }, [])
573
 
574
+ const pauseTts = useCallback(() => {
575
+ const a = audioRef.current
576
+ if (a && !a.paused) {
577
+ a.pause()
578
+ setTtsPaused(true)
579
+ }
580
+ }, [])
581
+
582
+ const resumeTts = useCallback(async () => {
583
+ const a = audioRef.current
584
+ if (!a) return
585
+ try {
586
+ await a.play()
587
+ setTtsPaused(false)
588
+ } catch (e) {
589
+ setVoiceError(String(e?.message || e))
590
+ }
591
+ }, [])
592
+
593
+ const playAudioUrlUntilDone = useCallback((url, session) => {
594
+ return new Promise((resolve) => {
595
+ if (session !== ttsSessionRef.current) {
596
+ resolve()
597
+ return
598
  }
599
+ const audio = new Audio(url)
600
+ audioRef.current = audio
601
+ ttsBlobUrlRef.current = url
602
+ let settled = false
603
+ let pollAbort = null
604
+ const finish = () => {
605
+ if (settled) return
606
+ settled = true
607
+ if (pollAbort != null) clearInterval(pollAbort)
608
+ audio.onended = null
609
+ audio.onerror = null
610
+ resolve()
611
+ }
612
+ pollAbort = setInterval(() => {
613
+ if (session !== ttsSessionRef.current) finish()
614
+ }, 120)
615
+ audio.onended = () => {
616
+ finish()
617
+ }
618
+ audio.onerror = () => {
619
+ finish()
620
+ }
621
+ audio.play().catch(() => finish())
622
+ })
623
+ }, [])
624
+
625
+ const fetchTtsAudio = useCallback(async (chunkText) => {
626
+ for (let attempt = 0; attempt < 2; attempt++) {
627
  try {
628
  const res = await fetch('/api/tts', {
629
  method: 'POST',
630
  headers: { 'Content-Type': 'application/json' },
631
+ body: JSON.stringify({ text: chunkText }),
632
  })
633
+ if (!res.ok) {
634
+ let detail = `TTS failed (${res.status})`
635
+ try {
636
+ const ct = res.headers.get('content-type') || ''
637
+ if (ct.includes('json')) {
638
+ const j = await res.json()
639
+ if (j?.detail != null) {
640
+ detail = typeof j.detail === 'string' ? j.detail : JSON.stringify(j.detail)
641
+ }
642
+ }
643
+ } catch { /* */ }
644
+ if (res.status >= 500 && attempt === 0) {
645
+ continue
646
+ }
647
+ return { error: detail }
648
+ }
649
  const blob = await res.blob()
650
+ return { url: URL.createObjectURL(blob) }
651
+ } catch (e) {
652
+ if (attempt === 0) {
653
+ continue
 
 
 
 
654
  }
655
+ return { error: String(e?.message || e) }
656
+ }
657
+ }
658
+ return { error: 'TTS failed after retries' }
659
+ }, [])
660
+
661
+ const playTtsForIndex = useCallback(
662
+ async (index, initialText) => {
663
+ try { sessionStorage.setItem(STORAGE_TTS_PRIMED, '1') } catch { /* */ }
664
+ setVoiceError(null)
665
+ stopTts()
666
+ const session = ttsSessionRef.current
667
+ ttsPlaybackActiveRef.current = true
668
+
669
+ setTtsLoadingIndex(index)
670
+ setTtsPlayingIndex(null)
671
+ setTtsPaused(false)
672
+
673
+ const LOOKAHEAD = 2
674
+ let playedCount = 0
675
+ let anyPlayed = false
676
+ let lastErr = null
677
+ const inFlight = new Map()
678
+
679
+ const getSentencesAndLimit = () => {
680
+ const live = streamingRef.current
681
+ const text = (messagesRef.current[index]?.content || initialText || '').trim()
682
+ const sentences = extractSentences(text, !live)
683
+ return { sentences, limit: sentences.length, live }
684
+ }
685
+
686
+ try {
687
+ // eslint-disable-next-line no-constant-condition
688
+ while (true) {
689
+ if (session !== ttsSessionRef.current) return
690
+
691
+ const { sentences, limit, live } = getSentencesAndLimit()
692
+
693
+ if (playedCount < limit) {
694
+ for (let ahead = playedCount; ahead < Math.min(playedCount + LOOKAHEAD, limit); ahead++) {
695
+ if (!inFlight.has(ahead)) {
696
+ inFlight.set(ahead, fetchTtsAudio(sentences[ahead]))
697
+ }
698
+ }
699
+
700
+ if (!anyPlayed) setTtsLoadingIndex(index)
701
+ const result = await (inFlight.get(playedCount) || fetchTtsAudio(sentences[playedCount]))
702
+ inFlight.delete(playedCount)
703
+
704
+ if (session !== ttsSessionRef.current) return
705
+
706
+ if (!result?.url) {
707
+ if (result?.error) lastErr = result.error
708
+ playedCount++
709
+ continue
710
+ }
711
+
712
+ anyPlayed = true
713
+ setTtsLoadingIndex(null)
714
+ setTtsPlayingIndex(index)
715
+ setTtsPaused(false)
716
+
717
+ await playAudioUrlUntilDone(result.url, session)
718
+ URL.revokeObjectURL(result.url)
719
+ if (ttsBlobUrlRef.current === result.url) {
720
+ ttsBlobUrlRef.current = null
721
+ audioRef.current = null
722
+ }
723
+
724
+ playedCount++
725
+ if (session !== ttsSessionRef.current) return
726
+ continue
727
+ }
728
+
729
+ if (!live) break
730
+
731
+ setTtsLoadingIndex(index)
732
+ await new Promise(resolve => {
733
+ ttsContentResolverRef.current = resolve
734
+ const rechk = getSentencesAndLimit()
735
+ if (rechk.limit > playedCount || !rechk.live) {
736
+ ttsContentResolverRef.current = null
737
+ resolve()
738
+ }
739
+ })
740
  }
741
+
 
742
  setTtsLoadingIndex(null)
743
  setTtsPlayingIndex(null)
744
+ setTtsPaused(false)
 
 
 
745
 
746
+ if (!anyPlayed && lastErr) setVoiceError(lastErr)
747
+ } catch (e) {
748
+ setVoiceError(String(e?.message || e))
749
+ setTtsLoadingIndex(null)
750
+ setTtsPlayingIndex(null)
751
+ setTtsPaused(false)
752
+ } finally {
753
+ ttsPlaybackActiveRef.current = false
754
  }
 
755
  },
756
+ [stopTts, playAudioUrlUntilDone, fetchTtsAudio],
757
  )
758
 
759
  useEffect(() => {
760
+ const last = messages[messages.length - 1]
761
+
762
+ if (streaming && alwaysSpeak && !ttsPlaybackActiveRef.current) {
763
+ if (last?.role === 'assistant' && last.content?.trim()) {
764
+ const sentences = extractSentences(last.content.trim())
765
+ if (sentences.length >= 2) {
766
+ playTtsForIndex(messages.length - 1, last.content)
767
+ }
768
+ }
769
+ }
770
+
771
+ if (streamingWasRef.current && !streaming && alwaysSpeak && !ttsPlaybackActiveRef.current) {
772
  if (last?.role === 'assistant' && last.content?.trim()) {
773
  playTtsForIndex(messages.length - 1, last.content)
774
  }
775
  }
776
+
777
  streamingWasRef.current = streaming
778
  }, [streaming, alwaysSpeak, messages, playTtsForIndex])
779
 
780
  useEffect(() => {
781
+ try { sessionStorage.setItem(STORAGE_ALWAYS_SPEAK, alwaysSpeak ? '1' : '0') } catch { /* */ }
 
 
 
 
782
  }, [alwaysSpeak])
783
 
784
  useEffect(() => {
785
+ if (ttsContentResolverRef.current) {
786
+ ttsContentResolverRef.current()
787
+ ttsContentResolverRef.current = null
 
 
 
788
  }
789
+ }, [messages, streaming])
790
 
791
  useEffect(() => {
792
+ return () => { stopTts() }
793
+ }, [stopTts])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
794
 
795
  const toggleMic = useCallback(async () => {
796
  if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
 
820
  const form = new FormData()
821
  form.append('audio', blob, 'recording.webm')
822
  fetch('/api/transcribe', { method: 'POST', body: form })
823
+ .then(async (r) => {
824
+ const data = await r.json().catch(() => ({}))
825
+ if (!r.ok) {
826
+ const d = data?.detail
827
+ const msg =
828
+ typeof d === 'string'
829
+ ? d
830
+ : d != null
831
+ ? JSON.stringify(d)
832
+ : `Speech-to-text failed (${r.status})`
833
+ setVoiceError(msg)
834
+ return null
835
+ }
836
+ return data
837
+ })
838
  .then((data) => {
839
+ if (!data) return
840
  const tx = data?.text?.trim()
841
+ if (tx) {
842
+ setVoiceError(null)
843
+ setInput((prev) => (prev ? `${prev} ${tx}` : tx))
844
+ }
845
+ })
846
+ .catch((e) => {
847
+ setVoiceError(String(e?.message || e))
848
  })
 
849
  .finally(() => setMicTranscribing(false))
850
  }
851
  mediaRecorderRef.current = mediaRecorder
 
1047
  const handleNewChat = () => {
1048
  abortRef.current?.abort()
1049
  stopTts()
1050
+ setVoiceError(null)
 
 
 
 
 
1051
  setMessages([])
1052
  setInput('')
1053
  setSummary(null)
 
1061
 
1062
  const handleRefresh = () => {
1063
  if (streaming) return
1064
+ stopTts()
 
 
 
 
 
1065
  let base = [...messages]
1066
  if (base.length && base[base.length - 1].role === 'assistant') base = base.slice(0, -1)
1067
  if (!base.length || base[base.length - 1].role !== 'user') return
 
1075
  const text = input.trim()
1076
  if (!text || streaming) return
1077
  setHideTypingCursor(false)
1078
+ stopTts()
 
 
 
 
 
1079
  const userMsg = { role: 'user', content: text }
1080
  const apiMsgs = [...messages, userMsg].map(({ role, content }) => ({ role, content }))
1081
  setMessages([...messages, userMsg, { role: 'assistant', content: '' }])
 
1328
  </div>
1329
  </header>
1330
 
1331
+ {voiceError && (
1332
+ <div className="aj-voice-error-banner" role="alert">
1333
+ <span>{voiceError}</span>
1334
+ <button type="button" className="aj-voice-error-dismiss" onClick={() => setVoiceError(null)} aria-label="Dismiss">
1335
+ ×
1336
+ </button>
1337
+ </div>
1338
+ )}
1339
+
1340
  {messages.length > 0 && (
1341
  <div className="aj-context-bar">
1342
  <div className="aj-context-meter">
 
1399
  show={!(streaming && i === messages.length - 1)}
1400
  speak={{
1401
  loading: ttsLoadingIndex === i,
1402
+ playing:
1403
+ ttsPlayingIndex === i
1404
+ && !ttsPaused
1405
+ && ttsLoadingIndex !== i,
1406
+ paused: ttsPlayingIndex === i && ttsPaused,
1407
+ showStop: ttsLoadingIndex === i || ttsPlayingIndex === i,
1408
  disabled:
1409
  !(m.content || '').trim()
1410
  && ttsLoadingIndex !== i
1411
  && ttsPlayingIndex !== i,
1412
+ onReadAloud: () => playTtsForIndex(i, m.content),
1413
+ onPause: pauseTts,
1414
+ onResume: resumeTts,
1415
+ onStopReading: stopTts,
1416
  }}
1417
  />
1418
  )}