Spaces:
Running
Running
NeonClary commited on
Commit ·
f714780
1
Parent(s): e7c7cf6
refactor: simplify TTS pipeline with stable sentence boundaries, remove prefetch cache
Browse files- backend/.env.example +3 -0
- backend/app/config.py +2 -0
- backend/app/voice_routes.py +120 -12
- frontend/src/App.css +44 -4
- frontend/src/App.jsx +329 -133
backend/.env.example
CHANGED
|
@@ -12,3 +12,6 @@ CORS_ORIGINS=http://localhost:3006,http://127.0.0.1:3006
|
|
| 12 |
# External TTS/STT (Coqui + Whisper) — same defaults as other Neon demos
|
| 13 |
# COQUI_BASE_URL=https://coqui.neonaiservices.com
|
| 14 |
# WHISPER_BASE_URL=https://whisper.neonaiservices.com
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
# External TTS/STT (Coqui + Whisper) — same defaults as other Neon demos
|
| 13 |
# COQUI_BASE_URL=https://coqui.neonaiservices.com
|
| 14 |
# WHISPER_BASE_URL=https://whisper.neonaiservices.com
|
| 15 |
+
|
| 16 |
+
# Local speech-to-text: ffmpeg must be on PATH, or set full path to ffmpeg.exe (Windows)
|
| 17 |
+
# FFMPEG_PATH=C:\path\to\ffmpeg.exe
|
backend/app/config.py
CHANGED
|
@@ -23,6 +23,8 @@ class Settings(BaseSettings):
|
|
| 23 |
# External voice services (same defaults as CCAI / Neon demos)
|
| 24 |
coqui_base_url: str = "https://coqui.neonaiservices.com"
|
| 25 |
whisper_base_url: str = "https://whisper.neonaiservices.com"
|
|
|
|
|
|
|
| 26 |
|
| 27 |
@property
|
| 28 |
def cors_origin_list(self) -> list[str]:
|
|
|
|
| 23 |
# External voice services (same defaults as CCAI / Neon demos)
|
| 24 |
coqui_base_url: str = "https://coqui.neonaiservices.com"
|
| 25 |
whisper_base_url: str = "https://whisper.neonaiservices.com"
|
| 26 |
+
# Local STT: ffmpeg must be on PATH, or set absolute path (Windows: where winget puts ffmpeg)
|
| 27 |
+
ffmpeg_path: str = ""
|
| 28 |
|
| 29 |
@property
|
| 30 |
def cors_origin_list(self) -> list[str]:
|
backend/app/voice_routes.py
CHANGED
|
@@ -4,10 +4,14 @@ from __future__ import annotations
|
|
| 4 |
|
| 5 |
import asyncio
|
| 6 |
import html as html_module
|
|
|
|
| 7 |
import logging
|
|
|
|
| 8 |
import re
|
|
|
|
| 9 |
import struct
|
| 10 |
import subprocess
|
|
|
|
| 11 |
import tempfile
|
| 12 |
import time
|
| 13 |
from pathlib import Path
|
|
@@ -41,7 +45,8 @@ router = APIRouter()
|
|
| 41 |
|
| 42 |
PROBE_TIMEOUT = 12.0
|
| 43 |
CACHE_TTL = 120.0
|
| 44 |
-
|
|
|
|
| 45 |
|
| 46 |
_status_cache: Dict[str, Any] = {
|
| 47 |
"tts": {"ready": False, "checked_at": 0.0},
|
|
@@ -103,6 +108,8 @@ def _md_to_spoken_text(md: str) -> str:
|
|
| 103 |
text = re.sub(r"<[^>]+>", " ", text)
|
| 104 |
text = html_module.unescape(text)
|
| 105 |
text = _SECTION_HEADERS.sub(" ", text)
|
|
|
|
|
|
|
| 106 |
text = re.sub(r"([.!?])\s*\1+", r"\1", text)
|
| 107 |
text = re.sub(r"\s*\.\s*\.", ".", text)
|
| 108 |
text = re.sub(r"\s+", " ", text).strip()
|
|
@@ -128,18 +135,66 @@ def _split_sentences(text: str) -> List[str]:
|
|
| 128 |
return chunks
|
| 129 |
|
| 130 |
|
| 131 |
-
async def _synthesize_one(
|
|
|
|
|
|
|
| 132 |
url = f"{base}/synthesize/{quote(chunk, safe='')}"
|
| 133 |
try:
|
| 134 |
r = await client.get(url)
|
| 135 |
-
r.
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
| 137 |
except Exception as exc:
|
|
|
|
| 138 |
LOG.warning("TTS chunk failed (%s chars): %s", len(chunk), exc)
|
| 139 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
|
| 142 |
-
def
|
|
|
|
| 143 |
if len(segments) == 1:
|
| 144 |
return segments[0]
|
| 145 |
pcm_parts: List[bytes] = []
|
|
@@ -164,6 +219,47 @@ def _concat_wav(segments: List[bytes]) -> bytes:
|
|
| 164 |
return bytes(header) + all_pcm
|
| 165 |
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
@router.get("/voice/status")
|
| 168 |
async def voice_status() -> Dict[str, bool]:
|
| 169 |
tts_ready = _cached_ready("tts")
|
|
@@ -201,10 +297,18 @@ async def text_to_speech(req: TTSRequest) -> Response:
|
|
| 201 |
|
| 202 |
try:
|
| 203 |
async with httpx.AsyncClient(timeout=120.0) as client:
|
| 204 |
-
|
| 205 |
-
wav_segments = [
|
| 206 |
if not wav_segments:
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
combined = _concat_wav(wav_segments)
|
| 209 |
_status_cache["tts"] = {"ready": True, "checked_at": time.time()}
|
| 210 |
return Response(content=combined, media_type="audio/wav")
|
|
@@ -218,6 +322,7 @@ async def text_to_speech(req: TTSRequest) -> Response:
|
|
| 218 |
|
| 219 |
|
| 220 |
def _convert_to_wav(audio_bytes: bytes, src_mime: str) -> bytes:
|
|
|
|
| 221 |
with tempfile.TemporaryDirectory() as tmp:
|
| 222 |
ext = "webm" if "webm" in (src_mime or "") else "ogg"
|
| 223 |
src = Path(tmp) / f"in.{ext}"
|
|
@@ -225,7 +330,7 @@ def _convert_to_wav(audio_bytes: bytes, src_mime: str) -> bytes:
|
|
| 225 |
src.write_bytes(audio_bytes)
|
| 226 |
result = subprocess.run(
|
| 227 |
[
|
| 228 |
-
|
| 229 |
"-y",
|
| 230 |
"-i",
|
| 231 |
str(src),
|
|
@@ -249,9 +354,9 @@ def _convert_to_wav(audio_bytes: bytes, src_mime: str) -> bytes:
|
|
| 249 |
@router.post("/transcribe")
|
| 250 |
async def transcribe_audio(audio: UploadFile = File(...)) -> Dict[str, str]:
|
| 251 |
contents = await audio.read()
|
|
|
|
| 252 |
if not contents:
|
| 253 |
return {"text": ""}
|
| 254 |
-
mime = audio.content_type or "audio/webm"
|
| 255 |
LOG.info("STT: received %s bytes (%s)", len(contents), mime)
|
| 256 |
|
| 257 |
try:
|
|
@@ -266,9 +371,12 @@ async def transcribe_audio(audio: UploadFile = File(...)) -> Dict[str, str]:
|
|
| 266 |
loop = asyncio.get_running_loop()
|
| 267 |
wav_bytes = await loop.run_in_executor(None, _convert_to_wav, contents, mime)
|
| 268 |
LOG.info("STT: converted to WAV (%s bytes)", len(wav_bytes))
|
|
|
|
|
|
|
|
|
|
| 269 |
except Exception as e:
|
| 270 |
LOG.error("STT conversion error: %s", e)
|
| 271 |
-
raise HTTPException(status_code=500, detail="Audio conversion failed")
|
| 272 |
else:
|
| 273 |
wav_bytes = contents
|
| 274 |
|
|
|
|
| 4 |
|
| 5 |
import asyncio
|
| 6 |
import html as html_module
|
| 7 |
+
import io
|
| 8 |
import logging
|
| 9 |
+
import os
|
| 10 |
import re
|
| 11 |
+
import shutil
|
| 12 |
import struct
|
| 13 |
import subprocess
|
| 14 |
+
import wave
|
| 15 |
import tempfile
|
| 16 |
import time
|
| 17 |
from pathlib import Path
|
|
|
|
| 45 |
|
| 46 |
PROBE_TIMEOUT = 12.0
|
| 47 |
CACHE_TTL = 120.0
|
| 48 |
+
# Fewer round-trips to Coqui (longer URLs); still typically under proxy URL limits.
|
| 49 |
+
MAX_CHUNK_CHARS = 380
|
| 50 |
|
| 51 |
_status_cache: Dict[str, Any] = {
|
| 52 |
"tts": {"ready": False, "checked_at": 0.0},
|
|
|
|
| 108 |
text = re.sub(r"<[^>]+>", " ", text)
|
| 109 |
text = html_module.unescape(text)
|
| 110 |
text = _SECTION_HEADERS.sub(" ", text)
|
| 111 |
+
# Remove trailing period after other sentence-ending punctuation ("!." → "!", "?." → "?")
|
| 112 |
+
text = re.sub(r"([!?])\s*\.+", r"\1", text)
|
| 113 |
text = re.sub(r"([.!?])\s*\1+", r"\1", text)
|
| 114 |
text = re.sub(r"\s*\.\s*\.", ".", text)
|
| 115 |
text = re.sub(r"\s+", " ", text).strip()
|
|
|
|
| 135 |
return chunks
|
| 136 |
|
| 137 |
|
| 138 |
+
async def _synthesize_one(
|
| 139 |
+
client: httpx.AsyncClient, base: str, chunk: str
|
| 140 |
+
) -> tuple[Optional[bytes], Optional[str]]:
|
| 141 |
url = f"{base}/synthesize/{quote(chunk, safe='')}"
|
| 142 |
try:
|
| 143 |
r = await client.get(url)
|
| 144 |
+
if r.status_code >= 400:
|
| 145 |
+
err = f"HTTP {r.status_code}: {r.text[:160]}"
|
| 146 |
+
LOG.warning("TTS chunk failed: %s", err)
|
| 147 |
+
return None, err
|
| 148 |
+
return r.content, None
|
| 149 |
except Exception as exc:
|
| 150 |
+
err = str(exc)[:220]
|
| 151 |
LOG.warning("TTS chunk failed (%s chars): %s", len(chunk), exc)
|
| 152 |
+
return None, err
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _ffmpeg_executable() -> str:
|
| 156 |
+
raw = (settings.ffmpeg_path or "").strip()
|
| 157 |
+
if raw:
|
| 158 |
+
p = Path(raw)
|
| 159 |
+
if p.is_file():
|
| 160 |
+
return str(p.resolve())
|
| 161 |
+
w = shutil.which("ffmpeg")
|
| 162 |
+
if w:
|
| 163 |
+
return w
|
| 164 |
+
localappdata = os.environ.get("LOCALAPPDATA", "")
|
| 165 |
+
if localappdata:
|
| 166 |
+
winget_link = Path(localappdata) / "Microsoft" / "WinGet" / "Links" / "ffmpeg.exe"
|
| 167 |
+
if winget_link.is_file():
|
| 168 |
+
return str(winget_link.resolve())
|
| 169 |
+
pkg_root = Path(localappdata) / "Microsoft" / "WinGet" / "Packages"
|
| 170 |
+
if pkg_root.is_dir():
|
| 171 |
+
try:
|
| 172 |
+
candidates = [p for p in pkg_root.rglob("ffmpeg.exe") if p.is_file()]
|
| 173 |
+
candidates.sort(
|
| 174 |
+
key=lambda p: (
|
| 175 |
+
0 if "\\bin\\" in str(p).lower() or "/bin/" in str(p).lower() else 1,
|
| 176 |
+
len(str(p)),
|
| 177 |
+
)
|
| 178 |
+
)
|
| 179 |
+
if candidates:
|
| 180 |
+
return str(candidates[0].resolve())
|
| 181 |
+
except OSError:
|
| 182 |
+
pass
|
| 183 |
+
for env in ("ProgramFiles", "ProgramFiles(x86)"):
|
| 184 |
+
base = os.environ.get(env, "")
|
| 185 |
+
if not base:
|
| 186 |
+
continue
|
| 187 |
+
cand = Path(base) / "ffmpeg" / "bin" / "ffmpeg.exe"
|
| 188 |
+
if cand.is_file():
|
| 189 |
+
return str(cand.resolve())
|
| 190 |
+
raise FileNotFoundError(
|
| 191 |
+
"ffmpeg not found. Install ffmpeg (e.g. winget install Gyan.FFmpeg), add it to PATH, "
|
| 192 |
+
"or set FFMPEG_PATH in backend/.env to the full path to ffmpeg.exe"
|
| 193 |
+
)
|
| 194 |
|
| 195 |
|
| 196 |
+
def _concat_wav_legacy(segments: List[bytes]) -> bytes:
|
| 197 |
+
"""Best-effort RIFF merge (can confuse decoders); used only as fallback."""
|
| 198 |
if len(segments) == 1:
|
| 199 |
return segments[0]
|
| 200 |
pcm_parts: List[bytes] = []
|
|
|
|
| 219 |
return bytes(header) + all_pcm
|
| 220 |
|
| 221 |
|
| 222 |
+
def _concat_wav(segments: List[bytes]) -> bytes:
|
| 223 |
+
"""Merge Coqui WAV segments with matching fmt using stdlib wave (reliable playback)."""
|
| 224 |
+
segs = [s for s in segments if s and len(s) >= 44]
|
| 225 |
+
if not segs:
|
| 226 |
+
return b""
|
| 227 |
+
if len(segs) == 1:
|
| 228 |
+
return segs[0]
|
| 229 |
+
readers: List[wave.Wave_read] = []
|
| 230 |
+
try:
|
| 231 |
+
for raw in segs:
|
| 232 |
+
readers.append(wave.open(io.BytesIO(raw), "rb"))
|
| 233 |
+
r0 = readers[0]
|
| 234 |
+
ch, sw, fr = r0.getnchannels(), r0.getsampwidth(), r0.getframerate()
|
| 235 |
+
out_buf = io.BytesIO()
|
| 236 |
+
wo = wave.open(out_buf, "wb")
|
| 237 |
+
wo.setnchannels(ch)
|
| 238 |
+
wo.setsampwidth(sw)
|
| 239 |
+
wo.setframerate(fr)
|
| 240 |
+
try:
|
| 241 |
+
for w in readers:
|
| 242 |
+
if (
|
| 243 |
+
w.getnchannels() != ch
|
| 244 |
+
or w.getsampwidth() != sw
|
| 245 |
+
or w.getframerate() != fr
|
| 246 |
+
):
|
| 247 |
+
raise ValueError("WAV format mismatch between segments")
|
| 248 |
+
wo.writeframes(w.readframes(w.getnframes()))
|
| 249 |
+
finally:
|
| 250 |
+
wo.close()
|
| 251 |
+
return out_buf.getvalue()
|
| 252 |
+
except Exception as exc:
|
| 253 |
+
LOG.warning("WAV concat via wave module failed, using legacy merge: %s", exc)
|
| 254 |
+
return _concat_wav_legacy(segments)
|
| 255 |
+
finally:
|
| 256 |
+
for w in readers:
|
| 257 |
+
try:
|
| 258 |
+
w.close()
|
| 259 |
+
except Exception:
|
| 260 |
+
pass
|
| 261 |
+
|
| 262 |
+
|
| 263 |
@router.get("/voice/status")
|
| 264 |
async def voice_status() -> Dict[str, bool]:
|
| 265 |
tts_ready = _cached_ready("tts")
|
|
|
|
| 297 |
|
| 298 |
try:
|
| 299 |
async with httpx.AsyncClient(timeout=120.0) as client:
|
| 300 |
+
pairs = await asyncio.gather(*[_synthesize_one(client, base, c) for c in chunks])
|
| 301 |
+
wav_segments = [seg for seg, _ in pairs if seg and len(seg) > 44]
|
| 302 |
if not wav_segments:
|
| 303 |
+
first_err = next((e for _, e in pairs if e), None)
|
| 304 |
+
detail = (first_err or "TTS synthesis failed for all chunks")[:500]
|
| 305 |
+
raise HTTPException(status_code=502, detail=detail)
|
| 306 |
+
if len(wav_segments) < len(chunks):
|
| 307 |
+
LOG.warning(
|
| 308 |
+
"TTS: partial success — %s/%s chunk(s) synthesized (others failed upstream)",
|
| 309 |
+
len(wav_segments),
|
| 310 |
+
len(chunks),
|
| 311 |
+
)
|
| 312 |
combined = _concat_wav(wav_segments)
|
| 313 |
_status_cache["tts"] = {"ready": True, "checked_at": time.time()}
|
| 314 |
return Response(content=combined, media_type="audio/wav")
|
|
|
|
| 322 |
|
| 323 |
|
| 324 |
def _convert_to_wav(audio_bytes: bytes, src_mime: str) -> bytes:
|
| 325 |
+
ff = _ffmpeg_executable()
|
| 326 |
with tempfile.TemporaryDirectory() as tmp:
|
| 327 |
ext = "webm" if "webm" in (src_mime or "") else "ogg"
|
| 328 |
src = Path(tmp) / f"in.{ext}"
|
|
|
|
| 330 |
src.write_bytes(audio_bytes)
|
| 331 |
result = subprocess.run(
|
| 332 |
[
|
| 333 |
+
ff,
|
| 334 |
"-y",
|
| 335 |
"-i",
|
| 336 |
str(src),
|
|
|
|
| 354 |
@router.post("/transcribe")
|
| 355 |
async def transcribe_audio(audio: UploadFile = File(...)) -> Dict[str, str]:
|
| 356 |
contents = await audio.read()
|
| 357 |
+
mime = audio.content_type or "audio/webm"
|
| 358 |
if not contents:
|
| 359 |
return {"text": ""}
|
|
|
|
| 360 |
LOG.info("STT: received %s bytes (%s)", len(contents), mime)
|
| 361 |
|
| 362 |
try:
|
|
|
|
| 371 |
loop = asyncio.get_running_loop()
|
| 372 |
wav_bytes = await loop.run_in_executor(None, _convert_to_wav, contents, mime)
|
| 373 |
LOG.info("STT: converted to WAV (%s bytes)", len(wav_bytes))
|
| 374 |
+
except FileNotFoundError as e:
|
| 375 |
+
LOG.error("STT: ffmpeg missing: %s", e)
|
| 376 |
+
raise HTTPException(status_code=503, detail=str(e))
|
| 377 |
except Exception as e:
|
| 378 |
LOG.error("STT conversion error: %s", e)
|
| 379 |
+
raise HTTPException(status_code=500, detail=f"Audio conversion failed: {e!s}"[:500])
|
| 380 |
else:
|
| 381 |
wav_bytes = contents
|
| 382 |
|
frontend/src/App.css
CHANGED
|
@@ -631,24 +631,61 @@ html.aj-hide-pointer * {
|
|
| 631 |
|
| 632 |
.aj-composer {
|
| 633 |
z-index: 40;
|
| 634 |
-
padding: 6px
|
| 635 |
border-top: 1px solid var(--lc-border);
|
| 636 |
background: var(--lc-sidebar);
|
| 637 |
flex-shrink: 0;
|
| 638 |
flex-grow: 0;
|
| 639 |
}
|
| 640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
.aj-composer-inner {
|
| 642 |
display: flex;
|
| 643 |
gap: 6px;
|
| 644 |
align-items: flex-end;
|
| 645 |
-
|
| 646 |
-
|
|
|
|
| 647 |
}
|
| 648 |
|
| 649 |
.aj-composer-field {
|
| 650 |
flex: 1;
|
| 651 |
min-width: 0;
|
|
|
|
|
|
|
|
|
|
| 652 |
}
|
| 653 |
|
| 654 |
.aj-composer-field--listening textarea {
|
|
@@ -657,7 +694,10 @@ html.aj-hide-pointer * {
|
|
| 657 |
}
|
| 658 |
|
| 659 |
.aj-composer textarea {
|
| 660 |
-
|
|
|
|
|
|
|
|
|
|
| 661 |
box-sizing: border-box;
|
| 662 |
min-height: 34px;
|
| 663 |
max-height: 50vh;
|
|
|
|
| 631 |
|
| 632 |
.aj-composer {
|
| 633 |
z-index: 40;
|
| 634 |
+
padding: 6px 10px;
|
| 635 |
border-top: 1px solid var(--lc-border);
|
| 636 |
background: var(--lc-sidebar);
|
| 637 |
flex-shrink: 0;
|
| 638 |
flex-grow: 0;
|
| 639 |
}
|
| 640 |
|
| 641 |
+
.aj-voice-error-banner {
|
| 642 |
+
display: flex;
|
| 643 |
+
align-items: flex-start;
|
| 644 |
+
justify-content: space-between;
|
| 645 |
+
gap: 12px;
|
| 646 |
+
padding: 8px 12px;
|
| 647 |
+
font-size: 13px;
|
| 648 |
+
line-height: 1.45;
|
| 649 |
+
color: #7f1d1d;
|
| 650 |
+
background: #fef2f2;
|
| 651 |
+
border-bottom: 1px solid #fecaca;
|
| 652 |
+
}
|
| 653 |
+
|
| 654 |
+
.aj-voice-error-banner span {
|
| 655 |
+
flex: 1;
|
| 656 |
+
min-width: 0;
|
| 657 |
+
}
|
| 658 |
+
|
| 659 |
+
.aj-voice-error-dismiss {
|
| 660 |
+
flex-shrink: 0;
|
| 661 |
+
border: none;
|
| 662 |
+
background: transparent;
|
| 663 |
+
color: #991b1b;
|
| 664 |
+
font-size: 20px;
|
| 665 |
+
line-height: 1;
|
| 666 |
+
cursor: pointer;
|
| 667 |
+
padding: 0 4px;
|
| 668 |
+
}
|
| 669 |
+
|
| 670 |
+
.aj-voice-error-dismiss:hover {
|
| 671 |
+
color: #450a0a;
|
| 672 |
+
}
|
| 673 |
+
|
| 674 |
.aj-composer-inner {
|
| 675 |
display: flex;
|
| 676 |
gap: 6px;
|
| 677 |
align-items: flex-end;
|
| 678 |
+
width: 100%;
|
| 679 |
+
max-width: none;
|
| 680 |
+
box-sizing: border-box;
|
| 681 |
}
|
| 682 |
|
| 683 |
.aj-composer-field {
|
| 684 |
flex: 1;
|
| 685 |
min-width: 0;
|
| 686 |
+
display: flex;
|
| 687 |
+
flex-direction: column;
|
| 688 |
+
align-self: stretch;
|
| 689 |
}
|
| 690 |
|
| 691 |
.aj-composer-field--listening textarea {
|
|
|
|
| 694 |
}
|
| 695 |
|
| 696 |
.aj-composer textarea {
|
| 697 |
+
width: 100%;
|
| 698 |
+
min-width: 0;
|
| 699 |
+
flex: 1 1 auto;
|
| 700 |
+
align-self: stretch;
|
| 701 |
box-sizing: border-box;
|
| 702 |
min-height: 34px;
|
| 703 |
max-height: 50vh;
|
frontend/src/App.jsx
CHANGED
|
@@ -8,6 +8,8 @@ import {
|
|
| 8 |
Loader2,
|
| 9 |
MessageSquarePlus,
|
| 10 |
Mic,
|
|
|
|
|
|
|
| 11 |
RefreshCw,
|
| 12 |
Search,
|
| 13 |
Send,
|
|
@@ -73,11 +75,29 @@ function parseSseBuffer(buf, onEvent) {
|
|
| 73 |
return remainder
|
| 74 |
}
|
| 75 |
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
const t = (text || '').trim()
|
| 78 |
-
if (!t) return
|
| 79 |
-
const
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
}
|
| 82 |
|
| 83 |
function appendTokenContent(acc, ev) {
|
|
@@ -196,28 +216,72 @@ function AssistantSearchBar({ content, show, speak }) {
|
|
| 196 |
}
|
| 197 |
|
| 198 |
if (!show) return null
|
| 199 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
return (
|
| 202 |
<div className="aj-msg-actions" ref={wrapRef}>
|
| 203 |
<div className="aj-msg-search-wrap">
|
| 204 |
{speak && (
|
| 205 |
-
<
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
)}
|
| 220 |
-
</
|
| 221 |
)}
|
| 222 |
<button
|
| 223 |
type="button"
|
|
@@ -315,15 +379,21 @@ export default function App() {
|
|
| 315 |
|
| 316 |
const [ttsLoadingIndex, setTtsLoadingIndex] = useState(null)
|
| 317 |
const [ttsPlayingIndex, setTtsPlayingIndex] = useState(null)
|
|
|
|
| 318 |
const [alwaysSpeak, setAlwaysSpeak] = useState(initialAlwaysSpeak)
|
| 319 |
const [micListening, setMicListening] = useState(false)
|
| 320 |
const [micTranscribing, setMicTranscribing] = useState(false)
|
|
|
|
| 321 |
|
| 322 |
const audioRef = useRef(null)
|
| 323 |
const ttsBlobUrlRef = useRef(null)
|
| 324 |
-
const
|
| 325 |
-
const
|
| 326 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
const streamingWasRef = useRef(false)
|
| 328 |
const mediaRecorderRef = useRef(null)
|
| 329 |
const audioChunksRef = useRef([])
|
|
@@ -486,6 +556,7 @@ export default function App() {
|
|
| 486 |
}, [exportOpen])
|
| 487 |
|
| 488 |
const stopTts = useCallback(() => {
|
|
|
|
| 489 |
if (audioRef.current) {
|
| 490 |
audioRef.current.pause()
|
| 491 |
audioRef.current.src = ''
|
|
@@ -497,126 +568,229 @@ export default function App() {
|
|
| 497 |
}
|
| 498 |
setTtsLoadingIndex(null)
|
| 499 |
setTtsPlayingIndex(null)
|
|
|
|
| 500 |
}, [])
|
| 501 |
|
| 502 |
-
const
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
}
|
| 511 |
-
|
| 512 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
try {
|
| 514 |
const res = await fetch('/api/tts', {
|
| 515 |
method: 'POST',
|
| 516 |
headers: { 'Content-Type': 'application/json' },
|
| 517 |
-
body: JSON.stringify({ text:
|
| 518 |
})
|
| 519 |
-
if (!res.ok)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
const blob = await res.blob()
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
setTtsLoadingIndex(null)
|
| 526 |
-
setTtsPlayingIndex(index)
|
| 527 |
-
audio.onended = () => {
|
| 528 |
-
stopTts()
|
| 529 |
}
|
| 530 |
-
|
| 531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
}
|
| 533 |
-
|
| 534 |
-
} catch {
|
| 535 |
setTtsLoadingIndex(null)
|
| 536 |
setTtsPlayingIndex(null)
|
| 537 |
-
|
| 538 |
-
},
|
| 539 |
-
[stopTts],
|
| 540 |
-
)
|
| 541 |
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
|
|
|
|
|
|
|
|
|
| 547 |
}
|
| 548 |
-
playTtsForIndex(index, text)
|
| 549 |
},
|
| 550 |
-
[
|
| 551 |
)
|
| 552 |
|
| 553 |
useEffect(() => {
|
| 554 |
-
|
| 555 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
if (last?.role === 'assistant' && last.content?.trim()) {
|
| 557 |
playTtsForIndex(messages.length - 1, last.content)
|
| 558 |
}
|
| 559 |
}
|
|
|
|
| 560 |
streamingWasRef.current = streaming
|
| 561 |
}, [streaming, alwaysSpeak, messages, playTtsForIndex])
|
| 562 |
|
| 563 |
useEffect(() => {
|
| 564 |
-
try {
|
| 565 |
-
sessionStorage.setItem(STORAGE_ALWAYS_SPEAK, alwaysSpeak ? '1' : '0')
|
| 566 |
-
} catch {
|
| 567 |
-
/* */
|
| 568 |
-
}
|
| 569 |
}, [alwaysSpeak])
|
| 570 |
|
| 571 |
useEffect(() => {
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
if (prefetchBlobRef.current?.url) {
|
| 576 |
-
URL.revokeObjectURL(prefetchBlobRef.current.url)
|
| 577 |
-
}
|
| 578 |
}
|
| 579 |
-
}, [
|
| 580 |
|
| 581 |
useEffect(() => {
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
try {
|
| 585 |
-
primed = sessionStorage.getItem(STORAGE_TTS_PRIMED) === '1'
|
| 586 |
-
} catch {
|
| 587 |
-
/* */
|
| 588 |
-
}
|
| 589 |
-
if (!primed) return
|
| 590 |
-
const last = messages[messages.length - 1]
|
| 591 |
-
if (!last || last.role !== 'assistant') return
|
| 592 |
-
const text = last.content
|
| 593 |
-
const first = extractFirstSentence(text)
|
| 594 |
-
if (!first || first.length < 8) return
|
| 595 |
-
const key = `${messages.length - 1}:${first}`
|
| 596 |
-
if (prefetchKeyRef.current === key) return
|
| 597 |
-
prefetchKeyRef.current = key
|
| 598 |
-
prefetchAbortRef.current?.abort()
|
| 599 |
-
const ac = new AbortController()
|
| 600 |
-
prefetchAbortRef.current = ac
|
| 601 |
-
fetch('/api/tts', {
|
| 602 |
-
method: 'POST',
|
| 603 |
-
headers: { 'Content-Type': 'application/json' },
|
| 604 |
-
body: JSON.stringify({ text: first }),
|
| 605 |
-
signal: ac.signal,
|
| 606 |
-
})
|
| 607 |
-
.then((r) => {
|
| 608 |
-
if (!r.ok) throw new Error('prefetch')
|
| 609 |
-
return r.blob()
|
| 610 |
-
})
|
| 611 |
-
.then((blob) => {
|
| 612 |
-
const url = URL.createObjectURL(blob)
|
| 613 |
-
if (prefetchBlobRef.current?.url) {
|
| 614 |
-
URL.revokeObjectURL(prefetchBlobRef.current.url)
|
| 615 |
-
}
|
| 616 |
-
prefetchBlobRef.current = { index: messages.length - 1, sentence: first, url }
|
| 617 |
-
})
|
| 618 |
-
.catch(() => {})
|
| 619 |
-
}, [streaming, messages])
|
| 620 |
|
| 621 |
const toggleMic = useCallback(async () => {
|
| 622 |
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
|
|
@@ -646,12 +820,32 @@ export default function App() {
|
|
| 646 |
const form = new FormData()
|
| 647 |
form.append('audio', blob, 'recording.webm')
|
| 648 |
fetch('/api/transcribe', { method: 'POST', body: form })
|
| 649 |
-
.then((r) =>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 650 |
.then((data) => {
|
|
|
|
| 651 |
const tx = data?.text?.trim()
|
| 652 |
-
if (tx)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
})
|
| 654 |
-
.catch(() => {})
|
| 655 |
.finally(() => setMicTranscribing(false))
|
| 656 |
}
|
| 657 |
mediaRecorderRef.current = mediaRecorder
|
|
@@ -853,12 +1047,7 @@ export default function App() {
|
|
| 853 |
const handleNewChat = () => {
|
| 854 |
abortRef.current?.abort()
|
| 855 |
stopTts()
|
| 856 |
-
|
| 857 |
-
prefetchKeyRef.current = ''
|
| 858 |
-
if (prefetchBlobRef.current?.url) {
|
| 859 |
-
URL.revokeObjectURL(prefetchBlobRef.current.url)
|
| 860 |
-
prefetchBlobRef.current = null
|
| 861 |
-
}
|
| 862 |
setMessages([])
|
| 863 |
setInput('')
|
| 864 |
setSummary(null)
|
|
@@ -872,12 +1061,7 @@ export default function App() {
|
|
| 872 |
|
| 873 |
const handleRefresh = () => {
|
| 874 |
if (streaming) return
|
| 875 |
-
|
| 876 |
-
prefetchKeyRef.current = ''
|
| 877 |
-
if (prefetchBlobRef.current?.url) {
|
| 878 |
-
URL.revokeObjectURL(prefetchBlobRef.current.url)
|
| 879 |
-
prefetchBlobRef.current = null
|
| 880 |
-
}
|
| 881 |
let base = [...messages]
|
| 882 |
if (base.length && base[base.length - 1].role === 'assistant') base = base.slice(0, -1)
|
| 883 |
if (!base.length || base[base.length - 1].role !== 'user') return
|
|
@@ -891,12 +1075,7 @@ export default function App() {
|
|
| 891 |
const text = input.trim()
|
| 892 |
if (!text || streaming) return
|
| 893 |
setHideTypingCursor(false)
|
| 894 |
-
|
| 895 |
-
prefetchKeyRef.current = ''
|
| 896 |
-
if (prefetchBlobRef.current?.url) {
|
| 897 |
-
URL.revokeObjectURL(prefetchBlobRef.current.url)
|
| 898 |
-
prefetchBlobRef.current = null
|
| 899 |
-
}
|
| 900 |
const userMsg = { role: 'user', content: text }
|
| 901 |
const apiMsgs = [...messages, userMsg].map(({ role, content }) => ({ role, content }))
|
| 902 |
setMessages([...messages, userMsg, { role: 'assistant', content: '' }])
|
|
@@ -1149,6 +1328,15 @@ export default function App() {
|
|
| 1149 |
</div>
|
| 1150 |
</header>
|
| 1151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1152 |
{messages.length > 0 && (
|
| 1153 |
<div className="aj-context-bar">
|
| 1154 |
<div className="aj-context-meter">
|
|
@@ -1211,12 +1399,20 @@ export default function App() {
|
|
| 1211 |
show={!(streaming && i === messages.length - 1)}
|
| 1212 |
speak={{
|
| 1213 |
loading: ttsLoadingIndex === i,
|
| 1214 |
-
playing:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1215 |
disabled:
|
| 1216 |
!(m.content || '').trim()
|
| 1217 |
&& ttsLoadingIndex !== i
|
| 1218 |
&& ttsPlayingIndex !== i,
|
| 1219 |
-
|
|
|
|
|
|
|
|
|
|
| 1220 |
}}
|
| 1221 |
/>
|
| 1222 |
)}
|
|
|
|
| 8 |
Loader2,
|
| 9 |
MessageSquarePlus,
|
| 10 |
Mic,
|
| 11 |
+
Pause,
|
| 12 |
+
Play,
|
| 13 |
RefreshCw,
|
| 14 |
Search,
|
| 15 |
Send,
|
|
|
|
| 75 |
return remainder
|
| 76 |
}
|
| 77 |
|
| 78 |
+
/**
|
| 79 |
+
* Extract complete sentences from text. Boundaries are stable: once a sentence
|
| 80 |
+
* ends with punctuation, it never changes even as more text streams in.
|
| 81 |
+
* When includeTrailing is true, any text after the last sentence terminator is
|
| 82 |
+
* also returned (used after streaming ends so nothing is silently dropped).
|
| 83 |
+
*/
|
| 84 |
+
function extractSentences(text, includeTrailing = false) {
|
| 85 |
const t = (text || '').trim()
|
| 86 |
+
if (!t) return []
|
| 87 |
+
const re = /[^.!?\n]+[.!?\n]+/g
|
| 88 |
+
const sentences = []
|
| 89 |
+
let match, lastEnd = 0
|
| 90 |
+
while ((match = re.exec(t)) !== null) {
|
| 91 |
+
const s = match[0].trim()
|
| 92 |
+
if (s) sentences.push(s)
|
| 93 |
+
lastEnd = re.lastIndex
|
| 94 |
+
}
|
| 95 |
+
if (!sentences.length) return includeTrailing ? [t] : []
|
| 96 |
+
if (includeTrailing) {
|
| 97 |
+
const remainder = t.slice(lastEnd).trim()
|
| 98 |
+
if (remainder) sentences.push(remainder)
|
| 99 |
+
}
|
| 100 |
+
return sentences
|
| 101 |
}
|
| 102 |
|
| 103 |
function appendTokenContent(acc, ev) {
|
|
|
|
| 216 |
}
|
| 217 |
|
| 218 |
if (!show) return null
|
| 219 |
+
if (
|
| 220 |
+
!(content || '').trim()
|
| 221 |
+
&& !speak?.loading
|
| 222 |
+
&& !speak?.playing
|
| 223 |
+
&& !speak?.paused
|
| 224 |
+
) {
|
| 225 |
+
return null
|
| 226 |
+
}
|
| 227 |
|
| 228 |
return (
|
| 229 |
<div className="aj-msg-actions" ref={wrapRef}>
|
| 230 |
<div className="aj-msg-search-wrap">
|
| 231 |
{speak && (
|
| 232 |
+
<>
|
| 233 |
+
<button
|
| 234 |
+
type="button"
|
| 235 |
+
className="aj-msg-search-btn"
|
| 236 |
+
onClick={
|
| 237 |
+
speak.paused
|
| 238 |
+
? speak.onResume
|
| 239 |
+
: speak.playing
|
| 240 |
+
? speak.onPause
|
| 241 |
+
: speak.onReadAloud
|
| 242 |
+
}
|
| 243 |
+
disabled={speak.disabled || speak.loading}
|
| 244 |
+
data-tip={
|
| 245 |
+
speak.paused
|
| 246 |
+
? 'Resume'
|
| 247 |
+
: speak.playing
|
| 248 |
+
? 'Pause'
|
| 249 |
+
: speak.loading
|
| 250 |
+
? 'Loading speech…'
|
| 251 |
+
: 'Read aloud'
|
| 252 |
+
}
|
| 253 |
+
aria-label={
|
| 254 |
+
speak.paused
|
| 255 |
+
? 'Resume'
|
| 256 |
+
: speak.playing
|
| 257 |
+
? 'Pause'
|
| 258 |
+
: speak.loading
|
| 259 |
+
? 'Loading speech'
|
| 260 |
+
: 'Read aloud'
|
| 261 |
+
}
|
| 262 |
+
>
|
| 263 |
+
{speak.paused ? (
|
| 264 |
+
<Play size={14} aria-hidden />
|
| 265 |
+
) : speak.loading ? (
|
| 266 |
+
<Loader2 size={14} className="aj-spin" aria-hidden />
|
| 267 |
+
) : speak.playing ? (
|
| 268 |
+
<Pause size={14} aria-hidden />
|
| 269 |
+
) : (
|
| 270 |
+
<Volume2 size={14} aria-hidden />
|
| 271 |
+
)}
|
| 272 |
+
</button>
|
| 273 |
+
{(speak.playing || speak.paused || (speak.loading && speak.showStop)) && (
|
| 274 |
+
<button
|
| 275 |
+
type="button"
|
| 276 |
+
className="aj-msg-search-btn"
|
| 277 |
+
onClick={speak.onStopReading}
|
| 278 |
+
data-tip="Stop reading"
|
| 279 |
+
aria-label="Stop reading"
|
| 280 |
+
>
|
| 281 |
+
<Square size={14} aria-hidden />
|
| 282 |
+
</button>
|
| 283 |
)}
|
| 284 |
+
</>
|
| 285 |
)}
|
| 286 |
<button
|
| 287 |
type="button"
|
|
|
|
| 379 |
|
| 380 |
const [ttsLoadingIndex, setTtsLoadingIndex] = useState(null)
|
| 381 |
const [ttsPlayingIndex, setTtsPlayingIndex] = useState(null)
|
| 382 |
+
const [ttsPaused, setTtsPaused] = useState(false)
|
| 383 |
const [alwaysSpeak, setAlwaysSpeak] = useState(initialAlwaysSpeak)
|
| 384 |
const [micListening, setMicListening] = useState(false)
|
| 385 |
const [micTranscribing, setMicTranscribing] = useState(false)
|
| 386 |
+
const [voiceError, setVoiceError] = useState(null)
|
| 387 |
|
| 388 |
const audioRef = useRef(null)
|
| 389 |
const ttsBlobUrlRef = useRef(null)
|
| 390 |
+
const ttsSessionRef = useRef(0)
|
| 391 |
+
const messagesRef = useRef(messages)
|
| 392 |
+
messagesRef.current = messages
|
| 393 |
+
const streamingRef = useRef(streaming)
|
| 394 |
+
streamingRef.current = streaming
|
| 395 |
+
const ttsContentResolverRef = useRef(null)
|
| 396 |
+
const ttsPlaybackActiveRef = useRef(false)
|
| 397 |
const streamingWasRef = useRef(false)
|
| 398 |
const mediaRecorderRef = useRef(null)
|
| 399 |
const audioChunksRef = useRef([])
|
|
|
|
| 556 |
}, [exportOpen])
|
| 557 |
|
| 558 |
const stopTts = useCallback(() => {
|
| 559 |
+
ttsSessionRef.current += 1
|
| 560 |
if (audioRef.current) {
|
| 561 |
audioRef.current.pause()
|
| 562 |
audioRef.current.src = ''
|
|
|
|
| 568 |
}
|
| 569 |
setTtsLoadingIndex(null)
|
| 570 |
setTtsPlayingIndex(null)
|
| 571 |
+
setTtsPaused(false)
|
| 572 |
}, [])
|
| 573 |
|
| 574 |
+
const pauseTts = useCallback(() => {
|
| 575 |
+
const a = audioRef.current
|
| 576 |
+
if (a && !a.paused) {
|
| 577 |
+
a.pause()
|
| 578 |
+
setTtsPaused(true)
|
| 579 |
+
}
|
| 580 |
+
}, [])
|
| 581 |
+
|
| 582 |
+
const resumeTts = useCallback(async () => {
|
| 583 |
+
const a = audioRef.current
|
| 584 |
+
if (!a) return
|
| 585 |
+
try {
|
| 586 |
+
await a.play()
|
| 587 |
+
setTtsPaused(false)
|
| 588 |
+
} catch (e) {
|
| 589 |
+
setVoiceError(String(e?.message || e))
|
| 590 |
+
}
|
| 591 |
+
}, [])
|
| 592 |
+
|
| 593 |
+
const playAudioUrlUntilDone = useCallback((url, session) => {
|
| 594 |
+
return new Promise((resolve) => {
|
| 595 |
+
if (session !== ttsSessionRef.current) {
|
| 596 |
+
resolve()
|
| 597 |
+
return
|
| 598 |
}
|
| 599 |
+
const audio = new Audio(url)
|
| 600 |
+
audioRef.current = audio
|
| 601 |
+
ttsBlobUrlRef.current = url
|
| 602 |
+
let settled = false
|
| 603 |
+
let pollAbort = null
|
| 604 |
+
const finish = () => {
|
| 605 |
+
if (settled) return
|
| 606 |
+
settled = true
|
| 607 |
+
if (pollAbort != null) clearInterval(pollAbort)
|
| 608 |
+
audio.onended = null
|
| 609 |
+
audio.onerror = null
|
| 610 |
+
resolve()
|
| 611 |
+
}
|
| 612 |
+
pollAbort = setInterval(() => {
|
| 613 |
+
if (session !== ttsSessionRef.current) finish()
|
| 614 |
+
}, 120)
|
| 615 |
+
audio.onended = () => {
|
| 616 |
+
finish()
|
| 617 |
+
}
|
| 618 |
+
audio.onerror = () => {
|
| 619 |
+
finish()
|
| 620 |
+
}
|
| 621 |
+
audio.play().catch(() => finish())
|
| 622 |
+
})
|
| 623 |
+
}, [])
|
| 624 |
+
|
| 625 |
+
const fetchTtsAudio = useCallback(async (chunkText) => {
|
| 626 |
+
for (let attempt = 0; attempt < 2; attempt++) {
|
| 627 |
try {
|
| 628 |
const res = await fetch('/api/tts', {
|
| 629 |
method: 'POST',
|
| 630 |
headers: { 'Content-Type': 'application/json' },
|
| 631 |
+
body: JSON.stringify({ text: chunkText }),
|
| 632 |
})
|
| 633 |
+
if (!res.ok) {
|
| 634 |
+
let detail = `TTS failed (${res.status})`
|
| 635 |
+
try {
|
| 636 |
+
const ct = res.headers.get('content-type') || ''
|
| 637 |
+
if (ct.includes('json')) {
|
| 638 |
+
const j = await res.json()
|
| 639 |
+
if (j?.detail != null) {
|
| 640 |
+
detail = typeof j.detail === 'string' ? j.detail : JSON.stringify(j.detail)
|
| 641 |
+
}
|
| 642 |
+
}
|
| 643 |
+
} catch { /* */ }
|
| 644 |
+
if (res.status >= 500 && attempt === 0) {
|
| 645 |
+
continue
|
| 646 |
+
}
|
| 647 |
+
return { error: detail }
|
| 648 |
+
}
|
| 649 |
const blob = await res.blob()
|
| 650 |
+
return { url: URL.createObjectURL(blob) }
|
| 651 |
+
} catch (e) {
|
| 652 |
+
if (attempt === 0) {
|
| 653 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
| 654 |
}
|
| 655 |
+
return { error: String(e?.message || e) }
|
| 656 |
+
}
|
| 657 |
+
}
|
| 658 |
+
return { error: 'TTS failed after retries' }
|
| 659 |
+
}, [])
|
| 660 |
+
|
| 661 |
+
const playTtsForIndex = useCallback(
|
| 662 |
+
async (index, initialText) => {
|
| 663 |
+
try { sessionStorage.setItem(STORAGE_TTS_PRIMED, '1') } catch { /* */ }
|
| 664 |
+
setVoiceError(null)
|
| 665 |
+
stopTts()
|
| 666 |
+
const session = ttsSessionRef.current
|
| 667 |
+
ttsPlaybackActiveRef.current = true
|
| 668 |
+
|
| 669 |
+
setTtsLoadingIndex(index)
|
| 670 |
+
setTtsPlayingIndex(null)
|
| 671 |
+
setTtsPaused(false)
|
| 672 |
+
|
| 673 |
+
const LOOKAHEAD = 2
|
| 674 |
+
let playedCount = 0
|
| 675 |
+
let anyPlayed = false
|
| 676 |
+
let lastErr = null
|
| 677 |
+
const inFlight = new Map()
|
| 678 |
+
|
| 679 |
+
const getSentencesAndLimit = () => {
|
| 680 |
+
const live = streamingRef.current
|
| 681 |
+
const text = (messagesRef.current[index]?.content || initialText || '').trim()
|
| 682 |
+
const sentences = extractSentences(text, !live)
|
| 683 |
+
return { sentences, limit: sentences.length, live }
|
| 684 |
+
}
|
| 685 |
+
|
| 686 |
+
try {
|
| 687 |
+
// eslint-disable-next-line no-constant-condition
|
| 688 |
+
while (true) {
|
| 689 |
+
if (session !== ttsSessionRef.current) return
|
| 690 |
+
|
| 691 |
+
const { sentences, limit, live } = getSentencesAndLimit()
|
| 692 |
+
|
| 693 |
+
if (playedCount < limit) {
|
| 694 |
+
for (let ahead = playedCount; ahead < Math.min(playedCount + LOOKAHEAD, limit); ahead++) {
|
| 695 |
+
if (!inFlight.has(ahead)) {
|
| 696 |
+
inFlight.set(ahead, fetchTtsAudio(sentences[ahead]))
|
| 697 |
+
}
|
| 698 |
+
}
|
| 699 |
+
|
| 700 |
+
if (!anyPlayed) setTtsLoadingIndex(index)
|
| 701 |
+
const result = await (inFlight.get(playedCount) || fetchTtsAudio(sentences[playedCount]))
|
| 702 |
+
inFlight.delete(playedCount)
|
| 703 |
+
|
| 704 |
+
if (session !== ttsSessionRef.current) return
|
| 705 |
+
|
| 706 |
+
if (!result?.url) {
|
| 707 |
+
if (result?.error) lastErr = result.error
|
| 708 |
+
playedCount++
|
| 709 |
+
continue
|
| 710 |
+
}
|
| 711 |
+
|
| 712 |
+
anyPlayed = true
|
| 713 |
+
setTtsLoadingIndex(null)
|
| 714 |
+
setTtsPlayingIndex(index)
|
| 715 |
+
setTtsPaused(false)
|
| 716 |
+
|
| 717 |
+
await playAudioUrlUntilDone(result.url, session)
|
| 718 |
+
URL.revokeObjectURL(result.url)
|
| 719 |
+
if (ttsBlobUrlRef.current === result.url) {
|
| 720 |
+
ttsBlobUrlRef.current = null
|
| 721 |
+
audioRef.current = null
|
| 722 |
+
}
|
| 723 |
+
|
| 724 |
+
playedCount++
|
| 725 |
+
if (session !== ttsSessionRef.current) return
|
| 726 |
+
continue
|
| 727 |
+
}
|
| 728 |
+
|
| 729 |
+
if (!live) break
|
| 730 |
+
|
| 731 |
+
setTtsLoadingIndex(index)
|
| 732 |
+
await new Promise(resolve => {
|
| 733 |
+
ttsContentResolverRef.current = resolve
|
| 734 |
+
const rechk = getSentencesAndLimit()
|
| 735 |
+
if (rechk.limit > playedCount || !rechk.live) {
|
| 736 |
+
ttsContentResolverRef.current = null
|
| 737 |
+
resolve()
|
| 738 |
+
}
|
| 739 |
+
})
|
| 740 |
}
|
| 741 |
+
|
|
|
|
| 742 |
setTtsLoadingIndex(null)
|
| 743 |
setTtsPlayingIndex(null)
|
| 744 |
+
setTtsPaused(false)
|
|
|
|
|
|
|
|
|
|
| 745 |
|
| 746 |
+
if (!anyPlayed && lastErr) setVoiceError(lastErr)
|
| 747 |
+
} catch (e) {
|
| 748 |
+
setVoiceError(String(e?.message || e))
|
| 749 |
+
setTtsLoadingIndex(null)
|
| 750 |
+
setTtsPlayingIndex(null)
|
| 751 |
+
setTtsPaused(false)
|
| 752 |
+
} finally {
|
| 753 |
+
ttsPlaybackActiveRef.current = false
|
| 754 |
}
|
|
|
|
| 755 |
},
|
| 756 |
+
[stopTts, playAudioUrlUntilDone, fetchTtsAudio],
|
| 757 |
)
|
| 758 |
|
| 759 |
useEffect(() => {
|
| 760 |
+
const last = messages[messages.length - 1]
|
| 761 |
+
|
| 762 |
+
if (streaming && alwaysSpeak && !ttsPlaybackActiveRef.current) {
|
| 763 |
+
if (last?.role === 'assistant' && last.content?.trim()) {
|
| 764 |
+
const sentences = extractSentences(last.content.trim())
|
| 765 |
+
if (sentences.length >= 2) {
|
| 766 |
+
playTtsForIndex(messages.length - 1, last.content)
|
| 767 |
+
}
|
| 768 |
+
}
|
| 769 |
+
}
|
| 770 |
+
|
| 771 |
+
if (streamingWasRef.current && !streaming && alwaysSpeak && !ttsPlaybackActiveRef.current) {
|
| 772 |
if (last?.role === 'assistant' && last.content?.trim()) {
|
| 773 |
playTtsForIndex(messages.length - 1, last.content)
|
| 774 |
}
|
| 775 |
}
|
| 776 |
+
|
| 777 |
streamingWasRef.current = streaming
|
| 778 |
}, [streaming, alwaysSpeak, messages, playTtsForIndex])
|
| 779 |
|
| 780 |
useEffect(() => {
|
| 781 |
+
try { sessionStorage.setItem(STORAGE_ALWAYS_SPEAK, alwaysSpeak ? '1' : '0') } catch { /* */ }
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
}, [alwaysSpeak])
|
| 783 |
|
| 784 |
useEffect(() => {
|
| 785 |
+
if (ttsContentResolverRef.current) {
|
| 786 |
+
ttsContentResolverRef.current()
|
| 787 |
+
ttsContentResolverRef.current = null
|
|
|
|
|
|
|
|
|
|
| 788 |
}
|
| 789 |
+
}, [messages, streaming])
|
| 790 |
|
| 791 |
useEffect(() => {
|
| 792 |
+
return () => { stopTts() }
|
| 793 |
+
}, [stopTts])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 794 |
|
| 795 |
const toggleMic = useCallback(async () => {
|
| 796 |
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
|
|
|
|
| 820 |
const form = new FormData()
|
| 821 |
form.append('audio', blob, 'recording.webm')
|
| 822 |
fetch('/api/transcribe', { method: 'POST', body: form })
|
| 823 |
+
.then(async (r) => {
|
| 824 |
+
const data = await r.json().catch(() => ({}))
|
| 825 |
+
if (!r.ok) {
|
| 826 |
+
const d = data?.detail
|
| 827 |
+
const msg =
|
| 828 |
+
typeof d === 'string'
|
| 829 |
+
? d
|
| 830 |
+
: d != null
|
| 831 |
+
? JSON.stringify(d)
|
| 832 |
+
: `Speech-to-text failed (${r.status})`
|
| 833 |
+
setVoiceError(msg)
|
| 834 |
+
return null
|
| 835 |
+
}
|
| 836 |
+
return data
|
| 837 |
+
})
|
| 838 |
.then((data) => {
|
| 839 |
+
if (!data) return
|
| 840 |
const tx = data?.text?.trim()
|
| 841 |
+
if (tx) {
|
| 842 |
+
setVoiceError(null)
|
| 843 |
+
setInput((prev) => (prev ? `${prev} ${tx}` : tx))
|
| 844 |
+
}
|
| 845 |
+
})
|
| 846 |
+
.catch((e) => {
|
| 847 |
+
setVoiceError(String(e?.message || e))
|
| 848 |
})
|
|
|
|
| 849 |
.finally(() => setMicTranscribing(false))
|
| 850 |
}
|
| 851 |
mediaRecorderRef.current = mediaRecorder
|
|
|
|
| 1047 |
const handleNewChat = () => {
|
| 1048 |
abortRef.current?.abort()
|
| 1049 |
stopTts()
|
| 1050 |
+
setVoiceError(null)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1051 |
setMessages([])
|
| 1052 |
setInput('')
|
| 1053 |
setSummary(null)
|
|
|
|
| 1061 |
|
| 1062 |
const handleRefresh = () => {
|
| 1063 |
if (streaming) return
|
| 1064 |
+
stopTts()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1065 |
let base = [...messages]
|
| 1066 |
if (base.length && base[base.length - 1].role === 'assistant') base = base.slice(0, -1)
|
| 1067 |
if (!base.length || base[base.length - 1].role !== 'user') return
|
|
|
|
| 1075 |
const text = input.trim()
|
| 1076 |
if (!text || streaming) return
|
| 1077 |
setHideTypingCursor(false)
|
| 1078 |
+
stopTts()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1079 |
const userMsg = { role: 'user', content: text }
|
| 1080 |
const apiMsgs = [...messages, userMsg].map(({ role, content }) => ({ role, content }))
|
| 1081 |
setMessages([...messages, userMsg, { role: 'assistant', content: '' }])
|
|
|
|
| 1328 |
</div>
|
| 1329 |
</header>
|
| 1330 |
|
| 1331 |
+
{voiceError && (
|
| 1332 |
+
<div className="aj-voice-error-banner" role="alert">
|
| 1333 |
+
<span>{voiceError}</span>
|
| 1334 |
+
<button type="button" className="aj-voice-error-dismiss" onClick={() => setVoiceError(null)} aria-label="Dismiss">
|
| 1335 |
+
×
|
| 1336 |
+
</button>
|
| 1337 |
+
</div>
|
| 1338 |
+
)}
|
| 1339 |
+
|
| 1340 |
{messages.length > 0 && (
|
| 1341 |
<div className="aj-context-bar">
|
| 1342 |
<div className="aj-context-meter">
|
|
|
|
| 1399 |
show={!(streaming && i === messages.length - 1)}
|
| 1400 |
speak={{
|
| 1401 |
loading: ttsLoadingIndex === i,
|
| 1402 |
+
playing:
|
| 1403 |
+
ttsPlayingIndex === i
|
| 1404 |
+
&& !ttsPaused
|
| 1405 |
+
&& ttsLoadingIndex !== i,
|
| 1406 |
+
paused: ttsPlayingIndex === i && ttsPaused,
|
| 1407 |
+
showStop: ttsLoadingIndex === i || ttsPlayingIndex === i,
|
| 1408 |
disabled:
|
| 1409 |
!(m.content || '').trim()
|
| 1410 |
&& ttsLoadingIndex !== i
|
| 1411 |
&& ttsPlayingIndex !== i,
|
| 1412 |
+
onReadAloud: () => playTtsForIndex(i, m.content),
|
| 1413 |
+
onPause: pauseTts,
|
| 1414 |
+
onResume: resumeTts,
|
| 1415 |
+
onStopReading: stopTts,
|
| 1416 |
}}
|
| 1417 |
/>
|
| 1418 |
)}
|