import { useCallback, useEffect, useRef, useState } from 'react'; import { apiBlobFetch } from '../api.js'; import { spokenReplyText, splitSpeechSegments } from '../app-voice-utils.js'; const DEFAULT_SPEECH_LANG = 'zh-CN'; const DEFAULT_SPEECH_RATE = 1; const DEFAULT_SPEECH_PITCH = 1; function ensureMessageSpeechAudio(audioRef) { if (!audioRef.current) { const audio = new Audio(); audio.preload = 'auto'; audio.playsInline = true; audioRef.current = audio; } return audioRef.current; } function clearMessageSpeechAudio(ctx, { release = false, abortRequest = false } = {}) { if (abortRequest) { ctx.abortRef.current?.abort?.(); ctx.abortRef.current = null; } ctx.stopPlaybackRef.current?.(); const audio = ctx.audioRef.current; if (audio) { audio.pause(); audio.onended = null; audio.onerror = null; audio.removeAttribute('src'); audio.load?.(); if (release) { ctx.audioRef.current = null; } } if (ctx.audioUrlRef.current) { URL.revokeObjectURL(ctx.audioUrlRef.current); ctx.audioUrlRef.current = ''; } window.speechSynthesis?.cancel?.(); } function stopMessageSpeech(ctx, { release = false, resetState = true } = {}) { ctx.runRef.current += 1; clearMessageSpeechAudio(ctx, { release, abortRequest: true }); if (resetState) { ctx.setSpeakingMessageId(''); ctx.setSpeechLoadingMessageId(''); } } function speakWithBrowser(text, stopPlaybackRef, options = {}) { return new Promise((resolve, reject) => { if (!window.speechSynthesis || !window.SpeechSynthesisUtterance) { reject(new Error('Browser speech synthesis is not supported')); return; } const utterance = new SpeechSynthesisUtterance(text); let settled = false; const finish = (error) => { if (settled) { return; } settled = true; if (stopPlaybackRef.current === stopBrowserSpeech) { stopPlaybackRef.current = null; } utterance.onend = null; utterance.onerror = null; error ? reject(error) : resolve(); }; function stopBrowserSpeech() { window.speechSynthesis.cancel(); finish(); } utterance.lang = options.lang || DEFAULT_SPEECH_LANG; utterance.rate = Number.isFinite(options.rate) ? options.rate : DEFAULT_SPEECH_RATE; utterance.pitch = Number.isFinite(options.pitch) ? options.pitch : DEFAULT_SPEECH_PITCH; utterance.onend = () => finish(); utterance.onerror = () => finish(new Error('Browser speech synthesis failed')); stopPlaybackRef.current = stopBrowserSpeech; window.speechSynthesis.cancel(); window.speechSynthesis.speak(utterance); }); } function playMessageAudioBlob(ctx, blob) { return new Promise((resolve, reject) => { const url = URL.createObjectURL(blob); const audio = ensureMessageSpeechAudio(ctx.audioRef); let settled = false; const finish = (error) => { if (settled) { return; } settled = true; if (ctx.stopPlaybackRef.current === finish) { ctx.stopPlaybackRef.current = null; } audio.onended = null; audio.onerror = null; error ? reject(error) : resolve(); }; clearMessageSpeechAudio(ctx); ctx.audioUrlRef.current = url; ctx.stopPlaybackRef.current = finish; audio.muted = false; audio.src = url; audio.playsInline = true; audio.onended = () => finish(); audio.onerror = () => finish(new Error('Message speech playback failed')); audio.load?.(); audio.play().catch(finish); }); } async function fetchMessageSpeechBlob(ctx, text, runId) { if (ctx.runRef.current !== runId) { throw new Error('Message speech was stopped'); } const controller = new AbortController(); ctx.abortRef.current = controller; try { return await apiBlobFetch('/api/voice/speech', { method: 'POST', body: { text }, signal: controller.signal }); } finally { if (ctx.abortRef.current === controller) { ctx.abortRef.current = null; } } } function messageSpeechProviderError(error, fallbackStartIndex) { const wrappedError = new Error(error?.message || String(error || 'Message speech failed')); wrappedError.name = error?.name || 'MessageSpeechProviderError'; wrappedError.fallbackStartIndex = fallbackStartIndex; return wrappedError; } async function playProviderSegments(ctx, { messageId, segments, runId }) { let blob = await fetchMessageSpeechBlob(ctx, segments[0], runId); if (ctx.runRef.current !== runId) { return { stopped: true, fallbackStartIndex: 0 }; } ctx.setSpeechLoadingMessageId(''); ctx.setSpeakingMessageId(messageId); let fallbackStartIndex = 0; try { for (let index = 0; index < segments.length; index += 1) { fallbackStartIndex = index; const nextIndex = index + 1; const nextBlobPromise = nextIndex < segments.length ? fetchMessageSpeechBlob(ctx, segments[nextIndex], runId) .then((nextBlob) => ({ blob: nextBlob })) .catch((error) => ({ error })) : null; await playMessageAudioBlob(ctx, blob); fallbackStartIndex = index + 1; if (ctx.runRef.current !== runId || !nextBlobPromise) { break; } ctx.setSpeechLoadingMessageId(messageId); const next = await nextBlobPromise; if (next.error) { throw next.error; } blob = next.blob; ctx.setSpeechLoadingMessageId(''); ctx.setSpeakingMessageId(messageId); } } catch (error) { throw messageSpeechProviderError(error, fallbackStartIndex); } return { stopped: ctx.runRef.current !== runId, fallbackStartIndex }; } async function fallbackMessageSpeech(ctx, { messageId, segments, fallbackStartIndex, runId }) { const startIndex = Math.max(0, Number(fallbackStartIndex) || 0); const fallbackText = segments.slice(startIndex).join(' '); try { ctx.setSpeechLoadingMessageId(''); ctx.setSpeakingMessageId(messageId); if (fallbackText) { await speakWithBrowser(fallbackText, ctx.stopPlaybackRef); } } catch (fallbackError) { if (ctx.runRef.current === runId) { console.warn('[voice] browser message speech failed:', fallbackError.message || fallbackError); } } } async function speakMessageFromContext(ctx, message) { const messageId = String(message?.id || ''); if (!messageId || message?.role !== 'assistant') { return; } if (ctx.speakingMessageId === messageId || ctx.speechLoadingMessageId === messageId) { stopMessageSpeech(ctx); return; } const segments = splitSpeechSegments(spokenReplyText(message.content)); if (!segments.length) { return; } ctx.runRef.current += 1; const runId = ctx.runRef.current; clearMessageSpeechAudio(ctx, { abortRequest: true }); ctx.setSpeechLoadingMessageId(messageId); ctx.setSpeakingMessageId(''); try { const result = await playProviderSegments(ctx, { messageId, segments, runId }); if (result.stopped) { return; } } catch (error) { if (error?.name === 'AbortError' || ctx.runRef.current !== runId) { return; } console.warn('[voice] message speech failed, using browser fallback:', error.message || error); await fallbackMessageSpeech(ctx, { messageId, segments, runId, fallbackStartIndex: error?.fallbackStartIndex }); } finally { if (ctx.runRef.current === runId) { clearMessageSpeechAudio(ctx); ctx.setSpeakingMessageId(''); ctx.setSpeechLoadingMessageId(''); } } } export function useMessageSpeech(selectedSessionId) { const [speakingMessageId, setSpeakingMessageId] = useState(''); const [speechLoadingMessageId, setSpeechLoadingMessageId] = useState(''); const ctxRef = useRef({}); ctxRef.current = { speakingMessageId, speechLoadingMessageId, setSpeakingMessageId, setSpeechLoadingMessageId, audioRef: ctxRef.current.audioRef || { current: null }, audioUrlRef: ctxRef.current.audioUrlRef || { current: '' }, abortRef: ctxRef.current.abortRef || { current: null }, stopPlaybackRef: ctxRef.current.stopPlaybackRef || { current: null }, runRef: ctxRef.current.runRef || { current: 0 } }; const stopSpeech = useCallback((options) => { stopMessageSpeech(ctxRef.current, options); }, []); const speakMessage = useCallback((message) => { speakMessageFromContext(ctxRef.current, message); }, []); useEffect(() => () => stopSpeech({ release: true, resetState: false }), [stopSpeech]); useEffect(() => { stopSpeech(); }, [selectedSessionId, stopSpeech]); return { speakingMessageId, speechLoadingMessageId, speakMessage, stopSpeech }; }