// TTS facade — mirrors runtime.js (the LLM facade). Picks the active TTS engine // (Kokoro / Kitten / Web Speech) and voice, and exposes makeNarrator(): a streaming // reader that speaks sentence-by-sentence so a war diary can narrate itself while the // LLM is still writing. Panels + the TTS bar import only from here. import { engine as kokoro } from '/web/ttsKokoro.js' import { engine as qwen3, engineLocal as qwen3local } from '/web/ttsQwen3.js' import { engine as voxcpm } from '/web/ttsVoxcpm.js' import { engine as kitten } from '/web/ttsKitten.js' import { engine as webspeech } from '/web/ttsWebSpeech.js' import { playSamples, stopAudio, decodeAudio, encodeWav } from '/web/ttsAudio.js' import { ensurePersistentStorage } from '/web/storage.js' const ENGINES = [kokoro, qwen3local, qwen3, voxcpm, kitten, webspeech] // Default voice provider: local-GPU Qwen3-TTS on localhost (your GPU designs voices), // in-browser Kokoro in prod (runs on the device — no exhaustible cloud quota). Cloud // Qwen3-TTS and the others remain selectable in Settings. Persisted across refreshes. const TTS_ENGINE_KEY = 'tinyarmy.ttsEngine' let activeId = (() => { let saved = '' try { saved = localStorage.getItem(TTS_ENGINE_KEY) || '' } catch { /* ignore */ } const e = ENGINES.find((x) => x.id === saved) return e && e.available() ? saved : 'voxcpm' })() // Qwen3-TTS designs a voice from a free-form description (the persona's `voice`). // Panels set it before narrating; previewVoice() plays a one-off sample. export function setVoiceDescription(desc) { qwen3.setDesc(desc); voxcpm.setDesc(desc) } export async function previewVoice(desc, text) { const e = eng() if (e.setDesc) e.setDesc(desc) const { audio, sampleRate } = await e.synth(text, 'persona') return playSamples(audio, sampleRate) } export const stopPreview = () => stopAudio() // Create a persona's voice FILE: synth the line in the designed voice and return the // raw WAV (ArrayBuffer) so it can be cached + replayed verbatim. Caller plays it. export async function createVoiceWav(desc, text) { const e = eng() if (e.setDesc) e.setDesc(desc) return e.synthWav(text, 'persona') } // Clone `text` from a reference voice file (keep timbre, change words). `desc` is the // voice design — a fallback so prod (no clone model) can re-design instead. Returns WAV. export async function cloneVoiceWav(refArrayBuffer, refText, text, desc) { const e = eng() if (e.setDesc) e.setDesc(desc) return e.cloneWav(text, refArrayBuffer, refText, desc) } export async function playWav(arrayBuffer) { const { audio, sampleRate } = await decodeAudio(arrayBuffer) return playSamples(audio, sampleRate) } // ── Fixed-voice engines (Kokoro / Kitten / Web Speech) ─────────────────────── // These don't "design" a voice from text; a hero picks one of the engine's named // voices. The persona panel uses these when the active engine is NOT Qwen3. export const activeEngineIsDesign = () => !!eng().design // Qwen3 → designs from a description export const activeEngineIsNative = () => eng().mode === 'native' // Web Speech → speaks live, no WAV export const activeEngineId = () => activeId export const activeVoices = () => eng().listVoices() export const activeDefaultVoice = () => eng().defaultVoice // Synthesize `text` in a NAMED voice with the active PCM engine → a cacheable WAV // (encode Kokoro/Kitten PCM, or pass through an engine that already returns WAV). export async function synthVoiceWav(voiceId, text) { const e = eng() if (e.needsDownload) { await ensurePersistentStorage(); await e.ensure() } if (e.synthWav) return e.synthWav(text, voiceId) const { audio, sampleRate } = await e.synth(text, voiceId) return encodeWav(audio, sampleRate) } // Speak `text` live in a named voice (native engines that can't render to a file). export async function speakVoiceLive(voiceId, text) { const e = eng() if (e.speak) return e.speak(text, voiceId) const { audio, sampleRate } = await e.synth(text, voiceId) return playSamples(audio, sampleRate) } export function stopVoiceLive() { const e = eng(); if (e.stop) e.stop(); stopAudio() } const voiceSel = {} // engineId -> chosen voice id const eng = () => ENGINES.find((e) => e.id === activeId) || ENGINES[0] export const listTtsEngines = () => ENGINES.map((e) => ({ id: e.id, label: e.label, available: e.available(), experimental: !!e.experimental, note: e.note || '' })) export const getTtsEngineId = () => activeId // Notify listeners (e.g. the persona panel, on another tab) when the provider changes, // so they can re-render voice controls without polling or relying on tab visibility. const _engineListeners = new Set() export function onTtsEngineChange(fn) { _engineListeners.add(fn); return () => _engineListeners.delete(fn) } export function setTtsEngine(id) { if (!ENGINES.some((e) => e.id === id) || id === activeId) return activeId = id try { localStorage.setItem(TTS_ENGINE_KEY, id) } catch { /* ignore */ } for (const fn of _engineListeners) { try { fn(id) } catch { /* ignore */ } } } export const listVoices = () => eng().listVoices() export const currentVoiceId = () => (voiceSel[activeId] !== undefined ? voiceSel[activeId] : eng().defaultVoice) export function setVoice(id) { voiceSel[activeId] = id } export const ttsNeedsDownload = () => !!eng().needsDownload export const ttsBackendLabel = () => eng().backendLabel() export const ttsNetworked = () => !!eng().networked // "Narrate as it writes" — global now that the picker lives in Settings (the diary // reads it; the settings voice bar sets it). let _autoNarrate = false export const getAutoNarrate = () => _autoNarrate export const setAutoNarrate = (v) => { _autoNarrate = !!v } export async function ensureTts(onProgress) { if (eng().needsDownload) await ensurePersistentStorage() return eng().ensure(onProgress) } // Speak text sentence-by-sentence. push() text as it streams; end() to flush the // tail; stop() to abort. PCM engines pre-generate the next sentence while the current // one plays; native engines (Web Speech) just speak each sentence in order. export function makeNarrator({ onState } = {}) { const engine = eng() const voice = currentVoiceId() let pending = '', sentences = [], closed = false, stopped = false, running = false const SENT = /[\s\S]*?[.!?…]["')\]]*(?:\s+|$)/g const wait = (ms) => new Promise((r) => setTimeout(r, ms)) function drain(force) { SENT.lastIndex = 0 let m, last = 0 while ((m = SENT.exec(pending)) !== null) { const s = m[0].trim() if (s) sentences.push(s) last = SENT.lastIndex } pending = pending.slice(last) if (force && pending.trim()) { sentences.push(pending.trim()); pending = '' } } async function loop() { running = true onState && onState('speaking') try { if (engine.mode === 'native') { while (!stopped) { if (sentences.length) await engine.speak(sentences.shift(), voice) else if (closed) break else await wait(60) } } else { const startNext = () => (sentences.length ? engine.synth(sentences.shift(), voice, {}) : null) let synthP = null while (!stopped) { if (!synthP) synthP = startNext() if (!synthP) { if (closed) break; await wait(60); continue } let cur = null try { cur = await synthP } catch { cur = null } synthP = startNext() // pre-generate next while current plays if (cur && !stopped) { try { await playSamples(cur.audio, cur.sampleRate) } catch { /* ignore */ } } } } } finally { running = false onState && onState(stopped ? 'stopped' : 'done') } } return { push(text) { pending += text; drain(false); if (!running && !stopped) loop() }, end() { drain(true); closed = true; if (!running && !stopped) loop() }, stop() { stopped = true; sentences = []; pending = ''; if (engine.stop) engine.stop(); stopAudio() }, } }