tiny-army / web /tts.js
polats's picture
Add Build Small recommended preset
de78f87
// TTS facade — mirrors runtime.js (the LLM facade). Picks the active TTS engine
// (Kokoro / Kitten / Web Speech) and voice, and exposes makeNarrator(): a streaming
// reader that speaks sentence-by-sentence so a war diary can narrate itself while the
// LLM is still writing. Panels + the TTS bar import only from here.
import { engine as kokoro } from '/web/ttsKokoro.js'
import { engine as qwen3, engineLocal as qwen3local } from '/web/ttsQwen3.js'
import { engine as voxcpm } from '/web/ttsVoxcpm.js'
import { engine as kitten } from '/web/ttsKitten.js'
import { engine as webspeech } from '/web/ttsWebSpeech.js'
import { playSamples, stopAudio, decodeAudio, encodeWav } from '/web/ttsAudio.js'
import { ensurePersistentStorage } from '/web/storage.js'
const ENGINES = [kokoro, qwen3local, qwen3, voxcpm, kitten, webspeech]
// Default voice provider: local-GPU Qwen3-TTS on localhost (your GPU designs voices),
// in-browser Kokoro in prod (runs on the device — no exhaustible cloud quota). Cloud
// Qwen3-TTS and the others remain selectable in Settings. Persisted across refreshes.
const TTS_ENGINE_KEY = 'tinyarmy.ttsEngine'
let activeId = (() => {
let saved = ''
try { saved = localStorage.getItem(TTS_ENGINE_KEY) || '' } catch { /* ignore */ }
const e = ENGINES.find((x) => x.id === saved)
return e && e.available() ? saved : 'voxcpm'
})()
// Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
// Panels set it before narrating; previewVoice() plays a one-off sample.
export function setVoiceDescription(desc) { qwen3.setDesc(desc); voxcpm.setDesc(desc) }
export async function previewVoice(desc, text) {
const e = eng()
if (e.setDesc) e.setDesc(desc)
const { audio, sampleRate } = await e.synth(text, 'persona')
return playSamples(audio, sampleRate)
}
export const stopPreview = () => stopAudio()
// Create a persona's voice FILE: synth the line in the designed voice and return the
// raw WAV (ArrayBuffer) so it can be cached + replayed verbatim. Caller plays it.
export async function createVoiceWav(desc, text) {
const e = eng()
if (e.setDesc) e.setDesc(desc)
return e.synthWav(text, 'persona')
}
// Clone `text` from a reference voice file (keep timbre, change words). `desc` is the
// voice design — a fallback so prod (no clone model) can re-design instead. Returns WAV.
export async function cloneVoiceWav(refArrayBuffer, refText, text, desc) {
const e = eng()
if (e.setDesc) e.setDesc(desc)
return e.cloneWav(text, refArrayBuffer, refText, desc)
}
export async function playWav(arrayBuffer) {
const { audio, sampleRate } = await decodeAudio(arrayBuffer)
return playSamples(audio, sampleRate)
}
// ── Fixed-voice engines (Kokoro / Kitten / Web Speech) ───────────────────────
// These don't "design" a voice from text; a hero picks one of the engine's named
// voices. The persona panel uses these when the active engine is NOT Qwen3.
export const activeEngineIsDesign = () => !!eng().design // Qwen3 → designs from a description
export const activeEngineIsNative = () => eng().mode === 'native' // Web Speech → speaks live, no WAV
export const activeEngineId = () => activeId
export const activeVoices = () => eng().listVoices()
export const activeDefaultVoice = () => eng().defaultVoice
// Synthesize `text` in a NAMED voice with the active PCM engine → a cacheable WAV
// (encode Kokoro/Kitten PCM, or pass through an engine that already returns WAV).
export async function synthVoiceWav(voiceId, text) {
const e = eng()
if (e.needsDownload) { await ensurePersistentStorage(); await e.ensure() }
if (e.synthWav) return e.synthWav(text, voiceId)
const { audio, sampleRate } = await e.synth(text, voiceId)
return encodeWav(audio, sampleRate)
}
// Speak `text` live in a named voice (native engines that can't render to a file).
export async function speakVoiceLive(voiceId, text) {
const e = eng()
if (e.speak) return e.speak(text, voiceId)
const { audio, sampleRate } = await e.synth(text, voiceId)
return playSamples(audio, sampleRate)
}
export function stopVoiceLive() { const e = eng(); if (e.stop) e.stop(); stopAudio() }
const voiceSel = {} // engineId -> chosen voice id
const eng = () => ENGINES.find((e) => e.id === activeId) || ENGINES[0]
export const listTtsEngines = () =>
ENGINES.map((e) => ({ id: e.id, label: e.label, available: e.available(), experimental: !!e.experimental, note: e.note || '' }))
export const getTtsEngineId = () => activeId
// Notify listeners (e.g. the persona panel, on another tab) when the provider changes,
// so they can re-render voice controls without polling or relying on tab visibility.
const _engineListeners = new Set()
export function onTtsEngineChange(fn) { _engineListeners.add(fn); return () => _engineListeners.delete(fn) }
export function setTtsEngine(id) {
if (!ENGINES.some((e) => e.id === id) || id === activeId) return
activeId = id
try { localStorage.setItem(TTS_ENGINE_KEY, id) } catch { /* ignore */ }
for (const fn of _engineListeners) { try { fn(id) } catch { /* ignore */ } }
}
export const listVoices = () => eng().listVoices()
export const currentVoiceId = () => (voiceSel[activeId] !== undefined ? voiceSel[activeId] : eng().defaultVoice)
export function setVoice(id) { voiceSel[activeId] = id }
export const ttsNeedsDownload = () => !!eng().needsDownload
export const ttsBackendLabel = () => eng().backendLabel()
export const ttsNetworked = () => !!eng().networked
// "Narrate as it writes" — global now that the picker lives in Settings (the diary
// reads it; the settings voice bar sets it).
let _autoNarrate = false
export const getAutoNarrate = () => _autoNarrate
export const setAutoNarrate = (v) => { _autoNarrate = !!v }
export async function ensureTts(onProgress) {
if (eng().needsDownload) await ensurePersistentStorage()
return eng().ensure(onProgress)
}
// Speak text sentence-by-sentence. push() text as it streams; end() to flush the
// tail; stop() to abort. PCM engines pre-generate the next sentence while the current
// one plays; native engines (Web Speech) just speak each sentence in order.
export function makeNarrator({ onState } = {}) {
const engine = eng()
const voice = currentVoiceId()
let pending = '', sentences = [], closed = false, stopped = false, running = false
const SENT = /[\s\S]*?[.!?…]["')\]]*(?:\s+|$)/g
const wait = (ms) => new Promise((r) => setTimeout(r, ms))
function drain(force) {
SENT.lastIndex = 0
let m, last = 0
while ((m = SENT.exec(pending)) !== null) {
const s = m[0].trim()
if (s) sentences.push(s)
last = SENT.lastIndex
}
pending = pending.slice(last)
if (force && pending.trim()) { sentences.push(pending.trim()); pending = '' }
}
async function loop() {
running = true
onState && onState('speaking')
try {
if (engine.mode === 'native') {
while (!stopped) {
if (sentences.length) await engine.speak(sentences.shift(), voice)
else if (closed) break
else await wait(60)
}
} else {
const startNext = () => (sentences.length ? engine.synth(sentences.shift(), voice, {}) : null)
let synthP = null
while (!stopped) {
if (!synthP) synthP = startNext()
if (!synthP) { if (closed) break; await wait(60); continue }
let cur = null
try { cur = await synthP } catch { cur = null }
synthP = startNext() // pre-generate next while current plays
if (cur && !stopped) { try { await playSamples(cur.audio, cur.sampleRate) } catch { /* ignore */ } }
}
}
} finally {
running = false
onState && onState(stopped ? 'stopped' : 'done')
}
}
return {
push(text) { pending += text; drain(false); if (!running && !stopped) loop() },
end() { drain(true); closed = true; if (!running && !stopped) loop() },
stop() { stopped = true; sentences = []; pending = ''; if (engine.stop) engine.stop(); stopAudio() },
}
}