Spaces:

build-small-hackathon
/

tiny-army

Running

App Files Files Community

tiny-army / web /tts.js

polats's picture

Add Build Small recommended preset

de78f87 3 days ago

history blame contribute delete

8.08 kB

	// TTS facade — mirrors runtime.js (the LLM facade). Picks the active TTS engine
	// (Kokoro / Kitten / Web Speech) and voice, and exposes makeNarrator(): a streaming
	// reader that speaks sentence-by-sentence so a war diary can narrate itself while the
	// LLM is still writing. Panels + the TTS bar import only from here.
	import { engine as kokoro } from '/web/ttsKokoro.js'
	import { engine as qwen3, engineLocal as qwen3local } from '/web/ttsQwen3.js'
	import { engine as voxcpm } from '/web/ttsVoxcpm.js'
	import { engine as kitten } from '/web/ttsKitten.js'
	import { engine as webspeech } from '/web/ttsWebSpeech.js'
	import { playSamples, stopAudio, decodeAudio, encodeWav } from '/web/ttsAudio.js'
	import { ensurePersistentStorage } from '/web/storage.js'

	const ENGINES = [kokoro, qwen3local, qwen3, voxcpm, kitten, webspeech]
	// Default voice provider: local-GPU Qwen3-TTS on localhost (your GPU designs voices),
	// in-browser Kokoro in prod (runs on the device — no exhaustible cloud quota). Cloud
	// Qwen3-TTS and the others remain selectable in Settings. Persisted across refreshes.
	const TTS_ENGINE_KEY = 'tinyarmy.ttsEngine'
	let activeId = (() => {
	let saved = ''
	try { saved = localStorage.getItem(TTS_ENGINE_KEY) \|\| '' } catch { /* ignore */ }
	const e = ENGINES.find((x) => x.id === saved)
	return e && e.available() ? saved : 'voxcpm'
	})()

	// Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
	// Panels set it before narrating; previewVoice() plays a one-off sample.
	export function setVoiceDescription(desc) { qwen3.setDesc(desc); voxcpm.setDesc(desc) }
	export async function previewVoice(desc, text) {
	const e = eng()
	if (e.setDesc) e.setDesc(desc)
	const { audio, sampleRate } = await e.synth(text, 'persona')
	return playSamples(audio, sampleRate)
	}
	export const stopPreview = () => stopAudio()

	// Create a persona's voice FILE: synth the line in the designed voice and return the
	// raw WAV (ArrayBuffer) so it can be cached + replayed verbatim. Caller plays it.
	export async function createVoiceWav(desc, text) {
	const e = eng()
	if (e.setDesc) e.setDesc(desc)
	return e.synthWav(text, 'persona')
	}
	// Clone `text` from a reference voice file (keep timbre, change words). `desc` is the
	// voice design — a fallback so prod (no clone model) can re-design instead. Returns WAV.
	export async function cloneVoiceWav(refArrayBuffer, refText, text, desc) {
	const e = eng()
	if (e.setDesc) e.setDesc(desc)
	return e.cloneWav(text, refArrayBuffer, refText, desc)
	}
	export async function playWav(arrayBuffer) {
	const { audio, sampleRate } = await decodeAudio(arrayBuffer)
	return playSamples(audio, sampleRate)
	}

	// ── Fixed-voice engines (Kokoro / Kitten / Web Speech) ───────────────────────
	// These don't "design" a voice from text; a hero picks one of the engine's named
	// voices. The persona panel uses these when the active engine is NOT Qwen3.
	export const activeEngineIsDesign = () => !!eng().design // Qwen3 → designs from a description
	export const activeEngineIsNative = () => eng().mode === 'native' // Web Speech → speaks live, no WAV
	export const activeEngineId = () => activeId
	export const activeVoices = () => eng().listVoices()
	export const activeDefaultVoice = () => eng().defaultVoice

	// Synthesize `text` in a NAMED voice with the active PCM engine → a cacheable WAV
	// (encode Kokoro/Kitten PCM, or pass through an engine that already returns WAV).
	export async function synthVoiceWav(voiceId, text) {
	const e = eng()
	if (e.needsDownload) { await ensurePersistentStorage(); await e.ensure() }
	if (e.synthWav) return e.synthWav(text, voiceId)
	const { audio, sampleRate } = await e.synth(text, voiceId)
	return encodeWav(audio, sampleRate)
	}
	// Speak `text` live in a named voice (native engines that can't render to a file).
	export async function speakVoiceLive(voiceId, text) {
	const e = eng()
	if (e.speak) return e.speak(text, voiceId)
	const { audio, sampleRate } = await e.synth(text, voiceId)
	return playSamples(audio, sampleRate)
	}
	export function stopVoiceLive() { const e = eng(); if (e.stop) e.stop(); stopAudio() }

	const voiceSel = {} // engineId -> chosen voice id

	const eng = () => ENGINES.find((e) => e.id === activeId) \|\| ENGINES[0]

	export const listTtsEngines = () =>
	ENGINES.map((e) => ({ id: e.id, label: e.label, available: e.available(), experimental: !!e.experimental, note: e.note \|\| '' }))
	export const getTtsEngineId = () => activeId
	// Notify listeners (e.g. the persona panel, on another tab) when the provider changes,
	// so they can re-render voice controls without polling or relying on tab visibility.
	const _engineListeners = new Set()
	export function onTtsEngineChange(fn) { _engineListeners.add(fn); return () => _engineListeners.delete(fn) }
	export function setTtsEngine(id) {
	if (!ENGINES.some((e) => e.id === id) \|\| id === activeId) return
	activeId = id
	try { localStorage.setItem(TTS_ENGINE_KEY, id) } catch { /* ignore */ }
	for (const fn of _engineListeners) { try { fn(id) } catch { /* ignore */ } }
	}

	export const listVoices = () => eng().listVoices()
	export const currentVoiceId = () => (voiceSel[activeId] !== undefined ? voiceSel[activeId] : eng().defaultVoice)
	export function setVoice(id) { voiceSel[activeId] = id }

	export const ttsNeedsDownload = () => !!eng().needsDownload
	export const ttsBackendLabel = () => eng().backendLabel()
	export const ttsNetworked = () => !!eng().networked

	// "Narrate as it writes" — global now that the picker lives in Settings (the diary
	// reads it; the settings voice bar sets it).
	let _autoNarrate = false
	export const getAutoNarrate = () => _autoNarrate
	export const setAutoNarrate = (v) => { _autoNarrate = !!v }

	export async function ensureTts(onProgress) {
	if (eng().needsDownload) await ensurePersistentStorage()
	return eng().ensure(onProgress)
	}

	// Speak text sentence-by-sentence. push() text as it streams; end() to flush the
	// tail; stop() to abort. PCM engines pre-generate the next sentence while the current
	// one plays; native engines (Web Speech) just speak each sentence in order.
	export function makeNarrator({ onState } = {}) {
	const engine = eng()
	const voice = currentVoiceId()
	let pending = '', sentences = [], closed = false, stopped = false, running = false
	const SENT = /[\s\S]?[.!?…]["')\]](?:\s+\|$)/g
	const wait = (ms) => new Promise((r) => setTimeout(r, ms))

	function drain(force) {
	SENT.lastIndex = 0
	let m, last = 0
	while ((m = SENT.exec(pending)) !== null) {
	const s = m[0].trim()
	if (s) sentences.push(s)
	last = SENT.lastIndex
	}
	pending = pending.slice(last)
	if (force && pending.trim()) { sentences.push(pending.trim()); pending = '' }
	}

	async function loop() {
	running = true
	onState && onState('speaking')
	try {
	if (engine.mode === 'native') {
	while (!stopped) {
	if (sentences.length) await engine.speak(sentences.shift(), voice)
	else if (closed) break
	else await wait(60)
	}
	} else {
	const startNext = () => (sentences.length ? engine.synth(sentences.shift(), voice, {}) : null)
	let synthP = null
	while (!stopped) {
	if (!synthP) synthP = startNext()
	if (!synthP) { if (closed) break; await wait(60); continue }
	let cur = null
	try { cur = await synthP } catch { cur = null }
	synthP = startNext() // pre-generate next while current plays
	if (cur && !stopped) { try { await playSamples(cur.audio, cur.sampleRate) } catch { /* ignore */ } }
	}
	}
	} finally {
	running = false
	onState && onState(stopped ? 'stopped' : 'done')
	}
	}

	return {
	push(text) { pending += text; drain(false); if (!running && !stopped) loop() },
	end() { drain(true); closed = true; if (!running && !stopped) loop() },
	stop() { stopped = true; sentences = []; pending = ''; if (engine.stop) engine.stop(); stopAudio() },
	}
	}