Spaces:

build-small-hackathon
/

tiny-army

Running

App Files Files Community

tiny-army / web /ttsKitten.js

polats's picture

Add in-browser TTS: read war diaries aloud (Kokoro / Kitten / Web Speech)

9c371b5 5 days ago

history blame contribute delete

3.93 kB

	// TTS engine: KittenTTS Nano (15M, ~24MB ONNX) — the smallest usable browser TTS.
	// Pipeline (mirrors clowerweb/kitten-tts-web-demo): phonemize → map to token ids →
	// ONNX run (input_ids/style/speed → waveform @ 24kHz). Runs on WASM for robustness
	// (the WebGPU EP produces NaNs on some drivers). Experimental: depends on
	// onnxruntime-web + phonemizer loading from CDN, so it degrades to an error if either
	// fails. Model from HF; tokenizer + voice embeddings vendored under /web/kitten/.
	const ORT_VER = '1.26.0'
	const ORT_URL = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${ORT_VER}/dist/ort.wasm.mjs`
	const MODEL_URL = 'https://huggingface.co/KittenML/kitten-tts-nano-0.1/resolve/main/kitten_tts_nano_v0_1.onnx'
	const SAMPLE_RATE = 24000

	const VOICES = [
	{ id: 'expr-voice-2-f', label: 'Voice 2 · ♀' },
	{ id: 'expr-voice-2-m', label: 'Voice 2 · ♂' },
	{ id: 'expr-voice-3-f', label: 'Voice 3 · ♀' },
	{ id: 'expr-voice-3-m', label: 'Voice 3 · ♂' },
	{ id: 'expr-voice-4-f', label: 'Voice 4 · ♀' },
	{ id: 'expr-voice-4-m', label: 'Voice 4 · ♂' },
	{ id: 'expr-voice-5-f', label: 'Voice 5 · ♀' },
	{ id: 'expr-voice-5-m', label: 'Voice 5 · ♂' },
	]

	let _ort = null, _session = null, _vocab = null, _voiceEmb = null, _phon = null, _p = null

	async function ensure(onProgress) {
	if (_session) return _session
	if (_p) return _p
	_p = (async () => {
	_ort = await import(ORT_URL)
	_ort.env.wasm.numThreads = 1 // HF Space lacks COOP/COEP → no SharedArrayBuffer
	_ort.env.wasm.wasmPaths = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${ORT_VER}/dist/`

	const [tok, voices] = await Promise.all([
	fetch('/web/kitten/tokenizer.json').then((r) => r.json()),
	fetch('/web/kitten/voices.json').then((r) => r.json()),
	])
	_vocab = tok.model.vocab
	_voiceEmb = voices

	const resp = await fetch(MODEL_URL)
	const total = +(resp.headers.get('content-length') \|\| 0)
	let loaded = 0
	const reader = resp.body.getReader()
	const parts = []
	for (;;) {
	const { done, value } = await reader.read()
	if (done) break
	parts.push(value); loaded += value.length
	if (onProgress && total) onProgress(loaded / total)
	}
	const bytes = new Uint8Array(loaded)
	let off = 0
	for (const p of parts) { bytes.set(p, off); off += p.length }

	_session = await _ort.InferenceSession.create(bytes.buffer, {
	executionProviders: [{ name: 'wasm', simd: true }],
	})
	return _session
	})().catch((e) => { _p = null; throw e })
	return _p
	}

	async function phonemes(text) {
	if (!_phon) { const m = await import('https://esm.run/phonemizer@1.2.1'); _phon = m.phonemize }
	const ph = await _phon(text, 'en-us')
	return Array.isArray(ph) ? ph.join(' ') : String(ph)
	}

	async function synth(text, voice, { speed = 1 } = {}) {
	const ph = await phonemes(text)
	const ids = `$${ph}$`.split('').map((ch) => (_vocab[ch] != null ? _vocab[ch] : 0))
	const emb = _voiceEmb[voice] \|\| _voiceEmb[VOICES[0].id]
	const inputs = {
	input_ids: new _ort.Tensor('int64', BigInt64Array.from(ids.map((i) => BigInt(i))), [1, ids.length]),
	style: new _ort.Tensor('float32', new Float32Array(emb[0]), [1, emb[0].length]),
	speed: new _ort.Tensor('float32', new Float32Array([speed]), [1]),
	}
	const out = await _session.run(inputs)
	const data = out.waveform.data
	// Scrub NaNs to silence so a bad frame can't poison the whole clip.
	const audio = new Float32Array(data.length)
	for (let i = 0; i < data.length; i++) audio[i] = Number.isNaN(data[i]) ? 0 : data[i]
	return { audio, sampleRate: SAMPLE_RATE }
	}

	export const engine = {
	id: 'kitten',
	label: 'Kitten TTS 15M · smallest (experimental)',
	mode: 'pcm',
	experimental: true,
	needsDownload: true,
	available: () => true,
	listVoices: () => VOICES,
	defaultVoice: 'expr-voice-2-f',
	ensure, synth,
	backendLabel: () => 'CPU (WASM)',
	}