// TTS engine: KittenTTS Nano (15M, ~24MB ONNX) — the smallest usable browser TTS. // Pipeline (mirrors clowerweb/kitten-tts-web-demo): phonemize → map to token ids → // ONNX run (input_ids/style/speed → waveform @ 24kHz). Runs on WASM for robustness // (the WebGPU EP produces NaNs on some drivers). Experimental: depends on // onnxruntime-web + phonemizer loading from CDN, so it degrades to an error if either // fails. Model from HF; tokenizer + voice embeddings vendored under /web/kitten/. const ORT_VER = '1.26.0' const ORT_URL = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${ORT_VER}/dist/ort.wasm.mjs` const MODEL_URL = 'https://huggingface.co/KittenML/kitten-tts-nano-0.1/resolve/main/kitten_tts_nano_v0_1.onnx' const SAMPLE_RATE = 24000 const VOICES = [ { id: 'expr-voice-2-f', label: 'Voice 2 · ♀' }, { id: 'expr-voice-2-m', label: 'Voice 2 · ♂' }, { id: 'expr-voice-3-f', label: 'Voice 3 · ♀' }, { id: 'expr-voice-3-m', label: 'Voice 3 · ♂' }, { id: 'expr-voice-4-f', label: 'Voice 4 · ♀' }, { id: 'expr-voice-4-m', label: 'Voice 4 · ♂' }, { id: 'expr-voice-5-f', label: 'Voice 5 · ♀' }, { id: 'expr-voice-5-m', label: 'Voice 5 · ♂' }, ] let _ort = null, _session = null, _vocab = null, _voiceEmb = null, _phon = null, _p = null async function ensure(onProgress) { if (_session) return _session if (_p) return _p _p = (async () => { _ort = await import(ORT_URL) _ort.env.wasm.numThreads = 1 // HF Space lacks COOP/COEP → no SharedArrayBuffer _ort.env.wasm.wasmPaths = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${ORT_VER}/dist/` const [tok, voices] = await Promise.all([ fetch('/web/kitten/tokenizer.json').then((r) => r.json()), fetch('/web/kitten/voices.json').then((r) => r.json()), ]) _vocab = tok.model.vocab _voiceEmb = voices const resp = await fetch(MODEL_URL) const total = +(resp.headers.get('content-length') || 0) let loaded = 0 const reader = resp.body.getReader() const parts = [] for (;;) { const { done, value } = await reader.read() if (done) break parts.push(value); loaded += value.length if (onProgress && total) onProgress(loaded / total) } const bytes = new Uint8Array(loaded) let off = 0 for (const p of parts) { bytes.set(p, off); off += p.length } _session = await _ort.InferenceSession.create(bytes.buffer, { executionProviders: [{ name: 'wasm', simd: true }], }) return _session })().catch((e) => { _p = null; throw e }) return _p } async function phonemes(text) { if (!_phon) { const m = await import('https://esm.run/phonemizer@1.2.1'); _phon = m.phonemize } const ph = await _phon(text, 'en-us') return Array.isArray(ph) ? ph.join(' ') : String(ph) } async function synth(text, voice, { speed = 1 } = {}) { const ph = await phonemes(text) const ids = `$${ph}$`.split('').map((ch) => (_vocab[ch] != null ? _vocab[ch] : 0)) const emb = _voiceEmb[voice] || _voiceEmb[VOICES[0].id] const inputs = { input_ids: new _ort.Tensor('int64', BigInt64Array.from(ids.map((i) => BigInt(i))), [1, ids.length]), style: new _ort.Tensor('float32', new Float32Array(emb[0]), [1, emb[0].length]), speed: new _ort.Tensor('float32', new Float32Array([speed]), [1]), } const out = await _session.run(inputs) const data = out.waveform.data // Scrub NaNs to silence so a bad frame can't poison the whole clip. const audio = new Float32Array(data.length) for (let i = 0; i < data.length; i++) audio[i] = Number.isNaN(data[i]) ? 0 : data[i] return { audio, sampleRate: SAMPLE_RATE } } export const engine = { id: 'kitten', label: 'Kitten TTS 15M · smallest (experimental)', mode: 'pcm', experimental: true, needsDownload: true, available: () => true, listVoices: () => VOICES, defaultVoice: 'expr-voice-2-f', ensure, synth, backendLabel: () => 'CPU (WASM)', }