Spaces:
Running
Running
| // Svara TTS WebGPU worker. | |
| // | |
| // Architecture: | |
| // 1) Llama-3.2-3B causal LM (loaded via @huggingface/transformers v4) emits | |
| // audio token IDs in the range [128266, 156938). | |
| // 2) We group every 7-token bundle into a SNAC frame. | |
| // 3) Offline decode mirrors Kenpath's streaming path: decode a sliding | |
| // 4-frame SNAC window and keep samples [2048:4096] from each window. | |
| // That matches the codec's stable synthesis region and avoids the | |
| // "behind a fan" smear seen when decoding the whole sequence in one shot. | |
| import { | |
| AutoTokenizer, | |
| AutoModelForCausalLM, | |
| LogitsProcessor, | |
| LogitsProcessorList, | |
| Tensor, | |
| } from "@huggingface/transformers"; | |
| import * as ort from "onnxruntime-web/webgpu"; | |
| // ORT-Web's .wasm/.mjs files aren't served by Vite by default; vite.config.js | |
| // copies them from node_modules to /ort-wasm/ via vite-plugin-static-copy. | |
| ort.env.wasm.wasmPaths = "/ort-wasm/"; | |
| // --- WebGPU feature detection ----------------------------------------------- | |
| let fp16_supported = false; | |
| try { | |
| const adapter = await navigator.gpu.requestAdapter(); | |
| if (!adapter) throw new Error("WebGPU is not supported (no adapter found)"); | |
| fp16_supported = adapter.features.has("shader-f16"); | |
| self.postMessage({ status: "feature-success", fp16: fp16_supported }); | |
| } catch (e) { | |
| self.postMessage({ status: "feature-error", data: e.toString() }); | |
| throw e; | |
| } | |
| // --- Constants matching upstream Svara inference ----------------------------- | |
| const EOT = 128009; | |
| const SOS = 128257, EOS = 128258; | |
| const SOH = 128259, EOH = 128260; | |
| const SOAI = 128261; | |
| const AUDIO_OFFSET = 128266; | |
| const AUDIO_END = AUDIO_OFFSET + 7 * 4096; | |
| const WINDOW_FRAMES = 4; | |
| const WINDOW_AUDIO_START = 2048; | |
| const WINDOW_AUDIO_END = 4096; | |
| const SAMPLE_RATE = 24000; | |
| const SVARA_REPO = "shreyask/svara-tts-v1-ONNX"; | |
| const SNAC_REPO = "onnx-community/snac_24khz-ONNX"; | |
| const SUPPORTED_DTYPES = new Set(["q4f16", "q8"]); | |
| // Lazy load the tokenizer once -- it's the same across dtypes. | |
| let tokenizerPromise = null; | |
| function getTokenizer() { | |
| return (tokenizerPromise ??= AutoTokenizer.from_pretrained(SVARA_REPO)); | |
| } | |
| // SNAC decoder is small (~26 MB at fp16); load once, share across LM dtypes. | |
| let snacPromise = null; | |
| function getSnac() { | |
| return (snacPromise ??= (async () => { | |
| const url = `https://huggingface.co/${SNAC_REPO}/resolve/main/onnx/decoder_model${fp16_supported ? "_fp16" : ""}.onnx`; | |
| return ort.InferenceSession.create(url, { executionProviders: ["webgpu"] }); | |
| })()); | |
| } | |
| // LM is per-dtype. Cache by dtype string so switching back is instant. | |
| const lmCache = new Map(); | |
| function getLM(dtype) { | |
| if (!lmCache.has(dtype)) { | |
| lmCache.set( | |
| dtype, | |
| AutoModelForCausalLM.from_pretrained(SVARA_REPO, { | |
| dtype, | |
| device: "webgpu", | |
| // Number of external data chunks to fetch alongside the .onnx graph. | |
| // q4f16 is one .onnx_data file; q8 is sharded into 3 chunks | |
| // (.onnx_data, _data_1, _data_2) to stay under the ~2 GB browser | |
| // ArrayBuffer ceiling. transformers.js v4 accepts a number here per | |
| // its types: `false` | `true` (=1) | <number of chunks>. | |
| use_external_data_format: dtype === "q8" ? 3 : true, | |
| }), | |
| ); | |
| } | |
| return lmCache.get(dtype); | |
| } | |
| // --- Generation guards ------------------------------------------------------ | |
| // Svara should only emit 7-band audio tokens followed by END_OF_SPEECH. If we | |
| // let the sampler wander into the text/control vocab, the rest of the clip | |
| // turns phasey/robotic because frame alignment is lost. | |
| class SvaraLogitsProcessor extends LogitsProcessor { | |
| constructor(promptLength) { | |
| super(); | |
| this.promptLength = promptLength; | |
| } | |
| _call(inputIds, logits) { | |
| for (let i = 0; i < inputIds.length; i++) { | |
| const data = logits[i].data; | |
| const step = inputIds[i].length - this.promptLength; | |
| if (step === 0) { | |
| data.fill(-Infinity); | |
| data[SOAI] = 0; | |
| continue; | |
| } | |
| if (step === 1) { | |
| data.fill(-Infinity); | |
| data[SOS] = 0; | |
| continue; | |
| } | |
| const eosLogit = data[EOS]; | |
| data.subarray(0, AUDIO_OFFSET).fill(-Infinity); | |
| data.subarray(AUDIO_END).fill(-Infinity); | |
| data[EOS] = eosLogit; | |
| } | |
| return logits; | |
| } | |
| } | |
| function buildLogitsProcessor(promptLength) { | |
| const list = new LogitsProcessorList(); | |
| list.push(new SvaraLogitsProcessor(promptLength)); | |
| return list; | |
| } | |
| function estimateAudioTokenBudget(text) { | |
| const spokenText = stripTrailingEmotionTag(text); | |
| const graphemeCount = Array.from( | |
| new Intl.Segmenter(undefined, { granularity: "grapheme" }).segment(spokenText), | |
| ({ segment }) => segment, | |
| ).filter((segment) => /\S/u.test(segment)).length; | |
| const punctuationGroups = Array.from( | |
| spokenText.matchAll(/[.,!?;:।॥…\-—]+/gu), | |
| ).length; | |
| const wordCount = spokenText.split(/\s+/u).filter(Boolean).length; | |
| const roughBudget = graphemeCount * 12 + wordCount * 20 + punctuationGroups * 28 + 84; | |
| const clampedBudget = Math.max(224, Math.min(1120, roughBudget)); | |
| return Math.ceil(clampedBudget / 7) * 7; | |
| } | |
| function getTrailingEmotionTag(text) { | |
| return text.match(/\s*(<[^>]+>)\s*$/u)?.[1] ?? ""; | |
| } | |
| function stripTrailingEmotionTag(text) { | |
| return text.replace(/\s*<[^>]+>\s*$/u, "").trim(); | |
| } | |
| function normalizeTextForSvara(text) { | |
| return text | |
| .replace(/\.{2,}/gu, ",") | |
| .replace(/…+/gu, ",") | |
| .replace(/[—–]+/gu, ",") | |
| .replace(/\s+/gu, " ") | |
| .replace(/\s*([,.;!?।॥])\s*/gu, "$1 ") | |
| .trim(); | |
| } | |
| function countChunkGraphemes(chunk) { | |
| return Array.from( | |
| new Intl.Segmenter(undefined, { granularity: "grapheme" }).segment(chunk), | |
| ({ segment }) => segment, | |
| ).filter((segment) => /\S/u.test(segment)).length; | |
| } | |
| function countChunkWords(chunk) { | |
| return chunk.split(/\s+/u).filter(Boolean).length; | |
| } | |
| function splitLongChunk(chunk) { | |
| const graphemeCount = countChunkGraphemes(chunk); | |
| const wordCount = countChunkWords(chunk); | |
| if (graphemeCount <= 28 || wordCount <= 5) return [chunk]; | |
| const parts = chunk.split(/\s*,\s*/u).map((part) => part.trim()).filter(Boolean); | |
| return parts.length > 1 ? parts : [chunk]; | |
| } | |
| function mergeTinyChunks(chunks) { | |
| const merged = []; | |
| for (const chunk of chunks) { | |
| const graphemeCount = countChunkGraphemes(chunk); | |
| const wordCount = countChunkWords(chunk); | |
| const shouldAttach = | |
| merged.length > 0 && | |
| !/[.!?।॥]$/u.test(merged.at(-1)) && | |
| (graphemeCount < 10 || wordCount < 3); | |
| if (shouldAttach) { | |
| merged[merged.length - 1] = `${merged.at(-1)}, ${chunk}`; | |
| continue; | |
| } | |
| merged.push(chunk); | |
| } | |
| return merged; | |
| } | |
| function splitTextForSvara(text) { | |
| const emotionTag = getTrailingEmotionTag(text); | |
| const spokenText = normalizeTextForSvara(stripTrailingEmotionTag(text)); | |
| if (!spokenText) return []; | |
| const chunks = mergeTinyChunks( | |
| spokenText | |
| .match(/[^.!?।॥]+[.!?।॥]?/gu) | |
| ?.map((part) => part.trim()) | |
| .filter(Boolean) | |
| .flatMap(splitLongChunk) ?? [], | |
| ); | |
| if (!emotionTag) return chunks; | |
| return chunks.map((chunk, index) => | |
| index === chunks.length - 1 ? `${chunk} ${emotionTag}` : chunk, | |
| ); | |
| } | |
| function mergeTinyLeadingChunks(chunks) { | |
| const merged = []; | |
| for (let i = 0; i < chunks.length; i++) { | |
| const chunk = chunks[i]; | |
| const graphemeCount = countChunkGraphemes(chunk); | |
| const wordCount = countChunkWords(chunk); | |
| if (graphemeCount < 10 && wordCount < 3) { | |
| if (i + 1 < chunks.length) { | |
| chunks[i + 1] = `${chunk}, ${chunks[i + 1]}`; | |
| continue; | |
| } | |
| if (merged.length > 0) { | |
| merged[merged.length - 1] = `${merged.at(-1)}, ${chunk}`; | |
| continue; | |
| } | |
| } | |
| merged.push(chunk); | |
| } | |
| return merged; | |
| } | |
| function splitEmotionSafeTextForSvara(text) { | |
| const emotionTag = getTrailingEmotionTag(text); | |
| const spokenText = normalizeTextForSvara(stripTrailingEmotionTag(text)); | |
| if (!spokenText) return []; | |
| const chunks = spokenText | |
| .match(/[^.!?।॥]+[.!?।॥]?/gu) | |
| ?.map((part) => part.trim()) | |
| .filter(Boolean) | |
| .flatMap((sentence) => { | |
| const commaParts = sentence | |
| .split(/\s*,\s*/u) | |
| .map((part) => part.trim()) | |
| .filter(Boolean); | |
| return mergeTinyLeadingChunks(commaParts); | |
| }) ?? []; | |
| if (!emotionTag) return chunks; | |
| return chunks.map((chunk, index) => | |
| index === chunks.length - 1 ? `${chunk} ${emotionTag}` : chunk, | |
| ); | |
| } | |
| function splitFinalEmotionClauseTextForSvara(text) { | |
| const emotionTag = getTrailingEmotionTag(text); | |
| const spokenText = normalizeTextForSvara(stripTrailingEmotionTag(text)); | |
| if (!spokenText) return []; | |
| const chunks = mergeTinyLeadingChunks( | |
| spokenText.split(/\s*,\s*/u).map((part) => part.trim()).filter(Boolean), | |
| ); | |
| if (!emotionTag) return chunks; | |
| return chunks.map((chunk, index) => | |
| index === chunks.length - 1 ? `${chunk} ${emotionTag}` : chunk, | |
| ); | |
| } | |
| function buildPromptVariants(text) { | |
| const rawText = text.trim(); | |
| const spokenText = normalizeTextForSvara(stripTrailingEmotionTag(text)); | |
| if (!rawText && !spokenText) return []; | |
| const variants = rawText ? [[rawText]] : []; | |
| variants.push( | |
| splitTextForSvara(text), | |
| splitEmotionSafeTextForSvara(text), | |
| splitFinalEmotionClauseTextForSvara(text), | |
| ); | |
| if (getTrailingEmotionTag(text)) { | |
| variants.push([spokenText]); | |
| variants.push(splitEmotionSafeTextForSvara(spokenText)); | |
| } | |
| const seen = new Set(); | |
| return variants.filter((chunks) => { | |
| if (chunks.length === 0) return false; | |
| const key = chunks.join("\u241e"); | |
| if (seen.has(key)) return false; | |
| seen.add(key); | |
| return true; | |
| }); | |
| } | |
| function pauseDurationForChunk(chunk, isLast) { | |
| if (isLast) return 0; | |
| const trimmed = chunk.trim(); | |
| if (/[!?]$/u.test(trimmed)) return 0.26; | |
| if (/[.]$/u.test(trimmed)) return 0.3; | |
| return 0.18; | |
| } | |
| function concatFloat32Arrays(chunks) { | |
| const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0); | |
| const merged = new Float32Array(totalLength); | |
| let offset = 0; | |
| for (const chunk of chunks) { | |
| merged.set(chunk, offset); | |
| offset += chunk.length; | |
| } | |
| return merged; | |
| } | |
| function pcmStats(samples) { | |
| let peak = 0; | |
| let sumSquares = 0; | |
| for (let i = 0; i < samples.length; i++) { | |
| const value = Math.abs(samples[i]); | |
| if (value > peak) peak = value; | |
| sumSquares += value * value; | |
| } | |
| const rms = samples.length > 0 ? Math.sqrt(sumSquares / samples.length) : 0; | |
| return { peak, rms }; | |
| } | |
| function isNearlySilent(samples) { | |
| const { peak, rms } = pcmStats(samples); | |
| return peak < 0.006 && rms < 0.0015; | |
| } | |
| function isComplexQ4Prompt(text) { | |
| const spokenText = stripTrailingEmotionTag(text); | |
| const wordCount = countChunkWords(spokenText); | |
| const punctuationGroups = Array.from( | |
| spokenText.matchAll(/[.,!?;:।॥…\-—]+/gu), | |
| ).length; | |
| return punctuationGroups >= 3 || wordCount >= 8 || ( | |
| getTrailingEmotionTag(text) && punctuationGroups >= 1 && wordCount >= 5 | |
| ); | |
| } | |
| async function synthesizeChunks(tokenizer, lm, speaker_id, chunks, generation) { | |
| const pcmChunks = []; | |
| for (let index = 0; index < chunks.length; index++) { | |
| const chunk = chunks[index]; | |
| const promptIds = buildPrompt(tokenizer, chunk, speaker_id); | |
| const inputIds = new Tensor( | |
| "int64", | |
| BigInt64Array.from(promptIds.map(BigInt)), | |
| [1, promptIds.length], | |
| ); | |
| const maxAudioTokens = estimateAudioTokenBudget(chunk); | |
| const out = await lm.generate({ | |
| inputs: inputIds, | |
| max_new_tokens: maxAudioTokens + 3, | |
| logits_processor: buildLogitsProcessor(promptIds.length), | |
| ...generation, | |
| repetition_penalty: 1.0, | |
| eos_token_id: EOS, | |
| }); | |
| const allIds = Array.from(out.data, (x) => Number(x)); | |
| const audioIds = extractAudioTokens(allIds, promptIds.length); | |
| if (audioIds.length === 0) { | |
| throw new Error(`LM produced no audio tokens for chunk ${index + 1}/${chunks.length}.`); | |
| } | |
| const pcm = await decodeSnacStable(audioIds); | |
| pcmChunks.push(pcm); | |
| const pauseSeconds = pauseDurationForChunk(chunk, index === chunks.length - 1); | |
| if (pauseSeconds > 0) { | |
| pcmChunks.push(new Float32Array(Math.round(SAMPLE_RATE * pauseSeconds))); | |
| } | |
| } | |
| return concatFloat32Arrays(pcmChunks); | |
| } | |
| // --- Token-stream → SNAC code conversion ------------------------------------ | |
| // Reference: mlx_audio/tts/models/llama/llama.py:codes_to_layers | |
| // layer_1 (band 0): [c0] — 1 code per coarse frame | |
| // layer_2 (bands 1, 4): [c1, c4] — 2 codes per coarse frame | |
| // layer_3 (bands 2, 3, 5, 6): [c2, c3, c5, c6] — 4 codes per coarse frame | |
| function codesToLayers(audioTokenIds) { | |
| const N = Math.floor(audioTokenIds.length / 7); | |
| const l1 = new BigInt64Array(N); | |
| const l2 = new BigInt64Array(N * 2); | |
| const l3 = new BigInt64Array(N * 4); | |
| for (let i = 0; i < N; i++) { | |
| const base = i * 7; | |
| l1[i] = BigInt(audioTokenIds[base ] - AUDIO_OFFSET - 0 * 4096); | |
| l2[2 * i + 0] = BigInt(audioTokenIds[base + 1] - AUDIO_OFFSET - 1 * 4096); | |
| l3[4 * i + 0] = BigInt(audioTokenIds[base + 2] - AUDIO_OFFSET - 2 * 4096); | |
| l3[4 * i + 1] = BigInt(audioTokenIds[base + 3] - AUDIO_OFFSET - 3 * 4096); | |
| l2[2 * i + 1] = BigInt(audioTokenIds[base + 4] - AUDIO_OFFSET - 4 * 4096); | |
| l3[4 * i + 2] = BigInt(audioTokenIds[base + 5] - AUDIO_OFFSET - 5 * 4096); | |
| l3[4 * i + 3] = BigInt(audioTokenIds[base + 6] - AUDIO_OFFSET - 6 * 4096); | |
| } | |
| return { l1, l2, l3, N }; | |
| } | |
| async function decodeSnacWindow(audioTokenIds) { | |
| const snac = await getSnac(); | |
| const { l1, l2, l3, N } = codesToLayers(audioTokenIds); | |
| const feeds = { | |
| [snac.inputNames[0]]: new ort.Tensor("int64", l1, [1, N]), | |
| [snac.inputNames[1]]: new ort.Tensor("int64", l2, [1, N * 2]), | |
| [snac.inputNames[2]]: new ort.Tensor("int64", l3, [1, N * 4]), | |
| }; | |
| const out = await snac.run(feeds); | |
| return out[snac.outputNames[0]].data; | |
| } | |
| async function decodeSnacStable(audioTokenIds) { | |
| const numFrames = Math.floor(audioTokenIds.length / 7); | |
| if (numFrames === 0) return new Float32Array(0); | |
| if (numFrames < WINDOW_FRAMES) { | |
| return await decodeSnacWindow(audioTokenIds); | |
| } | |
| const chunks = []; | |
| let totalLength = 0; | |
| for (let start = 0; start <= numFrames - WINDOW_FRAMES; start++) { | |
| const windowIds = audioTokenIds.slice(start * 7, (start + WINDOW_FRAMES) * 7); | |
| const decoded = await decodeSnacWindow(windowIds); | |
| const stable = decoded.slice(WINDOW_AUDIO_START, WINDOW_AUDIO_END); | |
| chunks.push(stable); | |
| totalLength += stable.length; | |
| } | |
| const merged = new Float32Array(totalLength); | |
| let offset = 0; | |
| for (const chunk of chunks) { | |
| merged.set(chunk, offset); | |
| offset += chunk.length; | |
| } | |
| return merged; | |
| } | |
| // Match the exported ONNX repo README: | |
| // [SOH, BOS, "<voice>: <text>" tokens, EOT, EOH] | |
| // The model predicts SOAI -> SOS -> audio tokens -> EOS itself. | |
| function buildPrompt(tokenizer, text, voice) { | |
| const body = tokenizer.encode(`${voice}: ${text}`, { add_special_tokens: false }); | |
| return [SOH, tokenizer.bos_token_id, ...body, EOT, EOH]; | |
| } | |
| // Keep audio tokens after the first START_OF_SPEECH emitted by the model. | |
| function extractAudioTokens(allTokenIds, promptLength) { | |
| let sosIdx = -1; | |
| for (let i = promptLength; i < allTokenIds.length; i++) { | |
| if (allTokenIds[i] === SOS) { | |
| sosIdx = i; | |
| break; | |
| } | |
| } | |
| if (sosIdx === -1) return []; | |
| const audio = []; | |
| for (let i = sosIdx + 1; i < allTokenIds.length; i++) { | |
| const tokenId = allTokenIds[i]; | |
| if (tokenId === EOS) break; | |
| if (tokenId >= AUDIO_OFFSET && tokenId < AUDIO_END) { | |
| audio.push(tokenId); | |
| } | |
| } | |
| return audio.slice(0, audio.length - (audio.length % 7)); | |
| } | |
| // --- WAV encoder (24 kHz, mono, PCM16) -------------------------------------- | |
| function pcmFloat32ToWav(samples, sampleRate) { | |
| const bufLen = 44 + samples.length * 2; | |
| const buf = new ArrayBuffer(bufLen); | |
| const v = new DataView(buf); | |
| let p = 0; | |
| const w = (s) => { for (let i = 0; i < s.length; i++) v.setUint8(p++, s.charCodeAt(i)); }; | |
| w("RIFF"); | |
| v.setUint32(p, 36 + samples.length * 2, true); p += 4; | |
| w("WAVEfmt "); | |
| v.setUint32(p, 16, true); p += 4; | |
| v.setUint16(p, 1, true); p += 2; | |
| v.setUint16(p, 1, true); p += 2; | |
| v.setUint32(p, sampleRate, true); p += 4; | |
| v.setUint32(p, sampleRate * 2, true); p += 4; | |
| v.setUint16(p, 2, true); p += 2; | |
| v.setUint16(p, 16, true); p += 2; | |
| w("data"); | |
| v.setUint32(p, samples.length * 2, true); p += 4; | |
| for (let i = 0; i < samples.length; i++) { | |
| const s = Math.max(-1, Math.min(1, samples[i])); | |
| v.setInt16(p, s < 0 ? s * 0x8000 : s * 0x7fff, true); | |
| p += 2; | |
| } | |
| return buf; | |
| } | |
| // --- Sampling defaults per dtype -------------------------------------------- | |
| // Transformers.js v4 currently ignores top-k/top-p on this path, so unconstrained | |
| // sampling drifts badly on quantized Svara and turns later words robotic. Use | |
| // greedy decoding by default for stability; q8 can tolerate a little sampling. | |
| function generationFor(dtype) { | |
| return dtype === "q8" | |
| ? { do_sample: true, temperature: 0.35, min_new_tokens: 30 } | |
| : { do_sample: false, min_new_tokens: 30 }; | |
| } | |
| function generationPlansFor(dtype, text) { | |
| const base = generationFor(dtype); | |
| if (dtype !== "q4f16" || !isComplexQ4Prompt(text)) { | |
| return [base]; | |
| } | |
| return [ | |
| { | |
| do_sample: true, | |
| temperature: 0.6, | |
| top_k: 40, | |
| top_p: 0.9, | |
| min_new_tokens: 30, | |
| }, | |
| base, | |
| ]; | |
| } | |
| // --- Message handler -------------------------------------------------------- | |
| self.addEventListener("message", async (e) => { | |
| const { type, text, speaker_id, dtype: requested } = e.data; | |
| const dtype = SUPPORTED_DTYPES.has(requested) ? requested : "q4f16"; | |
| try { | |
| if (type === "preload") { | |
| // Triggered by the explicit "Load model" action in the UI. | |
| self.postMessage({ status: "loading", dtype }); | |
| await Promise.all([getTokenizer(), getSnac(), getLM(dtype)]); | |
| self.postMessage({ status: "ready", dtype }); | |
| return; | |
| } | |
| self.postMessage({ status: "loading", dtype }); | |
| const [tokenizer, lm] = await Promise.all([getTokenizer(), getLM(dtype)]); | |
| await getSnac(); // warm | |
| const variants = buildPromptVariants(text); | |
| if (variants.length === 0) { | |
| throw new Error("No speakable text found after normalization."); | |
| } | |
| const generations = generationPlansFor(dtype, text); | |
| let mergedPcm = null; | |
| let lastError = null; | |
| for (const generation of generations) { | |
| for (const chunks of variants) { | |
| try { | |
| const candidate = await synthesizeChunks( | |
| tokenizer, | |
| lm, | |
| speaker_id, | |
| chunks, | |
| generation, | |
| ); | |
| if (isNearlySilent(candidate)) { | |
| lastError = new Error("Generated near-silent audio."); | |
| continue; | |
| } | |
| mergedPcm = candidate; | |
| break; | |
| } catch (err) { | |
| lastError = err; | |
| } | |
| } | |
| if (mergedPcm) { | |
| break; | |
| } | |
| } | |
| if (!mergedPcm) { | |
| throw lastError ?? new Error("Synthesis failed for all prompt variants."); | |
| } | |
| const wav = pcmFloat32ToWav(mergedPcm, SAMPLE_RATE); | |
| const blob = new Blob([wav], { type: "audio/wav" }); | |
| self.postMessage({ | |
| status: "complete", | |
| audio: URL.createObjectURL(blob), | |
| text, | |
| voice: speaker_id, | |
| dtype, | |
| }); | |
| } catch (err) { | |
| self.postMessage({ status: "error", data: String(err), dtype }); | |
| console.error(err); | |
| } | |
| }); | |