Spaces:
Configuration error
Configuration error
| import { AutoModel, Tensor, env, pipeline } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.2.0"; | |
| env.allowLocalModels = false; | |
| env.useBrowserCache = true; | |
| installFetchTelemetry("asr"); | |
| const SAMPLE_RATE = 16000; | |
| const SPEECH_THRESHOLD = 0.3; | |
| const EXIT_THRESHOLD = 0.1; | |
| const DEFAULT_SILENCE_DURATION_MS = 480; | |
| const MIN_SILENCE_DURATION_MS = 200; | |
| const MAX_SILENCE_DURATION_MS = 800; | |
| const SPEECH_PAD_SAMPLES = 80 * (SAMPLE_RATE / 1000); | |
| const MIN_SPEECH_DURATION_SAMPLES = 250 * (SAMPLE_RATE / 1000); | |
| const MAX_BUFFER_DURATION = 30; | |
| const NEW_BUFFER_SIZE = 512; | |
| const MAX_NUM_PREV_BUFFERS = Math.ceil(SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE); | |
| const PARTIAL_INTERVAL_MS = 1600; | |
| let vadModel = null; | |
| let transcriber = null; | |
| let device = "wasm"; | |
| let inputQueue = new Float32Array(0); | |
| let vadChain = Promise.resolve(); | |
| let asrChain = Promise.resolve(); | |
| let vadState = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]); | |
| let srTensor = new Tensor("int64", [SAMPLE_RATE], []); | |
| let isRecording = false; | |
| let bufferPointer = 0; | |
| let postSpeechSamples = 0; | |
| let previousBuffers = []; | |
| let partialEnabled = true; | |
| let minSilenceDurationSamples = silenceDurationSamples(DEFAULT_SILENCE_DURATION_MS); | |
| let partialBusy = false; | |
| let lastPartialAt = 0; | |
| let utteranceStartedAt = 0; | |
| const recordingBuffer = new Float32Array(MAX_BUFFER_DURATION * SAMPLE_RATE); | |
| function installFetchTelemetry(scope) { | |
| const originalFetch = globalThis.fetch?.bind(globalThis); | |
| if (!originalFetch || globalThis.__browserSpeakFetchTelemetryInstalled) return; | |
| globalThis.__browserSpeakFetchTelemetryInstalled = true; | |
| globalThis.fetch = async (input, init) => { | |
| const startedAt = performance.now(); | |
| const url = fetchUrl(input); | |
| const method = String(init?.method || input?.method || "GET").toUpperCase(); | |
| try { | |
| const response = await originalFetch(input, init); | |
| self.postMessage({ | |
| type: "network", | |
| scope, | |
| method, | |
| url, | |
| responseUrl: response.url || url, | |
| status: response.status, | |
| ok: response.ok, | |
| durationMs: performance.now() - startedAt, | |
| }); | |
| return response; | |
| } catch (error) { | |
| self.postMessage({ | |
| type: "network", | |
| scope, | |
| method, | |
| url, | |
| status: null, | |
| ok: false, | |
| durationMs: performance.now() - startedAt, | |
| error: error.message ?? String(error), | |
| }); | |
| throw error; | |
| } | |
| }; | |
| } | |
| function fetchUrl(input) { | |
| if (typeof input === "string") return input; | |
| if (input instanceof URL) return input.href; | |
| return input?.url ?? ""; | |
| } | |
| self.onmessage = async (event) => { | |
| const message = event.data; | |
| try { | |
| if (message.type === "load") { | |
| await load(message); | |
| } else if (message.type === "configure") { | |
| configure(message); | |
| } else if (message.type === "audio") { | |
| ingestAudio(message.buffer, message.sampleRate); | |
| } else if (message.type === "flush") { | |
| await flushRecording(); | |
| } | |
| } catch (error) { | |
| self.postMessage({ type: "error", message: error.message ?? String(error) }); | |
| } | |
| }; | |
| async function load({ model, device: requestedDevice, partial, silenceMs }) { | |
| device = requestedDevice; | |
| configure({ partial, silenceMs }); | |
| self.postMessage({ type: "status", scope: "vad", message: "Loading", mode: "warn" }); | |
| vadModel = await AutoModel.from_pretrained("onnx-community/silero-vad", { | |
| config: { model_type: "custom" }, | |
| dtype: "fp32", | |
| progress_callback: reportProgress("VAD"), | |
| }); | |
| self.postMessage({ type: "status", message: "Loading", mode: "warn" }); | |
| const dtype = | |
| model.includes("moonshine") | |
| ? { | |
| encoder_model: device === "webgpu" ? "fp32" : "fp32", | |
| decoder_model_merged: "q4", | |
| } | |
| : device === "webgpu" | |
| ? { | |
| encoder_model: "fp32", | |
| decoder_model_merged: "q4", | |
| } | |
| : { | |
| encoder_model: "fp32", | |
| decoder_model_merged: "q4", | |
| }; | |
| transcriber = await pipeline("automatic-speech-recognition", model, { | |
| device, | |
| dtype, | |
| progress_callback: reportProgress("STT"), | |
| }); | |
| self.postMessage({ type: "status", message: "Warming", mode: "warn" }); | |
| await transcribeBuffer(new Float32Array(SAMPLE_RATE), { warmup: true }); | |
| self.postMessage({ type: "ready" }); | |
| } | |
| function reportProgress(label) { | |
| return (progress) => { | |
| if (progress.status === "progress") { | |
| const pct = Number.isFinite(progress.progress) ? ` ${progress.progress.toFixed(0)}%` : ""; | |
| self.postMessage({ type: "status", message: `${label}${pct}`, mode: "warn" }); | |
| } | |
| }; | |
| } | |
| function ingestAudio(buffer, sourceRate) { | |
| const resampled = resampleTo16k(buffer, sourceRate); | |
| inputQueue = concat(inputQueue, resampled); | |
| while (inputQueue.length >= NEW_BUFFER_SIZE) { | |
| const chunk = inputQueue.slice(0, NEW_BUFFER_SIZE); | |
| inputQueue = inputQueue.slice(NEW_BUFFER_SIZE); | |
| void handleVadChunk(chunk); | |
| } | |
| } | |
| async function handleVadChunk(buffer) { | |
| const wasRecording = isRecording; | |
| const speech = await vad(buffer); | |
| if (!wasRecording && !speech) { | |
| if (previousBuffers.length >= MAX_NUM_PREV_BUFFERS) previousBuffers.shift(); | |
| previousBuffers.push(buffer); | |
| return; | |
| } | |
| const remaining = recordingBuffer.length - bufferPointer; | |
| if (buffer.length >= remaining) { | |
| recordingBuffer.set(buffer.subarray(0, remaining), bufferPointer); | |
| bufferPointer += remaining; | |
| dispatchForTranscription(buffer.subarray(remaining)); | |
| return; | |
| } | |
| recordingBuffer.set(buffer, bufferPointer); | |
| bufferPointer += buffer.length; | |
| if (speech) { | |
| if (!isRecording) { | |
| utteranceStartedAt = performance.now(); | |
| self.postMessage({ type: "speechstart" }); | |
| } | |
| isRecording = true; | |
| postSpeechSamples = 0; | |
| maybePartial(); | |
| return; | |
| } | |
| postSpeechSamples += buffer.length; | |
| if (postSpeechSamples < minSilenceDurationSamples) return; | |
| if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) { | |
| reset(); | |
| return; | |
| } | |
| self.postMessage({ | |
| type: "speechend", | |
| trailingSilenceMs: sampleDurationMs(postSpeechSamples), | |
| }); | |
| dispatchForTranscription(); | |
| } | |
| async function vad(buffer) { | |
| const input = new Tensor("float32", buffer, [1, buffer.length]); | |
| const result = await (vadChain = vadChain.then(() => | |
| vadModel({ input, sr: srTensor, state: vadState }), | |
| )); | |
| vadState = result.stateN; | |
| const probability = result.output.data[0]; | |
| return probability > SPEECH_THRESHOLD || (isRecording && probability >= EXIT_THRESHOLD); | |
| } | |
| function maybePartial() { | |
| if (!partialEnabled || partialBusy) return; | |
| const now = performance.now(); | |
| if (now - lastPartialAt < PARTIAL_INTERVAL_MS || bufferPointer < SAMPLE_RATE) return; | |
| partialBusy = true; | |
| lastPartialAt = now; | |
| const buffer = paddedRecordingBuffer(); | |
| transcribeBuffer(buffer, { partial: true }) | |
| .then((text) => { | |
| if (text.trim()) self.postMessage({ type: "partial", text }); | |
| }) | |
| .finally(() => { | |
| partialBusy = false; | |
| }); | |
| } | |
| function dispatchForTranscription(overflow) { | |
| const buffer = paddedRecordingBuffer(); | |
| transcribeBuffer(buffer, { partial: false }).then((text) => { | |
| self.postMessage({ | |
| type: "transcript", | |
| text, | |
| durationMs: performance.now() - utteranceStartedAt, | |
| }); | |
| }); | |
| if (overflow?.length) { | |
| recordingBuffer.set(overflow, 0); | |
| } | |
| reset(overflow?.length ?? 0); | |
| } | |
| async function flushRecording() { | |
| await (vadChain = vadChain.then(() => Promise.resolve())); | |
| if (!isRecording || bufferPointer < MIN_SPEECH_DURATION_SAMPLES) return; | |
| self.postMessage({ | |
| type: "speechend", | |
| trailingSilenceMs: sampleDurationMs(postSpeechSamples), | |
| forced: true, | |
| }); | |
| dispatchForTranscription(); | |
| } | |
| function paddedRecordingBuffer() { | |
| const current = recordingBuffer.slice(0, Math.min(bufferPointer + SPEECH_PAD_SAMPLES, recordingBuffer.length)); | |
| const prevLength = previousBuffers.reduce((sum, item) => sum + item.length, 0); | |
| const padded = new Float32Array(prevLength + current.length); | |
| let offset = 0; | |
| for (const prev of previousBuffers) { | |
| padded.set(prev, offset); | |
| offset += prev.length; | |
| } | |
| padded.set(current, offset); | |
| return padded; | |
| } | |
| async function transcribeBuffer(buffer, { warmup = false } = {}) { | |
| const output = await (asrChain = asrChain.then(() => transcriber(buffer))); | |
| if (warmup) return ""; | |
| return output.text ?? ""; | |
| } | |
| function reset(offset = 0) { | |
| recordingBuffer.fill(0, offset); | |
| bufferPointer = offset; | |
| isRecording = false; | |
| postSpeechSamples = 0; | |
| previousBuffers = []; | |
| lastPartialAt = 0; | |
| } | |
| function configure({ partial, silenceMs } = {}) { | |
| if (typeof partial === "boolean") partialEnabled = partial; | |
| if (silenceMs != null) minSilenceDurationSamples = silenceDurationSamples(silenceMs); | |
| } | |
| function silenceDurationSamples(value) { | |
| const numericValue = Number(value); | |
| const ms = Number.isFinite(numericValue) ? numericValue : DEFAULT_SILENCE_DURATION_MS; | |
| const clampedMs = Math.min(MAX_SILENCE_DURATION_MS, Math.max(MIN_SILENCE_DURATION_MS, ms)); | |
| return Math.round(clampedMs * (SAMPLE_RATE / 1000)); | |
| } | |
| function sampleDurationMs(samples) { | |
| return (samples / SAMPLE_RATE) * 1000; | |
| } | |
| function resampleTo16k(input, sourceRate) { | |
| if (sourceRate === SAMPLE_RATE) return input; | |
| const ratio = sourceRate / SAMPLE_RATE; | |
| const length = Math.floor(input.length / ratio); | |
| const output = new Float32Array(length); | |
| for (let i = 0; i < length; i += 1) { | |
| const position = i * ratio; | |
| const left = Math.floor(position); | |
| const right = Math.min(left + 1, input.length - 1); | |
| const weight = position - left; | |
| output[i] = input[left] * (1 - weight) + input[right] * weight; | |
| } | |
| return output; | |
| } | |
| function concat(left, right) { | |
| if (left.length === 0) return right; | |
| const out = new Float32Array(left.length + right.length); | |
| out.set(left, 0); | |
| out.set(right, left.length); | |
| return out; | |
| } | |