Spaces:

Mike0021
/

browser-speak

Configuration error

App Files Files Community

browser-speak / workers /asr-worker.js

Mike0021's picture

Add worker network telemetry to browser evidence

d2ae80e verified about 1 month ago

History Blame Contribute Delete

9.91 kB

	import { AutoModel, Tensor, env, pipeline } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.2.0";

	env.allowLocalModels = false;
	env.useBrowserCache = true;
	installFetchTelemetry("asr");

	const SAMPLE_RATE = 16000;
	const SPEECH_THRESHOLD = 0.3;
	const EXIT_THRESHOLD = 0.1;
	const DEFAULT_SILENCE_DURATION_MS = 480;
	const MIN_SILENCE_DURATION_MS = 200;
	const MAX_SILENCE_DURATION_MS = 800;
	const SPEECH_PAD_SAMPLES = 80 * (SAMPLE_RATE / 1000);
	const MIN_SPEECH_DURATION_SAMPLES = 250 * (SAMPLE_RATE / 1000);
	const MAX_BUFFER_DURATION = 30;
	const NEW_BUFFER_SIZE = 512;
	const MAX_NUM_PREV_BUFFERS = Math.ceil(SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE);
	const PARTIAL_INTERVAL_MS = 1600;

	let vadModel = null;
	let transcriber = null;
	let device = "wasm";
	let inputQueue = new Float32Array(0);
	let vadChain = Promise.resolve();
	let asrChain = Promise.resolve();
	let vadState = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
	let srTensor = new Tensor("int64", [SAMPLE_RATE], []);
	let isRecording = false;
	let bufferPointer = 0;
	let postSpeechSamples = 0;
	let previousBuffers = [];
	let partialEnabled = true;
	let minSilenceDurationSamples = silenceDurationSamples(DEFAULT_SILENCE_DURATION_MS);
	let partialBusy = false;
	let lastPartialAt = 0;
	let utteranceStartedAt = 0;
	const recordingBuffer = new Float32Array(MAX_BUFFER_DURATION * SAMPLE_RATE);

	function installFetchTelemetry(scope) {
	const originalFetch = globalThis.fetch?.bind(globalThis);
	if (!originalFetch \|\| globalThis.__browserSpeakFetchTelemetryInstalled) return;
	globalThis.__browserSpeakFetchTelemetryInstalled = true;
	globalThis.fetch = async (input, init) => {
	const startedAt = performance.now();
	const url = fetchUrl(input);
	const method = String(init?.method \|\| input?.method \|\| "GET").toUpperCase();
	try {
	const response = await originalFetch(input, init);
	self.postMessage({
	type: "network",
	scope,
	method,
	url,
	responseUrl: response.url \|\| url,
	status: response.status,
	ok: response.ok,
	durationMs: performance.now() - startedAt,
	});
	return response;
	} catch (error) {
	self.postMessage({
	type: "network",
	scope,
	method,
	url,
	status: null,
	ok: false,
	durationMs: performance.now() - startedAt,
	error: error.message ?? String(error),
	});
	throw error;
	}
	};
	}

	function fetchUrl(input) {
	if (typeof input === "string") return input;
	if (input instanceof URL) return input.href;
	return input?.url ?? "";
	}

	self.onmessage = async (event) => {
	const message = event.data;
	try {
	if (message.type === "load") {
	await load(message);
	} else if (message.type === "configure") {
	configure(message);
	} else if (message.type === "audio") {
	ingestAudio(message.buffer, message.sampleRate);
	} else if (message.type === "flush") {
	await flushRecording();
	}
	} catch (error) {
	self.postMessage({ type: "error", message: error.message ?? String(error) });
	}
	};

	async function load({ model, device: requestedDevice, partial, silenceMs }) {
	device = requestedDevice;
	configure({ partial, silenceMs });
	self.postMessage({ type: "status", scope: "vad", message: "Loading", mode: "warn" });
	vadModel = await AutoModel.from_pretrained("onnx-community/silero-vad", {
	config: { model_type: "custom" },
	dtype: "fp32",
	progress_callback: reportProgress("VAD"),
	});

	self.postMessage({ type: "status", message: "Loading", mode: "warn" });
	const dtype =
	model.includes("moonshine")
	? {
	encoder_model: device === "webgpu" ? "fp32" : "fp32",
	decoder_model_merged: "q4",
	}
	: device === "webgpu"
	? {
	encoder_model: "fp32",
	decoder_model_merged: "q4",
	}
	: {
	encoder_model: "fp32",
	decoder_model_merged: "q4",
	};

	transcriber = await pipeline("automatic-speech-recognition", model, {
	device,
	dtype,
	progress_callback: reportProgress("STT"),
	});

	self.postMessage({ type: "status", message: "Warming", mode: "warn" });
	await transcribeBuffer(new Float32Array(SAMPLE_RATE), { warmup: true });
	self.postMessage({ type: "ready" });
	}

	function reportProgress(label) {
	return (progress) => {
	if (progress.status === "progress") {
	const pct = Number.isFinite(progress.progress) ? ` ${progress.progress.toFixed(0)}%` : "";
	self.postMessage({ type: "status", message: `${label}${pct}`, mode: "warn" });
	}
	};
	}

	function ingestAudio(buffer, sourceRate) {
	const resampled = resampleTo16k(buffer, sourceRate);
	inputQueue = concat(inputQueue, resampled);
	while (inputQueue.length >= NEW_BUFFER_SIZE) {
	const chunk = inputQueue.slice(0, NEW_BUFFER_SIZE);
	inputQueue = inputQueue.slice(NEW_BUFFER_SIZE);
	void handleVadChunk(chunk);
	}
	}

	async function handleVadChunk(buffer) {
	const wasRecording = isRecording;
	const speech = await vad(buffer);

	if (!wasRecording && !speech) {
	if (previousBuffers.length >= MAX_NUM_PREV_BUFFERS) previousBuffers.shift();
	previousBuffers.push(buffer);
	return;
	}

	const remaining = recordingBuffer.length - bufferPointer;
	if (buffer.length >= remaining) {
	recordingBuffer.set(buffer.subarray(0, remaining), bufferPointer);
	bufferPointer += remaining;
	dispatchForTranscription(buffer.subarray(remaining));
	return;
	}

	recordingBuffer.set(buffer, bufferPointer);
	bufferPointer += buffer.length;

	if (speech) {
	if (!isRecording) {
	utteranceStartedAt = performance.now();
	self.postMessage({ type: "speechstart" });
	}
	isRecording = true;
	postSpeechSamples = 0;
	maybePartial();
	return;
	}

	postSpeechSamples += buffer.length;
	if (postSpeechSamples < minSilenceDurationSamples) return;

	if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
	reset();
	return;
	}

	self.postMessage({
	type: "speechend",
	trailingSilenceMs: sampleDurationMs(postSpeechSamples),
	});
	dispatchForTranscription();
	}

	async function vad(buffer) {
	const input = new Tensor("float32", buffer, [1, buffer.length]);
	const result = await (vadChain = vadChain.then(() =>
	vadModel({ input, sr: srTensor, state: vadState }),
	));
	vadState = result.stateN;
	const probability = result.output.data[0];
	return probability > SPEECH_THRESHOLD \|\| (isRecording && probability >= EXIT_THRESHOLD);
	}

	function maybePartial() {
	if (!partialEnabled \|\| partialBusy) return;
	const now = performance.now();
	if (now - lastPartialAt < PARTIAL_INTERVAL_MS \|\| bufferPointer < SAMPLE_RATE) return;
	partialBusy = true;
	lastPartialAt = now;
	const buffer = paddedRecordingBuffer();
	transcribeBuffer(buffer, { partial: true })
	.then((text) => {
	if (text.trim()) self.postMessage({ type: "partial", text });
	})
	.finally(() => {
	partialBusy = false;
	});
	}

	function dispatchForTranscription(overflow) {
	const buffer = paddedRecordingBuffer();
	transcribeBuffer(buffer, { partial: false }).then((text) => {
	self.postMessage({
	type: "transcript",
	text,
	durationMs: performance.now() - utteranceStartedAt,
	});
	});

	if (overflow?.length) {
	recordingBuffer.set(overflow, 0);
	}
	reset(overflow?.length ?? 0);
	}

	async function flushRecording() {
	await (vadChain = vadChain.then(() => Promise.resolve()));
	if (!isRecording \|\| bufferPointer < MIN_SPEECH_DURATION_SAMPLES) return;
	self.postMessage({
	type: "speechend",
	trailingSilenceMs: sampleDurationMs(postSpeechSamples),
	forced: true,
	});
	dispatchForTranscription();
	}

	function paddedRecordingBuffer() {
	const current = recordingBuffer.slice(0, Math.min(bufferPointer + SPEECH_PAD_SAMPLES, recordingBuffer.length));
	const prevLength = previousBuffers.reduce((sum, item) => sum + item.length, 0);
	const padded = new Float32Array(prevLength + current.length);
	let offset = 0;
	for (const prev of previousBuffers) {
	padded.set(prev, offset);
	offset += prev.length;
	}
	padded.set(current, offset);
	return padded;
	}

	async function transcribeBuffer(buffer, { warmup = false } = {}) {
	const output = await (asrChain = asrChain.then(() => transcriber(buffer)));
	if (warmup) return "";
	return output.text ?? "";
	}

	function reset(offset = 0) {
	recordingBuffer.fill(0, offset);
	bufferPointer = offset;
	isRecording = false;
	postSpeechSamples = 0;
	previousBuffers = [];
	lastPartialAt = 0;
	}

	function configure({ partial, silenceMs } = {}) {
	if (typeof partial === "boolean") partialEnabled = partial;
	if (silenceMs != null) minSilenceDurationSamples = silenceDurationSamples(silenceMs);
	}

	function silenceDurationSamples(value) {
	const numericValue = Number(value);
	const ms = Number.isFinite(numericValue) ? numericValue : DEFAULT_SILENCE_DURATION_MS;
	const clampedMs = Math.min(MAX_SILENCE_DURATION_MS, Math.max(MIN_SILENCE_DURATION_MS, ms));
	return Math.round(clampedMs * (SAMPLE_RATE / 1000));
	}

	function sampleDurationMs(samples) {
	return (samples / SAMPLE_RATE) * 1000;
	}

	function resampleTo16k(input, sourceRate) {
	if (sourceRate === SAMPLE_RATE) return input;
	const ratio = sourceRate / SAMPLE_RATE;
	const length = Math.floor(input.length / ratio);
	const output = new Float32Array(length);
	for (let i = 0; i < length; i += 1) {
	const position = i * ratio;
	const left = Math.floor(position);
	const right = Math.min(left + 1, input.length - 1);
	const weight = position - left;
	output[i] = input[left] * (1 - weight) + input[right] * weight;
	}
	return output;
	}

	function concat(left, right) {
	if (left.length === 0) return right;
	const out = new Float32Array(left.length + right.length);
	out.set(left, 0);
	out.set(right, left.length);
	return out;
	}