browser-speak / workers /asr-worker.js
Mike0021's picture
Add worker network telemetry to browser evidence
d2ae80e verified
Raw
History Blame Contribute Delete
9.91 kB
import { AutoModel, Tensor, env, pipeline } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.2.0";
env.allowLocalModels = false;
env.useBrowserCache = true;
installFetchTelemetry("asr");
const SAMPLE_RATE = 16000;
const SPEECH_THRESHOLD = 0.3;
const EXIT_THRESHOLD = 0.1;
const DEFAULT_SILENCE_DURATION_MS = 480;
const MIN_SILENCE_DURATION_MS = 200;
const MAX_SILENCE_DURATION_MS = 800;
const SPEECH_PAD_SAMPLES = 80 * (SAMPLE_RATE / 1000);
const MIN_SPEECH_DURATION_SAMPLES = 250 * (SAMPLE_RATE / 1000);
const MAX_BUFFER_DURATION = 30;
const NEW_BUFFER_SIZE = 512;
const MAX_NUM_PREV_BUFFERS = Math.ceil(SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE);
const PARTIAL_INTERVAL_MS = 1600;
let vadModel = null;
let transcriber = null;
let device = "wasm";
let inputQueue = new Float32Array(0);
let vadChain = Promise.resolve();
let asrChain = Promise.resolve();
let vadState = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
let srTensor = new Tensor("int64", [SAMPLE_RATE], []);
let isRecording = false;
let bufferPointer = 0;
let postSpeechSamples = 0;
let previousBuffers = [];
let partialEnabled = true;
let minSilenceDurationSamples = silenceDurationSamples(DEFAULT_SILENCE_DURATION_MS);
let partialBusy = false;
let lastPartialAt = 0;
let utteranceStartedAt = 0;
const recordingBuffer = new Float32Array(MAX_BUFFER_DURATION * SAMPLE_RATE);
function installFetchTelemetry(scope) {
const originalFetch = globalThis.fetch?.bind(globalThis);
if (!originalFetch || globalThis.__browserSpeakFetchTelemetryInstalled) return;
globalThis.__browserSpeakFetchTelemetryInstalled = true;
globalThis.fetch = async (input, init) => {
const startedAt = performance.now();
const url = fetchUrl(input);
const method = String(init?.method || input?.method || "GET").toUpperCase();
try {
const response = await originalFetch(input, init);
self.postMessage({
type: "network",
scope,
method,
url,
responseUrl: response.url || url,
status: response.status,
ok: response.ok,
durationMs: performance.now() - startedAt,
});
return response;
} catch (error) {
self.postMessage({
type: "network",
scope,
method,
url,
status: null,
ok: false,
durationMs: performance.now() - startedAt,
error: error.message ?? String(error),
});
throw error;
}
};
}
function fetchUrl(input) {
if (typeof input === "string") return input;
if (input instanceof URL) return input.href;
return input?.url ?? "";
}
self.onmessage = async (event) => {
const message = event.data;
try {
if (message.type === "load") {
await load(message);
} else if (message.type === "configure") {
configure(message);
} else if (message.type === "audio") {
ingestAudio(message.buffer, message.sampleRate);
} else if (message.type === "flush") {
await flushRecording();
}
} catch (error) {
self.postMessage({ type: "error", message: error.message ?? String(error) });
}
};
async function load({ model, device: requestedDevice, partial, silenceMs }) {
device = requestedDevice;
configure({ partial, silenceMs });
self.postMessage({ type: "status", scope: "vad", message: "Loading", mode: "warn" });
vadModel = await AutoModel.from_pretrained("onnx-community/silero-vad", {
config: { model_type: "custom" },
dtype: "fp32",
progress_callback: reportProgress("VAD"),
});
self.postMessage({ type: "status", message: "Loading", mode: "warn" });
const dtype =
model.includes("moonshine")
? {
encoder_model: device === "webgpu" ? "fp32" : "fp32",
decoder_model_merged: "q4",
}
: device === "webgpu"
? {
encoder_model: "fp32",
decoder_model_merged: "q4",
}
: {
encoder_model: "fp32",
decoder_model_merged: "q4",
};
transcriber = await pipeline("automatic-speech-recognition", model, {
device,
dtype,
progress_callback: reportProgress("STT"),
});
self.postMessage({ type: "status", message: "Warming", mode: "warn" });
await transcribeBuffer(new Float32Array(SAMPLE_RATE), { warmup: true });
self.postMessage({ type: "ready" });
}
function reportProgress(label) {
return (progress) => {
if (progress.status === "progress") {
const pct = Number.isFinite(progress.progress) ? ` ${progress.progress.toFixed(0)}%` : "";
self.postMessage({ type: "status", message: `${label}${pct}`, mode: "warn" });
}
};
}
function ingestAudio(buffer, sourceRate) {
const resampled = resampleTo16k(buffer, sourceRate);
inputQueue = concat(inputQueue, resampled);
while (inputQueue.length >= NEW_BUFFER_SIZE) {
const chunk = inputQueue.slice(0, NEW_BUFFER_SIZE);
inputQueue = inputQueue.slice(NEW_BUFFER_SIZE);
void handleVadChunk(chunk);
}
}
async function handleVadChunk(buffer) {
const wasRecording = isRecording;
const speech = await vad(buffer);
if (!wasRecording && !speech) {
if (previousBuffers.length >= MAX_NUM_PREV_BUFFERS) previousBuffers.shift();
previousBuffers.push(buffer);
return;
}
const remaining = recordingBuffer.length - bufferPointer;
if (buffer.length >= remaining) {
recordingBuffer.set(buffer.subarray(0, remaining), bufferPointer);
bufferPointer += remaining;
dispatchForTranscription(buffer.subarray(remaining));
return;
}
recordingBuffer.set(buffer, bufferPointer);
bufferPointer += buffer.length;
if (speech) {
if (!isRecording) {
utteranceStartedAt = performance.now();
self.postMessage({ type: "speechstart" });
}
isRecording = true;
postSpeechSamples = 0;
maybePartial();
return;
}
postSpeechSamples += buffer.length;
if (postSpeechSamples < minSilenceDurationSamples) return;
if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
reset();
return;
}
self.postMessage({
type: "speechend",
trailingSilenceMs: sampleDurationMs(postSpeechSamples),
});
dispatchForTranscription();
}
async function vad(buffer) {
const input = new Tensor("float32", buffer, [1, buffer.length]);
const result = await (vadChain = vadChain.then(() =>
vadModel({ input, sr: srTensor, state: vadState }),
));
vadState = result.stateN;
const probability = result.output.data[0];
return probability > SPEECH_THRESHOLD || (isRecording && probability >= EXIT_THRESHOLD);
}
function maybePartial() {
if (!partialEnabled || partialBusy) return;
const now = performance.now();
if (now - lastPartialAt < PARTIAL_INTERVAL_MS || bufferPointer < SAMPLE_RATE) return;
partialBusy = true;
lastPartialAt = now;
const buffer = paddedRecordingBuffer();
transcribeBuffer(buffer, { partial: true })
.then((text) => {
if (text.trim()) self.postMessage({ type: "partial", text });
})
.finally(() => {
partialBusy = false;
});
}
function dispatchForTranscription(overflow) {
const buffer = paddedRecordingBuffer();
transcribeBuffer(buffer, { partial: false }).then((text) => {
self.postMessage({
type: "transcript",
text,
durationMs: performance.now() - utteranceStartedAt,
});
});
if (overflow?.length) {
recordingBuffer.set(overflow, 0);
}
reset(overflow?.length ?? 0);
}
async function flushRecording() {
await (vadChain = vadChain.then(() => Promise.resolve()));
if (!isRecording || bufferPointer < MIN_SPEECH_DURATION_SAMPLES) return;
self.postMessage({
type: "speechend",
trailingSilenceMs: sampleDurationMs(postSpeechSamples),
forced: true,
});
dispatchForTranscription();
}
function paddedRecordingBuffer() {
const current = recordingBuffer.slice(0, Math.min(bufferPointer + SPEECH_PAD_SAMPLES, recordingBuffer.length));
const prevLength = previousBuffers.reduce((sum, item) => sum + item.length, 0);
const padded = new Float32Array(prevLength + current.length);
let offset = 0;
for (const prev of previousBuffers) {
padded.set(prev, offset);
offset += prev.length;
}
padded.set(current, offset);
return padded;
}
async function transcribeBuffer(buffer, { warmup = false } = {}) {
const output = await (asrChain = asrChain.then(() => transcriber(buffer)));
if (warmup) return "";
return output.text ?? "";
}
function reset(offset = 0) {
recordingBuffer.fill(0, offset);
bufferPointer = offset;
isRecording = false;
postSpeechSamples = 0;
previousBuffers = [];
lastPartialAt = 0;
}
function configure({ partial, silenceMs } = {}) {
if (typeof partial === "boolean") partialEnabled = partial;
if (silenceMs != null) minSilenceDurationSamples = silenceDurationSamples(silenceMs);
}
function silenceDurationSamples(value) {
const numericValue = Number(value);
const ms = Number.isFinite(numericValue) ? numericValue : DEFAULT_SILENCE_DURATION_MS;
const clampedMs = Math.min(MAX_SILENCE_DURATION_MS, Math.max(MIN_SILENCE_DURATION_MS, ms));
return Math.round(clampedMs * (SAMPLE_RATE / 1000));
}
function sampleDurationMs(samples) {
return (samples / SAMPLE_RATE) * 1000;
}
function resampleTo16k(input, sourceRate) {
if (sourceRate === SAMPLE_RATE) return input;
const ratio = sourceRate / SAMPLE_RATE;
const length = Math.floor(input.length / ratio);
const output = new Float32Array(length);
for (let i = 0; i < length; i += 1) {
const position = i * ratio;
const left = Math.floor(position);
const right = Math.min(left + 1, input.length - 1);
const weight = position - left;
output[i] = input[left] * (1 - weight) + input[right] * weight;
}
return output;
}
function concat(left, right) {
if (left.length === 0) return right;
const out = new Float32Array(left.length + right.length);
out.set(left, 0);
out.set(right, left.length);
return out;
}