| const MODEL_REPO = "Reza2kn/visualears-fastconformer-fa-depoisoned-phaseB-onnx-fp16"; |
| const MODEL_FILE = "fastconformer_phaseB_ctc_fixed2005_len_fp16_iofp32.onnx"; |
| const MODEL_DATA_FILE = "fastconformer_phaseB_ctc_fixed2005_len_fp16_iofp32.onnx.data"; |
| const MODEL_EMBEDDED_FILE = "fastconformer_phaseB_ctc_fixed2005_len_fp16_iofp32_embedded.onnx"; |
| const MODEL_URL = `https://huggingface.co/${MODEL_REPO}/resolve/main/${MODEL_FILE}`; |
| const MODEL_DATA_URL = `https://huggingface.co/${MODEL_REPO}/resolve/main/${MODEL_DATA_FILE}`; |
| const MODEL_EMBEDDED_URL = `https://huggingface.co/${MODEL_REPO}/resolve/main/${MODEL_EMBEDDED_FILE}`; |
| const ORT_WEBGPU_VERSION = "1.26.0"; |
| const ORT_COMPAT_VERSION = "1.18.0"; |
| const SAMPLE_RATE = 16000; |
| const N_FFT = 512; |
| const WIN_LENGTH = 400; |
| const HOP_LENGTH = 160; |
| const N_MELS = 80; |
| const FIXED_FRAMES = 2005; |
| const OUTPUT_STRIDE = 8; |
| const PREEMPHASIS = 0.0; |
| const CENTER_PAD = N_FFT / 2; |
| const WINDOW_OFFSET = (N_FFT - WIN_LENGTH) / 2; |
| const LOG_ZERO_GUARD = 2 ** -24; |
|
|
| const $ = (id) => document.getElementById(id); |
| let ortRuntime = null; |
| let ortCompatRuntime = null; |
| let wasmFeaturePromise = null; |
|
|
| const state = { |
| session: null, |
| tokens: null, |
| blankId: 1024, |
| audioContext: null, |
| source: null, |
| processor: null, |
| mediaStream: null, |
| utterancePcm: new Float32Array(0), |
| finalTranscript: "", |
| partialTranscript: "", |
| speechActive: false, |
| silenceMs: 0, |
| finalizing: false, |
| recording: false, |
| decoding: false, |
| decodeTimer: null, |
| lastDecodeAt: 0, |
| fftPlan: null, |
| melFilters: null, |
| gpuName: "-", |
| ort: null, |
| }; |
|
|
| function setStatus(message) { |
| $("status").textContent = message; |
| } |
|
|
| function setText(id, value) { |
| $(id).textContent = value; |
| } |
|
|
| function formatMs(ms) { |
| if (!Number.isFinite(ms)) return "-"; |
| return ms < 1000 ? `${ms.toFixed(0)} ms` : `${(ms / 1000).toFixed(2)} s`; |
| } |
|
|
| function updateHeap() { |
| const mem = performance.memory; |
| if (!mem) { |
| setText("stat-heap", "unavailable"); |
| return; |
| } |
| setText("stat-heap", `${(mem.usedJSHeapSize / 1048576).toFixed(0)} / ${(mem.jsHeapSizeLimit / 1048576).toFixed(0)} MB`); |
| } |
|
|
| function float32ToFloat16Bits(value) { |
| if (Number.isNaN(value)) return 0x7e00; |
| if (value === Infinity) return 0x7c00; |
| if (value === -Infinity) return 0xfc00; |
|
|
| const sign = value < 0 || Object.is(value, -0) ? 0x8000 : 0; |
| const abs = Math.abs(value); |
| if (abs === 0) return sign; |
| if (abs >= 65504) return sign | 0x7bff; |
| if (abs < 5.960464477539063e-8) return sign; |
|
|
| if (abs < 0.00006103515625) { |
| return sign | Math.round(abs / 5.960464477539063e-8); |
| } |
|
|
| let exponent = Math.floor(Math.log2(abs)); |
| let mantissa = abs / (2 ** exponent) - 1; |
| let halfExponent = exponent + 15; |
| let halfMantissa = Math.round(mantissa * 1024); |
| if (halfMantissa === 1024) { |
| halfMantissa = 0; |
| halfExponent += 1; |
| } |
| if (halfExponent >= 31) return sign | 0x7bff; |
| return sign | (halfExponent << 10) | (halfMantissa & 0x03ff); |
| } |
|
|
| function float32ArrayToFloat16Bits(values) { |
| if (globalThis.Float16Array) return new globalThis.Float16Array(values); |
| const out = new Uint16Array(values.length); |
| for (let i = 0; i < values.length; i++) out[i] = float32ToFloat16Bits(values[i]); |
| return out; |
| } |
|
|
| function float16BitsToFloat32(bits) { |
| const sign = (bits & 0x8000) ? -1 : 1; |
| const exponent = (bits >> 10) & 0x1f; |
| const mantissa = bits & 0x03ff; |
| if (exponent === 0) { |
| return mantissa === 0 ? sign * 0 : sign * (mantissa / 1024) * 2 ** -14; |
| } |
| if (exponent === 31) { |
| return mantissa ? NaN : sign * Infinity; |
| } |
| return sign * (1 + mantissa / 1024) * 2 ** (exponent - 15); |
| } |
|
|
| function tensorValue(data, index, type) { |
| if (type !== "float16") return data[index]; |
| if (globalThis.Float16Array && data instanceof globalThis.Float16Array) return data[index]; |
| return float16BitsToFloat32(data[index]); |
| } |
|
|
| function renderTranscript() { |
| $("transcript").textContent = `Final:\n${state.finalTranscript || "..."}\n\nPartial:\n${state.partialTranscript || "..."}`; |
| } |
|
|
| function appendToUtterance(chunk) { |
| const maxSeconds = Number($("window-seconds")?.value || 20); |
| const maxSamples = Math.ceil(maxSeconds * SAMPLE_RATE); |
| const merged = new Float32Array(Math.min(maxSamples, state.utterancePcm.length + chunk.length)); |
| const keep = Math.max(0, merged.length - chunk.length); |
| if (keep > 0) merged.set(state.utterancePcm.subarray(state.utterancePcm.length - keep), 0); |
| merged.set(chunk.subarray(Math.max(0, chunk.length - merged.length)), keep); |
| state.utterancePcm = merged; |
| } |
|
|
| function resampleLinear(input, fromRate, toRate) { |
| if (fromRate === toRate) return new Float32Array(input); |
| const outLen = Math.max(1, Math.round(input.length * toRate / fromRate)); |
| const output = new Float32Array(outLen); |
| const ratio = (input.length - 1) / Math.max(1, outLen - 1); |
| for (let i = 0; i < outLen; i++) { |
| const x = i * ratio; |
| const j = Math.floor(x); |
| const frac = x - j; |
| output[i] = input[j] * (1 - frac) + input[Math.min(j + 1, input.length - 1)] * frac; |
| } |
| return output; |
| } |
|
|
| function hzToMel(hz) { |
| return 2595 * Math.log10(1 + hz / 700); |
| } |
|
|
| function melToHz(mel) { |
| return 700 * (10 ** (mel / 2595) - 1); |
| } |
|
|
| function createMelFilters() { |
| if (!Array.isArray(window.VISUALEARS_MEL_FILTERS) || window.VISUALEARS_MEL_FILTERS.length !== N_MELS) { |
| throw new Error("Missing embedded Slaney mel filterbank"); |
| } |
| return window.VISUALEARS_MEL_FILTERS.map((row) => Float32Array.from(row)); |
| } |
|
|
| function createFftPlan(n) { |
| const cos = new Float32Array(n / 2); |
| const sin = new Float32Array(n / 2); |
| for (let i = 0; i < n / 2; i++) { |
| cos[i] = Math.cos(-2 * Math.PI * i / n); |
| sin[i] = Math.sin(-2 * Math.PI * i / n); |
| } |
| return { n, cos, sin }; |
| } |
|
|
| function fftRealPower(frame, plan) { |
| const n = plan.n; |
| const re = new Float32Array(n); |
| const im = new Float32Array(n); |
| re.set(frame); |
| let j = 0; |
| for (let i = 1; i < n; i++) { |
| let bit = n >> 1; |
| for (; j & bit; bit >>= 1) j ^= bit; |
| j ^= bit; |
| if (i < j) { |
| const tr = re[i]; re[i] = re[j]; re[j] = tr; |
| const ti = im[i]; im[i] = im[j]; im[j] = ti; |
| } |
| } |
| for (let len = 2; len <= n; len <<= 1) { |
| const half = len >> 1; |
| const step = n / len; |
| for (let i = 0; i < n; i += len) { |
| for (let k = 0; k < half; k++) { |
| const idx = k * step; |
| const wr = plan.cos[idx]; |
| const wi = plan.sin[idx]; |
| const ur = re[i + k]; |
| const ui = im[i + k]; |
| const vr = re[i + k + half] * wr - im[i + k + half] * wi; |
| const vi = re[i + k + half] * wi + im[i + k + half] * wr; |
| re[i + k] = ur + vr; |
| im[i + k] = ui + vi; |
| re[i + k + half] = ur - vr; |
| im[i + k + half] = ui - vi; |
| } |
| } |
| } |
| const power = new Float32Array(n / 2 + 1); |
| for (let i = 0; i < power.length; i++) power[i] = re[i] * re[i] + im[i] * im[i]; |
| return power; |
| } |
|
|
| function reflectIndex(index, length) { |
| if (length <= 1) return 0; |
| while (index < 0 || index >= length) { |
| if (index < 0) index = -index; |
| if (index >= length) index = 2 * length - index - 2; |
| } |
| return index; |
| } |
|
|
| function pcmToLogMel(pcm) { |
| if (!state.fftPlan) state.fftPlan = createFftPlan(N_FFT); |
| if (!state.melFilters) state.melFilters = createMelFilters(); |
|
|
| const maxSamples = (FIXED_FRAMES - 1) * HOP_LENGTH; |
| if (pcm.length > maxSamples) pcm = pcm.subarray(pcm.length - maxSamples); |
| const frameCount = Math.max(1, Math.min(FIXED_FRAMES, Math.floor(pcm.length / HOP_LENGTH) + 1)); |
| const features = new Float32Array(N_MELS * FIXED_FRAMES); |
| const hann = new Float32Array(WIN_LENGTH); |
| for (let i = 0; i < WIN_LENGTH; i++) hann[i] = 0.5 - 0.5 * Math.cos(2 * Math.PI * i / (WIN_LENGTH - 1)); |
|
|
| const emphasized = new Float32Array(Math.max(1, pcm.length)); |
| if (pcm.length > 0) emphasized[0] = pcm[0]; |
| for (let i = 1; i < pcm.length; i++) emphasized[i] = pcm[i] - PREEMPHASIS * pcm[i - 1]; |
|
|
| for (let t = 0; t < frameCount; t++) { |
| const frame = new Float32Array(N_FFT); |
| const frameStart = t * HOP_LENGTH - CENTER_PAD; |
| for (let j = 0; j < N_FFT; j++) { |
| const winIndex = j - WINDOW_OFFSET; |
| if (winIndex < 0 || winIndex >= WIN_LENGTH) continue; |
| const src = reflectIndex(frameStart + j, emphasized.length); |
| frame[j] = emphasized[src] * hann[winIndex]; |
| } |
| const power = fftRealPower(frame, state.fftPlan); |
| for (let m = 0; m < N_MELS; m++) { |
| const filter = state.melFilters[m]; |
| let energy = 0; |
| for (let k = 0; k < filter.length; k++) energy += power[k] * filter[k]; |
| features[m * FIXED_FRAMES + t] = Math.log(energy + LOG_ZERO_GUARD); |
| } |
| } |
|
|
| for (let m = 0; m < N_MELS; m++) { |
| const offset = m * FIXED_FRAMES; |
| let mean = 0; |
| for (let t = 0; t < frameCount; t++) mean += features[offset + t]; |
| mean /= frameCount; |
| let variance = 0; |
| for (let t = 0; t < frameCount; t++) { |
| const d = features[offset + t] - mean; |
| variance += d * d; |
| } |
| const scale = 1 / Math.sqrt(variance / frameCount + 1e-5); |
| for (let t = 0; t < frameCount; t++) features[offset + t] = (features[offset + t] - mean) * scale; |
| } |
|
|
| return { features, frameCount }; |
| } |
|
|
| function isSpecialToken(piece) { |
| return piece.startsWith("<") && piece.endsWith(">"); |
| } |
|
|
| function decodeCtc(logits, timeSteps, vocabSize, logitsType = "float32") { |
| const tokens = state.tokens; |
| const blank = state.blankId; |
| let previous = -1; |
| const pieces = []; |
| for (let t = 0; t < timeSteps; t++) { |
| let best = 0; |
| let bestValue = -Infinity; |
| const base = t * vocabSize; |
| for (let i = 0; i < vocabSize; i++) { |
| const v = tensorValue(logits, base + i, logitsType); |
| if (v > bestValue) { |
| bestValue = v; |
| best = i; |
| } |
| } |
| const piece = tokens[best] || ""; |
| if (best !== blank && best !== previous && piece && !isSpecialToken(piece)) pieces.push(piece); |
| previous = best; |
| } |
| return pieces.join("").replaceAll("▁", " ").replace(/\s+/g, " ").trim(); |
| } |
| async function inspectGpu() { |
| if (!navigator.gpu) { |
| state.gpuName = "not exposed"; |
| setText("stat-gpu", state.gpuName); |
| return; |
| } |
| try { |
| const adapter = await navigator.gpu.requestAdapter(); |
| if (!adapter) { |
| state.gpuName = "unavailable"; |
| } else if (adapter.info) { |
| state.gpuName = [adapter.info.vendor, adapter.info.architecture, adapter.info.device].filter(Boolean).join(" ") || "available"; |
| } else if (adapter.requestAdapterInfo) { |
| const info = await adapter.requestAdapterInfo(); |
| state.gpuName = [info.vendor, info.architecture, info.device].filter(Boolean).join(" ") || "available"; |
| } else { |
| state.gpuName = "available"; |
| } |
| } catch (err) { |
| state.gpuName = `unavailable: ${err.message}`; |
| } |
| setText("stat-gpu", state.gpuName); |
| } |
|
|
| async function loadCompatOrtRuntime() { |
| if (ortCompatRuntime) return ortCompatRuntime; |
| await new Promise((resolve, reject) => { |
| const existing = document.getElementById("ort-compat-wasm-js"); |
| if (existing) { |
| existing.addEventListener("load", resolve, { once: true }); |
| existing.addEventListener("error", () => reject(new Error("Compatibility ONNX Runtime script failed")), { once: true }); |
| return; |
| } |
| const script = document.createElement("script"); |
| script.id = "ort-compat-wasm-js"; |
| script.async = true; |
| script.crossOrigin = "anonymous"; |
| script.src = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${ORT_COMPAT_VERSION}/dist/ort.wasm.min.js`; |
| script.onload = resolve; |
| script.onerror = () => reject(new Error("Compatibility ONNX Runtime script failed")); |
| document.head.appendChild(script); |
| }); |
| if (!window.ort) throw new Error("Compatibility ONNX Runtime did not initialize"); |
| ortCompatRuntime = window.ort; |
| return ortCompatRuntime; |
| } |
|
|
| async function detectWasmFeatures() { |
| if (wasmFeaturePromise) return wasmFeaturePromise; |
| wasmFeaturePromise = Promise.resolve().then(() => { |
| const simdProbe = new Uint8Array([ |
| 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, |
| 0x01, 0x05, 0x01, 0x60, 0x00, 0x01, 0x7b, |
| 0x03, 0x02, 0x01, 0x00, |
| 0x0a, 0x0a, 0x01, 0x08, 0x00, 0x41, 0x00, 0xfd, 0x0f, 0xfd, 0x62, 0x0b, |
| ]); |
| return { |
| simd: typeof WebAssembly !== "undefined" && WebAssembly.validate(simdProbe), |
| threads: typeof Atomics !== "undefined" && typeof SharedArrayBuffer !== "undefined" && !!window.crossOriginIsolated, |
| }; |
| }).catch(() => ({ simd: false, threads: false })); |
| return wasmFeaturePromise; |
| } |
|
|
| function configureWasm(ort, provider, wasmMode, features) { |
| const compatWasm = provider === "wasm"; |
| const version = compatWasm ? ORT_COMPAT_VERSION : ORT_WEBGPU_VERSION; |
| ort.env.wasm.wasmPaths = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${version}/dist/`; |
| if (compatWasm) { |
| ort.env.wasm.simd = wasmMode !== "nosimd" && !!features?.simd; |
| ort.env.wasm.numThreads = features?.threads ? Math.max(1, Math.min(4, navigator.hardwareConcurrency || 1)) : 1; |
| } else { |
| ort.env.wasm.numThreads = Math.max(1, Math.min(4, navigator.hardwareConcurrency || 1)); |
| } |
| if (ort.env.webgpu) { |
| ort.env.webgpu.profiling = false; |
| } |
| return provider; |
| } |
|
|
| async function createSession(provider, wasmMode = "optimized") { |
| const compatWasm = provider === "wasm"; |
| const features = compatWasm ? await detectWasmFeatures() : null; |
| const effectiveWasmMode = compatWasm && (!features.simd || wasmMode === "nosimd") ? "nosimd" : "optimized"; |
| const ort = compatWasm ? await loadCompatOrtRuntime() : ortRuntime; |
| if (!ort) throw new Error("ONNX Runtime Web did not initialize"); |
| const effectiveProvider = configureWasm(ort, provider, effectiveWasmMode, features); |
| const sessionOptions = { |
| executionProviders: [effectiveProvider], |
| graphOptimizationLevel: "all", |
| enableMemPattern: false, |
| enableCpuMemArena: true, |
| }; |
| if (compatWasm) { |
| return { |
| provider: effectiveWasmMode === "nosimd" ? "wasm-nosimd" : features.threads ? "wasm-simd-threaded" : "wasm-simd", |
| session: await ort.InferenceSession.create(MODEL_EMBEDDED_URL, sessionOptions), |
| ort, |
| }; |
| } |
| return { |
| provider: effectiveProvider, |
| session: await ort.InferenceSession.create(MODEL_URL, { |
| ...sessionOptions, |
| externalData: [ |
| { |
| path: MODEL_DATA_FILE, |
| data: MODEL_DATA_URL, |
| }, |
| ], |
| }), |
| ort, |
| }; |
| } |
|
|
| async function loadModel() { |
| $("load-model").disabled = true; |
| setStatus("Loading tokenizer and ONNX Runtime WebGPU..."); |
| updateHeap(); |
| await inspectGpu(); |
|
|
| const tokenJson = window.VISUALEARS_TOKENS; |
| if (!tokenJson?.tokens?.length) throw new Error("Missing embedded CTC tokens"); |
| state.tokens = tokenJson.tokens; |
| state.blankId = tokenJson.blank_id; |
|
|
| const provider = $("provider").value; |
| const start = performance.now(); |
| let effectiveProvider = provider; |
| try { |
| const noGpu = provider === "webgpu" && !navigator.gpu; |
| if (noGpu) setStatus("WebGPU is unavailable here. Opening the best supported CPU WASM path..."); |
| else if (provider === "wasm") setStatus("Opening the best supported CPU WASM path. First load is about 232 MB; cached reloads should be much faster."); |
| else setStatus(`Downloading/opening ${provider.toUpperCase()} PhaseB FP16 session. First load is about 232 MB; cached reloads should be much faster.`); |
| const result = await createSession(noGpu ? "wasm" : provider); |
| state.session = result.session; |
| state.ort = result.ort; |
| effectiveProvider = result.provider; |
| } catch (err) { |
| const message = err && err.message ? err.message : String(err || "unknown error"); |
| setStatus(`Primary load failed (${message}). Retrying CPU-only WASM without SIMD...`); |
| const result = await createSession("wasm", "nosimd"); |
| state.session = result.session; |
| state.ort = result.ort; |
| effectiveProvider = result.provider; |
| } |
| const elapsed = performance.now() - start; |
| state.provider = effectiveProvider; |
| setText("stat-provider", effectiveProvider); |
| setText("stat-decode", "-"); |
| setText("stat-rtf", "-"); |
| setText("stat-speed", "-"); |
| updateHeap(); |
| setStatus(`Model ready in ${formatMs(elapsed)} (${effectiveProvider}). Start the microphone when you are ready.`); |
| $("start-mic").disabled = false; |
| } |
|
|
| async function startMic() { |
| if (!state.session) return; |
| state.mediaStream = await navigator.mediaDevices.getUserMedia({ |
| audio: { |
| channelCount: 1, |
| echoCancellation: true, |
| noiseSuppression: true, |
| autoGainControl: true, |
| }, |
| }); |
| state.audioContext = new AudioContext({ sampleRate: SAMPLE_RATE }); |
| state.source = state.audioContext.createMediaStreamSource(state.mediaStream); |
| state.processor = state.audioContext.createScriptProcessor(4096, 1, 1); |
| state.source.connect(state.processor); |
| state.processor.connect(state.audioContext.destination); |
| state.utterancePcm = new Float32Array(0); |
| state.finalTranscript = ""; |
| state.partialTranscript = ""; |
| state.speechActive = false; |
| state.silenceMs = 0; |
| state.finalizing = false; |
| renderTranscript(); |
| state.recording = true; |
| $("start-mic").disabled = true; |
| $("stop-mic").disabled = false; |
| setStatus("Listening. Partial text can move while you speak; final text freezes after silence."); |
|
|
| state.processor.onaudioprocess = (event) => { |
| if (!state.recording) return; |
| const input = event.inputBuffer.getChannelData(0); |
| let rms = 0; |
| for (let i = 0; i < input.length; i++) rms += input[i] * input[i]; |
| rms = Math.sqrt(rms / input.length); |
| $("level-bar").style.width = `${Math.min(100, rms * 900).toFixed(1)}%`; |
| const gate = Number($("noise-gate").value); |
| const speechy = rms >= gate; |
| const chunk = resampleLinear(input, state.audioContext.sampleRate, SAMPLE_RATE); |
| const chunkMs = chunk.length / SAMPLE_RATE * 1000; |
|
|
| if (speechy) { |
| if (!state.speechActive) { |
| state.utterancePcm = new Float32Array(0); |
| state.partialTranscript = ""; |
| state.silenceMs = 0; |
| state.speechActive = true; |
| state.finalizing = false; |
| renderTranscript(); |
| } |
| state.silenceMs = 0; |
| appendToUtterance(chunk); |
| return; |
| } |
|
|
| if (!state.speechActive) return; |
| state.silenceMs += chunkMs; |
| if (state.silenceMs <= 350) appendToUtterance(chunk); |
| if (state.silenceMs >= 900 && !state.finalizing) { |
| state.finalizing = true; |
| finalizeUtterance(); |
| } |
| }; |
|
|
| scheduleDecode(); |
| } |
|
|
| function stopMic() { |
| state.recording = false; |
| clearTimeout(state.decodeTimer); |
| state.decodeTimer = null; |
| if (state.processor) state.processor.disconnect(); |
| if (state.source) state.source.disconnect(); |
| if (state.mediaStream) state.mediaStream.getTracks().forEach((track) => track.stop()); |
| if (state.audioContext) state.audioContext.close(); |
| state.processor = null; |
| state.source = null; |
| state.mediaStream = null; |
| state.audioContext = null; |
| $("level-bar").style.width = "0%"; |
| $("start-mic").disabled = !state.session; |
| $("stop-mic").disabled = true; |
| setStatus("Stopped."); |
| } |
|
|
| function scheduleDecode() { |
| clearTimeout(state.decodeTimer); |
| if (!state.recording) return; |
| const delay = Number($("decode-every").value) * 1000; |
| state.decodeTimer = setTimeout(async () => { |
| await runDecode(); |
| scheduleDecode(); |
| }, delay); |
| } |
|
|
| async function runDecode() { |
| if (!state.session || state.decoding || state.finalizing || !state.speechActive || state.utterancePcm.length < SAMPLE_RATE * 0.8) return; |
| await decodeUtterance(false); |
| } |
|
|
| async function finalizeUtterance() { |
| if (state.decoding) { |
| setTimeout(finalizeUtterance, 150); |
| return; |
| } |
| if (!state.session || state.utterancePcm.length < SAMPLE_RATE * 0.25) { |
| state.speechActive = false; |
| state.finalizing = false; |
| return; |
| } |
| await decodeUtterance(true); |
| } |
|
|
| async function decodeUtterance(final) { |
| state.decoding = true; |
| try { |
| const maxSeconds = Number($("window-seconds").value); |
| const samples = Math.min(state.utterancePcm.length, Math.floor(maxSeconds * SAMPLE_RATE)); |
| const sourceAudio = state.utterancePcm.subarray(state.utterancePcm.length - samples); |
| const minLiveSamples = Math.floor(2.5 * SAMPLE_RATE); |
| const audio = sourceAudio.length >= minLiveSamples ? sourceAudio : (() => { |
| const padded = new Float32Array(minLiveSamples); |
| padded.set(sourceAudio, 0); |
| return padded; |
| })(); |
| const preprocessStarted = performance.now(); |
| const { features, frameCount } = pcmToLogMel(audio); |
| const ort = state.ort || ortRuntime; |
| if (!ort) throw new Error("ONNX Runtime Web did not initialize"); |
| const tensor = new ort.Tensor("float32", features, [1, N_MELS, FIXED_FRAMES]); |
| const lengthTensor = new ort.Tensor("int64", BigInt64Array.from([BigInt(frameCount)]), [1]); |
| const inferStarted = performance.now(); |
| const output = await state.session.run({ processed_signal: tensor, processed_signal_length: lengthTensor }); |
| const inferElapsed = performance.now() - inferStarted; |
| const elapsed = performance.now() - preprocessStarted; |
| const logits = output.logits.data; |
| const dims = output.logits.dims; |
| const logitsType = output.logits.type; |
| const vocabSize = dims[2] || state.tokens.length; |
| const encodedRaw = output.encoded_lengths?.data?.[0]; |
| const encodedLength = encodedRaw === undefined ? Math.ceil(frameCount / OUTPUT_STRIDE) : Number(encodedRaw); |
| const usableSteps = Math.max(1, Math.min(dims[1], encodedLength)); |
| const transcript = decodeCtc(logits, usableSteps, vocabSize, logitsType); |
| const audioSeconds = samples / SAMPLE_RATE; |
| const rtf = (elapsed / 1000) / Math.max(0.001, audioSeconds); |
| const speed = Math.max(0.001, audioSeconds) / Math.max(0.001, elapsed / 1000); |
| if (final) { |
| if (transcript) { |
| state.finalTranscript = state.finalTranscript ? `${state.finalTranscript}\n${transcript}` : transcript; |
| } |
| state.partialTranscript = ""; |
| state.speechActive = false; |
| state.finalizing = false; |
| state.silenceMs = 0; |
| state.utterancePcm = new Float32Array(0); |
| } else { |
| state.partialTranscript = transcript; |
| } |
| renderTranscript(); |
| setText("stat-decode", `${formatMs(elapsed)} total / ${formatMs(inferElapsed)} infer`); |
| setText("stat-rtf", `${rtf.toFixed(3)}`); |
| setText("stat-speed", `${speed.toFixed(1)}x`); |
| setText("stat-audio", `${audioSeconds.toFixed(1)} s`); |
| setText("stat-frames", `${frameCount} / ${FIXED_FRAMES}`); |
| setText("stat-provider", state.provider || $("provider").value); |
| setText("stat-last", new Date().toLocaleTimeString()); |
| updateHeap(); |
| setStatus(`${final ? "Finalized" : "Decoded partial"} ${audioSeconds.toFixed(1)} s model window (${(samples / SAMPLE_RATE).toFixed(1)} s captured) with ${usableSteps} CTC steps.`); |
| } catch (err) { |
| console.error(err); |
| setStatus(`Decode failed: ${err.message}`); |
| if (final) { |
| state.speechActive = false; |
| state.finalizing = false; |
| } |
| } finally { |
| state.decoding = false; |
| } |
| } |
|
|
| async function boot() { |
| if (!$("load-model")) { |
| setTimeout(boot, 100); |
| return; |
| } |
| ortRuntime = await import(`https://cdn.jsdelivr.net/npm/onnxruntime-web@${ORT_WEBGPU_VERSION}/dist/ort.webgpu.min.mjs`); |
| $("load-model").addEventListener("click", () => loadModel().catch((err) => { |
| console.error(err); |
| setStatus(`Load failed: ${err.message}`); |
| $("load-model").disabled = false; |
| })); |
| $("start-mic").addEventListener("click", () => startMic().catch((err) => { |
| console.error(err); |
| setStatus(`Mic failed: ${err.message}`); |
| stopMic(); |
| })); |
| $("stop-mic").addEventListener("click", stopMic); |
| $("provider").addEventListener("change", () => { |
| if (state.session) { |
| state.session = null; |
| $("start-mic").disabled = true; |
| $("load-model").disabled = false; |
| setStatus("Provider changed. Load the model again."); |
| } |
| }); |
|
|
| setText("stat-provider", "-"); |
| setText("stat-gpu", "-"); |
| updateHeap(); |
| } |
|
|
| boot().catch((err) => { |
| console.error(err); |
| setStatus(`Startup failed: ${err.message}`); |
| }); |
|
|