const MODEL_REPO = "Reza2kn/visualears-fastconformer-fa-depoisoned-phaseB-onnx-fp16"; const MODEL_FILE = "fastconformer_phaseB_ctc_fixed2005_len_fp16_iofp32.onnx"; const MODEL_DATA_FILE = "fastconformer_phaseB_ctc_fixed2005_len_fp16_iofp32.onnx.data"; const MODEL_EMBEDDED_FILE = "fastconformer_phaseB_ctc_fixed2005_len_fp16_iofp32_embedded.onnx"; const MODEL_URL = `https://huggingface.co/${MODEL_REPO}/resolve/main/${MODEL_FILE}`; const MODEL_DATA_URL = `https://huggingface.co/${MODEL_REPO}/resolve/main/${MODEL_DATA_FILE}`; const MODEL_EMBEDDED_URL = `https://huggingface.co/${MODEL_REPO}/resolve/main/${MODEL_EMBEDDED_FILE}`; const ORT_WEBGPU_VERSION = "1.26.0"; const ORT_COMPAT_VERSION = "1.18.0"; const SAMPLE_RATE = 16000; const N_FFT = 512; const WIN_LENGTH = 400; const HOP_LENGTH = 160; const N_MELS = 80; const FIXED_FRAMES = 2005; const OUTPUT_STRIDE = 8; const PREEMPHASIS = 0.0; const CENTER_PAD = N_FFT / 2; const WINDOW_OFFSET = (N_FFT - WIN_LENGTH) / 2; const LOG_ZERO_GUARD = 2 ** -24; const $ = (id) => document.getElementById(id); let ortRuntime = null; let ortCompatRuntime = null; let wasmFeaturePromise = null; const state = { session: null, tokens: null, blankId: 1024, audioContext: null, source: null, processor: null, mediaStream: null, utterancePcm: new Float32Array(0), finalTranscript: "", partialTranscript: "", speechActive: false, silenceMs: 0, finalizing: false, recording: false, decoding: false, decodeTimer: null, lastDecodeAt: 0, fftPlan: null, melFilters: null, gpuName: "-", ort: null, }; function setStatus(message) { $("status").textContent = message; } function setText(id, value) { $(id).textContent = value; } function formatMs(ms) { if (!Number.isFinite(ms)) return "-"; return ms < 1000 ? `${ms.toFixed(0)} ms` : `${(ms / 1000).toFixed(2)} s`; } function updateHeap() { const mem = performance.memory; if (!mem) { setText("stat-heap", "unavailable"); return; } setText("stat-heap", `${(mem.usedJSHeapSize / 1048576).toFixed(0)} / ${(mem.jsHeapSizeLimit / 1048576).toFixed(0)} MB`); } function float32ToFloat16Bits(value) { if (Number.isNaN(value)) return 0x7e00; if (value === Infinity) return 0x7c00; if (value === -Infinity) return 0xfc00; const sign = value < 0 || Object.is(value, -0) ? 0x8000 : 0; const abs = Math.abs(value); if (abs === 0) return sign; if (abs >= 65504) return sign | 0x7bff; if (abs < 5.960464477539063e-8) return sign; if (abs < 0.00006103515625) { return sign | Math.round(abs / 5.960464477539063e-8); } let exponent = Math.floor(Math.log2(abs)); let mantissa = abs / (2 ** exponent) - 1; let halfExponent = exponent + 15; let halfMantissa = Math.round(mantissa * 1024); if (halfMantissa === 1024) { halfMantissa = 0; halfExponent += 1; } if (halfExponent >= 31) return sign | 0x7bff; return sign | (halfExponent << 10) | (halfMantissa & 0x03ff); } function float32ArrayToFloat16Bits(values) { if (globalThis.Float16Array) return new globalThis.Float16Array(values); const out = new Uint16Array(values.length); for (let i = 0; i < values.length; i++) out[i] = float32ToFloat16Bits(values[i]); return out; } function float16BitsToFloat32(bits) { const sign = (bits & 0x8000) ? -1 : 1; const exponent = (bits >> 10) & 0x1f; const mantissa = bits & 0x03ff; if (exponent === 0) { return mantissa === 0 ? sign * 0 : sign * (mantissa / 1024) * 2 ** -14; } if (exponent === 31) { return mantissa ? NaN : sign * Infinity; } return sign * (1 + mantissa / 1024) * 2 ** (exponent - 15); } function tensorValue(data, index, type) { if (type !== "float16") return data[index]; if (globalThis.Float16Array && data instanceof globalThis.Float16Array) return data[index]; return float16BitsToFloat32(data[index]); } function renderTranscript() { $("transcript").textContent = `Final:\n${state.finalTranscript || "..."}\n\nPartial:\n${state.partialTranscript || "..."}`; } function appendToUtterance(chunk) { const maxSeconds = Number($("window-seconds")?.value || 20); const maxSamples = Math.ceil(maxSeconds * SAMPLE_RATE); const merged = new Float32Array(Math.min(maxSamples, state.utterancePcm.length + chunk.length)); const keep = Math.max(0, merged.length - chunk.length); if (keep > 0) merged.set(state.utterancePcm.subarray(state.utterancePcm.length - keep), 0); merged.set(chunk.subarray(Math.max(0, chunk.length - merged.length)), keep); state.utterancePcm = merged; } function resampleLinear(input, fromRate, toRate) { if (fromRate === toRate) return new Float32Array(input); const outLen = Math.max(1, Math.round(input.length * toRate / fromRate)); const output = new Float32Array(outLen); const ratio = (input.length - 1) / Math.max(1, outLen - 1); for (let i = 0; i < outLen; i++) { const x = i * ratio; const j = Math.floor(x); const frac = x - j; output[i] = input[j] * (1 - frac) + input[Math.min(j + 1, input.length - 1)] * frac; } return output; } function hzToMel(hz) { return 2595 * Math.log10(1 + hz / 700); } function melToHz(mel) { return 700 * (10 ** (mel / 2595) - 1); } function createMelFilters() { if (!Array.isArray(window.VISUALEARS_MEL_FILTERS) || window.VISUALEARS_MEL_FILTERS.length !== N_MELS) { throw new Error("Missing embedded Slaney mel filterbank"); } return window.VISUALEARS_MEL_FILTERS.map((row) => Float32Array.from(row)); } function createFftPlan(n) { const cos = new Float32Array(n / 2); const sin = new Float32Array(n / 2); for (let i = 0; i < n / 2; i++) { cos[i] = Math.cos(-2 * Math.PI * i / n); sin[i] = Math.sin(-2 * Math.PI * i / n); } return { n, cos, sin }; } function fftRealPower(frame, plan) { const n = plan.n; const re = new Float32Array(n); const im = new Float32Array(n); re.set(frame); let j = 0; for (let i = 1; i < n; i++) { let bit = n >> 1; for (; j & bit; bit >>= 1) j ^= bit; j ^= bit; if (i < j) { const tr = re[i]; re[i] = re[j]; re[j] = tr; const ti = im[i]; im[i] = im[j]; im[j] = ti; } } for (let len = 2; len <= n; len <<= 1) { const half = len >> 1; const step = n / len; for (let i = 0; i < n; i += len) { for (let k = 0; k < half; k++) { const idx = k * step; const wr = plan.cos[idx]; const wi = plan.sin[idx]; const ur = re[i + k]; const ui = im[i + k]; const vr = re[i + k + half] * wr - im[i + k + half] * wi; const vi = re[i + k + half] * wi + im[i + k + half] * wr; re[i + k] = ur + vr; im[i + k] = ui + vi; re[i + k + half] = ur - vr; im[i + k + half] = ui - vi; } } } const power = new Float32Array(n / 2 + 1); for (let i = 0; i < power.length; i++) power[i] = re[i] * re[i] + im[i] * im[i]; return power; } function reflectIndex(index, length) { if (length <= 1) return 0; while (index < 0 || index >= length) { if (index < 0) index = -index; if (index >= length) index = 2 * length - index - 2; } return index; } function pcmToLogMel(pcm) { if (!state.fftPlan) state.fftPlan = createFftPlan(N_FFT); if (!state.melFilters) state.melFilters = createMelFilters(); const maxSamples = (FIXED_FRAMES - 1) * HOP_LENGTH; if (pcm.length > maxSamples) pcm = pcm.subarray(pcm.length - maxSamples); const frameCount = Math.max(1, Math.min(FIXED_FRAMES, Math.floor(pcm.length / HOP_LENGTH) + 1)); const features = new Float32Array(N_MELS * FIXED_FRAMES); const hann = new Float32Array(WIN_LENGTH); for (let i = 0; i < WIN_LENGTH; i++) hann[i] = 0.5 - 0.5 * Math.cos(2 * Math.PI * i / (WIN_LENGTH - 1)); const emphasized = new Float32Array(Math.max(1, pcm.length)); if (pcm.length > 0) emphasized[0] = pcm[0]; for (let i = 1; i < pcm.length; i++) emphasized[i] = pcm[i] - PREEMPHASIS * pcm[i - 1]; for (let t = 0; t < frameCount; t++) { const frame = new Float32Array(N_FFT); const frameStart = t * HOP_LENGTH - CENTER_PAD; for (let j = 0; j < N_FFT; j++) { const winIndex = j - WINDOW_OFFSET; if (winIndex < 0 || winIndex >= WIN_LENGTH) continue; const src = reflectIndex(frameStart + j, emphasized.length); frame[j] = emphasized[src] * hann[winIndex]; } const power = fftRealPower(frame, state.fftPlan); for (let m = 0; m < N_MELS; m++) { const filter = state.melFilters[m]; let energy = 0; for (let k = 0; k < filter.length; k++) energy += power[k] * filter[k]; features[m * FIXED_FRAMES + t] = Math.log(energy + LOG_ZERO_GUARD); } } for (let m = 0; m < N_MELS; m++) { const offset = m * FIXED_FRAMES; let mean = 0; for (let t = 0; t < frameCount; t++) mean += features[offset + t]; mean /= frameCount; let variance = 0; for (let t = 0; t < frameCount; t++) { const d = features[offset + t] - mean; variance += d * d; } const scale = 1 / Math.sqrt(variance / frameCount + 1e-5); for (let t = 0; t < frameCount; t++) features[offset + t] = (features[offset + t] - mean) * scale; } return { features, frameCount }; } function isSpecialToken(piece) { return piece.startsWith("<") && piece.endsWith(">"); } function decodeCtc(logits, timeSteps, vocabSize, logitsType = "float32") { const tokens = state.tokens; const blank = state.blankId; let previous = -1; const pieces = []; for (let t = 0; t < timeSteps; t++) { let best = 0; let bestValue = -Infinity; const base = t * vocabSize; for (let i = 0; i < vocabSize; i++) { const v = tensorValue(logits, base + i, logitsType); if (v > bestValue) { bestValue = v; best = i; } } const piece = tokens[best] || ""; if (best !== blank && best !== previous && piece && !isSpecialToken(piece)) pieces.push(piece); previous = best; } return pieces.join("").replaceAll("▁", " ").replace(/\s+/g, " ").trim(); } async function inspectGpu() { if (!navigator.gpu) { state.gpuName = "not exposed"; setText("stat-gpu", state.gpuName); return; } try { const adapter = await navigator.gpu.requestAdapter(); if (!adapter) { state.gpuName = "unavailable"; } else if (adapter.info) { state.gpuName = [adapter.info.vendor, adapter.info.architecture, adapter.info.device].filter(Boolean).join(" ") || "available"; } else if (adapter.requestAdapterInfo) { const info = await adapter.requestAdapterInfo(); state.gpuName = [info.vendor, info.architecture, info.device].filter(Boolean).join(" ") || "available"; } else { state.gpuName = "available"; } } catch (err) { state.gpuName = `unavailable: ${err.message}`; } setText("stat-gpu", state.gpuName); } async function loadCompatOrtRuntime() { if (ortCompatRuntime) return ortCompatRuntime; await new Promise((resolve, reject) => { const existing = document.getElementById("ort-compat-wasm-js"); if (existing) { existing.addEventListener("load", resolve, { once: true }); existing.addEventListener("error", () => reject(new Error("Compatibility ONNX Runtime script failed")), { once: true }); return; } const script = document.createElement("script"); script.id = "ort-compat-wasm-js"; script.async = true; script.crossOrigin = "anonymous"; script.src = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${ORT_COMPAT_VERSION}/dist/ort.wasm.min.js`; script.onload = resolve; script.onerror = () => reject(new Error("Compatibility ONNX Runtime script failed")); document.head.appendChild(script); }); if (!window.ort) throw new Error("Compatibility ONNX Runtime did not initialize"); ortCompatRuntime = window.ort; return ortCompatRuntime; } async function detectWasmFeatures() { if (wasmFeaturePromise) return wasmFeaturePromise; wasmFeaturePromise = Promise.resolve().then(() => { const simdProbe = new Uint8Array([ 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60, 0x00, 0x01, 0x7b, 0x03, 0x02, 0x01, 0x00, 0x0a, 0x0a, 0x01, 0x08, 0x00, 0x41, 0x00, 0xfd, 0x0f, 0xfd, 0x62, 0x0b, ]); return { simd: typeof WebAssembly !== "undefined" && WebAssembly.validate(simdProbe), threads: typeof Atomics !== "undefined" && typeof SharedArrayBuffer !== "undefined" && !!window.crossOriginIsolated, }; }).catch(() => ({ simd: false, threads: false })); return wasmFeaturePromise; } function configureWasm(ort, provider, wasmMode, features) { const compatWasm = provider === "wasm"; const version = compatWasm ? ORT_COMPAT_VERSION : ORT_WEBGPU_VERSION; ort.env.wasm.wasmPaths = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${version}/dist/`; if (compatWasm) { ort.env.wasm.simd = wasmMode !== "nosimd" && !!features?.simd; ort.env.wasm.numThreads = features?.threads ? Math.max(1, Math.min(4, navigator.hardwareConcurrency || 1)) : 1; } else { ort.env.wasm.numThreads = Math.max(1, Math.min(4, navigator.hardwareConcurrency || 1)); } if (ort.env.webgpu) { ort.env.webgpu.profiling = false; } return provider; } async function createSession(provider, wasmMode = "optimized") { const compatWasm = provider === "wasm"; const features = compatWasm ? await detectWasmFeatures() : null; const effectiveWasmMode = compatWasm && (!features.simd || wasmMode === "nosimd") ? "nosimd" : "optimized"; const ort = compatWasm ? await loadCompatOrtRuntime() : ortRuntime; if (!ort) throw new Error("ONNX Runtime Web did not initialize"); const effectiveProvider = configureWasm(ort, provider, effectiveWasmMode, features); const sessionOptions = { executionProviders: [effectiveProvider], graphOptimizationLevel: "all", enableMemPattern: false, enableCpuMemArena: true, }; if (compatWasm) { return { provider: effectiveWasmMode === "nosimd" ? "wasm-nosimd" : features.threads ? "wasm-simd-threaded" : "wasm-simd", session: await ort.InferenceSession.create(MODEL_EMBEDDED_URL, sessionOptions), ort, }; } return { provider: effectiveProvider, session: await ort.InferenceSession.create(MODEL_URL, { ...sessionOptions, externalData: [ { path: MODEL_DATA_FILE, data: MODEL_DATA_URL, }, ], }), ort, }; } async function loadModel() { $("load-model").disabled = true; setStatus("Loading tokenizer and ONNX Runtime WebGPU..."); updateHeap(); await inspectGpu(); const tokenJson = window.VISUALEARS_TOKENS; if (!tokenJson?.tokens?.length) throw new Error("Missing embedded CTC tokens"); state.tokens = tokenJson.tokens; state.blankId = tokenJson.blank_id; const provider = $("provider").value; const start = performance.now(); let effectiveProvider = provider; try { const noGpu = provider === "webgpu" && !navigator.gpu; if (noGpu) setStatus("WebGPU is unavailable here. Opening the best supported CPU WASM path..."); else if (provider === "wasm") setStatus("Opening the best supported CPU WASM path. First load is about 232 MB; cached reloads should be much faster."); else setStatus(`Downloading/opening ${provider.toUpperCase()} PhaseB FP16 session. First load is about 232 MB; cached reloads should be much faster.`); const result = await createSession(noGpu ? "wasm" : provider); state.session = result.session; state.ort = result.ort; effectiveProvider = result.provider; } catch (err) { const message = err && err.message ? err.message : String(err || "unknown error"); setStatus(`Primary load failed (${message}). Retrying CPU-only WASM without SIMD...`); const result = await createSession("wasm", "nosimd"); state.session = result.session; state.ort = result.ort; effectiveProvider = result.provider; } const elapsed = performance.now() - start; state.provider = effectiveProvider; setText("stat-provider", effectiveProvider); setText("stat-decode", "-"); setText("stat-rtf", "-"); setText("stat-speed", "-"); updateHeap(); setStatus(`Model ready in ${formatMs(elapsed)} (${effectiveProvider}). Start the microphone when you are ready.`); $("start-mic").disabled = false; } async function startMic() { if (!state.session) return; state.mediaStream = await navigator.mediaDevices.getUserMedia({ audio: { channelCount: 1, echoCancellation: true, noiseSuppression: true, autoGainControl: true, }, }); state.audioContext = new AudioContext({ sampleRate: SAMPLE_RATE }); state.source = state.audioContext.createMediaStreamSource(state.mediaStream); state.processor = state.audioContext.createScriptProcessor(4096, 1, 1); state.source.connect(state.processor); state.processor.connect(state.audioContext.destination); state.utterancePcm = new Float32Array(0); state.finalTranscript = ""; state.partialTranscript = ""; state.speechActive = false; state.silenceMs = 0; state.finalizing = false; renderTranscript(); state.recording = true; $("start-mic").disabled = true; $("stop-mic").disabled = false; setStatus("Listening. Partial text can move while you speak; final text freezes after silence."); state.processor.onaudioprocess = (event) => { if (!state.recording) return; const input = event.inputBuffer.getChannelData(0); let rms = 0; for (let i = 0; i < input.length; i++) rms += input[i] * input[i]; rms = Math.sqrt(rms / input.length); $("level-bar").style.width = `${Math.min(100, rms * 900).toFixed(1)}%`; const gate = Number($("noise-gate").value); const speechy = rms >= gate; const chunk = resampleLinear(input, state.audioContext.sampleRate, SAMPLE_RATE); const chunkMs = chunk.length / SAMPLE_RATE * 1000; if (speechy) { if (!state.speechActive) { state.utterancePcm = new Float32Array(0); state.partialTranscript = ""; state.silenceMs = 0; state.speechActive = true; state.finalizing = false; renderTranscript(); } state.silenceMs = 0; appendToUtterance(chunk); return; } if (!state.speechActive) return; state.silenceMs += chunkMs; if (state.silenceMs <= 350) appendToUtterance(chunk); if (state.silenceMs >= 900 && !state.finalizing) { state.finalizing = true; finalizeUtterance(); } }; scheduleDecode(); } function stopMic() { state.recording = false; clearTimeout(state.decodeTimer); state.decodeTimer = null; if (state.processor) state.processor.disconnect(); if (state.source) state.source.disconnect(); if (state.mediaStream) state.mediaStream.getTracks().forEach((track) => track.stop()); if (state.audioContext) state.audioContext.close(); state.processor = null; state.source = null; state.mediaStream = null; state.audioContext = null; $("level-bar").style.width = "0%"; $("start-mic").disabled = !state.session; $("stop-mic").disabled = true; setStatus("Stopped."); } function scheduleDecode() { clearTimeout(state.decodeTimer); if (!state.recording) return; const delay = Number($("decode-every").value) * 1000; state.decodeTimer = setTimeout(async () => { await runDecode(); scheduleDecode(); }, delay); } async function runDecode() { if (!state.session || state.decoding || state.finalizing || !state.speechActive || state.utterancePcm.length < SAMPLE_RATE * 0.8) return; await decodeUtterance(false); } async function finalizeUtterance() { if (state.decoding) { setTimeout(finalizeUtterance, 150); return; } if (!state.session || state.utterancePcm.length < SAMPLE_RATE * 0.25) { state.speechActive = false; state.finalizing = false; return; } await decodeUtterance(true); } async function decodeUtterance(final) { state.decoding = true; try { const maxSeconds = Number($("window-seconds").value); const samples = Math.min(state.utterancePcm.length, Math.floor(maxSeconds * SAMPLE_RATE)); const sourceAudio = state.utterancePcm.subarray(state.utterancePcm.length - samples); const minLiveSamples = Math.floor(2.5 * SAMPLE_RATE); const audio = sourceAudio.length >= minLiveSamples ? sourceAudio : (() => { const padded = new Float32Array(minLiveSamples); padded.set(sourceAudio, 0); return padded; })(); const preprocessStarted = performance.now(); const { features, frameCount } = pcmToLogMel(audio); const ort = state.ort || ortRuntime; if (!ort) throw new Error("ONNX Runtime Web did not initialize"); const tensor = new ort.Tensor("float32", features, [1, N_MELS, FIXED_FRAMES]); const lengthTensor = new ort.Tensor("int64", BigInt64Array.from([BigInt(frameCount)]), [1]); const inferStarted = performance.now(); const output = await state.session.run({ processed_signal: tensor, processed_signal_length: lengthTensor }); const inferElapsed = performance.now() - inferStarted; const elapsed = performance.now() - preprocessStarted; const logits = output.logits.data; const dims = output.logits.dims; const logitsType = output.logits.type; const vocabSize = dims[2] || state.tokens.length; const encodedRaw = output.encoded_lengths?.data?.[0]; const encodedLength = encodedRaw === undefined ? Math.ceil(frameCount / OUTPUT_STRIDE) : Number(encodedRaw); const usableSteps = Math.max(1, Math.min(dims[1], encodedLength)); const transcript = decodeCtc(logits, usableSteps, vocabSize, logitsType); const audioSeconds = samples / SAMPLE_RATE; const rtf = (elapsed / 1000) / Math.max(0.001, audioSeconds); const speed = Math.max(0.001, audioSeconds) / Math.max(0.001, elapsed / 1000); if (final) { if (transcript) { state.finalTranscript = state.finalTranscript ? `${state.finalTranscript}\n${transcript}` : transcript; } state.partialTranscript = ""; state.speechActive = false; state.finalizing = false; state.silenceMs = 0; state.utterancePcm = new Float32Array(0); } else { state.partialTranscript = transcript; } renderTranscript(); setText("stat-decode", `${formatMs(elapsed)} total / ${formatMs(inferElapsed)} infer`); setText("stat-rtf", `${rtf.toFixed(3)}`); setText("stat-speed", `${speed.toFixed(1)}x`); setText("stat-audio", `${audioSeconds.toFixed(1)} s`); setText("stat-frames", `${frameCount} / ${FIXED_FRAMES}`); setText("stat-provider", state.provider || $("provider").value); setText("stat-last", new Date().toLocaleTimeString()); updateHeap(); setStatus(`${final ? "Finalized" : "Decoded partial"} ${audioSeconds.toFixed(1)} s model window (${(samples / SAMPLE_RATE).toFixed(1)} s captured) with ${usableSteps} CTC steps.`); } catch (err) { console.error(err); setStatus(`Decode failed: ${err.message}`); if (final) { state.speechActive = false; state.finalizing = false; } } finally { state.decoding = false; } } async function boot() { if (!$("load-model")) { setTimeout(boot, 100); return; } ortRuntime = await import(`https://cdn.jsdelivr.net/npm/onnxruntime-web@${ORT_WEBGPU_VERSION}/dist/ort.webgpu.min.mjs`); $("load-model").addEventListener("click", () => loadModel().catch((err) => { console.error(err); setStatus(`Load failed: ${err.message}`); $("load-model").disabled = false; })); $("start-mic").addEventListener("click", () => startMic().catch((err) => { console.error(err); setStatus(`Mic failed: ${err.message}`); stopMic(); })); $("stop-mic").addEventListener("click", stopMic); $("provider").addEventListener("change", () => { if (state.session) { state.session = null; $("start-mic").disabled = true; $("load-model").disabled = false; setStatus("Provider changed. Load the model again."); } }); setText("stat-provider", "-"); setText("stat-gpu", "-"); updateHeap(); } boot().catch((err) => { console.error(err); setStatus(`Startup failed: ${err.message}`); });