| <!DOCTYPE html> |
| <html> |
| <head> |
| <meta charset="UTF-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| <title>WASM Streaming Speech Recognition</title> |
| <style> |
| @import url("https://fonts.googleapis.com/css2?family=Source+Sans+3:wght@300;400;600;700&display=swap"); |
| html, body { font-family: "Source Sans 3", system-ui, -apple-system, Segoe UI, Roboto, sans-serif; } |
| </style> |
| <script src="css/tailwind-3.4.17.js"></script> |
| <script type="module"> |
| const MODEL_ID = "moshi_1b_en_fr_q4k"; |
| const WEIGHTS_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/model-q4k.gguf"; |
| const MIMI_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/mimi-pytorch-e351c8d8@125.safetensors"; |
| const TOKENIZER_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/tokenizer_en_fr_audio_8000.json"; |
| const CONFIG_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/config.json"; |
| |
| const moshiWorker = new Worker("./moshiWorker.js", { type: "module" }); |
| let mediaRecorder = null; |
| let audioChunks = []; |
| let isRecording = false; |
| let audioStream = null; |
| let audioContext = null; |
| let processor = null; |
| let source = null; |
| let modelInitialized = false; |
| let pendingStart = false; |
| |
| |
| let audioChunksProcessed = 0; |
| let sessionStartTime = 0; |
| |
| function updateStatusDiv(message) { |
| document.querySelector("#status-div").textContent = message; |
| } |
| |
| function updateDiagnostics() { |
| const diagnostics = document.querySelector("#diagnostics"); |
| if (!diagnostics) return; |
| |
| const cpuCount = navigator.hardwareConcurrency || 'unknown'; |
| |
| |
| if (isRecording && sessionStartTime) { |
| |
| |
| const audioProcessed = audioChunksProcessed * (1024 / 24000); |
| const audioSessionDuration = (Date.now() - sessionStartTime) / 1000; |
| const realTimeFactor = audioSessionDuration > 0 ? (audioProcessed / audioSessionDuration) : 0; |
| |
| |
| let factorColor = ''; |
| if (realTimeFactor >= 0.95) { |
| factorColor = 'text-green-600'; |
| } else if (realTimeFactor >= 0.8) { |
| factorColor = 'text-yellow-600'; |
| } |
| else { |
| factorColor = 'text-red-600'; |
| } |
| |
| diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="${factorColor}">${realTimeFactor.toFixed(2)}x</span>, Duration: ${audioSessionDuration.toFixed(1)}s`; |
| } else if (!sessionStartTime) { |
| diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="text-gray-600">0.00x</span>, Duration: 0.0s`; |
| } |
| } |
| |
| window.addEventListener('load', updateDiagnostics); |
| setInterval(updateDiagnostics, 200); |
| |
| function initializeModel() { |
| if (modelInitialized) return; |
| |
| const button = document.querySelector("#speech-button"); |
| button.disabled = true; |
| button.className = "bg-gray-400 text-gray-700 font-normal py-2 px-4 rounded cursor-not-allowed"; |
| |
| moshiWorker.postMessage({ |
| command: "initialize", |
| weightsURL: WEIGHTS_URL, |
| modelID: MODEL_ID, |
| mimiURL: MIMI_URL, |
| tokenizerURL: TOKENIZER_URL, |
| configURL: CONFIG_URL, |
| }); |
| } |
| |
| |
| moshiWorker.addEventListener("message", async (event) => { |
| const data = event.data; |
| if (data.status === "model_ready") { |
| modelInitialized = true; |
| updateStatusDiv("Model loaded - Ready to start"); |
| |
| const button = document.querySelector("#speech-button"); |
| button.disabled = false; |
| button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded"; |
| |
| if (pendingStart) { |
| pendingStart = false; |
| await startRecording(); |
| } |
| } else if (data.status === "streaming") { |
| |
| const outputDiv = document.querySelector("#output-generation"); |
| const placeholder = document.querySelector("#output-placeholder"); |
| |
| if (placeholder) placeholder.hidden = true; |
| |
| if (outputDiv.textContent) { |
| outputDiv.textContent += " " + data.word; |
| } else { |
| outputDiv.textContent = data.word; |
| } |
| outputDiv.hidden = false; |
| } else if (data.status === "chunk_processed") { |
| audioChunksProcessed++; |
| } else if (data.status === "loading") { |
| updateStatusDiv(data.message); |
| } else if (data.error) { |
| updateStatusDiv("Error: " + data.error); |
| pendingStart = false; |
| } |
| }); |
| |
| function updateStatus(data) { |
| const { status, message, word } = data; |
| const outputDiv = document.querySelector("#output-generation"); |
| |
| if (status === "loading" || status === "decoding") { |
| updateStatusDiv(message || (status === "loading" ? "Loading..." : "Decoding...")); |
| } else if (status === "streaming") { |
| |
| if (outputDiv.textContent) { |
| outputDiv.textContent += " " + word; |
| } else { |
| outputDiv.textContent = word; |
| } |
| outputDiv.hidden = false; |
| } else if (status === "complete") { |
| updateStatusDiv("Ready"); |
| } |
| } |
| |
| async function startMicrophone() { |
| try { |
| audioStream = await navigator.mediaDevices.getUserMedia({ audio: true }); |
| updateStatusDiv("Microphone access granted"); |
| |
| audioContext = new AudioContext({ sampleRate: 24000 }); |
| source = audioContext.createMediaStreamSource(audioStream); |
| |
| processor = audioContext.createScriptProcessor(1024, 1, 1); |
| |
| processor.onaudioprocess = function(event) { |
| if (!isRecording || !modelInitialized) return; |
| |
| const inputBuffer = event.inputBuffer; |
| const inputData = inputBuffer.getChannelData(0); |
| |
| |
| const audioChunk = new Float32Array(inputData); |
| moshiWorker.postMessage({ |
| command: "process_audio", |
| audioData: audioChunk |
| }, [audioChunk.buffer]); |
| }; |
| |
| source.connect(processor); |
| processor.connect(audioContext.destination); |
| |
| } catch (error) { |
| updateStatusDiv("Microphone access denied: " + error.message); |
| throw error; |
| } |
| } |
| |
| function stopMicrophone() { |
| |
| if (processor) { |
| processor.disconnect(); |
| processor = null; |
| } |
| if (source) { |
| source.disconnect(); |
| source = null; |
| } |
| if (audioContext) { |
| audioContext.close(); |
| audioContext = null; |
| } |
| |
| |
| if (audioStream) { |
| audioStream.getTracks().forEach(track => track.stop()); |
| audioStream = null; |
| } |
| |
| updateStatusDiv("Microphone stopped"); |
| } |
| |
| async function startRecording() { |
| const button = document.querySelector("#speech-button"); |
| |
| try { |
| updateStatusDiv("Requesting microphone access..."); |
| await startMicrophone(); |
| |
| |
| audioChunksProcessed = 0; |
| sessionStartTime = Date.now(); |
| |
| |
| moshiWorker.postMessage({ command: "start_stream" }); |
| |
| isRecording = true; |
| button.textContent = "Stop Speech"; |
| button.className = "bg-red-600 hover:bg-red-700 text-white font-normal py-2 px-4 rounded"; |
| updateStatusDiv("Listening..."); |
| |
| |
| document.querySelector("#output-generation").textContent = ""; |
| document.querySelector("#output-generation").hidden = true; |
| document.querySelector("#output-placeholder").hidden = true; |
| |
| } catch (error) { |
| console.error('Error starting microphone:', error); |
| updateStatusDiv("Error: " + error.message); |
| pendingStart = false; |
| } |
| } |
| |
| document.querySelector("#speech-button").addEventListener("click", async () => { |
| const button = document.querySelector("#speech-button"); |
| |
| if (!isRecording) { |
| |
| if (!modelInitialized) { |
| pendingStart = true; |
| initializeModel(); |
| return; |
| } |
| |
| await startRecording(); |
| } else { |
| stopMicrophone(); |
| |
| |
| moshiWorker.postMessage({ command: "stop_stream" }); |
| |
| isRecording = false; |
| button.textContent = "Start Speech"; |
| button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded"; |
| updateStatusDiv("Ready to start"); |
| } |
| }); |
| </script> |
| </head> |
| <body class="container max-w-4xl mx-auto p-4"> |
| <main class="grid grid-cols-1 gap-8 relative"> |
| <div> |
| <h1 class="text-4xl font-bold">WASM Streaming Speech Recognition</h1> |
| <p class="text-gray-700"> |
| Transcribe audio from your microphone in real time in the browser using Rust/WASM. |
| This demo runs entirely offline on your CPU after downloading a ~950 MB model. |
| It understands English and French, and uses the |
| <a href="https://huggingface.co/kyutai/stt-1b-en_fr" target="_blank" class="underline hover:text-blue-600">Kyutai STT model</a> |
| together with a WASM runtime built in |
| <a href="https://github.com/huggingface/candle/" target="_blank" class="underline hover:text-blue-600">Candle</a>. |
| </p> |
| </div> |
|
|
| <div> |
| <button id="speech-button" class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded"> |
| Start Speech |
| </button> |
| <div class="mt-2 text-gray-600 text-sm space-y-1"> |
| <div>Status: <span id="status-div">Click "Start Speech" to begin</span></div> |
| <div id="diagnostics">CPUs: -, Real-time factor: 0.00x, Duration: 0.0s</div> |
| </div> |
| </div> |
|
|
| <div> |
| <h3 class="font-medium">Transcription:</h3> |
| <div class="min-h-[200px] bg-slate-100 text-gray-700 p-4 rounded-md"> |
| <p id="output-generation" hidden></p> |
| <span id="output-placeholder" class="font-light text-gray-500">Click "Start Speech" to begin transcription</span> |
| </div> |
| </div> |
|
|
| <div class="mt-4 p-3 bg-gray-50 text-gray-700 rounded-md"> |
| 💡 This demo shows offline transcription in your browser. |
| For more accurate cloud transcription and real-time LLM grammar correction, check out |
| <a href="https://voicewriter.io" target="_blank" class="underline hover:text-blue-600">Voice Writer</a>. |
| </div> |
| </main> |
| </body> |
| </html> |