Spaces:
Running
Running
| <html> | |
| <head> | |
| <meta charset="UTF-8" /> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> | |
| <title>WASM Streaming Speech Recognition</title> | |
| <style> | |
| @import url("https://fonts.googleapis.com/css2?family=Source+Sans+3:wght@300;400;600;700&display=swap"); | |
| html, body { font-family: "Source Sans 3", system-ui, -apple-system, Segoe UI, Roboto, sans-serif; } | |
| </style> | |
| <script src="css/tailwind-3.4.17.js"></script> | |
| <script type="module"> | |
| const MODEL_ID = "moshi_1b_en_fr_q4k"; | |
| const WEIGHTS_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/model-q4k.gguf"; | |
| const MIMI_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/mimi-pytorch-e351c8d8@125.safetensors"; | |
| const TOKENIZER_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/tokenizer_en_fr_audio_8000.json"; | |
| const CONFIG_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/config.json"; | |
| const moshiWorker = new Worker("./moshiWorker.js", { type: "module" }); | |
| let mediaRecorder = null; | |
| let audioChunks = []; | |
| let isRecording = false; | |
| let audioStream = null; | |
| let audioContext = null; | |
| let processor = null; | |
| let source = null; | |
| let modelInitialized = false; | |
| let pendingStart = false; | |
| // Performance tracking | |
| let audioChunksProcessed = 0; | |
| let sessionStartTime = 0; | |
| function updateStatusDiv(message) { | |
| document.querySelector("#status-div").textContent = message; | |
| } | |
| function updateDiagnostics() { | |
| const diagnostics = document.querySelector("#diagnostics"); | |
| if (!diagnostics) return; | |
| const cpuCount = navigator.hardwareConcurrency || 'unknown'; | |
| // Only update metrics when recording, otherwise show final values | |
| if (isRecording && sessionStartTime) { | |
| // Calculate real-time factor (audio processed / wall clock time) | |
| // >1 = faster than real-time, <1 = slower than real-time | |
| const audioProcessed = audioChunksProcessed * (1024 / 24000); | |
| const audioSessionDuration = (Date.now() - sessionStartTime) / 1000; | |
| const realTimeFactor = audioSessionDuration > 0 ? (audioProcessed / audioSessionDuration) : 0; | |
| // Color code based on performance | |
| let factorColor = ''; | |
| if (realTimeFactor >= 0.95) { | |
| factorColor = 'text-green-600'; | |
| } else if (realTimeFactor >= 0.8) { | |
| factorColor = 'text-yellow-600'; | |
| } | |
| else { | |
| factorColor = 'text-red-600'; | |
| } | |
| diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="${factorColor}">${realTimeFactor.toFixed(2)}x</span>, Duration: ${audioSessionDuration.toFixed(1)}s`; | |
| } else if (!sessionStartTime) { | |
| diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="text-gray-600">0.00x</span>, Duration: 0.0s`; | |
| } | |
| } | |
| window.addEventListener('load', updateDiagnostics); | |
| setInterval(updateDiagnostics, 200); | |
| function initializeModel() { | |
| if (modelInitialized) return; | |
| const button = document.querySelector("#speech-button"); | |
| button.disabled = true; | |
| button.className = "bg-gray-400 text-gray-700 font-normal py-2 px-4 rounded cursor-not-allowed"; | |
| moshiWorker.postMessage({ | |
| command: "initialize", | |
| weightsURL: WEIGHTS_URL, | |
| modelID: MODEL_ID, | |
| mimiURL: MIMI_URL, | |
| tokenizerURL: TOKENIZER_URL, | |
| configURL: CONFIG_URL, | |
| }); | |
| } | |
| // Handle messages from worker | |
| moshiWorker.addEventListener("message", async (event) => { | |
| const data = event.data; | |
| if (data.status === "model_ready") { | |
| modelInitialized = true; | |
| updateStatusDiv("Model loaded - Ready to start"); | |
| const button = document.querySelector("#speech-button"); | |
| button.disabled = false; | |
| button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded"; | |
| if (pendingStart) { | |
| pendingStart = false; | |
| await startRecording(); | |
| } | |
| } else if (data.status === "streaming") { | |
| // Add new word to transcription in real-time | |
| const outputDiv = document.querySelector("#output-generation"); | |
| const placeholder = document.querySelector("#output-placeholder"); | |
| if (placeholder) placeholder.hidden = true; | |
| if (outputDiv.textContent) { | |
| outputDiv.textContent += " " + data.word; | |
| } else { | |
| outputDiv.textContent = data.word; | |
| } | |
| outputDiv.hidden = false; | |
| } else if (data.status === "chunk_processed") { | |
| audioChunksProcessed++; | |
| } else if (data.status === "loading") { | |
| updateStatusDiv(data.message); | |
| } else if (data.error) { | |
| updateStatusDiv("Error: " + data.error); | |
| pendingStart = false; | |
| } | |
| }); | |
| function updateStatus(data) { | |
| const { status, message, word } = data; | |
| const outputDiv = document.querySelector("#output-generation"); | |
| if (status === "loading" || status === "decoding") { | |
| updateStatusDiv(message || (status === "loading" ? "Loading..." : "Decoding...")); | |
| } else if (status === "streaming") { | |
| // Add new word to the transcription in real-time | |
| if (outputDiv.textContent) { | |
| outputDiv.textContent += " " + word; | |
| } else { | |
| outputDiv.textContent = word; | |
| } | |
| outputDiv.hidden = false; | |
| } else if (status === "complete") { | |
| updateStatusDiv("Ready"); | |
| } | |
| } | |
| async function startMicrophone() { | |
| try { | |
| audioStream = await navigator.mediaDevices.getUserMedia({ audio: true }); | |
| updateStatusDiv("Microphone access granted"); | |
| audioContext = new AudioContext({ sampleRate: 24000 }); | |
| source = audioContext.createMediaStreamSource(audioStream); | |
| processor = audioContext.createScriptProcessor(1024, 1, 1); | |
| processor.onaudioprocess = function(event) { | |
| if (!isRecording || !modelInitialized) return; | |
| const inputBuffer = event.inputBuffer; | |
| const inputData = inputBuffer.getChannelData(0); | |
| // Send audio chunk to worker | |
| const audioChunk = new Float32Array(inputData); | |
| moshiWorker.postMessage({ | |
| command: "process_audio", | |
| audioData: audioChunk | |
| }, [audioChunk.buffer]); | |
| }; | |
| source.connect(processor); | |
| processor.connect(audioContext.destination); | |
| } catch (error) { | |
| updateStatusDiv("Microphone access denied: " + error.message); | |
| throw error; | |
| } | |
| } | |
| function stopMicrophone() { | |
| // Disconnect audio nodes | |
| if (processor) { | |
| processor.disconnect(); | |
| processor = null; | |
| } | |
| if (source) { | |
| source.disconnect(); | |
| source = null; | |
| } | |
| if (audioContext) { | |
| audioContext.close(); | |
| audioContext = null; | |
| } | |
| // Stop media stream | |
| if (audioStream) { | |
| audioStream.getTracks().forEach(track => track.stop()); | |
| audioStream = null; | |
| } | |
| updateStatusDiv("Microphone stopped"); | |
| } | |
| async function startRecording() { | |
| const button = document.querySelector("#speech-button"); | |
| try { | |
| updateStatusDiv("Requesting microphone access..."); | |
| await startMicrophone(); | |
| // Reset performance counters | |
| audioChunksProcessed = 0; | |
| sessionStartTime = Date.now(); | |
| // Start streaming session | |
| moshiWorker.postMessage({ command: "start_stream" }); | |
| isRecording = true; | |
| button.textContent = "Stop Speech"; | |
| button.className = "bg-red-600 hover:bg-red-700 text-white font-normal py-2 px-4 rounded"; | |
| updateStatusDiv("Listening..."); | |
| // Clear previous transcription | |
| document.querySelector("#output-generation").textContent = ""; | |
| document.querySelector("#output-generation").hidden = true; | |
| document.querySelector("#output-placeholder").hidden = true; | |
| } catch (error) { | |
| console.error('Error starting microphone:', error); | |
| updateStatusDiv("Error: " + error.message); | |
| pendingStart = false; | |
| } | |
| } | |
| document.querySelector("#speech-button").addEventListener("click", async () => { | |
| const button = document.querySelector("#speech-button"); | |
| if (!isRecording) { | |
| // Check if model is ready | |
| if (!modelInitialized) { | |
| pendingStart = true; | |
| initializeModel(); | |
| return; | |
| } | |
| await startRecording(); | |
| } else { | |
| stopMicrophone(); | |
| // End streaming session | |
| moshiWorker.postMessage({ command: "stop_stream" }); | |
| isRecording = false; | |
| button.textContent = "Start Speech"; | |
| button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded"; | |
| updateStatusDiv("Ready to start"); | |
| } | |
| }); | |
| </script> | |
| </head> | |
| <body class="container max-w-4xl mx-auto p-4"> | |
| <main class="grid grid-cols-1 gap-8 relative"> | |
| <div> | |
| <h1 class="text-4xl font-bold">WASM Streaming Speech Recognition</h1> | |
| <p class="text-gray-700"> | |
| Transcribe audio from your microphone in real time in the browser using Rust/WASM. | |
| This demo runs entirely offline on your CPU after downloading a ~950 MB model. | |
| It understands English and French, and uses the | |
| <a href="https://huggingface.co/kyutai/stt-1b-en_fr" target="_blank" class="underline hover:text-blue-600">Kyutai STT model</a> | |
| together with a WASM runtime built in | |
| <a href="https://github.com/huggingface/candle/" target="_blank" class="underline hover:text-blue-600">Candle</a>. | |
| </p> | |
| </div> | |
| <div> | |
| <button id="speech-button" class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded"> | |
| Start Speech | |
| </button> | |
| <div class="mt-2 text-gray-600 text-sm space-y-1"> | |
| <div>Status: <span id="status-div">Click "Start Speech" to begin</span></div> | |
| <div id="diagnostics">CPUs: -, Real-time factor: 0.00x, Duration: 0.0s</div> | |
| </div> | |
| </div> | |
| <div> | |
| <h3 class="font-medium">Transcription:</h3> | |
| <div class="min-h-[200px] bg-slate-100 text-gray-700 p-4 rounded-md"> | |
| <p id="output-generation" hidden></p> | |
| <span id="output-placeholder" class="font-light text-gray-500">Click "Start Speech" to begin transcription</span> | |
| </div> | |
| </div> | |
| <div class="mt-4 p-3 bg-gray-50 text-gray-700 rounded-md"> | |
| 💡 This demo shows offline transcription in your browser. | |
| For more accurate cloud transcription and real-time LLM grammar correction, check out | |
| <a href="https://voicewriter.io" target="_blank" class="underline hover:text-blue-600">Voice Writer</a>. | |
| </div> | |
| </main> | |
| </body> | |
| </html> |