/** * Granite Speech WebGPU Demo * Uses Transformers.js v4 for in-browser speech recognition */ import { AutoProcessor, GraniteSpeechForConditionalGeneration, TextStreamer, } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.0.0-next.7'; import { detect } from 'https://cdn.jsdelivr.net/npm/tinyld/+esm'; // Model const MODEL_ID = 'onnx-community/granite-4.0-1b-speech-ONNX'; // Audio config const SAMPLE_RATE = 16000; const MAX_NEW_TOKENS = 256; // Task prompts — <|audio|> is expanded by the processor's chat template const TASK_PROMPTS = { 'transcribe': '<|audio|>Transcribe the speech to text', 'translate_en': '<|audio|>Translate the speech to English', 'translate_fr': '<|audio|>Translate the speech to French', 'translate_de': '<|audio|>Translate the speech to German', 'translate_es': '<|audio|>Translate the speech to Spanish', 'translate_pt': '<|audio|>Translate the speech to Portuguese', 'translate_ja': '<|audio|>Translate the speech to Japanese', }; // State let model = null; let processor = null; let isModelLoading = false; let currentAudioData = null; // DOM Elements const statusDot = document.getElementById('statusDot'); const statusText = document.getElementById('statusText'); const recordBtn = document.getElementById('recordBtn'); const audioFile = document.getElementById('audioFile'); const fileTile = document.querySelector('.file-label'); const inputCard = document.querySelector('.input-card'); const audioPreview = document.getElementById('audioPreview'); const audioPlayer = document.getElementById('audioPlayer'); const playBtn = document.getElementById('playBtn'); const waveformCanvas = document.getElementById('waveformCanvas'); const waveformProgress = document.getElementById('waveformProgress'); const audioTime = document.getElementById('audioTime'); const transcribeSection = document.getElementById('transcribeSection'); const transcribeBtn = document.getElementById('transcribeBtn'); const promptSelect = document.getElementById('promptSelect'); const punctuationCheckbox = document.getElementById('punctuationCheckbox'); const transcriptCard = document.getElementById('transcriptCard'); const outputText = document.getElementById('outputText'); const copyBtn = document.getElementById('copyBtn'); const downloadBtn = document.getElementById('downloadBtn'); const clearBtn = document.getElementById('clearBtn'); const progressSection = document.getElementById('progressSection'); const progressFill = document.getElementById('progressFill'); const progressText = document.getElementById('progressText'); const vadCheckbox = document.getElementById('vadCheckbox'); const gpuInfo = document.getElementById('gpuInfo'); // Recording state let mediaRecorder = null; let audioChunks = []; let transcriptionAborted = false; // Utility functions function setStatus(status, message) { statusDot.className = `status-dot ${status}`; statusText.textContent = message; } // Punctuation is handled by punctuator.js (applyPunctuation function) function showProgress(show) { progressSection.style.display = show ? 'block' : 'none'; } function updateProgress(progress, text) { progressFill.style.width = `${progress}%`; progressText.textContent = text; } // Check WebGPU support async function checkWebGPU() { if (!navigator.gpu) { gpuInfo.textContent = 'WebGPU not supported. Use Chrome 113+ or Edge 113+'; gpuInfo.style.color = '#e74c3c'; return false; } try { const adapter = await navigator.gpu.requestAdapter(); if (!adapter) { gpuInfo.textContent = 'No WebGPU adapter available'; gpuInfo.style.color = '#f39c12'; return false; } return true; } catch (e) { console.error('WebGPU error:', e); gpuInfo.textContent = `WebGPU error: ${e.message || e}`; gpuInfo.style.color = '#e74c3c'; return false; } } // Initialize models using Transformers.js v4 async function initModels() { if (isModelLoading) return; isModelLoading = true; setStatus('loading', 'Loading processor...'); try { await checkWebGPU(); processor = await AutoProcessor.from_pretrained(MODEL_ID); setStatus('loading', 'Downloading models...'); progressFill.style.width = '0%'; let lastProgressUpdate = 0; const fileProgress = {}; model = await GraniteSpeechForConditionalGeneration.from_pretrained(MODEL_ID, { dtype: { audio_encoder: 'q4', embed_tokens: 'q4f16', decoder_model_merged: 'q4f16', }, device: 'webgpu', progress_callback: (progress) => { if (progress.status === 'progress' && progress.total) { fileProgress[progress.file] = { loaded: progress.loaded, total: progress.total }; const now = performance.now(); if (now - lastProgressUpdate < 100) return; lastProgressUpdate = now; let totalLoaded = 0, totalSize = 0; for (const f of Object.values(fileProgress)) { totalLoaded += f.loaded; totalSize += f.total; } const pct = totalSize > 0 ? (totalLoaded / totalSize) * 100 : 0; progressFill.style.width = `${pct}%`; const mb = (totalLoaded / 1e6).toFixed(0); const totalMb = (totalSize / 1e6).toFixed(0); setStatus('loading', `Downloading models... ${mb} / ${totalMb} MB`); } }, }); setStatus('loading', 'Loading VAD and punctuation models...'); await Promise.all([loadVAD(), loadPunctuator()]); progressFill.style.width = '0%'; setStatus('ready', 'Ready - Record or upload audio'); enableControls(true); } catch (error) { console.error('Model loading failed:', error); console.error('Error stack:', error?.stack); const errorMsg = error?.message || error?.toString() || 'Unknown error'; setStatus('error', `Error: ${errorMsg}`); progressFill.style.width = '0%'; isModelLoading = false; } } function enableControls(enabled) { recordBtn.disabled = !enabled; audioFile.disabled = !enabled; } // Transcribe a single audio segment and return the text async function transcribeSegment(audioSegment, onPartialResult) { // Build prompt using chat template const taskKey = promptSelect.value; const content = TASK_PROMPTS[taskKey] || TASK_PROMPTS['transcribe']; const messages = [{ role: 'user', content }]; const text = processor.tokenizer.apply_chat_template(messages, { add_generation_prompt: true, tokenize: false, }); // Process text + audio into model inputs const inputs = await processor(text, audioSegment, { sampling_rate: SAMPLE_RATE }); // Streaming via TextStreamer let accumulated = ''; const streamer = new TextStreamer(processor.tokenizer, { skip_prompt: true, skip_special_tokens: true, callback_function: (chunk) => { accumulated += chunk; if (onPartialResult) { onPartialResult(accumulated); } }, }); // Generate await model.generate({ ...inputs, max_new_tokens: MAX_NEW_TOKENS, streamer, }); return accumulated; } // Wait until audio playback reaches a specific time function waitForPlaybackTime(targetTime) { return new Promise((resolve) => { const check = () => { if (audioPlayer.paused || audioPlayer.currentTime >= targetTime) { resolve(); } else { requestAnimationFrame(check); } }; check(); }); } // Run inference with segmentation and audio sync async function transcribe() { if (!model || !processor || !currentAudioData) { setStatus('error', 'Model or audio not ready'); return; } setStatus('processing', 'Processing audio...'); transcribeBtn.disabled = true; transcriptionAborted = false; outputText.textContent = ''; transcriptCard.style.display = 'block'; showProgress(true); try { // Get speech segments using VAD, or treat entire audio as one segment let segments; if (vadCheckbox.checked) { updateProgress(5, 'Detecting speech segments...'); segments = await getSpeechSegments(currentAudioData, SAMPLE_RATE); console.log(`VAD found ${segments.length} segment(s)`); } else { segments = [{ start: 0, end: currentAudioData.length / SAMPLE_RATE }]; } // Start audio playback immediately audioPlayer.currentTime = 0; audioPlayer.play(); playBtn.querySelector('.play-icon').style.display = 'none'; playBtn.querySelector('.pause-icon').style.display = 'block'; const playbackStartTime = performance.now() / 1000; // Process and display segments in sync with audio const displayedResults = []; const totalSegments = segments.length; for (let segIdx = 0; segIdx < totalSegments; segIdx++) { if (transcriptionAborted) break; const seg = segments[segIdx]; // Update progress bar const segProgress = ((segIdx + 1) / totalSegments) * 100; updateProgress(segProgress, ''); // Wait for audio to reach this segment's start time const elapsed = (performance.now() / 1000) - playbackStartTime; const waitTime = seg.start - elapsed; if (waitTime > 0) { await new Promise(resolve => setTimeout(resolve, waitTime * 1000)); } setStatus('processing', `Segment ${segIdx + 1}/${totalSegments}`); // Extract and transcribe this segment const startSample = Math.floor(seg.start * SAMPLE_RATE); const endSample = Math.floor(seg.end * SAMPLE_RATE); const audioSegment = currentAudioData.slice(startSample, endSample); const timestamp = formatTimestamp(seg.start); const makeRow = (ts, text) => `
${ts}${text}
`; // Transcribe with streaming display const segmentText = await transcribeSegment(audioSegment, (partial) => { const escaped = partial.replace(//g, '>'); const rows = [...displayedResults, makeRow(timestamp, escaped)]; outputText.innerHTML = rows.join(''); outputText.scrollTop = outputText.scrollHeight; }); if (segmentText.trim()) { let finalSegmentText = segmentText.trim(); // Apply punctuation/capitalization for English only if (punctuationCheckbox.checked) { const detectedLang = detect(finalSegmentText); if (detectedLang === 'en') { const stripped = finalSegmentText.replace(/[.,!?]/g, ' ').replace(/\s+/g, ' ').trim(); finalSegmentText = await applyPunctuation(stripped, 'en'); finalSegmentText = finalSegmentText.replace(//gi, ' ').replace(/\s+/g, ' ').trim(); } } const escaped = finalSegmentText.replace(//g, '>'); displayedResults.push(makeRow(timestamp, escaped)); outputText.innerHTML = displayedResults.join(''); outputText.scrollTop = outputText.scrollHeight; } } // Final output if (displayedResults.length === 0) { outputText.innerHTML = '(No speech detected)'; } copyBtn.disabled = false; showProgress(false); setStatus('ready', 'Transcription complete'); } catch (error) { console.error('Transcription failed:', error); setStatus('error', `Error: ${error.message}`); showProgress(false); } transcribeBtn.disabled = false; } // Audio recording let isRecording = false; function toggleRecording() { if (isRecording) { stopRecording(); } else { startRecording(); } } async function startRecording() { try { const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); mediaRecorder = new MediaRecorder(stream); audioChunks = []; mediaRecorder.ondataavailable = (event) => { audioChunks.push(event.data); }; mediaRecorder.onstop = async () => { const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); const audioUrl = URL.createObjectURL(audioBlob); audioPlayer.src = audioUrl; audioPreview.style.display = 'flex'; transcribeSection.style.display = 'flex'; await processAudioBlob(audioBlob); drawWaveform(); updateAudioTime(); stream.getTracks().forEach(track => track.stop()); }; mediaRecorder.start(); isRecording = true; setStatus('recording', 'Recording...'); // Update button UI recordBtn.querySelector('.mic-icon').style.display = 'none'; recordBtn.querySelector('.stop-icon').style.display = 'block'; recordBtn.querySelector('span').textContent = 'Stop'; recordBtn.classList.add('recording'); } catch (error) { console.error('Recording failed:', error); setStatus('error', 'Microphone access denied'); } } function stopRecording() { if (mediaRecorder && mediaRecorder.state !== 'inactive') { mediaRecorder.stop(); isRecording = false; setStatus('ready', 'Recording stopped - Click Transcribe'); // Update button UI recordBtn.querySelector('.mic-icon').style.display = 'block'; recordBtn.querySelector('.stop-icon').style.display = 'none'; recordBtn.querySelector('span').textContent = 'Record'; recordBtn.classList.remove('recording'); } } // Process audio file/blob async function processAudioBlob(blob) { try { const arrayBuffer = await blob.arrayBuffer(); const audioCtx = new AudioContext({ sampleRate: SAMPLE_RATE }); const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer); // Convert to mono Float32Array let audioData; if (audioBuffer.numberOfChannels > 1) { const left = audioBuffer.getChannelData(0); const right = audioBuffer.getChannelData(1); audioData = new Float32Array(left.length); for (let i = 0; i < left.length; i++) { audioData[i] = (left[i] + right[i]) / 2; } } else { audioData = audioBuffer.getChannelData(0); } // Resample if needed if (audioBuffer.sampleRate !== SAMPLE_RATE) { audioData = resample(audioData, audioBuffer.sampleRate, SAMPLE_RATE); } currentAudioData = audioData; transcribeBtn.disabled = false; } catch (error) { console.error('Audio processing failed:', error); setStatus('error', 'Failed to process audio'); } } // Simple linear resampling function resample(audioData, fromRate, toRate) { const ratio = fromRate / toRate; const newLength = Math.round(audioData.length / ratio); const result = new Float32Array(newLength); for (let i = 0; i < newLength; i++) { const srcIndex = i * ratio; const srcIndexFloor = Math.floor(srcIndex); const srcIndexCeil = Math.min(srcIndexFloor + 1, audioData.length - 1); const t = srcIndex - srcIndexFloor; result[i] = audioData[srcIndexFloor] * (1 - t) + audioData[srcIndexCeil] * t; } return result; } // Handle file upload async function handleFileUpload(event) { const file = event.target.files[0]; if (!file) return; await loadAudioFile(file); } // Handle dropped files async function handleFileDrop(event) { event.preventDefault(); inputCard.classList.remove('drag-over'); const file = event.dataTransfer.files[0]; if (!file || !file.type.startsWith('audio/')) { setStatus('error', 'Please drop an audio file'); return; } await loadAudioFile(file); } // Common file loading logic async function loadAudioFile(file) { setStatus('processing', 'Processing audio file...'); const audioUrl = URL.createObjectURL(file); audioPlayer.src = audioUrl; audioPreview.style.display = 'flex'; transcribeSection.style.display = 'flex'; await processAudioBlob(file); drawWaveform(); updateAudioTime(); setStatus('ready', 'Audio loaded - Click Transcribe'); } // Draw waveform visualization function drawWaveform() { if (!currentAudioData) return; const canvas = waveformCanvas; const ctx = canvas.getContext('2d'); const dpr = window.devicePixelRatio || 1; // Set canvas size const rect = canvas.getBoundingClientRect(); canvas.width = rect.width * dpr; canvas.height = rect.height * dpr; ctx.scale(dpr, dpr); const width = rect.width; const height = rect.height; const centerY = height / 2; // Downsample audio data for visualization const samples = currentAudioData; const barCount = Math.floor(width / 3); const samplesPerBar = Math.floor(samples.length / barCount); // Calculate bar amplitudes const barAmplitudes = []; for (let i = 0; i < barCount; i++) { let sum = 0; const start = i * samplesPerBar; for (let j = 0; j < samplesPerBar; j++) { sum += Math.abs(samples[start + j] || 0); } barAmplitudes.push(sum / samplesPerBar); } // Find max amplitude for normalization const maxAmp = Math.max(...barAmplitudes, 0.01); // Get color based on color scheme const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches; ctx.fillStyle = isDark ? '#64748b' : '#cbd5e1'; // Draw bars normalized to fill height for (let i = 0; i < barCount; i++) { const normalized = barAmplitudes[i] / maxAmp; const barHeight = Math.max(2, normalized * height * 0.9); ctx.fillRect(i * 3, centerY - barHeight / 2, 2, barHeight); } } // Format time as M:SS function formatTime(seconds) { const mins = Math.floor(seconds / 60); const secs = Math.floor(seconds % 60); return `${mins}:${secs.toString().padStart(2, '0')}`; } // Update audio time display function updateAudioTime() { const current = audioPlayer.currentTime || 0; const duration = audioPlayer.duration || 0; if (duration > 0) { audioTime.textContent = `${formatTime(current)} / ${formatTime(duration)}`; waveformProgress.style.width = `${(current / duration) * 100}%`; } else { audioTime.textContent = formatTime(currentAudioData ? currentAudioData.length / SAMPLE_RATE : 0); } } // Toggle play/pause function togglePlayback() { if (audioPlayer.paused) { audioPlayer.play(); playBtn.querySelector('.play-icon').style.display = 'none'; playBtn.querySelector('.pause-icon').style.display = 'block'; } else { audioPlayer.pause(); playBtn.querySelector('.play-icon').style.display = 'block'; playBtn.querySelector('.pause-icon').style.display = 'none'; // Stop transcription if running if (!transcriptionAborted && transcribeBtn.disabled) { transcriptionAborted = true; showProgress(false); setStatus('ready', 'Transcription stopped'); transcribeBtn.disabled = false; } } } // Seek in audio function seekAudio(event) { const rect = waveformCanvas.getBoundingClientRect(); const x = event.clientX - rect.left; const percent = x / rect.width; audioPlayer.currentTime = percent * audioPlayer.duration; updateAudioTime(); } // Copy to clipboard async function copyToClipboard() { try { await navigator.clipboard.writeText(outputText.textContent); // Brief visual feedback via title attribute const originalTitle = copyBtn.title; copyBtn.title = 'Copied!'; setTimeout(() => { copyBtn.title = originalTitle; }, 2000); } catch (error) { console.error('Copy failed:', error); } } function downloadTranscript() { // Convert
to newlines and strip other HTML const text = outputText.innerHTML .replace(//gi, '\n') .replace(/<[^>]+>/g, ''); if (!text) return; const blob = new Blob([text], { type: 'text/plain' }); const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; a.download = 'transcript.txt'; document.body.appendChild(a); a.click(); document.body.removeChild(a); URL.revokeObjectURL(url); // Brief visual feedback const originalTitle = downloadBtn.title; downloadBtn.title = 'Downloaded!'; setTimeout(() => { downloadBtn.title = originalTitle; }, 2000); } function clearAudio() { // Stop any playback audioPlayer.pause(); audioPlayer.src = ''; // Reset audio state currentAudioData = null; // Hide audio player and transcribe section audioPreview.style.display = 'none'; transcribeSection.style.display = 'none'; // Clear transcript transcriptCard.style.display = 'none'; outputText.textContent = ''; // Reset waveform waveformProgress.style.width = '0%'; const ctx = waveformCanvas.getContext('2d'); ctx.clearRect(0, 0, waveformCanvas.width, waveformCanvas.height); // Reset time display audioTime.textContent = '0:00'; // Reset buttons transcribeBtn.disabled = true; // Reset file input audioFile.value = ''; // Update status setStatus('ready', 'Ready'); } // Event listeners recordBtn.addEventListener('click', toggleRecording); audioFile.addEventListener('change', handleFileUpload); // Audio player controls playBtn.addEventListener('click', togglePlayback); waveformCanvas.addEventListener('click', seekAudio); audioPlayer.addEventListener('timeupdate', updateAudioTime); audioPlayer.addEventListener('ended', () => { playBtn.querySelector('.play-icon').style.display = 'block'; playBtn.querySelector('.pause-icon').style.display = 'none'; waveformProgress.style.width = '0%'; }); // Redraw waveform on resize window.addEventListener('resize', drawWaveform); transcribeBtn.addEventListener('click', transcribe); copyBtn.addEventListener('click', copyToClipboard); downloadBtn.addEventListener('click', downloadTranscript); clearBtn.addEventListener('click', clearAudio); // Drag and drop on input card inputCard.addEventListener('dragover', (e) => { e.preventDefault(); inputCard.classList.add('drag-over'); }); inputCard.addEventListener('dragleave', (e) => { e.preventDefault(); inputCard.classList.remove('drag-over'); }); inputCard.addEventListener('drop', handleFileDrop); // Initialize on load window.addEventListener('load', initModels);