Controls
{/* Microphone Selection */}Model Status
{modelMessage || 'Ready to load model'}
/** * Main Application Component * * Parakeet STT Progressive Transcription Demo with WebGPU */ import { useState, useEffect, useRef } from 'react'; import TranscriptionDisplay from './components/TranscriptionDisplay'; import PerformanceMetrics from './components/PerformanceMetrics'; import Progress from './components/Progress'; import { AudioRecorder, AudioProcessor } from './utils/audio'; import { SmartProgressiveStreamingHandler } from './utils/progressive-streaming'; // Import worker import WorkerUrl from './worker.js?worker&url'; function App() { // Model state const [modelStatus, setModelStatus] = useState('not_loaded'); const [modelMessage, setModelMessage] = useState(''); const [device, setDevice] = useState(null); // Microphone device selection const [audioDevices, setAudioDevices] = useState([]); const [selectedDeviceId, setSelectedDeviceId] = useState(null); // Recording state const [isRecording, setIsRecording] = useState(false); const [fixedText, setFixedText] = useState(''); const [activeText, setActiveText] = useState(''); const [timestamp, setTimestamp] = useState(0); const [audioLevel, setAudioLevel] = useState(0); // Performance metrics const [latency, setLatency] = useState(null); const [rtf, setRtf] = useState(null); const [audioDuration, setAudioDuration] = useState(null); const [windowState, setWindowState] = useState(null); const [isProcessingFile, setIsProcessingFile] = useState(false); const [fileDuration, setFileDuration] = useState(null); // File upload const [uploadedFileUrl, setUploadedFileUrl] = useState(null); const [autoScroll, setAutoScroll] = useState(true); // Progress tracking const [progressItems, setProgressItems] = useState([]); // Refs const workerRef = useRef(null); const recorderRef = useRef(null); const audioProcessorRef = useRef(null); const streamingHandlerRef = useRef(null); const progressiveIntervalRef = useRef(null); // Enumerate audio input devices useEffect(() => { async function getDevices() { try { const devices = await navigator.mediaDevices.enumerateDevices(); const audioInputs = devices.filter(device => device.kind === 'audioinput'); setAudioDevices(audioInputs); // Auto-select the default device (first one with "default" in deviceId) const defaultDevice = audioInputs.find(d => d.deviceId === 'default'); if (defaultDevice && !selectedDeviceId) { setSelectedDeviceId(defaultDevice.deviceId); console.log('[App] Auto-selected default device:', defaultDevice.label); } console.log('[App] Available audio devices:', audioInputs.map(d => `${d.label || 'Unnamed'} (${d.deviceId.slice(0, 8)}...)`)); } catch (error) { console.error('[App] Failed to enumerate devices:', error); } } getDevices(); }, []); // Initialize worker useEffect(() => { workerRef.current = new Worker(WorkerUrl, { type: 'module' }); workerRef.current.onmessage = (event) => { const { status, message, result, device: deviceType, file, progress, total, loaded } = event.data; if (status === 'loading') { setModelStatus('loading'); setModelMessage(message); } else if (status === 'ready') { setModelStatus('ready'); setModelMessage(message); setDevice(deviceType); } else if (status === 'error') { setModelStatus('error'); setModelMessage(message); console.error('Worker error:', event.data); } else if (status === 'transcription' && result) { // Update performance metrics if (result.metadata) { setLatency(result.metadata.latency); setRtf(result.metadata.rtf); setAudioDuration(result.metadata.audioDuration); } } else if (status === 'initiate') { // New file download initiated setProgressItems(prev => [...prev, { file, progress: 0, total }]); } else if (status === 'progress') { // Update progress for existing file setProgressItems(prev => prev.map(item => item.file === file ? { ...item, progress, total, loaded } : item ) ); } else if (status === 'done') { // File download complete - keep it at 100% briefly then remove setProgressItems(prev => prev.map(item => item.file === file ? { ...item, progress: 100 } : item ) ); } }; return () => { if (workerRef.current) { workerRef.current.terminate(); } }; }, []); const loadModel = async () => { if (modelStatus === 'loading' || modelStatus === 'ready') return; setModelStatus('loading'); setModelMessage('Initializing model...'); workerRef.current.postMessage({ type: 'load', data: { modelVersion: "parakeet-tdt-0.6b-v3", // Multilingual Parakeet options: { device: 'webgpu', // Hybrid: GPU encoder + WASM decoder for optimal performance }, }, }); }; const clearCache = async () => { if (!confirm('Clear cached model files (~2.5GB)? You will need to re-download the model.')) { return; } try { const dbs = await indexedDB.databases(); for (const db of dbs) { indexedDB.deleteDatabase(db.name); console.log('Deleted IndexedDB:', db.name); } alert('Cache cleared! Reload the page to start fresh.'); window.location.reload(); } catch (error) { console.error('Failed to clear cache:', error); alert('Failed to clear cache. Try clearing browser data manually.'); } }; const startRecording = async () => { if (modelStatus !== 'ready') { alert('Please load the model first'); return; } try { // Reset state setFixedText(''); setActiveText(''); setTimestamp(0); setLatency(null); setRtf(null); setAudioDuration(null); // Initialize audio processor audioProcessorRef.current = new AudioProcessor(); // Create model wrapper for progressive streaming const modelWrapper = { transcribe: async (audio) => { return new Promise((resolve) => { const messageHandler = (event) => { if (event.data.status === 'transcription') { workerRef.current.removeEventListener('message', messageHandler); resolve(event.data.result); } }; workerRef.current.addEventListener('message', messageHandler); workerRef.current.postMessage({ type: 'transcribe', data: { audio }, }); }); }, }; // Initialize progressive streaming handler streamingHandlerRef.current = new SmartProgressiveStreamingHandler(modelWrapper, { emissionInterval: 0.5, // 500ms maxWindowSize: 15.0, // 15 seconds sentenceBuffer: 2.0, // 2 seconds }); // Start recording with callback for audio chunks let quietWarningCount = 0; recorderRef.current = new AudioRecorder((audioChunk) => { // Append PCM audio chunk directly (Float32Array) const maxAmp = Math.max(...Array.from(audioChunk).map(Math.abs)); // Update audio level meter (scale to 0-100%) setAudioLevel(Math.min(100, maxAmp * 300)); // Scale up for visibility // Only warn about quiet audio once every 20 chunks (~3 seconds) if (maxAmp < 0.001) { quietWarningCount++; if (quietWarningCount === 1 || quietWarningCount % 20 === 0) { console.warn('⚠️ Very quiet audio detected. Try speaking louder or check your microphone selection.'); } } else { quietWarningCount = 0; } audioProcessorRef.current.appendChunk(audioChunk); }); await recorderRef.current.start(selectedDeviceId); setIsRecording(true); // Start progressive transcription updates let transcriptionInProgress = false; progressiveIntervalRef.current = setInterval(async () => { // Stop if recording stopped if (!recorderRef.current || !recorderRef.current.isRecording) { if (progressiveIntervalRef.current) { clearInterval(progressiveIntervalRef.current); progressiveIntervalRef.current = null; } return; } const audioBuffer = audioProcessorRef.current.getBuffer(); const duration = audioBuffer.length / 16000; // Update timestamp even if not transcribing yet setTimestamp(duration); // Skip if previous transcription still in progress (matches Python MLX lock behavior) if (transcriptionInProgress) { console.debug('Skipping progressive update (previous transcription still running)'); return; } // Simple VAD: Check if there's voice activity in the last 2 seconds // This prevents wasting compute on silence const vadWindowSize = Math.min(32000, audioBuffer.length); // Last 2 seconds or less const recentAudio = audioBuffer.slice(-vadWindowSize); let maxAmp = 0; for (let i = 0; i < recentAudio.length; i++) { const abs = Math.abs(recentAudio[i]); if (abs > maxAmp) maxAmp = abs; } const hasVoiceActivity = maxAmp > 0.01; // Threshold for voice activity // Only transcribe if we have enough audio (at least 1 second) AND voice activity detected if (audioBuffer.length >= 16000 && hasVoiceActivity) { try { transcriptionInProgress = true; const result = await streamingHandlerRef.current.transcribeIncremental(audioBuffer); setFixedText(result.fixedText); setActiveText(result.activeText); // Update window state setWindowState(duration >= 15 ? 'sliding' : 'growing'); } catch (error) { console.error('Progressive transcription error:', error); // Show error in UI setActiveText(`Error: ${error.message}`); } finally { transcriptionInProgress = false; } } else { // Not enough audio yet setWindowState('growing'); } }, 250); // 250ms updates } catch (error) { console.error('Failed to start recording:', error); alert('Failed to start recording: ' + error.message); setIsRecording(false); } }; const handleFileUpload = async (file) => { try { setFixedText(''); setActiveText('Loading file...'); setTimestamp(0); setIsProcessingFile(true); setLatency(null); setRtf(null); // Create audio URL for playback const fileUrl = URL.createObjectURL(file); setUploadedFileUrl(fileUrl); // Read audio file const audioContext = new AudioContext({ sampleRate: 16000 }); const arrayBuffer = await file.arrayBuffer(); const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); // Convert to Float32Array at 16kHz const audioData = audioBuffer.getChannelData(0); const duration = audioData.length / 16000; // Set file duration for metrics display setFileDuration(duration); setActiveText('Processing with progressive streaming...'); // Create a fresh streaming handler for this file const fileStreamingHandler = new SmartProgressiveStreamingHandler( { transcribe: async (audio) => { return new Promise((resolve, reject) => { const handleResult = (event) => { if (event.data.status === 'transcription') { workerRef.current.removeEventListener('message', handleResult); resolve(event.data.result); } else if (event.data.status === 'error') { workerRef.current.removeEventListener('message', handleResult); reject(new Error(event.data.message)); } }; workerRef.current.addEventListener('message', handleResult); workerRef.current.postMessage({ type: 'transcribe', data: { audio, sampleRate: 16000 }, }); }); }}, { emissionInterval: 0.5, // 500ms updates maxWindowSize: 15.0, // 15 seconds sentenceBuffer: 2.0, // 2 seconds } ); // Use batch streaming (fast processing with full windows) const startTime = performance.now(); let updateCount = 0; for await (const result of fileStreamingHandler.transcribeBatch(audioData)) { updateCount++; setFixedText(result.fixedText); setActiveText(result.activeText); setTimestamp(result.timestamp); // Update window state setWindowState('sliding'); // Batch mode always uses full windows // Update metrics continuously during processing const currentTime = performance.now(); const elapsedTime = (currentTime - startTime) / 1000; // RTF = how much audio transcribed / time spent processing const currentRTF = result.timestamp / elapsedTime; setLatency(elapsedTime); setRtf(currentRTF); // Final cleanup if (result.isFinal) { setWindowState(null); setIsProcessingFile(false); console.log(`[File] Processed ${duration.toFixed(1)}s audio in ${elapsedTime.toFixed(1)}s (${updateCount} windows, RTF: ${currentRTF.toFixed(2)}x)`); } } } catch (error) { console.error('Failed to process file:', error); alert('Failed to process file: ' + error.message); setActiveText(`Error: ${error.message}`); setWindowState(null); setIsProcessingFile(false); } }; const stopRecording = async () => { if (!isRecording) return; // Stop progressive updates first if (progressiveIntervalRef.current) { clearInterval(progressiveIntervalRef.current); progressiveIntervalRef.current = null; } // Set recording to false immediately to stop the interval loop setIsRecording(false); // Wait a bit for any in-flight transcription to complete await new Promise(resolve => setTimeout(resolve, 100)); // Stop recorder if (recorderRef.current) { try { await recorderRef.current.stop(); // Final transcription const audioBuffer = audioProcessorRef.current.getBuffer(); if (audioBuffer.length > 0 && streamingHandlerRef.current) { try { const finalText = await streamingHandlerRef.current.finalize(audioBuffer); setFixedText(finalText); setActiveText(''); } catch (error) { // Ignore ONNX session errors during cleanup if (!error.message.includes('Session')) { console.error('Error in final transcription:', error); } } } } catch (error) { console.error('Error stopping recording:', error); } } setWindowState(null); }; return (
Real-time speech recognition with smart progressive streaming • WebGPU accelerated
💾 Model files (~2.5GB) are cached locally for faster loading on future visits
{modelMessage || 'Ready to load model'}