Spaces:
Running
Running
| /** | |
| * Main Application Component | |
| * | |
| * Parakeet STT Progressive Transcription Demo with WebGPU | |
| */ | |
| import { useState, useEffect, useRef } from 'react'; | |
| import TranscriptionDisplay from './components/TranscriptionDisplay'; | |
| import PerformanceMetrics from './components/PerformanceMetrics'; | |
| import Progress from './components/Progress'; | |
| import { AudioRecorder, AudioProcessor } from './utils/audio'; | |
| import { SmartProgressiveStreamingHandler } from './utils/progressive-streaming'; | |
| // Import worker | |
| import WorkerUrl from './worker.js?worker&url'; | |
| function App() { | |
| // Model state | |
| const [modelStatus, setModelStatus] = useState('not_loaded'); | |
| const [modelMessage, setModelMessage] = useState(''); | |
| const [device, setDevice] = useState(null); | |
| // Microphone device selection | |
| const [audioDevices, setAudioDevices] = useState([]); | |
| const [selectedDeviceId, setSelectedDeviceId] = useState(null); | |
| // Recording state | |
| const [isRecording, setIsRecording] = useState(false); | |
| const [fixedText, setFixedText] = useState(''); | |
| const [activeText, setActiveText] = useState(''); | |
| const [timestamp, setTimestamp] = useState(0); | |
| const [audioLevel, setAudioLevel] = useState(0); | |
| // Performance metrics | |
| const [latency, setLatency] = useState(null); | |
| const [rtf, setRtf] = useState(null); | |
| const [audioDuration, setAudioDuration] = useState(null); | |
| const [windowState, setWindowState] = useState(null); | |
| const [isProcessingFile, setIsProcessingFile] = useState(false); | |
| const [fileDuration, setFileDuration] = useState(null); | |
| // File upload | |
| const [uploadedFileUrl, setUploadedFileUrl] = useState(null); | |
| const [autoScroll, setAutoScroll] = useState(true); | |
| // Progress tracking | |
| const [progressItems, setProgressItems] = useState([]); | |
| // Refs | |
| const workerRef = useRef(null); | |
| const recorderRef = useRef(null); | |
| const audioProcessorRef = useRef(null); | |
| const streamingHandlerRef = useRef(null); | |
| const progressiveIntervalRef = useRef(null); | |
| // Enumerate audio input devices | |
| useEffect(() => { | |
| async function getDevices() { | |
| try { | |
| const devices = await navigator.mediaDevices.enumerateDevices(); | |
| const audioInputs = devices.filter(device => device.kind === 'audioinput'); | |
| setAudioDevices(audioInputs); | |
| // Auto-select the default device (first one with "default" in deviceId) | |
| const defaultDevice = audioInputs.find(d => d.deviceId === 'default'); | |
| if (defaultDevice && !selectedDeviceId) { | |
| setSelectedDeviceId(defaultDevice.deviceId); | |
| console.log('[App] Auto-selected default device:', defaultDevice.label); | |
| } | |
| console.log('[App] Available audio devices:', audioInputs.map(d => `${d.label || 'Unnamed'} (${d.deviceId.slice(0, 8)}...)`)); | |
| } catch (error) { | |
| console.error('[App] Failed to enumerate devices:', error); | |
| } | |
| } | |
| getDevices(); | |
| }, []); | |
| // Initialize worker | |
| useEffect(() => { | |
| workerRef.current = new Worker(WorkerUrl, { type: 'module' }); | |
| workerRef.current.onmessage = (event) => { | |
| const { status, message, result, device: deviceType, file, progress, total, loaded } = event.data; | |
| if (status === 'loading') { | |
| setModelStatus('loading'); | |
| setModelMessage(message); | |
| } else if (status === 'ready') { | |
| setModelStatus('ready'); | |
| setModelMessage(message); | |
| setDevice(deviceType); | |
| } else if (status === 'error') { | |
| setModelStatus('error'); | |
| setModelMessage(message); | |
| console.error('Worker error:', event.data); | |
| } else if (status === 'transcription' && result) { | |
| // Update performance metrics | |
| if (result.metadata) { | |
| setLatency(result.metadata.latency); | |
| setRtf(result.metadata.rtf); | |
| setAudioDuration(result.metadata.audioDuration); | |
| } | |
| } else if (status === 'initiate') { | |
| // New file download initiated | |
| setProgressItems(prev => [...prev, { file, progress: 0, total }]); | |
| } else if (status === 'progress') { | |
| // Update progress for existing file | |
| setProgressItems(prev => | |
| prev.map(item => | |
| item.file === file ? { ...item, progress, total, loaded } : item | |
| ) | |
| ); | |
| } else if (status === 'done') { | |
| // File download complete - keep it at 100% briefly then remove | |
| setProgressItems(prev => | |
| prev.map(item => | |
| item.file === file ? { ...item, progress: 100 } : item | |
| ) | |
| ); | |
| } | |
| }; | |
| return () => { | |
| if (workerRef.current) { | |
| workerRef.current.terminate(); | |
| } | |
| }; | |
| }, []); | |
| const loadModel = async () => { | |
| if (modelStatus === 'loading' || modelStatus === 'ready') return; | |
| setModelStatus('loading'); | |
| setModelMessage('Initializing model...'); | |
| workerRef.current.postMessage({ | |
| type: 'load', | |
| data: { | |
| modelVersion: "parakeet-tdt-0.6b-v3", // Multilingual Parakeet | |
| options: { | |
| device: 'webgpu', // Hybrid: GPU encoder + WASM decoder for optimal performance | |
| }, | |
| }, | |
| }); | |
| }; | |
| const clearCache = async () => { | |
| if (!confirm('Clear cached model files (~2.5GB)? You will need to re-download the model.')) { | |
| return; | |
| } | |
| try { | |
| const dbs = await indexedDB.databases(); | |
| for (const db of dbs) { | |
| indexedDB.deleteDatabase(db.name); | |
| console.log('Deleted IndexedDB:', db.name); | |
| } | |
| alert('Cache cleared! Reload the page to start fresh.'); | |
| window.location.reload(); | |
| } catch (error) { | |
| console.error('Failed to clear cache:', error); | |
| alert('Failed to clear cache. Try clearing browser data manually.'); | |
| } | |
| }; | |
| const startRecording = async () => { | |
| if (modelStatus !== 'ready') { | |
| alert('Please load the model first'); | |
| return; | |
| } | |
| try { | |
| // Reset state | |
| setFixedText(''); | |
| setActiveText(''); | |
| setTimestamp(0); | |
| setLatency(null); | |
| setRtf(null); | |
| setAudioDuration(null); | |
| // Initialize audio processor | |
| audioProcessorRef.current = new AudioProcessor(); | |
| // Create model wrapper for progressive streaming | |
| const modelWrapper = { | |
| transcribe: async (audio) => { | |
| return new Promise((resolve) => { | |
| const messageHandler = (event) => { | |
| if (event.data.status === 'transcription') { | |
| workerRef.current.removeEventListener('message', messageHandler); | |
| resolve(event.data.result); | |
| } | |
| }; | |
| workerRef.current.addEventListener('message', messageHandler); | |
| workerRef.current.postMessage({ | |
| type: 'transcribe', | |
| data: { audio }, | |
| }); | |
| }); | |
| }, | |
| }; | |
| // Initialize progressive streaming handler | |
| streamingHandlerRef.current = new SmartProgressiveStreamingHandler(modelWrapper, { | |
| emissionInterval: 0.5, // 500ms | |
| maxWindowSize: 15.0, // 15 seconds | |
| sentenceBuffer: 2.0, // 2 seconds | |
| }); | |
| // Start recording with callback for audio chunks | |
| let quietWarningCount = 0; | |
| recorderRef.current = new AudioRecorder((audioChunk) => { | |
| // Append PCM audio chunk directly (Float32Array) | |
| const maxAmp = Math.max(...Array.from(audioChunk).map(Math.abs)); | |
| // Update audio level meter (scale to 0-100%) | |
| setAudioLevel(Math.min(100, maxAmp * 300)); // Scale up for visibility | |
| // Only warn about quiet audio once every 20 chunks (~3 seconds) | |
| if (maxAmp < 0.001) { | |
| quietWarningCount++; | |
| if (quietWarningCount === 1 || quietWarningCount % 20 === 0) { | |
| console.warn('⚠️ Very quiet audio detected. Try speaking louder or check your microphone selection.'); | |
| } | |
| } else { | |
| quietWarningCount = 0; | |
| } | |
| audioProcessorRef.current.appendChunk(audioChunk); | |
| }); | |
| await recorderRef.current.start(selectedDeviceId); | |
| setIsRecording(true); | |
| // Start progressive transcription updates | |
| let transcriptionInProgress = false; | |
| progressiveIntervalRef.current = setInterval(async () => { | |
| // Stop if recording stopped | |
| if (!recorderRef.current || !recorderRef.current.isRecording) { | |
| if (progressiveIntervalRef.current) { | |
| clearInterval(progressiveIntervalRef.current); | |
| progressiveIntervalRef.current = null; | |
| } | |
| return; | |
| } | |
| const audioBuffer = audioProcessorRef.current.getBuffer(); | |
| const duration = audioBuffer.length / 16000; | |
| // Update timestamp even if not transcribing yet | |
| setTimestamp(duration); | |
| // Skip if previous transcription still in progress (matches Python MLX lock behavior) | |
| if (transcriptionInProgress) { | |
| console.debug('Skipping progressive update (previous transcription still running)'); | |
| return; | |
| } | |
| // Simple VAD: Check if there's voice activity in the last 2 seconds | |
| // This prevents wasting compute on silence | |
| const vadWindowSize = Math.min(32000, audioBuffer.length); // Last 2 seconds or less | |
| const recentAudio = audioBuffer.slice(-vadWindowSize); | |
| let maxAmp = 0; | |
| for (let i = 0; i < recentAudio.length; i++) { | |
| const abs = Math.abs(recentAudio[i]); | |
| if (abs > maxAmp) maxAmp = abs; | |
| } | |
| const hasVoiceActivity = maxAmp > 0.01; // Threshold for voice activity | |
| // Only transcribe if we have enough audio (at least 1 second) AND voice activity detected | |
| if (audioBuffer.length >= 16000 && hasVoiceActivity) { | |
| try { | |
| transcriptionInProgress = true; | |
| const result = await streamingHandlerRef.current.transcribeIncremental(audioBuffer); | |
| setFixedText(result.fixedText); | |
| setActiveText(result.activeText); | |
| // Update window state | |
| setWindowState(duration >= 15 ? 'sliding' : 'growing'); | |
| } catch (error) { | |
| console.error('Progressive transcription error:', error); | |
| // Show error in UI | |
| setActiveText(`Error: ${error.message}`); | |
| } finally { | |
| transcriptionInProgress = false; | |
| } | |
| } else { | |
| // Not enough audio yet | |
| setWindowState('growing'); | |
| } | |
| }, 250); // 250ms updates | |
| } catch (error) { | |
| console.error('Failed to start recording:', error); | |
| alert('Failed to start recording: ' + error.message); | |
| setIsRecording(false); | |
| } | |
| }; | |
| const handleFileUpload = async (file) => { | |
| try { | |
| setFixedText(''); | |
| setActiveText('Loading file...'); | |
| setTimestamp(0); | |
| setIsProcessingFile(true); | |
| setLatency(null); | |
| setRtf(null); | |
| // Create audio URL for playback | |
| const fileUrl = URL.createObjectURL(file); | |
| setUploadedFileUrl(fileUrl); | |
| // Read audio file | |
| const audioContext = new AudioContext({ sampleRate: 16000 }); | |
| const arrayBuffer = await file.arrayBuffer(); | |
| const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); | |
| // Convert to Float32Array at 16kHz | |
| const audioData = audioBuffer.getChannelData(0); | |
| const duration = audioData.length / 16000; | |
| // Set file duration for metrics display | |
| setFileDuration(duration); | |
| setActiveText('Processing with progressive streaming...'); | |
| // Create a fresh streaming handler for this file | |
| const fileStreamingHandler = new SmartProgressiveStreamingHandler( | |
| { transcribe: async (audio) => { | |
| return new Promise((resolve, reject) => { | |
| const handleResult = (event) => { | |
| if (event.data.status === 'transcription') { | |
| workerRef.current.removeEventListener('message', handleResult); | |
| resolve(event.data.result); | |
| } else if (event.data.status === 'error') { | |
| workerRef.current.removeEventListener('message', handleResult); | |
| reject(new Error(event.data.message)); | |
| } | |
| }; | |
| workerRef.current.addEventListener('message', handleResult); | |
| workerRef.current.postMessage({ | |
| type: 'transcribe', | |
| data: { audio, sampleRate: 16000 }, | |
| }); | |
| }); | |
| }}, | |
| { | |
| emissionInterval: 0.5, // 500ms updates | |
| maxWindowSize: 15.0, // 15 seconds | |
| sentenceBuffer: 2.0, // 2 seconds | |
| } | |
| ); | |
| // Use batch streaming (fast processing with full windows) | |
| const startTime = performance.now(); | |
| let updateCount = 0; | |
| for await (const result of fileStreamingHandler.transcribeBatch(audioData)) { | |
| updateCount++; | |
| setFixedText(result.fixedText); | |
| setActiveText(result.activeText); | |
| setTimestamp(result.timestamp); | |
| // Update window state | |
| setWindowState('sliding'); // Batch mode always uses full windows | |
| // Update metrics continuously during processing | |
| const currentTime = performance.now(); | |
| const elapsedTime = (currentTime - startTime) / 1000; | |
| // RTF = how much audio transcribed / time spent processing | |
| const currentRTF = result.timestamp / elapsedTime; | |
| setLatency(elapsedTime); | |
| setRtf(currentRTF); | |
| // Final cleanup | |
| if (result.isFinal) { | |
| setWindowState(null); | |
| setIsProcessingFile(false); | |
| console.log(`[File] Processed ${duration.toFixed(1)}s audio in ${elapsedTime.toFixed(1)}s (${updateCount} windows, RTF: ${currentRTF.toFixed(2)}x)`); | |
| } | |
| } | |
| } catch (error) { | |
| console.error('Failed to process file:', error); | |
| alert('Failed to process file: ' + error.message); | |
| setActiveText(`Error: ${error.message}`); | |
| setWindowState(null); | |
| setIsProcessingFile(false); | |
| } | |
| }; | |
| const stopRecording = async () => { | |
| if (!isRecording) return; | |
| // Stop progressive updates first | |
| if (progressiveIntervalRef.current) { | |
| clearInterval(progressiveIntervalRef.current); | |
| progressiveIntervalRef.current = null; | |
| } | |
| // Set recording to false immediately to stop the interval loop | |
| setIsRecording(false); | |
| // Wait a bit for any in-flight transcription to complete | |
| await new Promise(resolve => setTimeout(resolve, 100)); | |
| // Stop recorder | |
| if (recorderRef.current) { | |
| try { | |
| await recorderRef.current.stop(); | |
| // Final transcription | |
| const audioBuffer = audioProcessorRef.current.getBuffer(); | |
| if (audioBuffer.length > 0 && streamingHandlerRef.current) { | |
| try { | |
| const finalText = await streamingHandlerRef.current.finalize(audioBuffer); | |
| setFixedText(finalText); | |
| setActiveText(''); | |
| } catch (error) { | |
| // Ignore ONNX session errors during cleanup | |
| if (!error.message.includes('Session')) { | |
| console.error('Error in final transcription:', error); | |
| } | |
| } | |
| } | |
| } catch (error) { | |
| console.error('Error stopping recording:', error); | |
| } | |
| } | |
| setWindowState(null); | |
| }; | |
| return ( | |
| <div className="min-h-screen bg-gradient-to-b from-gray-950 to-gray-900 text-white"> | |
| {/* Header */} | |
| <header className="border-b border-gray-800 bg-gray-950/50 backdrop-blur"> | |
| <div className="max-w-6xl mx-auto px-6 py-6"> | |
| <h1 className="text-3xl font-bold bg-gradient-to-r from-cyan-400 to-blue-500 bg-clip-text text-transparent"> | |
| 🎤 Parakeet STT Progressive Transcription | |
| </h1> | |
| <p className="text-gray-400 mt-2"> | |
| Real-time speech recognition with smart progressive streaming • WebGPU accelerated | |
| </p> | |
| <p className="text-gray-500 text-xs mt-2"> | |
| 💾 Model files (~2.5GB) are cached locally for faster loading on future visits | |
| </p> | |
| </div> | |
| </header> | |
| {/* Main Content */} | |
| <main className="max-w-6xl mx-auto px-6 py-8 space-y-8"> | |
| {/* Controls */} | |
| <div className="w-full max-w-4xl mx-auto bg-gray-900 rounded-lg border border-gray-700 p-4"> | |
| <h2 className="text-lg font-semibold mb-3">Controls</h2> | |
| {/* Microphone Selection */} | |
| <div className="mb-3"> | |
| <label className="block text-xs font-medium text-gray-400 mb-1">Microphone</label> | |
| <select | |
| value={selectedDeviceId || ''} | |
| onChange={(e) => setSelectedDeviceId(e.target.value)} | |
| className="w-full bg-gray-800 border border-gray-600 rounded px-4 py-2 text-white" | |
| disabled={isRecording} | |
| > | |
| {audioDevices.length === 0 && <option value="">Loading devices...</option>} | |
| {audioDevices.map((device) => ( | |
| <option key={device.deviceId} value={device.deviceId}> | |
| {device.label || `Microphone ${device.deviceId.slice(0, 8)}...`} | |
| </option> | |
| ))} | |
| </select> | |
| </div> | |
| {/* Audio Level Meter */} | |
| {isRecording && ( | |
| <div className="mb-3"> | |
| <label className="block text-xs font-medium text-gray-400 mb-1">Audio Level</label> | |
| <div className="w-full h-3 bg-gray-800 rounded-full overflow-hidden"> | |
| <div | |
| className="h-full bg-gradient-to-r from-green-500 via-yellow-500 to-red-500 transition-all duration-75" | |
| style={{ width: `${audioLevel}%` }} | |
| ></div> | |
| </div> | |
| </div> | |
| )} | |
| {/* Model Status and Actions */} | |
| <div className="flex items-center justify-between"> | |
| <div> | |
| <h3 className="text-xs font-medium text-gray-400">Model Status</h3> | |
| <p className="text-xs text-gray-300 mt-0.5">{modelMessage || 'Ready to load model'}</p> | |
| </div> | |
| <div className="flex items-center gap-3"> | |
| {modelStatus === 'not_loaded' && ( | |
| <> | |
| <button | |
| onClick={loadModel} | |
| className="px-6 py-3 bg-gradient-to-r from-cyan-500 to-blue-500 hover:from-cyan-600 hover:to-blue-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl" | |
| > | |
| Load Model (~2.5GB) | |
| </button> | |
| <button | |
| onClick={clearCache} | |
| className="px-4 py-2 bg-gray-700 hover:bg-gray-600 rounded-lg text-sm font-medium transition-all duration-200" | |
| title="Clear cached model files" | |
| > | |
| Clear Cache | |
| </button> | |
| </> | |
| )} | |
| {modelStatus === 'loading' && ( | |
| <div className="w-full max-w-md"> | |
| <div className="mb-4 text-gray-300 text-sm"> | |
| {modelMessage} | |
| </div> | |
| {progressItems.length > 0 ? ( | |
| <div className="bg-gray-800/50 rounded-lg p-4"> | |
| {progressItems.map((item, i) => ( | |
| <Progress key={i} text={item.file} percentage={item.progress} total={item.total} /> | |
| ))} | |
| </div> | |
| ) : ( | |
| <div className="flex items-center gap-3 text-gray-300"> | |
| <div className="w-5 h-5 border-2 border-cyan-400 border-t-transparent rounded-full animate-spin"></div> | |
| <span>Initializing...</span> | |
| </div> | |
| )} | |
| </div> | |
| )} | |
| {modelStatus === 'ready' && ( | |
| <div className="flex items-center gap-4"> | |
| <div className="px-4 py-2 bg-green-900/30 border border-green-700 rounded-lg text-green-400 text-sm font-semibold"> | |
| ✓ Ready | |
| </div> | |
| {!isRecording ? ( | |
| <> | |
| <button | |
| onClick={startRecording} | |
| className="px-6 py-3 bg-gradient-to-r from-green-500 to-emerald-500 hover:from-green-600 hover:to-emerald-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl" | |
| > | |
| Start Recording | |
| </button> | |
| <label className="px-6 py-3 bg-gradient-to-r from-purple-500 to-indigo-500 hover:from-purple-600 hover:to-indigo-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl cursor-pointer"> | |
| Upload Audio | |
| <input | |
| type="file" | |
| accept="audio/*" | |
| className="hidden" | |
| onChange={(e) => { | |
| const file = e.target.files?.[0]; | |
| if (file) handleFileUpload(file); | |
| }} | |
| /> | |
| </label> | |
| </> | |
| ) : ( | |
| <button | |
| onClick={stopRecording} | |
| className="px-6 py-3 bg-gradient-to-r from-red-500 to-pink-500 hover:from-red-600 hover:to-pink-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl" | |
| > | |
| Stop Recording | |
| </button> | |
| )} | |
| </div> | |
| )} | |
| {modelStatus === 'error' && ( | |
| <button | |
| onClick={loadModel} | |
| className="px-6 py-3 bg-red-900/30 border border-red-700 hover:bg-red-900/50 rounded-lg font-semibold transition-all duration-200" | |
| > | |
| Retry | |
| </button> | |
| )} | |
| </div> | |
| </div> | |
| {/* Audio Player - only shown for uploaded files */} | |
| {uploadedFileUrl && ( | |
| <div className="mt-4 pt-4 border-t border-gray-700"> | |
| <label className="block text-sm font-medium text-gray-400 mb-2">Audio Playback</label> | |
| <audio | |
| src={uploadedFileUrl} | |
| controls | |
| className="w-full" | |
| style={{ height: '40px' }} | |
| /> | |
| </div> | |
| )} | |
| </div> | |
| {/* Transcription Display */} | |
| <TranscriptionDisplay | |
| fixedText={fixedText} | |
| activeText={activeText} | |
| timestamp={timestamp} | |
| isRecording={isRecording} | |
| autoScroll={autoScroll} | |
| onAutoScrollToggle={() => setAutoScroll(!autoScroll)} | |
| /> | |
| {/* Performance Metrics */} | |
| <PerformanceMetrics | |
| latency={latency} | |
| rtf={rtf} | |
| audioDuration={audioDuration} | |
| windowState={windowState} | |
| device={device} | |
| updateInterval={250} | |
| isProcessingFile={isProcessingFile} | |
| fileDuration={fileDuration} | |
| transcribedDuration={timestamp} | |
| /> | |
| </main> | |
| {/* Footer */} | |
| <footer className="border-t border-gray-800 mt-12 py-6"> | |
| <div className="max-w-6xl mx-auto px-6 text-center text-sm text-gray-500"> | |
| <p> | |
| Built with parakeet.js, ONNX Runtime Web, React, and Vite •{' '} | |
| <a | |
| href="https://huggingface.co/spaces/andito/parakeet-v3-streaming/tree/main/source" | |
| className="text-cyan-400 hover:text-cyan-300" | |
| target="_blank" | |
| rel="noopener noreferrer" | |
| > | |
| View Source | |
| </a> | |
| </p> | |
| </div> | |
| </footer> | |
| </div> | |
| ); | |
| } | |
| export default App; | |