| | |
| | |
| | |
| | |
| | |
| |
|
| | import { useState, useEffect, useRef } from 'react'; |
| | import TranscriptionDisplay from './components/TranscriptionDisplay'; |
| | import PerformanceMetrics from './components/PerformanceMetrics'; |
| | import Progress from './components/Progress'; |
| | import { AudioRecorder, AudioProcessor } from './utils/audio'; |
| | import { SmartProgressiveStreamingHandler } from './utils/progressive-streaming'; |
| |
|
| | |
| | import WorkerUrl from './worker.js?worker&url'; |
| |
|
| | function App() { |
| | |
| | const [modelStatus, setModelStatus] = useState('not_loaded'); |
| | const [modelMessage, setModelMessage] = useState(''); |
| | const [device, setDevice] = useState(null); |
| |
|
| | |
| | const [audioDevices, setAudioDevices] = useState([]); |
| | const [selectedDeviceId, setSelectedDeviceId] = useState(null); |
| |
|
| | |
| | const [isRecording, setIsRecording] = useState(false); |
| | const [fixedText, setFixedText] = useState(''); |
| | const [activeText, setActiveText] = useState(''); |
| | const [timestamp, setTimestamp] = useState(0); |
| | const [audioLevel, setAudioLevel] = useState(0); |
| |
|
| | |
| | const [latency, setLatency] = useState(null); |
| | const [rtf, setRtf] = useState(null); |
| | const [audioDuration, setAudioDuration] = useState(null); |
| | const [windowState, setWindowState] = useState(null); |
| | const [isProcessingFile, setIsProcessingFile] = useState(false); |
| | const [fileDuration, setFileDuration] = useState(null); |
| |
|
| | |
| | const [uploadedFileUrl, setUploadedFileUrl] = useState(null); |
| | const [autoScroll, setAutoScroll] = useState(true); |
| |
|
| | |
| | const [progressItems, setProgressItems] = useState([]); |
| |
|
| | |
| | const workerRef = useRef(null); |
| | const recorderRef = useRef(null); |
| | const audioProcessorRef = useRef(null); |
| | const streamingHandlerRef = useRef(null); |
| | const progressiveIntervalRef = useRef(null); |
| |
|
| | |
| | useEffect(() => { |
| | async function getDevices() { |
| | try { |
| | const devices = await navigator.mediaDevices.enumerateDevices(); |
| | const audioInputs = devices.filter(device => device.kind === 'audioinput'); |
| | setAudioDevices(audioInputs); |
| |
|
| | |
| | const defaultDevice = audioInputs.find(d => d.deviceId === 'default'); |
| | if (defaultDevice && !selectedDeviceId) { |
| | setSelectedDeviceId(defaultDevice.deviceId); |
| | console.log('[App] Auto-selected default device:', defaultDevice.label); |
| | } |
| |
|
| | console.log('[App] Available audio devices:', audioInputs.map(d => `${d.label || 'Unnamed'} (${d.deviceId.slice(0, 8)}...)`)); |
| | } catch (error) { |
| | console.error('[App] Failed to enumerate devices:', error); |
| | } |
| | } |
| | getDevices(); |
| | }, []); |
| |
|
| | |
| | useEffect(() => { |
| | workerRef.current = new Worker(WorkerUrl, { type: 'module' }); |
| |
|
| | workerRef.current.onmessage = (event) => { |
| | const { status, message, result, device: deviceType, file, progress, total, loaded } = event.data; |
| |
|
| | if (status === 'loading') { |
| | setModelStatus('loading'); |
| | setModelMessage(message); |
| | } else if (status === 'ready') { |
| | setModelStatus('ready'); |
| | setModelMessage(message); |
| | setDevice(deviceType); |
| | } else if (status === 'error') { |
| | setModelStatus('error'); |
| | setModelMessage(message); |
| | console.error('Worker error:', event.data); |
| | } else if (status === 'transcription' && result) { |
| | |
| | if (result.metadata) { |
| | setLatency(result.metadata.latency); |
| | setRtf(result.metadata.rtf); |
| | setAudioDuration(result.metadata.audioDuration); |
| | } |
| | } else if (status === 'initiate') { |
| | |
| | setProgressItems(prev => [...prev, { file, progress: 0, total }]); |
| | } else if (status === 'progress') { |
| | |
| | setProgressItems(prev => |
| | prev.map(item => |
| | item.file === file ? { ...item, progress, total, loaded } : item |
| | ) |
| | ); |
| | } else if (status === 'done') { |
| | |
| | setProgressItems(prev => |
| | prev.map(item => |
| | item.file === file ? { ...item, progress: 100 } : item |
| | ) |
| | ); |
| | } |
| | }; |
| |
|
| | return () => { |
| | if (workerRef.current) { |
| | workerRef.current.terminate(); |
| | } |
| | }; |
| | }, []); |
| |
|
| | const loadModel = async () => { |
| | if (modelStatus === 'loading' || modelStatus === 'ready') return; |
| |
|
| | setModelStatus('loading'); |
| | setModelMessage('Initializing model...'); |
| |
|
| | workerRef.current.postMessage({ |
| | type: 'load', |
| | data: { |
| | modelVersion: "parakeet-tdt-0.6b-v3", |
| | options: { |
| | device: 'webgpu', |
| | }, |
| | }, |
| | }); |
| | }; |
| |
|
| | const clearCache = async () => { |
| | if (!confirm('Clear cached model files (~2.5GB)? You will need to re-download the model.')) { |
| | return; |
| | } |
| |
|
| | try { |
| | const dbs = await indexedDB.databases(); |
| | for (const db of dbs) { |
| | indexedDB.deleteDatabase(db.name); |
| | console.log('Deleted IndexedDB:', db.name); |
| | } |
| | alert('Cache cleared! Reload the page to start fresh.'); |
| | window.location.reload(); |
| | } catch (error) { |
| | console.error('Failed to clear cache:', error); |
| | alert('Failed to clear cache. Try clearing browser data manually.'); |
| | } |
| | }; |
| |
|
| | const startRecording = async () => { |
| | if (modelStatus !== 'ready') { |
| | alert('Please load the model first'); |
| | return; |
| | } |
| |
|
| | try { |
| | |
| | setFixedText(''); |
| | setActiveText(''); |
| | setTimestamp(0); |
| | setLatency(null); |
| | setRtf(null); |
| | setAudioDuration(null); |
| |
|
| | |
| | audioProcessorRef.current = new AudioProcessor(); |
| |
|
| | |
| | const modelWrapper = { |
| | transcribe: async (audio) => { |
| | return new Promise((resolve) => { |
| | const messageHandler = (event) => { |
| | if (event.data.status === 'transcription') { |
| | workerRef.current.removeEventListener('message', messageHandler); |
| | resolve(event.data.result); |
| | } |
| | }; |
| |
|
| | workerRef.current.addEventListener('message', messageHandler); |
| | workerRef.current.postMessage({ |
| | type: 'transcribe', |
| | data: { audio }, |
| | }); |
| | }); |
| | }, |
| | }; |
| |
|
| | |
| | streamingHandlerRef.current = new SmartProgressiveStreamingHandler(modelWrapper, { |
| | emissionInterval: 0.5, |
| | maxWindowSize: 15.0, |
| | sentenceBuffer: 2.0, |
| | }); |
| |
|
| | |
| | let quietWarningCount = 0; |
| | recorderRef.current = new AudioRecorder((audioChunk) => { |
| | |
| | const maxAmp = Math.max(...Array.from(audioChunk).map(Math.abs)); |
| |
|
| | |
| | setAudioLevel(Math.min(100, maxAmp * 300)); |
| |
|
| | |
| | if (maxAmp < 0.001) { |
| | quietWarningCount++; |
| | if (quietWarningCount === 1 || quietWarningCount % 20 === 0) { |
| | console.warn('⚠️ Very quiet audio detected. Try speaking louder or check your microphone selection.'); |
| | } |
| | } else { |
| | quietWarningCount = 0; |
| | } |
| |
|
| | audioProcessorRef.current.appendChunk(audioChunk); |
| | }); |
| |
|
| | await recorderRef.current.start(selectedDeviceId); |
| | setIsRecording(true); |
| |
|
| | |
| | let transcriptionInProgress = false; |
| | progressiveIntervalRef.current = setInterval(async () => { |
| | |
| | if (!recorderRef.current || !recorderRef.current.isRecording) { |
| | if (progressiveIntervalRef.current) { |
| | clearInterval(progressiveIntervalRef.current); |
| | progressiveIntervalRef.current = null; |
| | } |
| | return; |
| | } |
| |
|
| | const audioBuffer = audioProcessorRef.current.getBuffer(); |
| | const duration = audioBuffer.length / 16000; |
| |
|
| | |
| | setTimestamp(duration); |
| |
|
| | |
| | if (transcriptionInProgress) { |
| | console.debug('Skipping progressive update (previous transcription still running)'); |
| | return; |
| | } |
| |
|
| | |
| | |
| | const vadWindowSize = Math.min(32000, audioBuffer.length); |
| | const recentAudio = audioBuffer.slice(-vadWindowSize); |
| | let maxAmp = 0; |
| | for (let i = 0; i < recentAudio.length; i++) { |
| | const abs = Math.abs(recentAudio[i]); |
| | if (abs > maxAmp) maxAmp = abs; |
| | } |
| | const hasVoiceActivity = maxAmp > 0.01; |
| |
|
| | |
| | if (audioBuffer.length >= 16000 && hasVoiceActivity) { |
| | try { |
| | transcriptionInProgress = true; |
| | const result = await streamingHandlerRef.current.transcribeIncremental(audioBuffer); |
| |
|
| | setFixedText(result.fixedText); |
| | setActiveText(result.activeText); |
| |
|
| | |
| | setWindowState(duration >= 15 ? 'sliding' : 'growing'); |
| | } catch (error) { |
| | console.error('Progressive transcription error:', error); |
| | |
| | setActiveText(`Error: ${error.message}`); |
| | } finally { |
| | transcriptionInProgress = false; |
| | } |
| | } else { |
| | |
| | setWindowState('growing'); |
| | } |
| | }, 250); |
| | } catch (error) { |
| | console.error('Failed to start recording:', error); |
| | alert('Failed to start recording: ' + error.message); |
| | setIsRecording(false); |
| | } |
| | }; |
| |
|
| | const handleFileUpload = async (file) => { |
| | try { |
| | setFixedText(''); |
| | setActiveText('Loading file...'); |
| | setTimestamp(0); |
| | setIsProcessingFile(true); |
| | setLatency(null); |
| | setRtf(null); |
| |
|
| | |
| | const fileUrl = URL.createObjectURL(file); |
| | setUploadedFileUrl(fileUrl); |
| |
|
| | |
| | const audioContext = new AudioContext({ sampleRate: 16000 }); |
| | const arrayBuffer = await file.arrayBuffer(); |
| | const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); |
| |
|
| | |
| | const audioData = audioBuffer.getChannelData(0); |
| | const duration = audioData.length / 16000; |
| |
|
| | |
| | setFileDuration(duration); |
| | setActiveText('Processing with progressive streaming...'); |
| |
|
| | |
| | const fileStreamingHandler = new SmartProgressiveStreamingHandler( |
| | { transcribe: async (audio) => { |
| | return new Promise((resolve, reject) => { |
| | const handleResult = (event) => { |
| | if (event.data.status === 'transcription') { |
| | workerRef.current.removeEventListener('message', handleResult); |
| | resolve(event.data.result); |
| | } else if (event.data.status === 'error') { |
| | workerRef.current.removeEventListener('message', handleResult); |
| | reject(new Error(event.data.message)); |
| | } |
| | }; |
| |
|
| | workerRef.current.addEventListener('message', handleResult); |
| | workerRef.current.postMessage({ |
| | type: 'transcribe', |
| | data: { audio, sampleRate: 16000 }, |
| | }); |
| | }); |
| | }}, |
| | { |
| | emissionInterval: 0.5, |
| | maxWindowSize: 15.0, |
| | sentenceBuffer: 2.0, |
| | } |
| | ); |
| |
|
| | |
| | const startTime = performance.now(); |
| | let updateCount = 0; |
| |
|
| | for await (const result of fileStreamingHandler.transcribeBatch(audioData)) { |
| | updateCount++; |
| | setFixedText(result.fixedText); |
| | setActiveText(result.activeText); |
| | setTimestamp(result.timestamp); |
| |
|
| | |
| | setWindowState('sliding'); |
| |
|
| | |
| | const currentTime = performance.now(); |
| | const elapsedTime = (currentTime - startTime) / 1000; |
| | |
| | const currentRTF = result.timestamp / elapsedTime; |
| |
|
| | setLatency(elapsedTime); |
| | setRtf(currentRTF); |
| |
|
| | |
| | if (result.isFinal) { |
| | setWindowState(null); |
| | setIsProcessingFile(false); |
| |
|
| | console.log(`[File] Processed ${duration.toFixed(1)}s audio in ${elapsedTime.toFixed(1)}s (${updateCount} windows, RTF: ${currentRTF.toFixed(2)}x)`); |
| | } |
| | } |
| | } catch (error) { |
| | console.error('Failed to process file:', error); |
| | alert('Failed to process file: ' + error.message); |
| | setActiveText(`Error: ${error.message}`); |
| | setWindowState(null); |
| | setIsProcessingFile(false); |
| | } |
| | }; |
| |
|
| | const stopRecording = async () => { |
| | if (!isRecording) return; |
| |
|
| | |
| | if (progressiveIntervalRef.current) { |
| | clearInterval(progressiveIntervalRef.current); |
| | progressiveIntervalRef.current = null; |
| | } |
| |
|
| | |
| | setIsRecording(false); |
| |
|
| | |
| | await new Promise(resolve => setTimeout(resolve, 100)); |
| |
|
| | |
| | if (recorderRef.current) { |
| | try { |
| | await recorderRef.current.stop(); |
| |
|
| | |
| | const audioBuffer = audioProcessorRef.current.getBuffer(); |
| | if (audioBuffer.length > 0 && streamingHandlerRef.current) { |
| | try { |
| | const finalText = await streamingHandlerRef.current.finalize(audioBuffer); |
| | setFixedText(finalText); |
| | setActiveText(''); |
| | } catch (error) { |
| | |
| | if (!error.message.includes('Session')) { |
| | console.error('Error in final transcription:', error); |
| | } |
| | } |
| | } |
| | } catch (error) { |
| | console.error('Error stopping recording:', error); |
| | } |
| | } |
| |
|
| | setWindowState(null); |
| | }; |
| |
|
| | return ( |
| | <div className="min-h-screen bg-gradient-to-b from-gray-950 to-gray-900 text-white"> |
| | {/* Header */} |
| | <header className="border-b border-gray-800 bg-gray-950/50 backdrop-blur"> |
| | <div className="max-w-6xl mx-auto px-6 py-6"> |
| | <h1 className="text-3xl font-bold bg-gradient-to-r from-cyan-400 to-blue-500 bg-clip-text text-transparent"> |
| | 🎤 Parakeet STT Progressive Transcription |
| | </h1> |
| | <p className="text-gray-400 mt-2"> |
| | Real-time speech recognition with smart progressive streaming • WebGPU accelerated |
| | </p> |
| | <p className="text-gray-500 text-xs mt-2"> |
| | 💾 Model files (~2.5GB) are cached locally for faster loading on future visits |
| | </p> |
| | </div> |
| | </header> |
| | |
| | {/* Main Content */} |
| | <main className="max-w-6xl mx-auto px-6 py-8 space-y-8"> |
| | {/* Controls */} |
| | <div className="w-full max-w-4xl mx-auto bg-gray-900 rounded-lg border border-gray-700 p-4"> |
| | <h2 className="text-lg font-semibold mb-3">Controls</h2> |
| | |
| | {/* Microphone Selection */} |
| | <div className="mb-3"> |
| | <label className="block text-xs font-medium text-gray-400 mb-1">Microphone</label> |
| | <select |
| | value={selectedDeviceId || ''} |
| | onChange={(e) => setSelectedDeviceId(e.target.value)} |
| | className="w-full bg-gray-800 border border-gray-600 rounded px-4 py-2 text-white" |
| | disabled={isRecording} |
| | > |
| | {audioDevices.length === 0 && <option value="">Loading devices...</option>} |
| | {audioDevices.map((device) => ( |
| | <option key={device.deviceId} value={device.deviceId}> |
| | {device.label || `Microphone ${device.deviceId.slice(0, 8)}...`} |
| | </option> |
| | ))} |
| | </select> |
| | </div> |
| | |
| | {/* Audio Level Meter */} |
| | {isRecording && ( |
| | <div className="mb-3"> |
| | <label className="block text-xs font-medium text-gray-400 mb-1">Audio Level</label> |
| | <div className="w-full h-3 bg-gray-800 rounded-full overflow-hidden"> |
| | <div |
| | className="h-full bg-gradient-to-r from-green-500 via-yellow-500 to-red-500 transition-all duration-75" |
| | style={{ width: `${audioLevel}%` }} |
| | ></div> |
| | </div> |
| | </div> |
| | )} |
| | |
| | {/* Model Status and Actions */} |
| | <div className="flex items-center justify-between"> |
| | <div> |
| | <h3 className="text-xs font-medium text-gray-400">Model Status</h3> |
| | <p className="text-xs text-gray-300 mt-0.5">{modelMessage || 'Ready to load model'}</p> |
| | </div> |
| | <div className="flex items-center gap-3"> |
| | {modelStatus === 'not_loaded' && ( |
| | <> |
| | <button |
| | onClick={loadModel} |
| | className="px-6 py-3 bg-gradient-to-r from-cyan-500 to-blue-500 hover:from-cyan-600 hover:to-blue-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl" |
| | > |
| | Load Model (~2.5GB) |
| | </button> |
| | <button |
| | onClick={clearCache} |
| | className="px-4 py-2 bg-gray-700 hover:bg-gray-600 rounded-lg text-sm font-medium transition-all duration-200" |
| | title="Clear cached model files" |
| | > |
| | Clear Cache |
| | </button> |
| | </> |
| | )} |
| | {modelStatus === 'loading' && ( |
| | <div className="w-full max-w-md"> |
| | <div className="mb-4 text-gray-300 text-sm"> |
| | {modelMessage} |
| | </div> |
| | {progressItems.length > 0 ? ( |
| | <div className="bg-gray-800/50 rounded-lg p-4"> |
| | {progressItems.map((item, i) => ( |
| | <Progress key={i} text={item.file} percentage={item.progress} total={item.total} /> |
| | ))} |
| | </div> |
| | ) : ( |
| | <div className="flex items-center gap-3 text-gray-300"> |
| | <div className="w-5 h-5 border-2 border-cyan-400 border-t-transparent rounded-full animate-spin"></div> |
| | <span>Initializing...</span> |
| | </div> |
| | )} |
| | </div> |
| | )} |
| | {modelStatus === 'ready' && ( |
| | <div className="flex items-center gap-4"> |
| | <div className="px-4 py-2 bg-green-900/30 border border-green-700 rounded-lg text-green-400 text-sm font-semibold"> |
| | ✓ Ready |
| | </div> |
| | {!isRecording ? ( |
| | <> |
| | <button |
| | onClick={startRecording} |
| | className="px-6 py-3 bg-gradient-to-r from-green-500 to-emerald-500 hover:from-green-600 hover:to-emerald-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl" |
| | > |
| | Start Recording |
| | </button> |
| | <label className="px-6 py-3 bg-gradient-to-r from-purple-500 to-indigo-500 hover:from-purple-600 hover:to-indigo-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl cursor-pointer"> |
| | Upload Audio |
| | <input |
| | type="file" |
| | accept="audio/*" |
| | className="hidden" |
| | onChange={(e) => { |
| | const file = e.target.files?.[0]; |
| | if (file) handleFileUpload(file); |
| | }} |
| | /> |
| | </label> |
| | </> |
| | ) : ( |
| | <button |
| | onClick={stopRecording} |
| | className="px-6 py-3 bg-gradient-to-r from-red-500 to-pink-500 hover:from-red-600 hover:to-pink-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl" |
| | > |
| | Stop Recording |
| | </button> |
| | )} |
| | </div> |
| | )} |
| | {modelStatus === 'error' && ( |
| | <button |
| | onClick={loadModel} |
| | className="px-6 py-3 bg-red-900/30 border border-red-700 hover:bg-red-900/50 rounded-lg font-semibold transition-all duration-200" |
| | > |
| | Retry |
| | </button> |
| | )} |
| | </div> |
| | </div> |
| | |
| | {/* Audio Player - only shown for uploaded files */} |
| | {uploadedFileUrl && ( |
| | <div className="mt-4 pt-4 border-t border-gray-700"> |
| | <label className="block text-sm font-medium text-gray-400 mb-2">Audio Playback</label> |
| | <audio |
| | src={uploadedFileUrl} |
| | controls |
| | className="w-full" |
| | style={{ height: '40px' }} |
| | /> |
| | </div> |
| | )} |
| | </div> |
| |
|
| | {} |
| | <TranscriptionDisplay |
| | fixedText={fixedText} |
| | activeText={activeText} |
| | timestamp={timestamp} |
| | isRecording={isRecording} |
| | autoScroll={autoScroll} |
| | onAutoScrollToggle={() => setAutoScroll(!autoScroll)} |
| | /> |
| |
|
| | {} |
| | <PerformanceMetrics |
| | latency={latency} |
| | rtf={rtf} |
| | audioDuration={audioDuration} |
| | windowState={windowState} |
| | device={device} |
| | updateInterval={250} |
| | isProcessingFile={isProcessingFile} |
| | fileDuration={fileDuration} |
| | transcribedDuration={timestamp} |
| | /> |
| | </main> |
| |
|
| | {} |
| | <footer className="border-t border-gray-800 mt-12 py-6"> |
| | <div className="max-w-6xl mx-auto px-6 text-center text-sm text-gray-500"> |
| | <p> |
| | Built with parakeet.js, ONNX Runtime Web, React, and Vite •{' '} |
| | <a |
| | href="https://huggingface.co/spaces/andito/parakeet-v3-streaming/tree/main/source" |
| | className="text-cyan-400 hover:text-cyan-300" |
| | target="_blank" |
| | rel="noopener noreferrer" |
| | > |
| | View Source |
| | </a> |
| | </p> |
| | </div> |
| | </footer> |
| | </div> |
| | ); |
| | } |
| |
|
| | export default App; |
| |
|