andito's picture
andito HF Staff
fixing session mismatch
bebb6f7
/**
* Main Application Component
*
* Parakeet STT Progressive Transcription Demo with WebGPU
*/
import { useState, useEffect, useRef } from 'react';
import TranscriptionDisplay from './components/TranscriptionDisplay';
import PerformanceMetrics from './components/PerformanceMetrics';
import Progress from './components/Progress';
import { AudioRecorder, AudioProcessor } from './utils/audio';
import { SmartProgressiveStreamingHandler } from './utils/progressive-streaming';
// Import worker
import WorkerUrl from './worker.js?worker&url';
function App() {
// Model state
const [modelStatus, setModelStatus] = useState('not_loaded');
const [modelMessage, setModelMessage] = useState('');
const [device, setDevice] = useState(null);
// Microphone device selection
const [audioDevices, setAudioDevices] = useState([]);
const [selectedDeviceId, setSelectedDeviceId] = useState(null);
// Recording state
const [isRecording, setIsRecording] = useState(false);
const [fixedText, setFixedText] = useState('');
const [activeText, setActiveText] = useState('');
const [timestamp, setTimestamp] = useState(0);
const [audioLevel, setAudioLevel] = useState(0);
// Performance metrics
const [latency, setLatency] = useState(null);
const [rtf, setRtf] = useState(null);
const [audioDuration, setAudioDuration] = useState(null);
const [windowState, setWindowState] = useState(null);
const [isProcessingFile, setIsProcessingFile] = useState(false);
const [fileDuration, setFileDuration] = useState(null);
// File upload
const [uploadedFileUrl, setUploadedFileUrl] = useState(null);
const [autoScroll, setAutoScroll] = useState(true);
// Progress tracking
const [progressItems, setProgressItems] = useState([]);
// Refs
const workerRef = useRef(null);
const recorderRef = useRef(null);
const audioProcessorRef = useRef(null);
const streamingHandlerRef = useRef(null);
const progressiveIntervalRef = useRef(null);
// Enumerate audio input devices
useEffect(() => {
async function getDevices() {
try {
const devices = await navigator.mediaDevices.enumerateDevices();
const audioInputs = devices.filter(device => device.kind === 'audioinput');
setAudioDevices(audioInputs);
// Auto-select the default device (first one with "default" in deviceId)
const defaultDevice = audioInputs.find(d => d.deviceId === 'default');
if (defaultDevice && !selectedDeviceId) {
setSelectedDeviceId(defaultDevice.deviceId);
console.log('[App] Auto-selected default device:', defaultDevice.label);
}
console.log('[App] Available audio devices:', audioInputs.map(d => `${d.label || 'Unnamed'} (${d.deviceId.slice(0, 8)}...)`));
} catch (error) {
console.error('[App] Failed to enumerate devices:', error);
}
}
getDevices();
}, []);
// Initialize worker
useEffect(() => {
workerRef.current = new Worker(WorkerUrl, { type: 'module' });
workerRef.current.onmessage = (event) => {
const { status, message, result, device: deviceType, file, progress, total, loaded } = event.data;
if (status === 'loading') {
setModelStatus('loading');
setModelMessage(message);
} else if (status === 'ready') {
setModelStatus('ready');
setModelMessage(message);
setDevice(deviceType);
} else if (status === 'error') {
setModelStatus('error');
setModelMessage(message);
console.error('Worker error:', event.data);
} else if (status === 'transcription' && result) {
// Update performance metrics
if (result.metadata) {
setLatency(result.metadata.latency);
setRtf(result.metadata.rtf);
setAudioDuration(result.metadata.audioDuration);
}
} else if (status === 'initiate') {
// New file download initiated
setProgressItems(prev => [...prev, { file, progress: 0, total }]);
} else if (status === 'progress') {
// Update progress for existing file
setProgressItems(prev =>
prev.map(item =>
item.file === file ? { ...item, progress, total, loaded } : item
)
);
} else if (status === 'done') {
// File download complete - keep it at 100% briefly then remove
setProgressItems(prev =>
prev.map(item =>
item.file === file ? { ...item, progress: 100 } : item
)
);
}
};
return () => {
if (workerRef.current) {
workerRef.current.terminate();
}
};
}, []);
const loadModel = async () => {
if (modelStatus === 'loading' || modelStatus === 'ready') return;
setModelStatus('loading');
setModelMessage('Initializing model...');
workerRef.current.postMessage({
type: 'load',
data: {
modelVersion: "parakeet-tdt-0.6b-v3", // Multilingual Parakeet
options: {
device: 'webgpu', // Hybrid: GPU encoder + WASM decoder for optimal performance
},
},
});
};
const clearCache = async () => {
if (!confirm('Clear cached model files (~2.5GB)? You will need to re-download the model.')) {
return;
}
try {
const dbs = await indexedDB.databases();
for (const db of dbs) {
indexedDB.deleteDatabase(db.name);
console.log('Deleted IndexedDB:', db.name);
}
alert('Cache cleared! Reload the page to start fresh.');
window.location.reload();
} catch (error) {
console.error('Failed to clear cache:', error);
alert('Failed to clear cache. Try clearing browser data manually.');
}
};
const startRecording = async () => {
if (modelStatus !== 'ready') {
alert('Please load the model first');
return;
}
try {
// Reset state
setFixedText('');
setActiveText('');
setTimestamp(0);
setLatency(null);
setRtf(null);
setAudioDuration(null);
// Initialize audio processor
audioProcessorRef.current = new AudioProcessor();
// Create model wrapper for progressive streaming
const modelWrapper = {
transcribe: async (audio) => {
return new Promise((resolve) => {
const messageHandler = (event) => {
if (event.data.status === 'transcription') {
workerRef.current.removeEventListener('message', messageHandler);
resolve(event.data.result);
}
};
workerRef.current.addEventListener('message', messageHandler);
workerRef.current.postMessage({
type: 'transcribe',
data: { audio },
});
});
},
};
// Initialize progressive streaming handler
streamingHandlerRef.current = new SmartProgressiveStreamingHandler(modelWrapper, {
emissionInterval: 0.5, // 500ms
maxWindowSize: 15.0, // 15 seconds
sentenceBuffer: 2.0, // 2 seconds
});
// Start recording with callback for audio chunks
let quietWarningCount = 0;
recorderRef.current = new AudioRecorder((audioChunk) => {
// Append PCM audio chunk directly (Float32Array)
const maxAmp = Math.max(...Array.from(audioChunk).map(Math.abs));
// Update audio level meter (scale to 0-100%)
setAudioLevel(Math.min(100, maxAmp * 300)); // Scale up for visibility
// Only warn about quiet audio once every 20 chunks (~3 seconds)
if (maxAmp < 0.001) {
quietWarningCount++;
if (quietWarningCount === 1 || quietWarningCount % 20 === 0) {
console.warn('⚠️ Very quiet audio detected. Try speaking louder or check your microphone selection.');
}
} else {
quietWarningCount = 0;
}
audioProcessorRef.current.appendChunk(audioChunk);
});
await recorderRef.current.start(selectedDeviceId);
setIsRecording(true);
// Start progressive transcription updates
let transcriptionInProgress = false;
progressiveIntervalRef.current = setInterval(async () => {
// Stop if recording stopped
if (!recorderRef.current || !recorderRef.current.isRecording) {
if (progressiveIntervalRef.current) {
clearInterval(progressiveIntervalRef.current);
progressiveIntervalRef.current = null;
}
return;
}
const audioBuffer = audioProcessorRef.current.getBuffer();
const duration = audioBuffer.length / 16000;
// Update timestamp even if not transcribing yet
setTimestamp(duration);
// Skip if previous transcription still in progress (matches Python MLX lock behavior)
if (transcriptionInProgress) {
console.debug('Skipping progressive update (previous transcription still running)');
return;
}
// Simple VAD: Check if there's voice activity in the last 2 seconds
// This prevents wasting compute on silence
const vadWindowSize = Math.min(32000, audioBuffer.length); // Last 2 seconds or less
const recentAudio = audioBuffer.slice(-vadWindowSize);
let maxAmp = 0;
for (let i = 0; i < recentAudio.length; i++) {
const abs = Math.abs(recentAudio[i]);
if (abs > maxAmp) maxAmp = abs;
}
const hasVoiceActivity = maxAmp > 0.01; // Threshold for voice activity
// Only transcribe if we have enough audio (at least 1 second) AND voice activity detected
if (audioBuffer.length >= 16000 && hasVoiceActivity) {
try {
transcriptionInProgress = true;
const result = await streamingHandlerRef.current.transcribeIncremental(audioBuffer);
setFixedText(result.fixedText);
setActiveText(result.activeText);
// Update window state
setWindowState(duration >= 15 ? 'sliding' : 'growing');
} catch (error) {
console.error('Progressive transcription error:', error);
// Show error in UI
setActiveText(`Error: ${error.message}`);
} finally {
transcriptionInProgress = false;
}
} else {
// Not enough audio yet
setWindowState('growing');
}
}, 250); // 250ms updates
} catch (error) {
console.error('Failed to start recording:', error);
alert('Failed to start recording: ' + error.message);
setIsRecording(false);
}
};
const handleFileUpload = async (file) => {
try {
setFixedText('');
setActiveText('Loading file...');
setTimestamp(0);
setIsProcessingFile(true);
setLatency(null);
setRtf(null);
// Create audio URL for playback
const fileUrl = URL.createObjectURL(file);
setUploadedFileUrl(fileUrl);
// Read audio file
const audioContext = new AudioContext({ sampleRate: 16000 });
const arrayBuffer = await file.arrayBuffer();
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
// Convert to Float32Array at 16kHz
const audioData = audioBuffer.getChannelData(0);
const duration = audioData.length / 16000;
// Set file duration for metrics display
setFileDuration(duration);
setActiveText('Processing with progressive streaming...');
// Create a fresh streaming handler for this file
const fileStreamingHandler = new SmartProgressiveStreamingHandler(
{ transcribe: async (audio) => {
return new Promise((resolve, reject) => {
const handleResult = (event) => {
if (event.data.status === 'transcription') {
workerRef.current.removeEventListener('message', handleResult);
resolve(event.data.result);
} else if (event.data.status === 'error') {
workerRef.current.removeEventListener('message', handleResult);
reject(new Error(event.data.message));
}
};
workerRef.current.addEventListener('message', handleResult);
workerRef.current.postMessage({
type: 'transcribe',
data: { audio, sampleRate: 16000 },
});
});
}},
{
emissionInterval: 0.5, // 500ms updates
maxWindowSize: 15.0, // 15 seconds
sentenceBuffer: 2.0, // 2 seconds
}
);
// Use batch streaming (fast processing with full windows)
const startTime = performance.now();
let updateCount = 0;
for await (const result of fileStreamingHandler.transcribeBatch(audioData)) {
updateCount++;
setFixedText(result.fixedText);
setActiveText(result.activeText);
setTimestamp(result.timestamp);
// Update window state
setWindowState('sliding'); // Batch mode always uses full windows
// Update metrics continuously during processing
const currentTime = performance.now();
const elapsedTime = (currentTime - startTime) / 1000;
// RTF = how much audio transcribed / time spent processing
const currentRTF = result.timestamp / elapsedTime;
setLatency(elapsedTime);
setRtf(currentRTF);
// Final cleanup
if (result.isFinal) {
setWindowState(null);
setIsProcessingFile(false);
console.log(`[File] Processed ${duration.toFixed(1)}s audio in ${elapsedTime.toFixed(1)}s (${updateCount} windows, RTF: ${currentRTF.toFixed(2)}x)`);
}
}
} catch (error) {
console.error('Failed to process file:', error);
alert('Failed to process file: ' + error.message);
setActiveText(`Error: ${error.message}`);
setWindowState(null);
setIsProcessingFile(false);
}
};
const stopRecording = async () => {
if (!isRecording) return;
// Stop progressive updates first
if (progressiveIntervalRef.current) {
clearInterval(progressiveIntervalRef.current);
progressiveIntervalRef.current = null;
}
// Set recording to false immediately to stop the interval loop
setIsRecording(false);
// Wait a bit for any in-flight transcription to complete
await new Promise(resolve => setTimeout(resolve, 100));
// Stop recorder
if (recorderRef.current) {
try {
await recorderRef.current.stop();
// Final transcription
const audioBuffer = audioProcessorRef.current.getBuffer();
if (audioBuffer.length > 0 && streamingHandlerRef.current) {
try {
const finalText = await streamingHandlerRef.current.finalize(audioBuffer);
setFixedText(finalText);
setActiveText('');
} catch (error) {
// Ignore ONNX session errors during cleanup
if (!error.message.includes('Session')) {
console.error('Error in final transcription:', error);
}
}
}
} catch (error) {
console.error('Error stopping recording:', error);
}
}
setWindowState(null);
};
return (
<div className="min-h-screen bg-gradient-to-b from-gray-950 to-gray-900 text-white">
{/* Header */}
<header className="border-b border-gray-800 bg-gray-950/50 backdrop-blur">
<div className="max-w-6xl mx-auto px-6 py-6">
<h1 className="text-3xl font-bold bg-gradient-to-r from-cyan-400 to-blue-500 bg-clip-text text-transparent">
🎤 Parakeet STT Progressive Transcription
</h1>
<p className="text-gray-400 mt-2">
Real-time speech recognition with smart progressive streaming • WebGPU accelerated
</p>
<p className="text-gray-500 text-xs mt-2">
💾 Model files (~2.5GB) are cached locally for faster loading on future visits
</p>
</div>
</header>
{/* Main Content */}
<main className="max-w-6xl mx-auto px-6 py-8 space-y-8">
{/* Controls */}
<div className="w-full max-w-4xl mx-auto bg-gray-900 rounded-lg border border-gray-700 p-4">
<h2 className="text-lg font-semibold mb-3">Controls</h2>
{/* Microphone Selection */}
<div className="mb-3">
<label className="block text-xs font-medium text-gray-400 mb-1">Microphone</label>
<select
value={selectedDeviceId || ''}
onChange={(e) => setSelectedDeviceId(e.target.value)}
className="w-full bg-gray-800 border border-gray-600 rounded px-4 py-2 text-white"
disabled={isRecording}
>
{audioDevices.length === 0 && <option value="">Loading devices...</option>}
{audioDevices.map((device) => (
<option key={device.deviceId} value={device.deviceId}>
{device.label || `Microphone ${device.deviceId.slice(0, 8)}...`}
</option>
))}
</select>
</div>
{/* Audio Level Meter */}
{isRecording && (
<div className="mb-3">
<label className="block text-xs font-medium text-gray-400 mb-1">Audio Level</label>
<div className="w-full h-3 bg-gray-800 rounded-full overflow-hidden">
<div
className="h-full bg-gradient-to-r from-green-500 via-yellow-500 to-red-500 transition-all duration-75"
style={{ width: `${audioLevel}%` }}
></div>
</div>
</div>
)}
{/* Model Status and Actions */}
<div className="flex items-center justify-between">
<div>
<h3 className="text-xs font-medium text-gray-400">Model Status</h3>
<p className="text-xs text-gray-300 mt-0.5">{modelMessage || 'Ready to load model'}</p>
</div>
<div className="flex items-center gap-3">
{modelStatus === 'not_loaded' && (
<>
<button
onClick={loadModel}
className="px-6 py-3 bg-gradient-to-r from-cyan-500 to-blue-500 hover:from-cyan-600 hover:to-blue-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl"
>
Load Model (~2.5GB)
</button>
<button
onClick={clearCache}
className="px-4 py-2 bg-gray-700 hover:bg-gray-600 rounded-lg text-sm font-medium transition-all duration-200"
title="Clear cached model files"
>
Clear Cache
</button>
</>
)}
{modelStatus === 'loading' && (
<div className="w-full max-w-md">
<div className="mb-4 text-gray-300 text-sm">
{modelMessage}
</div>
{progressItems.length > 0 ? (
<div className="bg-gray-800/50 rounded-lg p-4">
{progressItems.map((item, i) => (
<Progress key={i} text={item.file} percentage={item.progress} total={item.total} />
))}
</div>
) : (
<div className="flex items-center gap-3 text-gray-300">
<div className="w-5 h-5 border-2 border-cyan-400 border-t-transparent rounded-full animate-spin"></div>
<span>Initializing...</span>
</div>
)}
</div>
)}
{modelStatus === 'ready' && (
<div className="flex items-center gap-4">
<div className="px-4 py-2 bg-green-900/30 border border-green-700 rounded-lg text-green-400 text-sm font-semibold">
✓ Ready
</div>
{!isRecording ? (
<>
<button
onClick={startRecording}
className="px-6 py-3 bg-gradient-to-r from-green-500 to-emerald-500 hover:from-green-600 hover:to-emerald-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl"
>
Start Recording
</button>
<label className="px-6 py-3 bg-gradient-to-r from-purple-500 to-indigo-500 hover:from-purple-600 hover:to-indigo-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl cursor-pointer">
Upload Audio
<input
type="file"
accept="audio/*"
className="hidden"
onChange={(e) => {
const file = e.target.files?.[0];
if (file) handleFileUpload(file);
}}
/>
</label>
</>
) : (
<button
onClick={stopRecording}
className="px-6 py-3 bg-gradient-to-r from-red-500 to-pink-500 hover:from-red-600 hover:to-pink-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl"
>
Stop Recording
</button>
)}
</div>
)}
{modelStatus === 'error' && (
<button
onClick={loadModel}
className="px-6 py-3 bg-red-900/30 border border-red-700 hover:bg-red-900/50 rounded-lg font-semibold transition-all duration-200"
>
Retry
</button>
)}
</div>
</div>
{/* Audio Player - only shown for uploaded files */}
{uploadedFileUrl && (
<div className="mt-4 pt-4 border-t border-gray-700">
<label className="block text-sm font-medium text-gray-400 mb-2">Audio Playback</label>
<audio
src={uploadedFileUrl}
controls
className="w-full"
style={{ height: '40px' }}
/>
</div>
)}
</div>
{/* Transcription Display */}
<TranscriptionDisplay
fixedText={fixedText}
activeText={activeText}
timestamp={timestamp}
isRecording={isRecording}
autoScroll={autoScroll}
onAutoScrollToggle={() => setAutoScroll(!autoScroll)}
/>
{/* Performance Metrics */}
<PerformanceMetrics
latency={latency}
rtf={rtf}
audioDuration={audioDuration}
windowState={windowState}
device={device}
updateInterval={250}
isProcessingFile={isProcessingFile}
fileDuration={fileDuration}
transcribedDuration={timestamp}
/>
</main>
{/* Footer */}
<footer className="border-t border-gray-800 mt-12 py-6">
<div className="max-w-6xl mx-auto px-6 text-center text-sm text-gray-500">
<p>
Built with parakeet.js, ONNX Runtime Web, React, and Vite •{' '}
<a
href="https://huggingface.co/spaces/andito/parakeet-v3-streaming/tree/main/source"
className="text-cyan-400 hover:text-cyan-300"
target="_blank"
rel="noopener noreferrer"
>
View Source
</a>
</p>
</div>
</footer>
</div>
);
}
export default App;