parakeet-v3-streaming

Running

App Files Files Community

parakeet-v3-streaming / source /src /App.jsx

andito HF Staff

fixing session mismatch

bebb6f7 about 1 month ago

raw

history blame contribute delete

23.9 kB

	/**
	* Main Application Component
	*
	* Parakeet STT Progressive Transcription Demo with WebGPU
	*/

	import { useState, useEffect, useRef } from 'react';
	import TranscriptionDisplay from './components/TranscriptionDisplay';
	import PerformanceMetrics from './components/PerformanceMetrics';
	import Progress from './components/Progress';
	import { AudioRecorder, AudioProcessor } from './utils/audio';
	import { SmartProgressiveStreamingHandler } from './utils/progressive-streaming';

	// Import worker
	import WorkerUrl from './worker.js?worker&url';

	function App() {
	// Model state
	const [modelStatus, setModelStatus] = useState('not_loaded');
	const [modelMessage, setModelMessage] = useState('');
	const [device, setDevice] = useState(null);

	// Microphone device selection
	const [audioDevices, setAudioDevices] = useState([]);
	const [selectedDeviceId, setSelectedDeviceId] = useState(null);

	// Recording state
	const [isRecording, setIsRecording] = useState(false);
	const [fixedText, setFixedText] = useState('');
	const [activeText, setActiveText] = useState('');
	const [timestamp, setTimestamp] = useState(0);
	const [audioLevel, setAudioLevel] = useState(0);

	// Performance metrics
	const [latency, setLatency] = useState(null);
	const [rtf, setRtf] = useState(null);
	const [audioDuration, setAudioDuration] = useState(null);
	const [windowState, setWindowState] = useState(null);
	const [isProcessingFile, setIsProcessingFile] = useState(false);
	const [fileDuration, setFileDuration] = useState(null);

	// File upload
	const [uploadedFileUrl, setUploadedFileUrl] = useState(null);
	const [autoScroll, setAutoScroll] = useState(true);

	// Progress tracking
	const [progressItems, setProgressItems] = useState([]);

	// Refs
	const workerRef = useRef(null);
	const recorderRef = useRef(null);
	const audioProcessorRef = useRef(null);
	const streamingHandlerRef = useRef(null);
	const progressiveIntervalRef = useRef(null);

	// Enumerate audio input devices
	useEffect(() => {
	async function getDevices() {
	try {
	const devices = await navigator.mediaDevices.enumerateDevices();
	const audioInputs = devices.filter(device => device.kind === 'audioinput');
	setAudioDevices(audioInputs);

	// Auto-select the default device (first one with "default" in deviceId)
	const defaultDevice = audioInputs.find(d => d.deviceId === 'default');
	if (defaultDevice && !selectedDeviceId) {
	setSelectedDeviceId(defaultDevice.deviceId);
	console.log('[App] Auto-selected default device:', defaultDevice.label);
	}

	console.log('[App] Available audio devices:', audioInputs.map(d => `${d.label \|\| 'Unnamed'} (${d.deviceId.slice(0, 8)}...)`));
	} catch (error) {
	console.error('[App] Failed to enumerate devices:', error);
	}
	}
	getDevices();
	}, []);

	// Initialize worker
	useEffect(() => {
	workerRef.current = new Worker(WorkerUrl, { type: 'module' });

	workerRef.current.onmessage = (event) => {
	const { status, message, result, device: deviceType, file, progress, total, loaded } = event.data;

	if (status === 'loading') {
	setModelStatus('loading');
	setModelMessage(message);
	} else if (status === 'ready') {
	setModelStatus('ready');
	setModelMessage(message);
	setDevice(deviceType);
	} else if (status === 'error') {
	setModelStatus('error');
	setModelMessage(message);
	console.error('Worker error:', event.data);
	} else if (status === 'transcription' && result) {
	// Update performance metrics
	if (result.metadata) {
	setLatency(result.metadata.latency);
	setRtf(result.metadata.rtf);
	setAudioDuration(result.metadata.audioDuration);
	}
	} else if (status === 'initiate') {
	// New file download initiated
	setProgressItems(prev => [...prev, { file, progress: 0, total }]);
	} else if (status === 'progress') {
	// Update progress for existing file
	setProgressItems(prev =>
	prev.map(item =>
	item.file === file ? { ...item, progress, total, loaded } : item
	)
	);
	} else if (status === 'done') {
	// File download complete - keep it at 100% briefly then remove
	setProgressItems(prev =>
	prev.map(item =>
	item.file === file ? { ...item, progress: 100 } : item
	)
	);
	}
	};

	return () => {
	if (workerRef.current) {
	workerRef.current.terminate();
	}
	};
	}, []);

	const loadModel = async () => {
	if (modelStatus === 'loading' \|\| modelStatus === 'ready') return;

	setModelStatus('loading');
	setModelMessage('Initializing model...');

	workerRef.current.postMessage({
	type: 'load',
	data: {
	modelVersion: "parakeet-tdt-0.6b-v3", // Multilingual Parakeet
	options: {
	device: 'webgpu', // Hybrid: GPU encoder + WASM decoder for optimal performance
	},
	},
	});
	};

	const clearCache = async () => {
	if (!confirm('Clear cached model files (~2.5GB)? You will need to re-download the model.')) {
	return;
	}

	try {
	const dbs = await indexedDB.databases();
	for (const db of dbs) {
	indexedDB.deleteDatabase(db.name);
	console.log('Deleted IndexedDB:', db.name);
	}
	alert('Cache cleared! Reload the page to start fresh.');
	window.location.reload();
	} catch (error) {
	console.error('Failed to clear cache:', error);
	alert('Failed to clear cache. Try clearing browser data manually.');
	}
	};

	const startRecording = async () => {
	if (modelStatus !== 'ready') {
	alert('Please load the model first');
	return;
	}

	try {
	// Reset state
	setFixedText('');
	setActiveText('');
	setTimestamp(0);
	setLatency(null);
	setRtf(null);
	setAudioDuration(null);

	// Initialize audio processor
	audioProcessorRef.current = new AudioProcessor();

	// Create model wrapper for progressive streaming
	const modelWrapper = {
	transcribe: async (audio) => {
	return new Promise((resolve) => {
	const messageHandler = (event) => {
	if (event.data.status === 'transcription') {
	workerRef.current.removeEventListener('message', messageHandler);
	resolve(event.data.result);
	}
	};

	workerRef.current.addEventListener('message', messageHandler);
	workerRef.current.postMessage({
	type: 'transcribe',
	data: { audio },
	});
	});
	},
	};

	// Initialize progressive streaming handler
	streamingHandlerRef.current = new SmartProgressiveStreamingHandler(modelWrapper, {
	emissionInterval: 0.5, // 500ms
	maxWindowSize: 15.0, // 15 seconds
	sentenceBuffer: 2.0, // 2 seconds
	});

	// Start recording with callback for audio chunks
	let quietWarningCount = 0;
	recorderRef.current = new AudioRecorder((audioChunk) => {
	// Append PCM audio chunk directly (Float32Array)
	const maxAmp = Math.max(...Array.from(audioChunk).map(Math.abs));

	// Update audio level meter (scale to 0-100%)
	setAudioLevel(Math.min(100, maxAmp * 300)); // Scale up for visibility

	// Only warn about quiet audio once every 20 chunks (~3 seconds)
	if (maxAmp < 0.001) {
	quietWarningCount++;
	if (quietWarningCount === 1 \|\| quietWarningCount % 20 === 0) {
	console.warn('⚠️ Very quiet audio detected. Try speaking louder or check your microphone selection.');
	}
	} else {
	quietWarningCount = 0;
	}

	audioProcessorRef.current.appendChunk(audioChunk);
	});

	await recorderRef.current.start(selectedDeviceId);
	setIsRecording(true);

	// Start progressive transcription updates
	let transcriptionInProgress = false;
	progressiveIntervalRef.current = setInterval(async () => {
	// Stop if recording stopped
	if (!recorderRef.current \|\| !recorderRef.current.isRecording) {
	if (progressiveIntervalRef.current) {
	clearInterval(progressiveIntervalRef.current);
	progressiveIntervalRef.current = null;
	}
	return;
	}

	const audioBuffer = audioProcessorRef.current.getBuffer();
	const duration = audioBuffer.length / 16000;

	// Update timestamp even if not transcribing yet
	setTimestamp(duration);

	// Skip if previous transcription still in progress (matches Python MLX lock behavior)
	if (transcriptionInProgress) {
	console.debug('Skipping progressive update (previous transcription still running)');
	return;
	}

	// Simple VAD: Check if there's voice activity in the last 2 seconds
	// This prevents wasting compute on silence
	const vadWindowSize = Math.min(32000, audioBuffer.length); // Last 2 seconds or less
	const recentAudio = audioBuffer.slice(-vadWindowSize);
	let maxAmp = 0;
	for (let i = 0; i < recentAudio.length; i++) {
	const abs = Math.abs(recentAudio[i]);
	if (abs > maxAmp) maxAmp = abs;
	}
	const hasVoiceActivity = maxAmp > 0.01; // Threshold for voice activity

	// Only transcribe if we have enough audio (at least 1 second) AND voice activity detected
	if (audioBuffer.length >= 16000 && hasVoiceActivity) {
	try {
	transcriptionInProgress = true;
	const result = await streamingHandlerRef.current.transcribeIncremental(audioBuffer);

	setFixedText(result.fixedText);
	setActiveText(result.activeText);

	// Update window state
	setWindowState(duration >= 15 ? 'sliding' : 'growing');
	} catch (error) {
	console.error('Progressive transcription error:', error);
	// Show error in UI
	setActiveText(`Error: ${error.message}`);
	} finally {
	transcriptionInProgress = false;
	}
	} else {
	// Not enough audio yet
	setWindowState('growing');
	}
	}, 250); // 250ms updates
	} catch (error) {
	console.error('Failed to start recording:', error);
	alert('Failed to start recording: ' + error.message);
	setIsRecording(false);
	}
	};

	const handleFileUpload = async (file) => {
	try {
	setFixedText('');
	setActiveText('Loading file...');
	setTimestamp(0);
	setIsProcessingFile(true);
	setLatency(null);
	setRtf(null);

	// Create audio URL for playback
	const fileUrl = URL.createObjectURL(file);
	setUploadedFileUrl(fileUrl);

	// Read audio file
	const audioContext = new AudioContext({ sampleRate: 16000 });
	const arrayBuffer = await file.arrayBuffer();
	const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);

	// Convert to Float32Array at 16kHz
	const audioData = audioBuffer.getChannelData(0);
	const duration = audioData.length / 16000;

	// Set file duration for metrics display
	setFileDuration(duration);
	setActiveText('Processing with progressive streaming...');

	// Create a fresh streaming handler for this file
	const fileStreamingHandler = new SmartProgressiveStreamingHandler(
	{ transcribe: async (audio) => {
	return new Promise((resolve, reject) => {
	const handleResult = (event) => {
	if (event.data.status === 'transcription') {
	workerRef.current.removeEventListener('message', handleResult);
	resolve(event.data.result);
	} else if (event.data.status === 'error') {
	workerRef.current.removeEventListener('message', handleResult);
	reject(new Error(event.data.message));
	}
	};

	workerRef.current.addEventListener('message', handleResult);
	workerRef.current.postMessage({
	type: 'transcribe',
	data: { audio, sampleRate: 16000 },
	});
	});
	}},
	{
	emissionInterval: 0.5, // 500ms updates
	maxWindowSize: 15.0, // 15 seconds
	sentenceBuffer: 2.0, // 2 seconds
	}
	);

	// Use batch streaming (fast processing with full windows)
	const startTime = performance.now();
	let updateCount = 0;

	for await (const result of fileStreamingHandler.transcribeBatch(audioData)) {
	updateCount++;
	setFixedText(result.fixedText);
	setActiveText(result.activeText);
	setTimestamp(result.timestamp);

	// Update window state
	setWindowState('sliding'); // Batch mode always uses full windows

	// Update metrics continuously during processing
	const currentTime = performance.now();
	const elapsedTime = (currentTime - startTime) / 1000;
	// RTF = how much audio transcribed / time spent processing
	const currentRTF = result.timestamp / elapsedTime;

	setLatency(elapsedTime);
	setRtf(currentRTF);

	// Final cleanup
	if (result.isFinal) {
	setWindowState(null);
	setIsProcessingFile(false);

	console.log(`[File] Processed ${duration.toFixed(1)}s audio in ${elapsedTime.toFixed(1)}s (${updateCount} windows, RTF: ${currentRTF.toFixed(2)}x)`);
	}
	}
	} catch (error) {
	console.error('Failed to process file:', error);
	alert('Failed to process file: ' + error.message);
	setActiveText(`Error: ${error.message}`);
	setWindowState(null);
	setIsProcessingFile(false);
	}
	};

	const stopRecording = async () => {
	if (!isRecording) return;

	// Stop progressive updates first
	if (progressiveIntervalRef.current) {
	clearInterval(progressiveIntervalRef.current);
	progressiveIntervalRef.current = null;
	}

	// Set recording to false immediately to stop the interval loop
	setIsRecording(false);

	// Wait a bit for any in-flight transcription to complete
	await new Promise(resolve => setTimeout(resolve, 100));

	// Stop recorder
	if (recorderRef.current) {
	try {
	await recorderRef.current.stop();

	// Final transcription
	const audioBuffer = audioProcessorRef.current.getBuffer();
	if (audioBuffer.length > 0 && streamingHandlerRef.current) {
	try {
	const finalText = await streamingHandlerRef.current.finalize(audioBuffer);
	setFixedText(finalText);
	setActiveText('');
	} catch (error) {
	// Ignore ONNX session errors during cleanup
	if (!error.message.includes('Session')) {
	console.error('Error in final transcription:', error);
	}
	}
	}
	} catch (error) {
	console.error('Error stopping recording:', error);
	}
	}

	setWindowState(null);
	};

	return (
	<div className="min-h-screen bg-gradient-to-b from-gray-950 to-gray-900 text-white">
	{/* Header */}
	<header className="border-b border-gray-800 bg-gray-950/50 backdrop-blur">
	<div className="max-w-6xl mx-auto px-6 py-6">
	<h1 className="text-3xl font-bold bg-gradient-to-r from-cyan-400 to-blue-500 bg-clip-text text-transparent">
	🎤 Parakeet STT Progressive Transcription
	</h1>
	<p className="text-gray-400 mt-2">
	Real-time speech recognition with smart progressive streaming • WebGPU accelerated
	</p>
	<p className="text-gray-500 text-xs mt-2">
	💾 Model files (~2.5GB) are cached locally for faster loading on future visits
	</p>
	</div>
	</header>

	{/* Main Content */}
	<main className="max-w-6xl mx-auto px-6 py-8 space-y-8">
	{/* Controls */}
	<div className="w-full max-w-4xl mx-auto bg-gray-900 rounded-lg border border-gray-700 p-4">
	<h2 className="text-lg font-semibold mb-3">Controls</h2>

	{/* Microphone Selection */}
	<div className="mb-3">
	<label className="block text-xs font-medium text-gray-400 mb-1">Microphone</label>
	<select
	value={selectedDeviceId \|\| ''}
	onChange={(e) => setSelectedDeviceId(e.target.value)}
	className="w-full bg-gray-800 border border-gray-600 rounded px-4 py-2 text-white"
	disabled={isRecording}
	>
	{audioDevices.length === 0 && <option value="">Loading devices...</option>}
	{audioDevices.map((device) => (
	<option key={device.deviceId} value={device.deviceId}>
	{device.label \|\| `Microphone ${device.deviceId.slice(0, 8)}...`}
	</option>
	))}
	</select>
	</div>

	{/* Audio Level Meter */}
	{isRecording && (
	<div className="mb-3">
	<label className="block text-xs font-medium text-gray-400 mb-1">Audio Level</label>
	<div className="w-full h-3 bg-gray-800 rounded-full overflow-hidden">
	<div
	className="h-full bg-gradient-to-r from-green-500 via-yellow-500 to-red-500 transition-all duration-75"
	style={{ width: `${audioLevel}%` }}
	></div>
	</div>
	</div>
	)}

	{/* Model Status and Actions */}
	<div className="flex items-center justify-between">
	<div>
	<h3 className="text-xs font-medium text-gray-400">Model Status</h3>
	<p className="text-xs text-gray-300 mt-0.5">{modelMessage \|\| 'Ready to load model'}</p>
	</div>
	<div className="flex items-center gap-3">
	{modelStatus === 'not_loaded' && (
	<>
	<button
	onClick={loadModel}
	className="px-6 py-3 bg-gradient-to-r from-cyan-500 to-blue-500 hover:from-cyan-600 hover:to-blue-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl"
	>
	Load Model (~2.5GB)
	</button>
	<button
	onClick={clearCache}
	className="px-4 py-2 bg-gray-700 hover:bg-gray-600 rounded-lg text-sm font-medium transition-all duration-200"
	title="Clear cached model files"
	>
	Clear Cache
	</button>
	</>
	)}
	{modelStatus === 'loading' && (
	<div className="w-full max-w-md">
	<div className="mb-4 text-gray-300 text-sm">
	{modelMessage}
	</div>
	{progressItems.length > 0 ? (
	<div className="bg-gray-800/50 rounded-lg p-4">
	{progressItems.map((item, i) => (
	<Progress key={i} text={item.file} percentage={item.progress} total={item.total} />
	))}
	</div>
	) : (
	<div className="flex items-center gap-3 text-gray-300">
	<div className="w-5 h-5 border-2 border-cyan-400 border-t-transparent rounded-full animate-spin"></div>
	<span>Initializing...</span>
	</div>
	)}
	</div>
	)}
	{modelStatus === 'ready' && (
	<div className="flex items-center gap-4">
	<div className="px-4 py-2 bg-green-900/30 border border-green-700 rounded-lg text-green-400 text-sm font-semibold">
	✓ Ready
	</div>
	{!isRecording ? (
	<>
	<button
	onClick={startRecording}
	className="px-6 py-3 bg-gradient-to-r from-green-500 to-emerald-500 hover:from-green-600 hover:to-emerald-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl"
	>
	Start Recording
	</button>
	<label className="px-6 py-3 bg-gradient-to-r from-purple-500 to-indigo-500 hover:from-purple-600 hover:to-indigo-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl cursor-pointer">
	Upload Audio
	<input
	type="file"
	accept="audio/*"
	className="hidden"
	onChange={(e) => {
	const file = e.target.files?.[0];
	if (file) handleFileUpload(file);
	}}
	/>
	</label>
	</>
	) : (
	<button
	onClick={stopRecording}
	className="px-6 py-3 bg-gradient-to-r from-red-500 to-pink-500 hover:from-red-600 hover:to-pink-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl"
	>
	Stop Recording
	</button>
	)}
	</div>
	)}
	{modelStatus === 'error' && (
	<button
	onClick={loadModel}
	className="px-6 py-3 bg-red-900/30 border border-red-700 hover:bg-red-900/50 rounded-lg font-semibold transition-all duration-200"
	>
	Retry
	</button>
	)}
	</div>
	</div>

	{/* Audio Player - only shown for uploaded files */}
	{uploadedFileUrl && (
	<div className="mt-4 pt-4 border-t border-gray-700">
	<label className="block text-sm font-medium text-gray-400 mb-2">Audio Playback</label>
	<audio
	src={uploadedFileUrl}
	controls
	className="w-full"
	style={{ height: '40px' }}
	/>
	</div>
	)}
	</div>

	{/* Transcription Display */}
	<TranscriptionDisplay
	fixedText={fixedText}
	activeText={activeText}
	timestamp={timestamp}
	isRecording={isRecording}
	autoScroll={autoScroll}
	onAutoScrollToggle={() => setAutoScroll(!autoScroll)}
	/>

	{/* Performance Metrics */}
	<PerformanceMetrics
	latency={latency}
	rtf={rtf}
	audioDuration={audioDuration}
	windowState={windowState}
	device={device}
	updateInterval={250}
	isProcessingFile={isProcessingFile}
	fileDuration={fileDuration}
	transcribedDuration={timestamp}
	/>
	</main>

	{/* Footer */}
	<footer className="border-t border-gray-800 mt-12 py-6">
	<div className="max-w-6xl mx-auto px-6 text-center text-sm text-gray-500">
	<p>
	Built with parakeet.js, ONNX Runtime Web, React, and Vite •{' '}
	<a
	href="https://huggingface.co/spaces/andito/parakeet-v3-streaming/tree/main/source"
	className="text-cyan-400 hover:text-cyan-300"
	target="_blank"
	rel="noopener noreferrer"
	>
	View Source
	</a>
	</p>
	</div>
	</footer>
	</div>
	);
	}

	export default App;