Spaces:

andito
/

parakeet-v3-streaming

Running

File size: 23,900 Bytes

/**
 * Main Application Component
 *
 * Parakeet STT Progressive Transcription Demo with WebGPU
 */

import { useState, useEffect, useRef } from 'react';
import TranscriptionDisplay from './components/TranscriptionDisplay';
import PerformanceMetrics from './components/PerformanceMetrics';
import Progress from './components/Progress';
import { AudioRecorder, AudioProcessor } from './utils/audio';
import { SmartProgressiveStreamingHandler } from './utils/progressive-streaming';

// Import worker
import WorkerUrl from './worker.js?worker&url';

function App() {
  // Model state
  const [modelStatus, setModelStatus] = useState('not_loaded');
  const [modelMessage, setModelMessage] = useState('');
  const [device, setDevice] = useState(null);

  // Microphone device selection
  const [audioDevices, setAudioDevices] = useState([]);
  const [selectedDeviceId, setSelectedDeviceId] = useState(null);

  // Recording state
  const [isRecording, setIsRecording] = useState(false);
  const [fixedText, setFixedText] = useState('');
  const [activeText, setActiveText] = useState('');
  const [timestamp, setTimestamp] = useState(0);
  const [audioLevel, setAudioLevel] = useState(0);

  // Performance metrics
  const [latency, setLatency] = useState(null);
  const [rtf, setRtf] = useState(null);
  const [audioDuration, setAudioDuration] = useState(null);
  const [windowState, setWindowState] = useState(null);
  const [isProcessingFile, setIsProcessingFile] = useState(false);
  const [fileDuration, setFileDuration] = useState(null);

  // File upload
  const [uploadedFileUrl, setUploadedFileUrl] = useState(null);
  const [autoScroll, setAutoScroll] = useState(true);

  // Progress tracking
  const [progressItems, setProgressItems] = useState([]);

  // Refs
  const workerRef = useRef(null);
  const recorderRef = useRef(null);
  const audioProcessorRef = useRef(null);
  const streamingHandlerRef = useRef(null);
  const progressiveIntervalRef = useRef(null);

  // Enumerate audio input devices
  useEffect(() => {
    async function getDevices() {
      try {
        const devices = await navigator.mediaDevices.enumerateDevices();
        const audioInputs = devices.filter(device => device.kind === 'audioinput');
        setAudioDevices(audioInputs);

        // Auto-select the default device (first one with "default" in deviceId)
        const defaultDevice = audioInputs.find(d => d.deviceId === 'default');
        if (defaultDevice && !selectedDeviceId) {
          setSelectedDeviceId(defaultDevice.deviceId);
          console.log('[App] Auto-selected default device:', defaultDevice.label);
        }

        console.log('[App] Available audio devices:', audioInputs.map(d => `${d.label || 'Unnamed'} (${d.deviceId.slice(0, 8)}...)`));
      } catch (error) {
        console.error('[App] Failed to enumerate devices:', error);
      }
    }
    getDevices();
  }, []);

  // Initialize worker
  useEffect(() => {
    workerRef.current = new Worker(WorkerUrl, { type: 'module' });

    workerRef.current.onmessage = (event) => {
      const { status, message, result, device: deviceType, file, progress, total, loaded } = event.data;

      if (status === 'loading') {
        setModelStatus('loading');
        setModelMessage(message);
      } else if (status === 'ready') {
        setModelStatus('ready');
        setModelMessage(message);
        setDevice(deviceType);
      } else if (status === 'error') {
        setModelStatus('error');
        setModelMessage(message);
        console.error('Worker error:', event.data);
      } else if (status === 'transcription' && result) {
        // Update performance metrics
        if (result.metadata) {
          setLatency(result.metadata.latency);
          setRtf(result.metadata.rtf);
          setAudioDuration(result.metadata.audioDuration);
        }
      } else if (status === 'initiate') {
        // New file download initiated
        setProgressItems(prev => [...prev, { file, progress: 0, total }]);
      } else if (status === 'progress') {
        // Update progress for existing file
        setProgressItems(prev =>
          prev.map(item =>
            item.file === file ? { ...item, progress, total, loaded } : item
          )
        );
      } else if (status === 'done') {
        // File download complete - keep it at 100% briefly then remove
        setProgressItems(prev =>
          prev.map(item =>
            item.file === file ? { ...item, progress: 100 } : item
          )
        );
      }
    };

    return () => {
      if (workerRef.current) {
        workerRef.current.terminate();
      }
    };
  }, []);

  const loadModel = async () => {
    if (modelStatus === 'loading' || modelStatus === 'ready') return;

    setModelStatus('loading');
    setModelMessage('Initializing model...');

    workerRef.current.postMessage({
      type: 'load',
      data: {
        modelVersion: "parakeet-tdt-0.6b-v3",  // Multilingual Parakeet
        options: {
          device: 'webgpu',  // Hybrid: GPU encoder + WASM decoder for optimal performance
        },
      },
    });
  };

  const clearCache = async () => {
    if (!confirm('Clear cached model files (~2.5GB)? You will need to re-download the model.')) {
      return;
    }

    try {
      const dbs = await indexedDB.databases();
      for (const db of dbs) {
        indexedDB.deleteDatabase(db.name);
        console.log('Deleted IndexedDB:', db.name);
      }
      alert('Cache cleared! Reload the page to start fresh.');
      window.location.reload();
    } catch (error) {
      console.error('Failed to clear cache:', error);
      alert('Failed to clear cache. Try clearing browser data manually.');
    }
  };

  const startRecording = async () => {
    if (modelStatus !== 'ready') {
      alert('Please load the model first');
      return;
    }

    try {
      // Reset state
      setFixedText('');
      setActiveText('');
      setTimestamp(0);
      setLatency(null);
      setRtf(null);
      setAudioDuration(null);

      // Initialize audio processor
      audioProcessorRef.current = new AudioProcessor();

      // Create model wrapper for progressive streaming
      const modelWrapper = {
        transcribe: async (audio) => {
          return new Promise((resolve) => {
            const messageHandler = (event) => {
              if (event.data.status === 'transcription') {
                workerRef.current.removeEventListener('message', messageHandler);
                resolve(event.data.result);
              }
            };

            workerRef.current.addEventListener('message', messageHandler);
            workerRef.current.postMessage({
              type: 'transcribe',
              data: { audio },
            });
          });
        },
      };

      // Initialize progressive streaming handler
      streamingHandlerRef.current = new SmartProgressiveStreamingHandler(modelWrapper, {
        emissionInterval: 0.5,  // 500ms
        maxWindowSize: 15.0,  // 15 seconds
        sentenceBuffer: 2.0,  // 2 seconds
      });

      // Start recording with callback for audio chunks
      let quietWarningCount = 0;
      recorderRef.current = new AudioRecorder((audioChunk) => {
        // Append PCM audio chunk directly (Float32Array)
        const maxAmp = Math.max(...Array.from(audioChunk).map(Math.abs));

        // Update audio level meter (scale to 0-100%)
        setAudioLevel(Math.min(100, maxAmp * 300)); // Scale up for visibility

        // Only warn about quiet audio once every 20 chunks (~3 seconds)
        if (maxAmp < 0.001) {
          quietWarningCount++;
          if (quietWarningCount === 1 || quietWarningCount % 20 === 0) {
            console.warn('⚠️ Very quiet audio detected. Try speaking louder or check your microphone selection.');
          }
        } else {
          quietWarningCount = 0;
        }

        audioProcessorRef.current.appendChunk(audioChunk);
      });

      await recorderRef.current.start(selectedDeviceId);
      setIsRecording(true);

      // Start progressive transcription updates
      let transcriptionInProgress = false;
      progressiveIntervalRef.current = setInterval(async () => {
        // Stop if recording stopped
        if (!recorderRef.current || !recorderRef.current.isRecording) {
          if (progressiveIntervalRef.current) {
            clearInterval(progressiveIntervalRef.current);
            progressiveIntervalRef.current = null;
          }
          return;
        }

        const audioBuffer = audioProcessorRef.current.getBuffer();
        const duration = audioBuffer.length / 16000;

        // Update timestamp even if not transcribing yet
        setTimestamp(duration);

        // Skip if previous transcription still in progress (matches Python MLX lock behavior)
        if (transcriptionInProgress) {
          console.debug('Skipping progressive update (previous transcription still running)');
          return;
        }

        // Simple VAD: Check if there's voice activity in the last 2 seconds
        // This prevents wasting compute on silence
        const vadWindowSize = Math.min(32000, audioBuffer.length); // Last 2 seconds or less
        const recentAudio = audioBuffer.slice(-vadWindowSize);
        let maxAmp = 0;
        for (let i = 0; i < recentAudio.length; i++) {
          const abs = Math.abs(recentAudio[i]);
          if (abs > maxAmp) maxAmp = abs;
        }
        const hasVoiceActivity = maxAmp > 0.01; // Threshold for voice activity

        // Only transcribe if we have enough audio (at least 1 second) AND voice activity detected
        if (audioBuffer.length >= 16000 && hasVoiceActivity) {
          try {
            transcriptionInProgress = true;
            const result = await streamingHandlerRef.current.transcribeIncremental(audioBuffer);

            setFixedText(result.fixedText);
            setActiveText(result.activeText);

            // Update window state
            setWindowState(duration >= 15 ? 'sliding' : 'growing');
          } catch (error) {
            console.error('Progressive transcription error:', error);
            // Show error in UI
            setActiveText(`Error: ${error.message}`);
          } finally {
            transcriptionInProgress = false;
          }
        } else {
          // Not enough audio yet
          setWindowState('growing');
        }
      }, 250);  // 250ms updates
    } catch (error) {
      console.error('Failed to start recording:', error);
      alert('Failed to start recording: ' + error.message);
      setIsRecording(false);
    }
  };

  const handleFileUpload = async (file) => {
    try {
      setFixedText('');
      setActiveText('Loading file...');
      setTimestamp(0);
      setIsProcessingFile(true);
      setLatency(null);
      setRtf(null);

      // Create audio URL for playback
      const fileUrl = URL.createObjectURL(file);
      setUploadedFileUrl(fileUrl);

      // Read audio file
      const audioContext = new AudioContext({ sampleRate: 16000 });
      const arrayBuffer = await file.arrayBuffer();
      const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);

      // Convert to Float32Array at 16kHz
      const audioData = audioBuffer.getChannelData(0);
      const duration = audioData.length / 16000;

      // Set file duration for metrics display
      setFileDuration(duration);
      setActiveText('Processing with progressive streaming...');

      // Create a fresh streaming handler for this file
      const fileStreamingHandler = new SmartProgressiveStreamingHandler(
        { transcribe: async (audio) => {
          return new Promise((resolve, reject) => {
            const handleResult = (event) => {
              if (event.data.status === 'transcription') {
                workerRef.current.removeEventListener('message', handleResult);
                resolve(event.data.result);
              } else if (event.data.status === 'error') {
                workerRef.current.removeEventListener('message', handleResult);
                reject(new Error(event.data.message));
              }
            };

            workerRef.current.addEventListener('message', handleResult);
            workerRef.current.postMessage({
              type: 'transcribe',
              data: { audio, sampleRate: 16000 },
            });
          });
        }},
        {
          emissionInterval: 0.5,  // 500ms updates
          maxWindowSize: 15.0,  // 15 seconds
          sentenceBuffer: 2.0,  // 2 seconds
        }
      );

      // Use batch streaming (fast processing with full windows)
      const startTime = performance.now();
      let updateCount = 0;

      for await (const result of fileStreamingHandler.transcribeBatch(audioData)) {
        updateCount++;
        setFixedText(result.fixedText);
        setActiveText(result.activeText);
        setTimestamp(result.timestamp);

        // Update window state
        setWindowState('sliding');  // Batch mode always uses full windows

        // Update metrics continuously during processing
        const currentTime = performance.now();
        const elapsedTime = (currentTime - startTime) / 1000;
        // RTF = how much audio transcribed / time spent processing
        const currentRTF = result.timestamp / elapsedTime;

        setLatency(elapsedTime);
        setRtf(currentRTF);

        // Final cleanup
        if (result.isFinal) {
          setWindowState(null);
          setIsProcessingFile(false);

          console.log(`[File] Processed ${duration.toFixed(1)}s audio in ${elapsedTime.toFixed(1)}s (${updateCount} windows, RTF: ${currentRTF.toFixed(2)}x)`);
        }
      }
    } catch (error) {
      console.error('Failed to process file:', error);
      alert('Failed to process file: ' + error.message);
      setActiveText(`Error: ${error.message}`);
      setWindowState(null);
      setIsProcessingFile(false);
    }
  };

  const stopRecording = async () => {
    if (!isRecording) return;

    // Stop progressive updates first
    if (progressiveIntervalRef.current) {
      clearInterval(progressiveIntervalRef.current);
      progressiveIntervalRef.current = null;
    }

    // Set recording to false immediately to stop the interval loop
    setIsRecording(false);

    // Wait a bit for any in-flight transcription to complete
    await new Promise(resolve => setTimeout(resolve, 100));

    // Stop recorder
    if (recorderRef.current) {
      try {
        await recorderRef.current.stop();

        // Final transcription
        const audioBuffer = audioProcessorRef.current.getBuffer();
        if (audioBuffer.length > 0 && streamingHandlerRef.current) {
          try {
            const finalText = await streamingHandlerRef.current.finalize(audioBuffer);
            setFixedText(finalText);
            setActiveText('');
          } catch (error) {
            // Ignore ONNX session errors during cleanup
            if (!error.message.includes('Session')) {
              console.error('Error in final transcription:', error);
            }
          }
        }
      } catch (error) {
        console.error('Error stopping recording:', error);
      }
    }

    setWindowState(null);
  };

  return (
    <div className="min-h-screen bg-gradient-to-b from-gray-950 to-gray-900 text-white">
      {/* Header */}
      <header className="border-b border-gray-800 bg-gray-950/50 backdrop-blur">
        <div className="max-w-6xl mx-auto px-6 py-6">
          <h1 className="text-3xl font-bold bg-gradient-to-r from-cyan-400 to-blue-500 bg-clip-text text-transparent">
            🎤 Parakeet STT Progressive Transcription
          </h1>
          <p className="text-gray-400 mt-2">
            Real-time speech recognition with smart progressive streaming • WebGPU accelerated
          </p>
          <p className="text-gray-500 text-xs mt-2">
            💾 Model files (~2.5GB) are cached locally for faster loading on future visits
          </p>
        </div>
      </header>

      {/* Main Content */}
      <main className="max-w-6xl mx-auto px-6 py-8 space-y-8">
        {/* Controls */}
        <div className="w-full max-w-4xl mx-auto bg-gray-900 rounded-lg border border-gray-700 p-4">
          <h2 className="text-lg font-semibold mb-3">Controls</h2>

          {/* Microphone Selection */}
          <div className="mb-3">
            <label className="block text-xs font-medium text-gray-400 mb-1">Microphone</label>
            <select
              value={selectedDeviceId || ''}
              onChange={(e) => setSelectedDeviceId(e.target.value)}
              className="w-full bg-gray-800 border border-gray-600 rounded px-4 py-2 text-white"
              disabled={isRecording}
            >
              {audioDevices.length === 0 && <option value="">Loading devices...</option>}
              {audioDevices.map((device) => (
                <option key={device.deviceId} value={device.deviceId}>
                  {device.label || `Microphone ${device.deviceId.slice(0, 8)}...`}
                </option>
              ))}
            </select>
          </div>

          {/* Audio Level Meter */}
          {isRecording && (
            <div className="mb-3">
              <label className="block text-xs font-medium text-gray-400 mb-1">Audio Level</label>
              <div className="w-full h-3 bg-gray-800 rounded-full overflow-hidden">
                <div
                  className="h-full bg-gradient-to-r from-green-500 via-yellow-500 to-red-500 transition-all duration-75"
                  style={{ width: `${audioLevel}%` }}
                ></div>
              </div>
            </div>
          )}

          {/* Model Status and Actions */}
          <div className="flex items-center justify-between">
            <div>
              <h3 className="text-xs font-medium text-gray-400">Model Status</h3>
              <p className="text-xs text-gray-300 mt-0.5">{modelMessage || 'Ready to load model'}</p>
            </div>
            <div className="flex items-center gap-3">
              {modelStatus === 'not_loaded' && (
                <>
                  <button
                    onClick={loadModel}
                    className="px-6 py-3 bg-gradient-to-r from-cyan-500 to-blue-500 hover:from-cyan-600 hover:to-blue-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl"
                  >
                    Load Model (~2.5GB)
                  </button>
                  <button
                    onClick={clearCache}
                    className="px-4 py-2 bg-gray-700 hover:bg-gray-600 rounded-lg text-sm font-medium transition-all duration-200"
                    title="Clear cached model files"
                  >
                    Clear Cache
                  </button>
                </>
              )}
              {modelStatus === 'loading' && (
                <div className="w-full max-w-md">
                  <div className="mb-4 text-gray-300 text-sm">
                    {modelMessage}
                  </div>
                  {progressItems.length > 0 ? (
                    <div className="bg-gray-800/50 rounded-lg p-4">
                      {progressItems.map((item, i) => (
                        <Progress key={i} text={item.file} percentage={item.progress} total={item.total} />
                      ))}
                    </div>
                  ) : (
                    <div className="flex items-center gap-3 text-gray-300">
                      <div className="w-5 h-5 border-2 border-cyan-400 border-t-transparent rounded-full animate-spin"></div>
                      <span>Initializing...</span>
                    </div>
                  )}
                </div>
              )}
              {modelStatus === 'ready' && (
                <div className="flex items-center gap-4">
                  <div className="px-4 py-2 bg-green-900/30 border border-green-700 rounded-lg text-green-400 text-sm font-semibold">
                    ✓ Ready
                  </div>
                  {!isRecording ? (
                    <>
                      <button
                        onClick={startRecording}
                        className="px-6 py-3 bg-gradient-to-r from-green-500 to-emerald-500 hover:from-green-600 hover:to-emerald-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl"
                      >
                        Start Recording
                      </button>
                      <label className="px-6 py-3 bg-gradient-to-r from-purple-500 to-indigo-500 hover:from-purple-600 hover:to-indigo-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl cursor-pointer">
                        Upload Audio
                        <input
                          type="file"
                          accept="audio/*"
                          className="hidden"
                          onChange={(e) => {
                            const file = e.target.files?.[0];
                            if (file) handleFileUpload(file);
                          }}
                        />
                      </label>
                    </>
                  ) : (
                    <button
                      onClick={stopRecording}
                      className="px-6 py-3 bg-gradient-to-r from-red-500 to-pink-500 hover:from-red-600 hover:to-pink-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl"
                    >
                      Stop Recording
                    </button>
                  )}
                </div>
              )}
              {modelStatus === 'error' && (
                <button
                  onClick={loadModel}
                  className="px-6 py-3 bg-red-900/30 border border-red-700 hover:bg-red-900/50 rounded-lg font-semibold transition-all duration-200"
                >
                  Retry
                </button>
              )}
            </div>
          </div>

          {/* Audio Player - only shown for uploaded files */}
          {uploadedFileUrl && (
            <div className="mt-4 pt-4 border-t border-gray-700">
              <label className="block text-sm font-medium text-gray-400 mb-2">Audio Playback</label>
              <audio
                src={uploadedFileUrl}
                controls
                className="w-full"
                style={{ height: '40px' }}
              />
            </div>
          )}
        </div>

        {/* Transcription Display */}
        <TranscriptionDisplay
          fixedText={fixedText}
          activeText={activeText}
          timestamp={timestamp}
          isRecording={isRecording}
          autoScroll={autoScroll}
          onAutoScrollToggle={() => setAutoScroll(!autoScroll)}
        />

        {/* Performance Metrics */}
        <PerformanceMetrics
          latency={latency}
          rtf={rtf}
          audioDuration={audioDuration}
          windowState={windowState}
          device={device}
          updateInterval={250}
          isProcessingFile={isProcessingFile}
          fileDuration={fileDuration}
          transcribedDuration={timestamp}
        />
      </main>

      {/* Footer */}
      <footer className="border-t border-gray-800 mt-12 py-6">
        <div className="max-w-6xl mx-auto px-6 text-center text-sm text-gray-500">
          <p>
            Built with parakeet.js, ONNX Runtime Web, React, and Vite •{' '}
            <a
              href="https://huggingface.co/spaces/andito/parakeet-v3-streaming/tree/main/source"
              className="text-cyan-400 hover:text-cyan-300"
              target="_blank"
              rel="noopener noreferrer"
            >
              View Source
            </a>
          </p>
        </div>
      </footer>
    </div>
  );
}

export default App;