parakeet-v3-streaming

Running

andito HF Staff Claude Sonnet 4.5 commited on Feb 10

Commit

5151e54

1 Parent(s): 0f739b8

Add microphone selector, audio level meter, and fix Chrome compatibility

- Add microphone device selector with auto-selection of default device
- Add real-time audio level meter during recording
- Fix stack overflow error in worker.js for large audio buffers
- Reorganize UI to combine controls in one panel
- Disable echo cancellation/noise suppression for Chrome compatibility
- Add audio resampling from native rate to 16kHz
- Include microphone test page for debugging

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (5) hide show

source/index.html +14 -0
source/microphone-test.html +238 -0
source/src/App.jsx +76 -5
source/src/utils/audio.js +96 -14
source/src/worker.js +19 -0

source/index.html ADDED Viewed

	@@ -0,0 +1,14 @@

+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/svg+xml" href="/vite.svg" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <meta name="description" content="Real-time speech recognition with Parakeet STT and WebGPU acceleration. Progressive transcription demo." />
+    <title>Parakeet STT Progressive Transcription | WebGPU Demo</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.jsx"></script>
+  </body>
+</html>

source/microphone-test.html ADDED Viewed

	@@ -0,0 +1,238 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Microphone Test</title>
+  <style>
+    body {
+      font-family: monospace;
+      padding: 20px;
+      max-width: 800px;
+      margin: 0 auto;
+    }
+    button {
+      padding: 10px 20px;
+      font-size: 16px;
+      margin: 10px 0;
+    }
+    #log {
+      background: #000;
+      color: #0f0;
+      padding: 10px;
+      font-size: 12px;
+      height: 400px;
+      overflow-y: scroll;
+      margin-top: 20px;
+    }
+    #meter {
+      width: 100%;
+      height: 40px;
+      background: #222;
+      margin-top: 10px;
+      position: relative;
+    }
+    #meter-bar {
+      height: 100%;
+      background: linear-gradient(to right, green, yellow, red);
+      width: 0%;
+      transition: width 0.05s;
+    }
+  </style>
+</head>
+<body>
+  <h1>Microphone Test</h1>
+  <p>This tests if your microphone is working with Web Audio API</p>
+  <label for="deviceSelect">Select Microphone:</label>
+  <select id="deviceSelect" style="width: 100%; padding: 8px; margin: 10px 0; background: #222; color: #0f0; border: 1px solid #0f0; font-family: monospace;">
+    <option value="">Default Microphone</option>
+  </select>
+  <button id="start">Start Microphone Test</button>
+  <button id="stop" disabled>Stop Test</button>
+  <div id="meter">
+    <div id="meter-bar"></div>
+  </div>
+  <div id="log"></div>
+  <script>
+    const logEl = document.getElementById('log');
+    const meterBar = document.getElementById('meter-bar');
+    const startBtn = document.getElementById('start');
+    const stopBtn = document.getElementById('stop');
+    const deviceSelect = document.getElementById('deviceSelect');
+    let audioContext = null;
+    let source = null;
+    let analyser = null;
+    let processor = null;
+    let animationId = null;
+    // Enumerate devices on page load
+    async function loadDevices() {
+      try {
+        const devices = await navigator.mediaDevices.enumerateDevices();
+        const audioInputs = devices.filter(d => d.kind === 'audioinput');
+        deviceSelect.innerHTML = '<option value="">Default Microphone</option>';
+        audioInputs.forEach(device => {
+          const option = document.createElement('option');
+          option.value = device.deviceId;
+          option.textContent = device.label || `Microphone ${device.deviceId.slice(0, 8)}...`;
+          deviceSelect.appendChild(option);
+        });
+        console.log('Available devices:', audioInputs);
+      } catch (error) {
+        console.error('Failed to enumerate devices:', error);
+      }
+    }
+    loadDevices();
+    function log(msg) {
+      const line = document.createElement('div');
+      line.textContent = `[${new Date().toLocaleTimeString()}] ${msg}`;
+      logEl.appendChild(line);
+      logEl.scrollTop = logEl.scrollHeight;
+      console.log(msg);
+    }
+    async function startTest() {
+      try {
+        const selectedDeviceId = deviceSelect.value;
+        log(`Requesting microphone access... ${selectedDeviceId ? `(Device: ${deviceSelect.options[deviceSelect.selectedIndex].text})` : '(Default)'}`);
+        const audioConstraints = {
+          channelCount: 1,
+          echoCancellation: false,
+          noiseSuppression: false,
+          autoGainControl: false,
+        };
+        if (selectedDeviceId) {
+          audioConstraints.deviceId = { exact: selectedDeviceId };
+        }
+        const stream = await navigator.mediaDevices.getUserMedia({
+          audio: audioConstraints
+        });
+        log('✓ Microphone access granted');
+        // Refresh device list now that we have permission
+        await loadDevices();
+        const tracks = stream.getAudioTracks();
+        log(`Stream has ${tracks.length} audio tracks`);
+        if (tracks.length > 0) {
+          const track = tracks[0];
+          const settings = track.getSettings();
+          log(`Track: ${track.label}`);
+          log(`Settings: ${JSON.stringify(settings, null, 2)}`);
+          log(`Enabled: ${track.enabled}, Muted: ${track.muted}, State: ${track.readyState}`);
+        }
+        audioContext = new AudioContext();
+        log(`AudioContext created: ${audioContext.sampleRate}Hz, state: ${audioContext.state}`);
+        if (audioContext.state === 'suspended') {
+          await audioContext.resume();
+          log(`AudioContext resumed to: ${audioContext.state}`);
+        }
+        source = audioContext.createMediaStreamSource(stream);
+        log('MediaStreamSource created');
+        // Test 1: AnalyserNode
+        analyser = audioContext.createAnalyser();
+        analyser.fftSize = 2048;
+        source.connect(analyser);
+        log('AnalyserNode connected');
+        const dataArray = new Uint8Array(analyser.frequencyBinCount);
+        function checkAnalyser() {
+          analyser.getByteTimeDomainData(dataArray);
+          let sum = 0;
+          let max = 0;
+          for (let i = 0; i < dataArray.length; i++) {
+            const val = Math.abs(dataArray[i] - 128);
+            sum += val;
+            max = Math.max(max, val);
+          }
+          const avg = sum / dataArray.length;
+          const percent = (max / 128) * 100;
+          meterBar.style.width = percent + '%';
+          log(`Analyser - Avg: ${avg.toFixed(2)}, Max: ${max}, Samples: ${dataArray.length}`);
+          if (avg < 0.1) {
+            log('⚠️ WARNING: Audio level is 0 or very quiet!');
+          }
+        }
+        setTimeout(checkAnalyser, 500);
+        setTimeout(checkAnalyser, 1000);
+        setTimeout(checkAnalyser, 2000);
+        // Test 2: ScriptProcessorNode
+        processor = audioContext.createScriptProcessor(4096, 1, 1);
+        source.connect(processor);
+        processor.connect(audioContext.destination);
+        let chunkCount = 0;
+        processor.onaudioprocess = (event) => {
+          const inputData = event.inputBuffer.getChannelData(0);
+          const max = Math.max(...Array.from(inputData).map(Math.abs));
+          const avg = Array.from(inputData).reduce((sum, val) => sum + Math.abs(val), 0) / inputData.length;
+          chunkCount++;
+          if (chunkCount % 10 === 0) {
+            log(`ScriptProcessor - Chunk ${chunkCount}, Avg: ${avg.toFixed(6)}, Max: ${max.toFixed(6)}, Length: ${inputData.length}`);
+            if (max < 0.0001) {
+              log('⚠️ WARNING: ScriptProcessor getting all zeros!');
+            }
+          }
+          // Update meter
+          const percent = (max * 100);
+          meterBar.style.width = Math.min(100, percent) + '%';
+        };
+        log('ScriptProcessorNode connected and listening...');
+        log('✓ Test running - speak into your microphone!');
+        startBtn.disabled = true;
+        stopBtn.disabled = false;
+        deviceSelect.disabled = true;
+      } catch (error) {
+        log(`❌ ERROR: ${error.message}`);
+        console.error(error);
+      }
+    }
+    function stopTest() {
+      if (processor) processor.disconnect();
+      if (analyser) analyser.disconnect();
+      if (source) source.disconnect();
+      if (audioContext) audioContext.close();
+      if (animationId) cancelAnimationFrame(animationId);
+      log('Test stopped');
+      startBtn.disabled = false;
+      stopBtn.disabled = true;
+      deviceSelect.disabled = false;
+      meterBar.style.width = '0%';
+    }
+    startBtn.addEventListener('click', startTest);
+    stopBtn.addEventListener('click', stopTest);
+  </script>
+</body>
+</html>

source/src/App.jsx CHANGED Viewed

@@ -19,11 +19,16 @@ function App() {
   const [modelMessage, setModelMessage] = useState('');
   const [device, setDevice] = useState(null);
   // Recording state
   const [isRecording, setIsRecording] = useState(false);
   const [fixedText, setFixedText] = useState('');
   const [activeText, setActiveText] = useState('');
   const [timestamp, setTimestamp] = useState(0);
   // Performance metrics
   const [latency, setLatency] = useState(null);
@@ -38,6 +43,29 @@ function App() {
   const streamingHandlerRef = useRef(null);
   const progressiveIntervalRef = useRef(null);
   // Initialize worker
   useEffect(() => {
     workerRef.current = new Worker(WorkerUrl, { type: 'module' });
@@ -138,12 +166,21 @@ function App() {
       // Start recording with callback for audio chunks
       recorderRef.current = new AudioRecorder((audioChunk) => {
         // Append PCM audio chunk directly (Float32Array)
-        console.log('Audio chunk received:', audioChunk.length, 'samples (~' + (audioChunk.length / 16000 * 1000).toFixed(0) + 'ms)');
         audioProcessorRef.current.appendChunk(audioChunk);
         console.log('Total buffer:', audioProcessorRef.current.getBuffer().length, 'samples');
       });
-      await recorderRef.current.start();
       setIsRecording(true);
       // Start progressive transcription updates
@@ -246,12 +283,46 @@ function App() {
       {/* Main Content */}
       <main className="max-w-6xl mx-auto px-6 py-8 space-y-8">
-        {/* Model Status */}
         <div className="bg-gray-900 rounded-lg border border-gray-700 p-6">
           <div className="flex items-center justify-between">
             <div>
-              <h2 className="text-lg font-semibold mb-2">Model Status</h2>
-              <p className="text-sm text-gray-400">{modelMessage || 'Ready to load model'}</p>
             </div>
             <div>
               {modelStatus === 'not_loaded' && (

   const [modelMessage, setModelMessage] = useState('');
   const [device, setDevice] = useState(null);
+  // Microphone device selection
+  const [audioDevices, setAudioDevices] = useState([]);
+  const [selectedDeviceId, setSelectedDeviceId] = useState(null);
   // Recording state
   const [isRecording, setIsRecording] = useState(false);
   const [fixedText, setFixedText] = useState('');
   const [activeText, setActiveText] = useState('');
   const [timestamp, setTimestamp] = useState(0);
+  const [audioLevel, setAudioLevel] = useState(0);
   // Performance metrics
   const [latency, setLatency] = useState(null);
   const streamingHandlerRef = useRef(null);
   const progressiveIntervalRef = useRef(null);
+  // Enumerate audio input devices
+  useEffect(() => {
+    async function getDevices() {
+      try {
+        const devices = await navigator.mediaDevices.enumerateDevices();
+        const audioInputs = devices.filter(device => device.kind === 'audioinput');
+        setAudioDevices(audioInputs);
+        // Auto-select the default device (first one with "default" in deviceId)
+        const defaultDevice = audioInputs.find(d => d.deviceId === 'default');
+        if (defaultDevice && !selectedDeviceId) {
+          setSelectedDeviceId(defaultDevice.deviceId);
+          console.log('[App] Auto-selected default device:', defaultDevice.label);
+        }
+        console.log('[App] Available audio devices:', audioInputs.map(d => `${d.label || 'Unnamed'} (${d.deviceId.slice(0, 8)}...)`));
+      } catch (error) {
+        console.error('[App] Failed to enumerate devices:', error);
+      }
+    }
+    getDevices();
+  }, []);
   // Initialize worker
   useEffect(() => {
     workerRef.current = new Worker(WorkerUrl, { type: 'module' });
       // Start recording with callback for audio chunks
       recorderRef.current = new AudioRecorder((audioChunk) => {
         // Append PCM audio chunk directly (Float32Array)
+        const maxAmp = Math.max(...Array.from(audioChunk).map(Math.abs));
+        console.log('Audio chunk received:', audioChunk.length, 'samples (~' + (audioChunk.length / 16000 * 1000).toFixed(0) + 'ms), max amplitude:', maxAmp.toFixed(4));
+        // Update audio level meter (scale to 0-100%)
+        setAudioLevel(Math.min(100, maxAmp * 300)); // Scale up for visibility
+        if (maxAmp < 0.001) {
+          console.warn('⚠️ Very quiet audio - if using AirPods, they may need time to activate. Try speaking louder or tapping the mic.');
+        }
         audioProcessorRef.current.appendChunk(audioChunk);
         console.log('Total buffer:', audioProcessorRef.current.getBuffer().length, 'samples');
       });
+      await recorderRef.current.start(selectedDeviceId);
       setIsRecording(true);
       // Start progressive transcription updates
       {/* Main Content */}
       <main className="max-w-6xl mx-auto px-6 py-8 space-y-8">
+        {/* Controls */}
         <div className="bg-gray-900 rounded-lg border border-gray-700 p-6">
+          <h2 className="text-lg font-semibold mb-4">Controls</h2>
+          {/* Microphone Selection */}
+          <div className="mb-4">
+            <label className="block text-sm font-medium text-gray-400 mb-2">Microphone</label>
+            <select
+              value={selectedDeviceId || ''}
+              onChange={(e) => setSelectedDeviceId(e.target.value)}
+              className="w-full bg-gray-800 border border-gray-600 rounded px-4 py-2 text-white"
+              disabled={isRecording}
+            >
+              {audioDevices.length === 0 && <option value="">Loading devices...</option>}
+              {audioDevices.map((device) => (
+                <option key={device.deviceId} value={device.deviceId}>
+                  {device.label || `Microphone ${device.deviceId.slice(0, 8)}...`}
+                </option>
+              ))}
+            </select>
+          </div>
+          {/* Audio Level Meter */}
+          {isRecording && (
+            <div className="mb-4">
+              <label className="block text-sm font-medium text-gray-400 mb-2">Audio Level</label>
+              <div className="w-full h-3 bg-gray-800 rounded-full overflow-hidden">
+                <div
+                  className="h-full bg-gradient-to-r from-green-500 via-yellow-500 to-red-500 transition-all duration-75"
+                  style={{ width: `${audioLevel}%` }}
+                ></div>
+              </div>
+            </div>
+          )}
+          {/* Model Status and Actions */}
           <div className="flex items-center justify-between">
             <div>
+              <h3 className="text-sm font-medium text-gray-400">Model Status</h3>
+              <p className="text-sm text-gray-300 mt-1">{modelMessage || 'Ready to load model'}</p>
             </div>
             <div>
               {modelStatus === 'not_loaded' && (

source/src/utils/audio.js CHANGED Viewed

@@ -17,29 +17,78 @@ export class AudioRecorder {
     this.audioChunks = [];
   }
-  async start() {
     /**
      * Start recording audio from microphone using Web Audio API
      */
     try {
       // Request microphone access
       this.stream = await navigator.mediaDevices.getUserMedia({
-        audio: {
-          channelCount: 1,
-          sampleRate: WHISPER_SAMPLING_RATE,
-          echoCancellation: true,
-          noiseSuppression: true,
-        }
       });
-      // Create AudioContext with 16kHz sample rate
-      this.audioContext = new AudioContext({ sampleRate: WHISPER_SAMPLING_RATE });
       // Create source from stream
       this.source = this.audioContext.createMediaStreamSource(this.stream);
       // Create ScriptProcessorNode (deprecated but works everywhere)
-      // 4096 samples = ~256ms at 16kHz
       const bufferSize = 4096;
       this.processor = this.audioContext.createScriptProcessor(bufferSize, 1, 1);
@@ -47,13 +96,20 @@ export class AudioRecorder {
         if (!this.isRecording) return;
         const inputData = event.inputBuffer.getChannelData(0);
-        // Copy the data (important! buffer is reused)
-        const audioChunk = new Float32Array(inputData);
-        this.audioChunks.push(audioChunk);
         if (this.onDataAvailable) {
-          this.onDataAvailable(audioChunk);
         }
       };
@@ -71,6 +127,32 @@ export class AudioRecorder {
     }
   }
   requestData() {
     /**
      * No-op for ScriptProcessor (data comes automatically)

     this.audioChunks = [];
   }
+  async start(deviceId = null) {
     /**
      * Start recording audio from microphone using Web Audio API
+     * @param {string} deviceId - Optional specific device ID to use
      */
     try {
       // Request microphone access
+      // Note: Disable echo cancellation and noise suppression in Chrome
+      // as they can conflict with cross-origin isolation headers
+      // Chrome + AirPods known issue: AirPods may send silence initially due to power saving.
+      // Recommendation: Use built-in/wired microphone for best results in Chrome.
+      const audioConstraints = {
+        channelCount: 1,
+        echoCancellation: false,
+        noiseSuppression: false,
+        autoGainControl: false,
+      };
+      // If specific device requested, add deviceId constraint
+      if (deviceId) {
+        audioConstraints.deviceId = { exact: deviceId };
+      }
       this.stream = await navigator.mediaDevices.getUserMedia({
+        audio: audioConstraints
       });
+      // Create AudioContext at native sample rate (browser will choose optimal rate)
+      this.audioContext = new AudioContext();
+      const nativeSampleRate = this.audioContext.sampleRate;
+      console.log(`[Audio] Native sample rate: ${nativeSampleRate}Hz, target: ${WHISPER_SAMPLING_RATE}Hz`);
+      console.log(`[Audio] AudioContext state: ${this.audioContext.state}`);
+      // Resume AudioContext if suspended (required by some browsers)
+      if (this.audioContext.state === 'suspended') {
+        await this.audioContext.resume();
+        console.log(`[Audio] AudioContext resumed to: ${this.audioContext.state}`);
+      }
       // Create source from stream
       this.source = this.audioContext.createMediaStreamSource(this.stream);
+      // Debug: Check stream and track status
+      const audioTracks = this.stream.getAudioTracks();
+      console.log(`[Audio] MediaStreamSource created, stream active: ${this.stream.active}, tracks: ${audioTracks.length}`);
+      if (audioTracks.length > 0) {
+        const track = audioTracks[0];
+        console.log(`[Audio] Track settings:`, track.getSettings());
+        console.log(`[Audio] Track enabled: ${track.enabled}, muted: ${track.muted}, readyState: ${track.readyState}`);
+      }
+      // Create AnalyserNode to verify microphone input
+      const analyser = this.audioContext.createAnalyser();
+      analyser.fftSize = 2048;
+      const bufferLength = analyser.frequencyBinCount;
+      const dataArray = new Uint8Array(bufferLength);
+      this.source.connect(analyser);
+      // Test microphone input with analyser
+      setTimeout(() => {
+        analyser.getByteTimeDomainData(dataArray);
+        let sum = 0;
+        for (let i = 0; i < bufferLength; i++) {
+          sum += Math.abs(dataArray[i] - 128);
+        }
+        const avgLevel = sum / bufferLength;
+        console.log(`[Audio] Analyser test - Average level: ${avgLevel.toFixed(2)} (should be >0 if mic working)`);
+      }, 500);
       // Create ScriptProcessorNode (deprecated but works everywhere)
+      // Use larger buffer at native rate
       const bufferSize = 4096;
       this.processor = this.audioContext.createScriptProcessor(bufferSize, 1, 1);
         if (!this.isRecording) return;
         const inputData = event.inputBuffer.getChannelData(0);
+        // Debug: Check raw input levels BEFORE resampling
+        const rawMax = Math.max(...Array.from(inputData).map(Math.abs));
+        if (this.audioChunks.length % 10 === 0) {  // Log every 10th chunk
+          console.log(`[Audio] Raw input max amplitude: ${rawMax.toFixed(4)} (${inputData.length} samples at ${nativeSampleRate}Hz)`);
+        }
+        // Resample from native rate to 16kHz
+        const resampled = this.resample(inputData, nativeSampleRate, WHISPER_SAMPLING_RATE);
+        this.audioChunks.push(resampled);
         if (this.onDataAvailable) {
+          this.onDataAvailable(resampled);
         }
       };
     }
   }
+  resample(audioData, sourceSampleRate, targetSampleRate) {
+    /**
+     * Simple linear interpolation resampler
+     * Converts audio from sourceSampleRate to targetSampleRate
+     */
+    if (sourceSampleRate === targetSampleRate) {
+      return new Float32Array(audioData);
+    }
+    const ratio = sourceSampleRate / targetSampleRate;
+    const newLength = Math.round(audioData.length / ratio);
+    const result = new Float32Array(newLength);
+    for (let i = 0; i < newLength; i++) {
+      const srcIndex = i * ratio;
+      const srcIndexFloor = Math.floor(srcIndex);
+      const srcIndexCeil = Math.min(srcIndexFloor + 1, audioData.length - 1);
+      const t = srcIndex - srcIndexFloor;
+      // Linear interpolation
+      result[i] = audioData[srcIndexFloor] * (1 - t) + audioData[srcIndexCeil] * t;
+    }
+    return result;
+  }
   requestData() {
     /**
      * No-op for ScriptProcessor (data comes automatically)

source/src/worker.js CHANGED Viewed

@@ -83,6 +83,21 @@ async function transcribe(audio, language = null) {
   try {
     const startTime = performance.now();
     // Transcribe with parakeet.js
     const result = await model.transcribe(audio, 16000, {
       returnTimestamps: true,  // Get word-level timestamps
@@ -95,6 +110,10 @@ async function transcribe(audio, language = null) {
     const audioDuration = audio.length / 16000;
     const rtf = latency / audioDuration;  // Real-time factor
     // Convert parakeet.js word format to our sentence format
     console.log('[Worker] Parakeet words:', result.words?.length || 0, 'words');
     if (result.words && result.words.length > 0) {

   try {
     const startTime = performance.now();
+    // Debug: Check audio levels (use reduce to avoid stack overflow with large arrays)
+    let maxAmplitude = 0;
+    let avgAmplitude = 0;
+    for (let i = 0; i < audio.length; i++) {
+      const abs = Math.abs(audio[i]);
+      if (abs > maxAmplitude) maxAmplitude = abs;
+      avgAmplitude += abs;
+    }
+    avgAmplitude /= audio.length;
+    console.log('[Worker] Audio stats - Max:', maxAmplitude.toFixed(4), 'Avg:', avgAmplitude.toFixed(4), 'Length:', audio.length);
+    if (maxAmplitude < 0.01) {
+      console.warn('[Worker] WARNING: Audio is very quiet! Microphone may not be working.');
+    }
     // Transcribe with parakeet.js
     const result = await model.transcribe(audio, 16000, {
       returnTimestamps: true,  // Get word-level timestamps
     const audioDuration = audio.length / 16000;
     const rtf = latency / audioDuration;  // Real-time factor
+    // Debug: log full result to see what parakeet.js returns
+    console.log('[Worker] Full parakeet result:', result);
+    console.log('[Worker] utterance_text:', result.utterance_text);
     // Convert parakeet.js word format to our sentence format
     console.log('[Worker] Parakeet words:', result.words?.length || 0, 'words');
     if (result.words && result.words.length > 0) {