Spaces:
Running
Running
| /** | |
| * Audio capture and processing utilities | |
| * | |
| * Uses Web Audio API with ScriptProcessorNode for real-time PCM audio capture | |
| */ | |
| const WHISPER_SAMPLING_RATE = 16000; | |
| export class AudioRecorder { | |
| constructor(onDataAvailable) { | |
| this.onDataAvailable = onDataAvailable; | |
| this.audioContext = null; | |
| this.stream = null; | |
| this.source = null; | |
| this.processor = null; | |
| this.isRecording = false; | |
| this.audioChunks = []; | |
| } | |
| async start(deviceId = null) { | |
| /** | |
| * Start recording audio from microphone using Web Audio API | |
| * @param {string} deviceId - Optional specific device ID to use | |
| */ | |
| try { | |
| // Request microphone access | |
| // Note: Disable echo cancellation and noise suppression in Chrome | |
| // as they can conflict with cross-origin isolation headers | |
| const audioConstraints = { | |
| channelCount: 1, | |
| echoCancellation: false, | |
| noiseSuppression: false, | |
| autoGainControl: false, | |
| }; | |
| // If specific device requested, add deviceId constraint | |
| if (deviceId) { | |
| audioConstraints.deviceId = { exact: deviceId }; | |
| } | |
| this.stream = await navigator.mediaDevices.getUserMedia({ | |
| audio: audioConstraints | |
| }); | |
| // Create AudioContext at native sample rate (browser will choose optimal rate) | |
| this.audioContext = new AudioContext(); | |
| const nativeSampleRate = this.audioContext.sampleRate; | |
| // Resume AudioContext if suspended (required by some browsers) | |
| if (this.audioContext.state === 'suspended') { | |
| await this.audioContext.resume(); | |
| } | |
| // Create source from stream | |
| this.source = this.audioContext.createMediaStreamSource(this.stream); | |
| // Create ScriptProcessorNode (deprecated but works everywhere) | |
| // Use larger buffer at native rate | |
| const bufferSize = 4096; | |
| this.processor = this.audioContext.createScriptProcessor(bufferSize, 1, 1); | |
| this.processor.onaudioprocess = (event) => { | |
| if (!this.isRecording) return; | |
| const inputData = event.inputBuffer.getChannelData(0); | |
| // Resample from native rate to 16kHz | |
| const resampled = this.resample(inputData, nativeSampleRate, WHISPER_SAMPLING_RATE); | |
| this.audioChunks.push(resampled); | |
| if (this.onDataAvailable) { | |
| this.onDataAvailable(resampled); | |
| } | |
| }; | |
| // Connect: source -> processor -> destination | |
| this.source.connect(this.processor); | |
| this.processor.connect(this.audioContext.destination); | |
| this.isRecording = true; | |
| return true; | |
| } catch (error) { | |
| console.error('Failed to start recording:', error); | |
| throw error; | |
| } | |
| } | |
| resample(audioData, sourceSampleRate, targetSampleRate) { | |
| /** | |
| * Simple linear interpolation resampler | |
| * Converts audio from sourceSampleRate to targetSampleRate | |
| */ | |
| if (sourceSampleRate === targetSampleRate) { | |
| return new Float32Array(audioData); | |
| } | |
| const ratio = sourceSampleRate / targetSampleRate; | |
| const newLength = Math.round(audioData.length / ratio); | |
| const result = new Float32Array(newLength); | |
| for (let i = 0; i < newLength; i++) { | |
| const srcIndex = i * ratio; | |
| const srcIndexFloor = Math.floor(srcIndex); | |
| const srcIndexCeil = Math.min(srcIndexFloor + 1, audioData.length - 1); | |
| const t = srcIndex - srcIndexFloor; | |
| // Linear interpolation | |
| result[i] = audioData[srcIndexFloor] * (1 - t) + audioData[srcIndexCeil] * t; | |
| } | |
| return result; | |
| } | |
| requestData() { | |
| /** | |
| * No-op for ScriptProcessor (data comes automatically) | |
| */ | |
| // Data is emitted automatically via onaudioprocess | |
| } | |
| async stop() { | |
| /** | |
| * Stop recording and return complete audio as Float32Array | |
| */ | |
| return new Promise((resolve) => { | |
| this.isRecording = false; | |
| // Disconnect nodes | |
| if (this.processor) { | |
| this.processor.disconnect(); | |
| this.processor = null; | |
| } | |
| if (this.source) { | |
| this.source.disconnect(); | |
| this.source = null; | |
| } | |
| // Concatenate all chunks | |
| let totalLength = 0; | |
| for (const chunk of this.audioChunks) { | |
| totalLength += chunk.length; | |
| } | |
| const completeAudio = new Float32Array(totalLength); | |
| let offset = 0; | |
| for (const chunk of this.audioChunks) { | |
| completeAudio.set(chunk, offset); | |
| offset += chunk.length; | |
| } | |
| // Clean up | |
| this.cleanup(); | |
| resolve(completeAudio); | |
| }); | |
| } | |
| cleanup() { | |
| /** | |
| * Clean up resources | |
| */ | |
| if (this.stream) { | |
| this.stream.getTracks().forEach(track => track.stop()); | |
| this.stream = null; | |
| } | |
| if (this.audioContext && this.audioContext.state !== 'closed') { | |
| this.audioContext.close(); | |
| this.audioContext = null; | |
| } | |
| this.audioChunks = []; | |
| this.isRecording = false; | |
| } | |
| } | |
| export class AudioProcessor { | |
| /** | |
| * Process audio chunks for real-time transcription | |
| */ | |
| constructor(sampleRate = WHISPER_SAMPLING_RATE) { | |
| this.sampleRate = sampleRate; | |
| this.audioBuffer = new Float32Array(0); | |
| } | |
| appendChunk(chunk) { | |
| /** | |
| * Append new audio chunk to buffer | |
| */ | |
| const newBuffer = new Float32Array(this.audioBuffer.length + chunk.length); | |
| newBuffer.set(this.audioBuffer); | |
| newBuffer.set(chunk, this.audioBuffer.length); | |
| this.audioBuffer = newBuffer; | |
| } | |
| getBuffer() { | |
| /** | |
| * Get current audio buffer | |
| */ | |
| return this.audioBuffer; | |
| } | |
| getDuration() { | |
| /** | |
| * Get current buffer duration in seconds | |
| */ | |
| return this.audioBuffer.length / this.sampleRate; | |
| } | |
| reset() { | |
| /** | |
| * Clear audio buffer | |
| */ | |
| this.audioBuffer = new Float32Array(0); | |
| } | |
| trimToSize(maxDuration) { | |
| /** | |
| * Trim buffer to maximum duration (in seconds) | |
| */ | |
| const maxSamples = Math.floor(maxDuration * this.sampleRate); | |
| if (this.audioBuffer.length > maxSamples) { | |
| this.audioBuffer = this.audioBuffer.slice(-maxSamples); | |
| } | |
| } | |
| } | |
| export { WHISPER_SAMPLING_RATE }; | |