import { AudioEngine as IAudioEngine, AudioEngineConfig, AudioSegment, IRingBuffer, AudioMetrics } from './types'; import { RingBuffer } from './RingBuffer'; import { AudioSegmentProcessor, ProcessedSegment } from './AudioSegmentProcessor'; import { resampleLinear } from './utils'; /** Duration of the visualization buffer in seconds */ const VISUALIZATION_BUFFER_DURATION = 30; /** * AudioEngine implementation for capturing audio, buffering it, and performing VAD. * Uses AudioSegmentProcessor for robust speech detection (incl. lookback). */ export class AudioEngine implements IAudioEngine { private config: AudioEngineConfig; private ringBuffer: IRingBuffer; private audioProcessor: AudioSegmentProcessor; // Replaces EnergyVAD private deviceId: string | null = null; private audioContext: AudioContext | null = null; private mediaStream: MediaStream | null = null; private workletNode: AudioWorkletNode | null = null; private sourceNode: MediaStreamAudioSourceNode | null = null; // AnalyserNode for oscilloscope waveform (native getByteTimeDomainData) private analyserNode: AnalyserNode | null = null; private analyserSourceNode: MediaStreamAudioSourceNode | null = null; private analyserGainNode: GainNode | null = null; private analyserTimeBuffer: Uint8Array | null = null; private waveformOut: Float32Array | null = null; private readonly ANALYSER_FFT_SIZE = 256; private readonly ANALYSER_SMOOTHING = 0.3; // Low = fast oscilloscope response // Track device vs target sample rates private deviceSampleRate: number = 48000; private targetSampleRate: number = 16000; private currentEnergy: number = 0; private segmentCallbacks: Array<(segment: AudioSegment) => void> = []; // Fixed-window streaming state (v3 token streaming mode) private windowCallbacks: Array<{ windowDuration: number; overlapDuration: number; triggerInterval: number; callback: (audio: Float32Array, startTime: number) => void; lastWindowEnd: number; // Frame offset of last window end }> = []; // Resampled audio chunk callbacks (for mel worker, etc.) private audioChunkCallbacks: Array<(chunk: Float32Array) => void> = []; // SMA buffer for energy calculation private energyHistory: number[] = []; // Last N energy values for bar visualizer (oldest first when read) private energyBarHistory: number[] = []; private readonly BAR_LEVELS_SIZE = 64; // Visualization Summary Buffer (Low-Res Min/Max pairs) private visualizationSummary: Float32Array | null = null; private visualizationSummaryPosition: number = 0; private readonly VIS_SUMMARY_SIZE = 2000; // 2000 min/max pairs for 30 seconds = 15ms resolution // Raw visualization buffer (still kept for higher-res requests if needed, but summary is preferred) private visualizationBuffer: Float32Array | null = null; private visualizationBufferPosition: number = 0; private visualizationBufferSize: number = 0; // Metrics for UI components private metrics: AudioMetrics = { currentEnergy: 0, averageEnergy: 0, peakEnergy: 0, noiseFloor: 0.01, currentSNR: 0, isSpeaking: false, }; // Subscribers for visualization updates private visualizationCallbacks: Array<(data: Float32Array, metrics: AudioMetrics, bufferEndTime: number) => void> = []; private lastVisualizationNotifyTime: number = 0; private readonly VISUALIZATION_NOTIFY_INTERVAL_MS = 16; // ~60fps for responsive oscilloscope // Recent segments for visualization (stores timing info only) private recentSegments: Array<{ startTime: number; endTime: number; isProcessed: boolean }> = []; private readonly MAX_SEGMENTS_FOR_VISUALIZATION = 50; constructor(config: Partial = {}) { this.config = { sampleRate: 16000, bufferDuration: 120, energyThreshold: 0.08, // Match legacy UI project 'medium' minSpeechDuration: 240, // Match legacy UI project minSilenceDuration: 400, // Match legacy UI project maxSegmentDuration: 4.8, // Match legacy UI project // Advanced VAD defaults lookbackDuration: 0.120, speechHangover: 0.16, minEnergyIntegral: 22, minEnergyPerSecond: 5, useAdaptiveEnergyThresholds: true, adaptiveEnergyIntegralFactor: 25.0, adaptiveEnergyPerSecondFactor: 10.0, minAdaptiveEnergyIntegral: 3, minAdaptiveEnergyPerSecond: 1, maxSilenceWithinSpeech: 0.160, endingSpeechTolerance: 0.240, ...config, }; this.deviceId = this.config.deviceId || null; this.targetSampleRate = this.config.sampleRate; // RingBuffer operates at TARGET sample rate (16kHz) this.ringBuffer = new RingBuffer(this.targetSampleRate, this.config.bufferDuration); // Initialize AudioSegmentProcessor this.audioProcessor = new AudioSegmentProcessor({ sampleRate: this.targetSampleRate, energyThreshold: this.config.energyThreshold, minSpeechDuration: this.config.minSpeechDuration, silenceThreshold: this.config.minSilenceDuration, maxSegmentDuration: this.config.maxSegmentDuration, lookbackDuration: this.config.lookbackDuration, maxSilenceWithinSpeech: this.config.maxSilenceWithinSpeech, endingSpeechTolerance: this.config.endingSpeechTolerance, snrThreshold: 3.0, minSnrThreshold: 1.0, noiseFloorAdaptationRate: 0.05, fastAdaptationRate: 0.15, minBackgroundDuration: 1.0, energyRiseThreshold: 0.08 }); // Initialize visualization buffer (30 seconds at target sample rate) this.visualizationBufferSize = Math.round(this.targetSampleRate * VISUALIZATION_BUFFER_DURATION); this.visualizationBuffer = new Float32Array(this.visualizationBufferSize); this.visualizationBufferPosition = 0; // Initialize visualization summary (2000 points for 30s) this.visualizationSummary = new Float32Array(this.VIS_SUMMARY_SIZE * 2); this.visualizationSummaryPosition = 0; console.log('[AudioEngine] Initialized with config:', this.config); } private isWorkletInitialized = false; async init(): Promise { // Request microphone permission with optional deviceId try { if (this.mediaStream) { this.mediaStream.getTracks().forEach(t => t.stop()); } const constraints: MediaStreamConstraints = { audio: { deviceId: this.deviceId ? { exact: this.deviceId } : undefined, channelCount: 1, echoCancellation: false, noiseSuppression: false, autoGainControl: false, }, }; console.log('[AudioEngine] Requesting microphone:', constraints); this.mediaStream = await navigator.mediaDevices.getUserMedia(constraints); console.log('[AudioEngine] Microphone stream acquired:', this.mediaStream.id); } catch (err) { console.error('[AudioEngine] Failed to get media stream:', err); throw err; } const track = this.mediaStream!.getAudioTracks()[0]; const trackSettings = track?.getSettings?.(); // Device sample rate (what the mic gives us) this.deviceSampleRate = trackSettings?.sampleRate ?? 48000; console.log('[AudioEngine] Device sample rate:', this.deviceSampleRate, '-> Target:', this.targetSampleRate); if (this.audioContext && this.audioContext.sampleRate !== this.deviceSampleRate) { await this.audioContext.close(); this.audioContext = null; } if (!this.audioContext) { this.audioContext = new AudioContext({ sampleRate: this.deviceSampleRate, latencyHint: 'interactive', }); console.log('[AudioEngine] Created AudioContext:', this.audioContext.state, 'sampleRate:', this.audioContext.sampleRate); } // Re-initialize components with correct rates this.ringBuffer = new RingBuffer(this.targetSampleRate, this.config.bufferDuration); // Update processor config this.audioProcessor = new AudioSegmentProcessor({ sampleRate: this.targetSampleRate, energyThreshold: this.config.energyThreshold, minSpeechDuration: this.config.minSpeechDuration, silenceThreshold: this.config.minSilenceDuration, maxSegmentDuration: this.config.maxSegmentDuration, }); if (!this.isWorkletInitialized) { const windowDuration = 0.080; const processorCode = ` class CaptureProcessor extends AudioWorkletProcessor { constructor(options) { super(options); const opts = options?.processorOptions || {}; this.inputSampleRate = opts.inputSampleRate || 16000; this.targetSampleRate = opts.targetSampleRate || this.inputSampleRate; this.ratio = this.inputSampleRate / this.targetSampleRate; this.bufferSize = Math.round(${windowDuration} * this.inputSampleRate); this.buffer = new Float32Array(this.bufferSize); this.index = 0; this._lastLog = 0; } _emitChunk() { let out; let maxAbs = 0; if (this.targetSampleRate === this.inputSampleRate) { out = new Float32Array(this.bufferSize); for (let i = 0; i < this.bufferSize; i++) { const v = this.buffer[i]; out[i] = v; const a = v < 0 ? -v : v; if (a > maxAbs) maxAbs = a; } } else { const outLength = Math.floor(this.bufferSize / this.ratio); out = new Float32Array(outLength); for (let i = 0; i < outLength; i++) { const srcIndex = i * this.ratio; const srcIndexFloor = Math.floor(srcIndex); const srcIndexCeil = Math.min(srcIndexFloor + 1, this.bufferSize - 1); const t = srcIndex - srcIndexFloor; const v = this.buffer[srcIndexFloor] * (1 - t) + this.buffer[srcIndexCeil] * t; out[i] = v; const a = v < 0 ? -v : v; if (a > maxAbs) maxAbs = a; } } this.port.postMessage( { type: 'audio', samples: out, sampleRate: this.targetSampleRate, maxAbs }, [out.buffer] ); } process(inputs) { const input = inputs[0]; if (!input || !input[0]) return true; const channelData = input[0]; // Buffer the data for (let i = 0; i < channelData.length; i++) { this.buffer[this.index++] = channelData[i]; if (this.index >= this.bufferSize) { this._emitChunk(); this.index = 0; // Debug log every ~5 seconds const now = Date.now(); if (now - this._lastLog > 5000) { this.port.postMessage({ type: 'log', message: '[AudioWorklet] Active' }); this._lastLog = now; } } } return true; } } registerProcessor('capture-processor', CaptureProcessor); `; const blob = new Blob([processorCode], { type: 'application/javascript' }); const url = URL.createObjectURL(blob); try { await this.audioContext.audioWorklet.addModule(url); this.isWorkletInitialized = true; console.log('[AudioEngine] AudioWorklet module loaded'); } catch (err) { console.error('[AudioEngine] Failed to load worklet:', err); if (err instanceof Error && err.name === 'InvalidStateError') { // Ignore if already registered this.isWorkletInitialized = true; } } } // Re-create worklet node if needed (it might handle dispose differently, but safe to new) if (this.workletNode) this.workletNode.disconnect(); this.workletNode = new AudioWorkletNode(this.audioContext, 'capture-processor', { processorOptions: { inputSampleRate: this.deviceSampleRate, targetSampleRate: this.targetSampleRate }, }); this.workletNode.port.onmessage = (event: MessageEvent) => { if (event.data?.type === 'audio' && event.data.samples instanceof Float32Array) { this.handleAudioChunk(event.data.samples, event.data.maxAbs, event.data.sampleRate); } else if (event.data instanceof Float32Array) { this.handleAudioChunk(event.data, undefined, this.deviceSampleRate); } else if (event.data?.type === 'log') { console.log(event.data.message); } }; this.workletNode.onprocessorerror = (e) => { console.error('[AudioEngine] Worklet processor error:', e); }; // Reconnect source node this.sourceNode?.disconnect(); this.sourceNode = this.audioContext.createMediaStreamSource(this.mediaStream); this.sourceNode.connect(this.workletNode); // AnalyserNode branch for lightweight preview bars (native FFT, no mel worker) this.disposeAnalyser(); this.analyserSourceNode = this.audioContext.createMediaStreamSource(this.mediaStream); this.analyserNode = this.audioContext.createAnalyser(); this.analyserNode.fftSize = this.ANALYSER_FFT_SIZE; this.analyserNode.smoothingTimeConstant = this.ANALYSER_SMOOTHING; this.analyserTimeBuffer = new Uint8Array(this.analyserNode.fftSize); this.waveformOut = new Float32Array(this.analyserNode.fftSize); this.analyserGainNode = this.audioContext.createGain(); this.analyserGainNode.gain.value = 0; this.analyserSourceNode.connect(this.analyserNode); this.analyserNode.connect(this.analyserGainNode); this.analyserGainNode.connect(this.audioContext.destination); // Keep graph alive this.workletNode.connect(this.audioContext.destination); console.log('[AudioEngine] Graph connected: Source -> Worklet, AnalyserNode for oscilloscope'); } async start(): Promise { if (!this.mediaStream || !this.audioContext || !this.workletNode) { await this.init(); } if (this.audioContext?.state === 'suspended') { await this.audioContext.resume(); } } stop(): void { if (this.audioContext?.state === 'running') { this.audioContext.suspend(); } } /** * Reset buffers and VAD state for a new session while keeping the audio graph. * Aligns visualization + segment timebase to 0, matching legacy UI project behavior. */ reset(): void { // Reset audio/VAD state this.ringBuffer.reset(); this.audioProcessor.reset(); this.currentEnergy = 0; // Reset metrics this.metrics = { currentEnergy: 0, averageEnergy: 0, peakEnergy: 0, noiseFloor: 0.01, currentSNR: 0, isSpeaking: false, }; // Clear segment history used by the visualizer this.recentSegments = []; this.energyBarHistory = []; // Reset visualization buffer if (this.visualizationBuffer) { this.visualizationBuffer.fill(0); } this.visualizationBufferPosition = 0; // Reset windowed streaming cursors for (const entry of this.windowCallbacks) { entry.lastWindowEnd = 0; } // Push a blank update so UI clears stale waveform/segments this.notifyVisualizationUpdate(); } getCurrentEnergy(): number { return this.currentEnergy; } /** Oscilloscope waveform from AnalyserNode.getByteTimeDomainData (native, fast). Values -1..1. */ getBarLevels(): Float32Array { if (this.analyserNode && this.analyserTimeBuffer && this.waveformOut) { (this.analyserNode as { getByteTimeDomainData(array: Uint8Array): void }).getByteTimeDomainData(this.analyserTimeBuffer); for (let i = 0; i < this.analyserTimeBuffer.length; i++) { this.waveformOut[i] = (this.analyserTimeBuffer[i] - 128) / 128; // 0..255 -> -1..1 } return this.waveformOut; } const out = new Float32Array(this.BAR_LEVELS_SIZE); const h = this.energyBarHistory; const start = h.length <= this.BAR_LEVELS_SIZE ? 0 : h.length - this.BAR_LEVELS_SIZE; for (let i = 0; i < this.BAR_LEVELS_SIZE; i++) { const idx = start + i; out[i] = idx < h.length ? Math.min(1, Math.max(0, h[idx])) : 0; } return out; } getSignalMetrics(): { noiseFloor: number; snr: number; threshold: number; snrThreshold: number } { const stats = this.audioProcessor.getStats(); return { noiseFloor: stats.noiseFloor ?? 0.0001, snr: stats.snr ?? 0, threshold: this.config.energyThreshold, snrThreshold: stats.snrThreshold ?? 3.0 }; } isSpeechActive(): boolean { return this.audioProcessor.getStateInfo().inSpeech; } getRingBuffer(): IRingBuffer { return this.ringBuffer; } onSpeechSegment(callback: (segment: AudioSegment) => void): () => void { this.segmentCallbacks.push(callback); return () => { this.segmentCallbacks = this.segmentCallbacks.filter((cb) => cb !== callback); }; } /** * Subscribe to fixed-window chunks for token streaming mode. * Fires every triggerInterval seconds with windowDuration of audio. */ onWindowChunk( windowDuration: number, overlapDuration: number, triggerInterval: number, callback: (audio: Float32Array, startTime: number) => void ): () => void { const entry = { windowDuration, overlapDuration, triggerInterval, callback, lastWindowEnd: 0, // Will be set on first chunk }; this.windowCallbacks.push(entry); return () => { this.windowCallbacks = this.windowCallbacks.filter((e) => e !== entry); }; } /** * Subscribe to every resampled audio chunk (16kHz). * Used to feed the continuous mel producer worker. * Returns an unsubscribe function. */ onAudioChunk(callback: (chunk: Float32Array) => void): () => void { this.audioChunkCallbacks.push(callback); return () => { this.audioChunkCallbacks = this.audioChunkCallbacks.filter((cb) => cb !== callback); }; } updateConfig(config: Partial): void { this.config = { ...this.config, ...config }; // Update processor config if (config.energyThreshold !== undefined) this.audioProcessor.setThreshold(config.energyThreshold); if (config.minSpeechDuration !== undefined) this.audioProcessor.setMinSpeechDuration(config.minSpeechDuration); if (config.minSilenceDuration !== undefined) this.audioProcessor.setSilenceLength(config.minSilenceDuration); if (config.maxSegmentDuration !== undefined) this.audioProcessor.setMaxSegmentDuration(config.maxSegmentDuration); // Advanced VAD updates if (config.lookbackDuration !== undefined) this.audioProcessor.setLookbackDuration(config.lookbackDuration); if (config.overlapDuration !== undefined) this.audioProcessor.setOverlapDuration(config.overlapDuration); if (config.maxSilenceWithinSpeech !== undefined) this.audioProcessor.setMaxSilenceWithinSpeech(config.maxSilenceWithinSpeech); if (config.endingSpeechTolerance !== undefined) this.audioProcessor.setEndingSpeechTolerance(config.endingSpeechTolerance); if (config.snrThreshold !== undefined) this.audioProcessor.setSnrThreshold(config.snrThreshold); if (config.minSnrThreshold !== undefined) this.audioProcessor.setMinSnrThreshold(config.minSnrThreshold); } async setDevice(deviceId: string): Promise { this.deviceId = deviceId; await this.init(); // Reconnect if running if (this.audioContext && this.workletNode) { this.sourceNode?.disconnect(); this.sourceNode = this.audioContext.createMediaStreamSource(this.mediaStream!); this.sourceNode.connect(this.workletNode); } } private disposeAnalyser(): void { this.analyserSourceNode?.disconnect(); this.analyserNode?.disconnect(); this.analyserGainNode?.disconnect(); this.analyserSourceNode = null; this.analyserNode = null; this.analyserGainNode = null; this.analyserTimeBuffer = null; this.waveformOut = null; } dispose(): void { this.stop(); this.disposeAnalyser(); this.mediaStream?.getTracks().forEach(track => track.stop()); this.audioContext?.close(); this.audioContext = null; this.mediaStream = null; this.workletNode = null; this.sourceNode = null; } private handleAudioChunk(rawChunk: Float32Array, precomputedMaxAbs?: number, chunkSampleRate?: number): void { // 0. Ensure chunk is at target sample rate (resample only if needed) const sampleRate = chunkSampleRate ?? this.targetSampleRate; const needsResample = sampleRate !== this.targetSampleRate; const chunk = needsResample ? resampleLinear(rawChunk, sampleRate, this.targetSampleRate) : rawChunk; // Calculate chunk energy (Peak Amplitude) + SMA for VAD compatibility let maxAbs = (!needsResample && precomputedMaxAbs !== undefined) ? precomputedMaxAbs : 0; if (precomputedMaxAbs === undefined || needsResample) { for (let i = 0; i < chunk.length; i++) { const abs = Math.abs(chunk[i]); if (abs > maxAbs) maxAbs = abs; } } // SMA Smoothing (matching legacy UI project logic) this.energyHistory.push(maxAbs); if (this.energyHistory.length > 6) { this.energyHistory.shift(); } const energy = this.energyHistory.reduce((a: number, b: number) => a + b, 0) / this.energyHistory.length; this.currentEnergy = energy; this.energyBarHistory.push(energy); if (this.energyBarHistory.length > this.BAR_LEVELS_SIZE) { this.energyBarHistory.shift(); } // Log when energy crosses threshold if state is close to changing const isSpeech = energy > this.config.energyThreshold; const wasSpeaking = this.metrics.isSpeaking; if (isSpeech !== wasSpeaking) { console.debug(`[AudioEngine] Energy threshold crossed: ${energy.toFixed(6)} > ${this.config.energyThreshold} = ${isSpeech}`); } // 1. Write to ring buffer before any callbacks can transfer the chunk. this.ringBuffer.write(chunk); const endFrame = this.ringBuffer.getCurrentFrame(); // 2. Process VAD on resampled audio // The processor uses its own internal history for lookback, but we pull full audio from ring buffer later. const currentTime = this.ringBuffer.getCurrentTime(); const segments = this.audioProcessor.processAudioData(chunk, currentTime, energy); // 2.5 Update visualization buffer this.updateVisualizationBuffer(chunk); // 2.6 Update metrics const stats = this.audioProcessor.getStats(); const stateInfo = this.audioProcessor.getStateInfo(); this.metrics.currentEnergy = energy; this.metrics.averageEnergy = this.metrics.averageEnergy * 0.95 + energy * 0.05; this.metrics.peakEnergy = Math.max(this.metrics.peakEnergy * 0.99, energy); this.metrics.noiseFloor = stats.noiseFloor ?? 0.01; this.metrics.currentSNR = stats.snr ?? 0; this.metrics.isSpeaking = stateInfo.inSpeech; // Periodic debug log if (Math.random() < 0.05) { console.debug(`[AudioEngine] Metrics: E=${energy.toFixed(6)}, NF=${this.metrics.noiseFloor.toFixed(6)}, SNR=${this.metrics.currentSNR.toFixed(2)}, Speaking=${this.metrics.isSpeaking}`); } // 3. Handle segments if (segments.length > 0) { for (const seg of segments) { // Apply lookback and overlap adjustments matching legacy UI project const lookbackDuration = this.config.lookbackDuration ?? 0.120; const startTime = Math.max(0, seg.startTime - lookbackDuration); // Calculate the sample positions for audio extraction const startFrame = Math.round(startTime * this.targetSampleRate); const endFrame = Math.round(seg.endTime * this.targetSampleRate); // Retrieval with padding (hangover) const speechHangover = this.config.speechHangover ?? 0.16; const paddedEndFrame = Math.min( this.ringBuffer.getCurrentFrame(), endFrame + Math.round(speechHangover * this.targetSampleRate) ); try { const audioData = this.ringBuffer.read(startFrame, paddedEndFrame); // Calculate precise energy metrics for filtering const metrics = this.calculateSegmentEnergyMetrics(audioData, this.targetSampleRate); // Normalize power to 16kHz equivalent const normalizedPowerAt16k = metrics.averagePower * 16000; const normalizedEnergyIntegralAt16k = normalizedPowerAt16k * metrics.duration; // Adaptive threshold calculation let minEnergyIntegralThreshold = this.config.minEnergyIntegral ?? 22; let minEnergyPerSecondThreshold = this.config.minEnergyPerSecond ?? 5; if (this.config.useAdaptiveEnergyThresholds) { const windowSize = this.config.windowSize ?? Math.round(0.080 * this.targetSampleRate); const normalizedNoiseFloor = windowSize > 0 ? this.metrics.noiseFloor / windowSize : 0; const noiseFloorAt16k = normalizedNoiseFloor * 16000; const adaptiveMinEnergyIntegral = noiseFloorAt16k * (this.config.adaptiveEnergyIntegralFactor ?? 25.0); minEnergyIntegralThreshold = Math.max(this.config.minAdaptiveEnergyIntegral ?? 3, adaptiveMinEnergyIntegral); const adaptiveMinEnergyPerSecond = noiseFloorAt16k * (this.config.adaptiveEnergyPerSecondFactor ?? 10.0); minEnergyPerSecondThreshold = Math.max(this.config.minAdaptiveEnergyPerSecond ?? 1, adaptiveMinEnergyPerSecond); } const isValidSpeech = metrics.duration >= (this.config.minSpeechDuration / 1000) && normalizedPowerAt16k >= minEnergyPerSecondThreshold && normalizedEnergyIntegralAt16k >= minEnergyIntegralThreshold; if (isValidSpeech) { const audioSegment: AudioSegment = { startFrame: startFrame, endFrame: paddedEndFrame, duration: metrics.duration, averageEnergy: metrics.averagePower, timestamp: Date.now(), }; this.notifySegment(audioSegment); } else { console.log('[AudioEngine] Filtered out noise segment:', { duration: metrics.duration, power: normalizedPowerAt16k, integral: normalizedEnergyIntegralAt16k }); } } catch (err) { console.warn('[AudioEngine] Failed to extract audio for validation:', err); } } } // 6. Fixed-window streaming (v3 token streaming mode) this.processWindowCallbacks(endFrame); // 7. Notify audio chunk subscribers AFTER internal processing. // Callbacks may transfer the chunk's buffer; do not use `chunk` after this. for (const cb of this.audioChunkCallbacks) { cb(chunk); } // 8. Notify visualization subscribers this.notifyVisualizationUpdate(); } /** * Helper to read audio from ring buffer and calculate energy metrics for a detected segment. */ private calculateSegmentEnergyMetrics(audioData: Float32Array, sampleRate: number): { averagePower: number; duration: number; numSamples: number } { if (!audioData || audioData.length === 0) { return { averagePower: 0, duration: 0, numSamples: 0 }; } const numSamples = audioData.length; let sumOfSquares = 0; for (let i = 0; i < numSamples; i++) { sumOfSquares += audioData[i] * audioData[i]; } const duration = numSamples / sampleRate; const averagePower = numSamples > 0 ? sumOfSquares / numSamples : 0; return { averagePower, duration, numSamples }; } /** * Process fixed-window callbacks for token streaming mode. * Fires when enough audio has accumulated for a new window. */ private processWindowCallbacks(currentFrame: number): void { for (const entry of this.windowCallbacks) { const windowFrames = Math.floor(entry.windowDuration * this.targetSampleRate); const stepFrames = Math.floor(entry.triggerInterval * this.targetSampleRate); // Initialize lastWindowEnd on first call if (entry.lastWindowEnd === 0) { entry.lastWindowEnd = currentFrame; continue; } // Check if we have enough new audio for the next window const framesSinceLastWindow = currentFrame - entry.lastWindowEnd; if (framesSinceLastWindow >= stepFrames) { // Calculate window boundaries const windowEnd = currentFrame; const windowStart = windowEnd - windowFrames; // Ensure we have enough data in the ring buffer const baseOffset = this.ringBuffer.getBaseFrameOffset(); if (windowStart >= baseOffset) { try { const audio = this.ringBuffer.read(windowStart, windowEnd); const startTime = windowStart / this.targetSampleRate; entry.callback(audio, startTime); entry.lastWindowEnd = windowEnd; } catch (e) { console.warn('[AudioEngine] Window read failed:', e); } } } } } private notifySegment(segment: AudioSegment): void { // Track segment for visualization this.recentSegments.push({ startTime: segment.startFrame / this.targetSampleRate, endTime: segment.endFrame / this.targetSampleRate, isProcessed: false }); // Limit segments count if (this.recentSegments.length > this.MAX_SEGMENTS_FOR_VISUALIZATION) { this.recentSegments.shift(); } this.segmentCallbacks.forEach((cb) => cb(segment)); } /** * Get recent segments for visualization. */ getSegmentsForVisualization(): Array<{ startTime: number; endTime: number; isProcessed: boolean }> { const segments = [...this.recentSegments]; // Add pending segment if speech is currently active const vadState = this.audioProcessor.getStateInfo(); if (vadState.inSpeech && vadState.speechStartTime !== null) { segments.push({ startTime: vadState.speechStartTime, endTime: this.ringBuffer.getCurrentTime(), isProcessed: false // Pending }); } return segments; } /** * Mark a segment as processed (for visualization color coding). */ markSegmentProcessed(startTime: number): void { const segment = this.recentSegments.find(s => Math.abs(s.startTime - startTime) < 0.1); if (segment) { segment.isProcessed = true; } } /** * Update the visualization buffer and summary with new audio data. */ private updateVisualizationBuffer(chunk: Float32Array): void { if (!this.visualizationBuffer || !this.visualizationSummary) return; const chunkLength = chunk.length; const bufferLength = this.visualizationBufferSize; // 1. Update raw circular buffer if (chunkLength >= bufferLength) { this.visualizationBuffer.set(chunk.subarray(chunkLength - bufferLength)); this.visualizationBufferPosition = 0; } else { const endPosition = this.visualizationBufferPosition + chunkLength; if (endPosition <= bufferLength) { this.visualizationBuffer.set(chunk, this.visualizationBufferPosition); this.visualizationBufferPosition = endPosition % bufferLength; } else { const firstPart = bufferLength - this.visualizationBufferPosition; this.visualizationBuffer.set(chunk.subarray(0, firstPart), this.visualizationBufferPosition); this.visualizationBuffer.set(chunk.subarray(firstPart), 0); this.visualizationBufferPosition = (chunkLength - firstPart) % bufferLength; } } // 2. Update summary buffer (Low-res min/max pairs) // Each point in VIS_SUMMARY_SIZE represents bufferLength / VIS_SUMMARY_SIZE samples const samplesPerPoint = bufferLength / this.VIS_SUMMARY_SIZE; const numNewPoints = Math.round(chunkLength / samplesPerPoint); for (let i = 0; i < numNewPoints; i++) { const start = Math.floor(i * samplesPerPoint); const end = Math.min(chunkLength, Math.floor((i + 1) * samplesPerPoint)); if (start >= end) continue; let min = chunk[start]; let max = chunk[start]; for (let s = start + 1; s < end; s++) { const v = chunk[s]; if (v < min) min = v; if (v > max) max = v; } // Write to circular summary const targetIdx = this.visualizationSummaryPosition * 2; this.visualizationSummary[targetIdx] = min; this.visualizationSummary[targetIdx + 1] = max; this.visualizationSummaryPosition = (this.visualizationSummaryPosition + 1) % this.VIS_SUMMARY_SIZE; } } /** * Get visualization data subsampled to fit the target width. * Returns min/max pairs for each pixel to preserve peaks in the waveform. * Zero-allocation except for the returned result. * @param targetWidth - The desired number of data points (e.g., canvas width). * @returns Float32Array containing alternating min/max values, length targetWidth * 2. */ getVisualizationData(targetWidth: number): Float32Array { if (!this.visualizationSummary || !targetWidth || targetWidth <= 0) { return new Float32Array(0); } // If targetWidth is close to or less than our summary size, use the summary (MUCH faster) if (targetWidth <= this.VIS_SUMMARY_SIZE) { const subsampledBuffer = new Float32Array(targetWidth * 2); const samplesPerTarget = this.VIS_SUMMARY_SIZE / targetWidth; for (let i = 0; i < targetWidth; i++) { const rangeStart = i * samplesPerTarget; const rangeEnd = (i + 1) * samplesPerTarget; let minVal = 0; let maxVal = 0; let first = true; for (let s = Math.floor(rangeStart); s < Math.floor(rangeEnd); s++) { const idx = ((this.visualizationSummaryPosition + s) % this.VIS_SUMMARY_SIZE) * 2; const vMin = this.visualizationSummary[idx]; const vMax = this.visualizationSummary[idx + 1]; if (first) { minVal = vMin; maxVal = vMax; first = false; } else { if (vMin < minVal) minVal = vMin; if (vMax > maxVal) maxVal = vMax; } } subsampledBuffer[i * 2] = minVal; subsampledBuffer[i * 2 + 1] = maxVal; } return subsampledBuffer; } return this.getVisualizationDataFromRaw(targetWidth); } private getVisualizationDataFromRaw(targetWidth: number): Float32Array { if (!this.visualizationBuffer) return new Float32Array(0); const buffer = this.visualizationBuffer; const bufferLength = this.visualizationBufferSize; const pos = this.visualizationBufferPosition; const samplesPerPoint = bufferLength / targetWidth; const subsampledBuffer = new Float32Array(targetWidth * 2); // Logical index s maps to physical index: // if s < wrapS: pos + s // else: s - wrapS (which is s - (bufferLength - pos) = s + pos - bufferLength) const wrapS = bufferLength - pos; for (let i = 0; i < targetWidth; i++) { const startS = Math.floor(i * samplesPerPoint); const endS = Math.floor((i + 1) * samplesPerPoint); let minVal = 0; let maxVal = 0; let first = true; // Part 1: Before wrap (Logical indices < wrapS) // Physical indices: pos + s const end1 = (endS < wrapS) ? endS : wrapS; if (startS < end1) { let p = pos + startS; const pEnd = pos + end1; if (first && p < pEnd) { const val = buffer[p]; minVal = val; maxVal = val; first = false; p++; } for (; p < pEnd; p++) { const val = buffer[p]; if (val < minVal) minVal = val; else if (val > maxVal) maxVal = val; } } // Part 2: After wrap (Logical indices >= wrapS) // Physical indices: s - wrapS const start2 = (startS > wrapS) ? startS : wrapS; if (start2 < endS) { let p = start2 - wrapS; const pEnd = endS - wrapS; if (first && p < pEnd) { const val = buffer[p]; minVal = val; maxVal = val; first = false; p++; } for (; p < pEnd; p++) { const val = buffer[p]; if (val < minVal) minVal = val; else if (val > maxVal) maxVal = val; } } subsampledBuffer[i * 2] = minVal; subsampledBuffer[i * 2 + 1] = maxVal; } return subsampledBuffer; } /** * Get current audio metrics for UI visualization. */ getMetrics(): AudioMetrics { return { ...this.metrics }; } /** * Get current time in seconds (for waveform time markers). */ getCurrentTime(): number { return this.ringBuffer.getCurrentTime(); } /** * Get the visualization buffer duration in seconds. */ getVisualizationDuration(): number { return VISUALIZATION_BUFFER_DURATION; } /** * Subscribe to visualization updates. * Callback is invoked after each audio chunk is processed. */ onVisualizationUpdate(callback: (data: Float32Array, metrics: AudioMetrics, bufferEndTime: number) => void): () => void { this.visualizationCallbacks.push(callback); return () => { this.visualizationCallbacks = this.visualizationCallbacks.filter((cb) => cb !== callback); }; } /** * Notify visualization subscribers with updated data. * Throttled to ~30fps to avoid UI stuttering. */ private notifyVisualizationUpdate(): void { const now = performance.now(); if (now - this.lastVisualizationNotifyTime < this.VISUALIZATION_NOTIFY_INTERVAL_MS) { return; } this.lastVisualizationNotifyTime = now; const data = this.getVisualizationData(400); // 400 points is enough for modern displays and saves CPU const bufferEndTime = this.ringBuffer.getCurrentTime(); this.visualizationCallbacks.forEach((cb) => cb(data, this.getMetrics(), bufferEndTime)); } }