keet-streaming / src /lib /audio /AudioEngine.ts
ysdede's picture
feat(space): migrate Hugging Face Space to keet SolidJS app
b8cc2bf
import { AudioEngine as IAudioEngine, AudioEngineConfig, AudioSegment, IRingBuffer, AudioMetrics } from './types';
import { RingBuffer } from './RingBuffer';
import { AudioSegmentProcessor, ProcessedSegment } from './AudioSegmentProcessor';
import { resampleLinear } from './utils';
/** Duration of the visualization buffer in seconds */
const VISUALIZATION_BUFFER_DURATION = 30;
/**
* AudioEngine implementation for capturing audio, buffering it, and performing VAD.
* Uses AudioSegmentProcessor for robust speech detection (incl. lookback).
*/
export class AudioEngine implements IAudioEngine {
private config: AudioEngineConfig;
private ringBuffer: IRingBuffer;
private audioProcessor: AudioSegmentProcessor; // Replaces EnergyVAD
private deviceId: string | null = null;
private audioContext: AudioContext | null = null;
private mediaStream: MediaStream | null = null;
private workletNode: AudioWorkletNode | null = null;
private sourceNode: MediaStreamAudioSourceNode | null = null;
// AnalyserNode for oscilloscope waveform (native getByteTimeDomainData)
private analyserNode: AnalyserNode | null = null;
private analyserSourceNode: MediaStreamAudioSourceNode | null = null;
private analyserGainNode: GainNode | null = null;
private analyserTimeBuffer: Uint8Array | null = null;
private waveformOut: Float32Array | null = null;
private readonly ANALYSER_FFT_SIZE = 256;
private readonly ANALYSER_SMOOTHING = 0.3; // Low = fast oscilloscope response
// Track device vs target sample rates
private deviceSampleRate: number = 48000;
private targetSampleRate: number = 16000;
private currentEnergy: number = 0;
private segmentCallbacks: Array<(segment: AudioSegment) => void> = [];
// Fixed-window streaming state (v3 token streaming mode)
private windowCallbacks: Array<{
windowDuration: number;
overlapDuration: number;
triggerInterval: number;
callback: (audio: Float32Array, startTime: number) => void;
lastWindowEnd: number; // Frame offset of last window end
}> = [];
// Resampled audio chunk callbacks (for mel worker, etc.)
private audioChunkCallbacks: Array<(chunk: Float32Array) => void> = [];
// SMA buffer for energy calculation
private energyHistory: number[] = [];
// Last N energy values for bar visualizer (oldest first when read)
private energyBarHistory: number[] = [];
private readonly BAR_LEVELS_SIZE = 64;
// Visualization Summary Buffer (Low-Res Min/Max pairs)
private visualizationSummary: Float32Array | null = null;
private visualizationSummaryPosition: number = 0;
private readonly VIS_SUMMARY_SIZE = 2000; // 2000 min/max pairs for 30 seconds = 15ms resolution
// Raw visualization buffer (still kept for higher-res requests if needed, but summary is preferred)
private visualizationBuffer: Float32Array | null = null;
private visualizationBufferPosition: number = 0;
private visualizationBufferSize: number = 0;
// Metrics for UI components
private metrics: AudioMetrics = {
currentEnergy: 0,
averageEnergy: 0,
peakEnergy: 0,
noiseFloor: 0.01,
currentSNR: 0,
isSpeaking: false,
};
// Subscribers for visualization updates
private visualizationCallbacks: Array<(data: Float32Array, metrics: AudioMetrics, bufferEndTime: number) => void> = [];
private lastVisualizationNotifyTime: number = 0;
private readonly VISUALIZATION_NOTIFY_INTERVAL_MS = 16; // ~60fps for responsive oscilloscope
// Recent segments for visualization (stores timing info only)
private recentSegments: Array<{ startTime: number; endTime: number; isProcessed: boolean }> = [];
private readonly MAX_SEGMENTS_FOR_VISUALIZATION = 50;
constructor(config: Partial<AudioEngineConfig> = {}) {
this.config = {
sampleRate: 16000,
bufferDuration: 120,
energyThreshold: 0.08, // Match legacy UI project 'medium'
minSpeechDuration: 240, // Match legacy UI project
minSilenceDuration: 400, // Match legacy UI project
maxSegmentDuration: 4.8, // Match legacy UI project
// Advanced VAD defaults
lookbackDuration: 0.120,
speechHangover: 0.16,
minEnergyIntegral: 22,
minEnergyPerSecond: 5,
useAdaptiveEnergyThresholds: true,
adaptiveEnergyIntegralFactor: 25.0,
adaptiveEnergyPerSecondFactor: 10.0,
minAdaptiveEnergyIntegral: 3,
minAdaptiveEnergyPerSecond: 1,
maxSilenceWithinSpeech: 0.160,
endingSpeechTolerance: 0.240,
...config,
};
this.deviceId = this.config.deviceId || null;
this.targetSampleRate = this.config.sampleRate;
// RingBuffer operates at TARGET sample rate (16kHz)
this.ringBuffer = new RingBuffer(this.targetSampleRate, this.config.bufferDuration);
// Initialize AudioSegmentProcessor
this.audioProcessor = new AudioSegmentProcessor({
sampleRate: this.targetSampleRate,
energyThreshold: this.config.energyThreshold,
minSpeechDuration: this.config.minSpeechDuration,
silenceThreshold: this.config.minSilenceDuration,
maxSegmentDuration: this.config.maxSegmentDuration,
lookbackDuration: this.config.lookbackDuration,
maxSilenceWithinSpeech: this.config.maxSilenceWithinSpeech,
endingSpeechTolerance: this.config.endingSpeechTolerance,
snrThreshold: 3.0,
minSnrThreshold: 1.0,
noiseFloorAdaptationRate: 0.05,
fastAdaptationRate: 0.15,
minBackgroundDuration: 1.0,
energyRiseThreshold: 0.08
});
// Initialize visualization buffer (30 seconds at target sample rate)
this.visualizationBufferSize = Math.round(this.targetSampleRate * VISUALIZATION_BUFFER_DURATION);
this.visualizationBuffer = new Float32Array(this.visualizationBufferSize);
this.visualizationBufferPosition = 0;
// Initialize visualization summary (2000 points for 30s)
this.visualizationSummary = new Float32Array(this.VIS_SUMMARY_SIZE * 2);
this.visualizationSummaryPosition = 0;
console.log('[AudioEngine] Initialized with config:', this.config);
}
private isWorkletInitialized = false;
async init(): Promise<void> {
// Request microphone permission with optional deviceId
try {
if (this.mediaStream) {
this.mediaStream.getTracks().forEach(t => t.stop());
}
const constraints: MediaStreamConstraints = {
audio: {
deviceId: this.deviceId ? { exact: this.deviceId } : undefined,
channelCount: 1,
echoCancellation: false,
noiseSuppression: false,
autoGainControl: false,
},
};
console.log('[AudioEngine] Requesting microphone:', constraints);
this.mediaStream = await navigator.mediaDevices.getUserMedia(constraints);
console.log('[AudioEngine] Microphone stream acquired:', this.mediaStream.id);
} catch (err) {
console.error('[AudioEngine] Failed to get media stream:', err);
throw err;
}
const track = this.mediaStream!.getAudioTracks()[0];
const trackSettings = track?.getSettings?.();
// Device sample rate (what the mic gives us)
this.deviceSampleRate = trackSettings?.sampleRate ?? 48000;
console.log('[AudioEngine] Device sample rate:', this.deviceSampleRate, '-> Target:', this.targetSampleRate);
if (this.audioContext && this.audioContext.sampleRate !== this.deviceSampleRate) {
await this.audioContext.close();
this.audioContext = null;
}
if (!this.audioContext) {
this.audioContext = new AudioContext({
sampleRate: this.deviceSampleRate,
latencyHint: 'interactive',
});
console.log('[AudioEngine] Created AudioContext:', this.audioContext.state, 'sampleRate:', this.audioContext.sampleRate);
}
// Re-initialize components with correct rates
this.ringBuffer = new RingBuffer(this.targetSampleRate, this.config.bufferDuration);
// Update processor config
this.audioProcessor = new AudioSegmentProcessor({
sampleRate: this.targetSampleRate,
energyThreshold: this.config.energyThreshold,
minSpeechDuration: this.config.minSpeechDuration,
silenceThreshold: this.config.minSilenceDuration,
maxSegmentDuration: this.config.maxSegmentDuration,
});
if (!this.isWorkletInitialized) {
const windowDuration = 0.080;
const processorCode = `
class CaptureProcessor extends AudioWorkletProcessor {
constructor(options) {
super(options);
const opts = options?.processorOptions || {};
this.inputSampleRate = opts.inputSampleRate || 16000;
this.targetSampleRate = opts.targetSampleRate || this.inputSampleRate;
this.ratio = this.inputSampleRate / this.targetSampleRate;
this.bufferSize = Math.round(${windowDuration} * this.inputSampleRate);
this.buffer = new Float32Array(this.bufferSize);
this.index = 0;
this._lastLog = 0;
}
_emitChunk() {
let out;
let maxAbs = 0;
if (this.targetSampleRate === this.inputSampleRate) {
out = new Float32Array(this.bufferSize);
for (let i = 0; i < this.bufferSize; i++) {
const v = this.buffer[i];
out[i] = v;
const a = v < 0 ? -v : v;
if (a > maxAbs) maxAbs = a;
}
} else {
const outLength = Math.floor(this.bufferSize / this.ratio);
out = new Float32Array(outLength);
for (let i = 0; i < outLength; i++) {
const srcIndex = i * this.ratio;
const srcIndexFloor = Math.floor(srcIndex);
const srcIndexCeil = Math.min(srcIndexFloor + 1, this.bufferSize - 1);
const t = srcIndex - srcIndexFloor;
const v = this.buffer[srcIndexFloor] * (1 - t) + this.buffer[srcIndexCeil] * t;
out[i] = v;
const a = v < 0 ? -v : v;
if (a > maxAbs) maxAbs = a;
}
}
this.port.postMessage(
{ type: 'audio', samples: out, sampleRate: this.targetSampleRate, maxAbs },
[out.buffer]
);
}
process(inputs) {
const input = inputs[0];
if (!input || !input[0]) return true;
const channelData = input[0];
// Buffer the data
for (let i = 0; i < channelData.length; i++) {
this.buffer[this.index++] = channelData[i];
if (this.index >= this.bufferSize) {
this._emitChunk();
this.index = 0;
// Debug log every ~5 seconds
const now = Date.now();
if (now - this._lastLog > 5000) {
this.port.postMessage({ type: 'log', message: '[AudioWorklet] Active' });
this._lastLog = now;
}
}
}
return true;
}
}
registerProcessor('capture-processor', CaptureProcessor);
`;
const blob = new Blob([processorCode], { type: 'application/javascript' });
const url = URL.createObjectURL(blob);
try {
await this.audioContext.audioWorklet.addModule(url);
this.isWorkletInitialized = true;
console.log('[AudioEngine] AudioWorklet module loaded');
} catch (err) {
console.error('[AudioEngine] Failed to load worklet:', err);
if (err instanceof Error && err.name === 'InvalidStateError') {
// Ignore if already registered
this.isWorkletInitialized = true;
}
}
}
// Re-create worklet node if needed (it might handle dispose differently, but safe to new)
if (this.workletNode) this.workletNode.disconnect();
this.workletNode = new AudioWorkletNode(this.audioContext, 'capture-processor', {
processorOptions: { inputSampleRate: this.deviceSampleRate, targetSampleRate: this.targetSampleRate },
});
this.workletNode.port.onmessage = (event: MessageEvent<any>) => {
if (event.data?.type === 'audio' && event.data.samples instanceof Float32Array) {
this.handleAudioChunk(event.data.samples, event.data.maxAbs, event.data.sampleRate);
} else if (event.data instanceof Float32Array) {
this.handleAudioChunk(event.data, undefined, this.deviceSampleRate);
} else if (event.data?.type === 'log') {
console.log(event.data.message);
}
};
this.workletNode.onprocessorerror = (e) => {
console.error('[AudioEngine] Worklet processor error:', e);
};
// Reconnect source node
this.sourceNode?.disconnect();
this.sourceNode = this.audioContext.createMediaStreamSource(this.mediaStream);
this.sourceNode.connect(this.workletNode);
// AnalyserNode branch for lightweight preview bars (native FFT, no mel worker)
this.disposeAnalyser();
this.analyserSourceNode = this.audioContext.createMediaStreamSource(this.mediaStream);
this.analyserNode = this.audioContext.createAnalyser();
this.analyserNode.fftSize = this.ANALYSER_FFT_SIZE;
this.analyserNode.smoothingTimeConstant = this.ANALYSER_SMOOTHING;
this.analyserTimeBuffer = new Uint8Array(this.analyserNode.fftSize);
this.waveformOut = new Float32Array(this.analyserNode.fftSize);
this.analyserGainNode = this.audioContext.createGain();
this.analyserGainNode.gain.value = 0;
this.analyserSourceNode.connect(this.analyserNode);
this.analyserNode.connect(this.analyserGainNode);
this.analyserGainNode.connect(this.audioContext.destination);
// Keep graph alive
this.workletNode.connect(this.audioContext.destination);
console.log('[AudioEngine] Graph connected: Source -> Worklet, AnalyserNode for oscilloscope');
}
async start(): Promise<void> {
if (!this.mediaStream || !this.audioContext || !this.workletNode) {
await this.init();
}
if (this.audioContext?.state === 'suspended') {
await this.audioContext.resume();
}
}
stop(): void {
if (this.audioContext?.state === 'running') {
this.audioContext.suspend();
}
}
/**
* Reset buffers and VAD state for a new session while keeping the audio graph.
* Aligns visualization + segment timebase to 0, matching legacy UI project behavior.
*/
reset(): void {
// Reset audio/VAD state
this.ringBuffer.reset();
this.audioProcessor.reset();
this.currentEnergy = 0;
// Reset metrics
this.metrics = {
currentEnergy: 0,
averageEnergy: 0,
peakEnergy: 0,
noiseFloor: 0.01,
currentSNR: 0,
isSpeaking: false,
};
// Clear segment history used by the visualizer
this.recentSegments = [];
this.energyBarHistory = [];
// Reset visualization buffer
if (this.visualizationBuffer) {
this.visualizationBuffer.fill(0);
}
this.visualizationBufferPosition = 0;
// Reset windowed streaming cursors
for (const entry of this.windowCallbacks) {
entry.lastWindowEnd = 0;
}
// Push a blank update so UI clears stale waveform/segments
this.notifyVisualizationUpdate();
}
getCurrentEnergy(): number {
return this.currentEnergy;
}
/** Oscilloscope waveform from AnalyserNode.getByteTimeDomainData (native, fast). Values -1..1. */
getBarLevels(): Float32Array {
if (this.analyserNode && this.analyserTimeBuffer && this.waveformOut) {
(this.analyserNode as { getByteTimeDomainData(array: Uint8Array): void }).getByteTimeDomainData(this.analyserTimeBuffer);
for (let i = 0; i < this.analyserTimeBuffer.length; i++) {
this.waveformOut[i] = (this.analyserTimeBuffer[i] - 128) / 128; // 0..255 -> -1..1
}
return this.waveformOut;
}
const out = new Float32Array(this.BAR_LEVELS_SIZE);
const h = this.energyBarHistory;
const start = h.length <= this.BAR_LEVELS_SIZE ? 0 : h.length - this.BAR_LEVELS_SIZE;
for (let i = 0; i < this.BAR_LEVELS_SIZE; i++) {
const idx = start + i;
out[i] = idx < h.length ? Math.min(1, Math.max(0, h[idx])) : 0;
}
return out;
}
getSignalMetrics(): { noiseFloor: number; snr: number; threshold: number; snrThreshold: number } {
const stats = this.audioProcessor.getStats();
return {
noiseFloor: stats.noiseFloor ?? 0.0001,
snr: stats.snr ?? 0,
threshold: this.config.energyThreshold,
snrThreshold: stats.snrThreshold ?? 3.0
};
}
isSpeechActive(): boolean {
return this.audioProcessor.getStateInfo().inSpeech;
}
getRingBuffer(): IRingBuffer {
return this.ringBuffer;
}
onSpeechSegment(callback: (segment: AudioSegment) => void): () => void {
this.segmentCallbacks.push(callback);
return () => {
this.segmentCallbacks = this.segmentCallbacks.filter((cb) => cb !== callback);
};
}
/**
* Subscribe to fixed-window chunks for token streaming mode.
* Fires every triggerInterval seconds with windowDuration of audio.
*/
onWindowChunk(
windowDuration: number,
overlapDuration: number,
triggerInterval: number,
callback: (audio: Float32Array, startTime: number) => void
): () => void {
const entry = {
windowDuration,
overlapDuration,
triggerInterval,
callback,
lastWindowEnd: 0, // Will be set on first chunk
};
this.windowCallbacks.push(entry);
return () => {
this.windowCallbacks = this.windowCallbacks.filter((e) => e !== entry);
};
}
/**
* Subscribe to every resampled audio chunk (16kHz).
* Used to feed the continuous mel producer worker.
* Returns an unsubscribe function.
*/
onAudioChunk(callback: (chunk: Float32Array) => void): () => void {
this.audioChunkCallbacks.push(callback);
return () => {
this.audioChunkCallbacks = this.audioChunkCallbacks.filter((cb) => cb !== callback);
};
}
updateConfig(config: Partial<AudioEngineConfig>): void {
this.config = { ...this.config, ...config };
// Update processor config
if (config.energyThreshold !== undefined) this.audioProcessor.setThreshold(config.energyThreshold);
if (config.minSpeechDuration !== undefined) this.audioProcessor.setMinSpeechDuration(config.minSpeechDuration);
if (config.minSilenceDuration !== undefined) this.audioProcessor.setSilenceLength(config.minSilenceDuration);
if (config.maxSegmentDuration !== undefined) this.audioProcessor.setMaxSegmentDuration(config.maxSegmentDuration);
// Advanced VAD updates
if (config.lookbackDuration !== undefined) this.audioProcessor.setLookbackDuration(config.lookbackDuration);
if (config.overlapDuration !== undefined) this.audioProcessor.setOverlapDuration(config.overlapDuration);
if (config.maxSilenceWithinSpeech !== undefined) this.audioProcessor.setMaxSilenceWithinSpeech(config.maxSilenceWithinSpeech);
if (config.endingSpeechTolerance !== undefined) this.audioProcessor.setEndingSpeechTolerance(config.endingSpeechTolerance);
if (config.snrThreshold !== undefined) this.audioProcessor.setSnrThreshold(config.snrThreshold);
if (config.minSnrThreshold !== undefined) this.audioProcessor.setMinSnrThreshold(config.minSnrThreshold);
}
async setDevice(deviceId: string): Promise<void> {
this.deviceId = deviceId;
await this.init();
// Reconnect if running
if (this.audioContext && this.workletNode) {
this.sourceNode?.disconnect();
this.sourceNode = this.audioContext.createMediaStreamSource(this.mediaStream!);
this.sourceNode.connect(this.workletNode);
}
}
private disposeAnalyser(): void {
this.analyserSourceNode?.disconnect();
this.analyserNode?.disconnect();
this.analyserGainNode?.disconnect();
this.analyserSourceNode = null;
this.analyserNode = null;
this.analyserGainNode = null;
this.analyserTimeBuffer = null;
this.waveformOut = null;
}
dispose(): void {
this.stop();
this.disposeAnalyser();
this.mediaStream?.getTracks().forEach(track => track.stop());
this.audioContext?.close();
this.audioContext = null;
this.mediaStream = null;
this.workletNode = null;
this.sourceNode = null;
}
private handleAudioChunk(rawChunk: Float32Array, precomputedMaxAbs?: number, chunkSampleRate?: number): void {
// 0. Ensure chunk is at target sample rate (resample only if needed)
const sampleRate = chunkSampleRate ?? this.targetSampleRate;
const needsResample = sampleRate !== this.targetSampleRate;
const chunk = needsResample
? resampleLinear(rawChunk, sampleRate, this.targetSampleRate)
: rawChunk;
// Calculate chunk energy (Peak Amplitude) + SMA for VAD compatibility
let maxAbs = (!needsResample && precomputedMaxAbs !== undefined) ? precomputedMaxAbs : 0;
if (precomputedMaxAbs === undefined || needsResample) {
for (let i = 0; i < chunk.length; i++) {
const abs = Math.abs(chunk[i]);
if (abs > maxAbs) maxAbs = abs;
}
}
// SMA Smoothing (matching legacy UI project logic)
this.energyHistory.push(maxAbs);
if (this.energyHistory.length > 6) {
this.energyHistory.shift();
}
const energy = this.energyHistory.reduce((a: number, b: number) => a + b, 0) / this.energyHistory.length;
this.currentEnergy = energy;
this.energyBarHistory.push(energy);
if (this.energyBarHistory.length > this.BAR_LEVELS_SIZE) {
this.energyBarHistory.shift();
}
// Log when energy crosses threshold if state is close to changing
const isSpeech = energy > this.config.energyThreshold;
const wasSpeaking = this.metrics.isSpeaking;
if (isSpeech !== wasSpeaking) {
console.debug(`[AudioEngine] Energy threshold crossed: ${energy.toFixed(6)} > ${this.config.energyThreshold} = ${isSpeech}`);
}
// 1. Write to ring buffer before any callbacks can transfer the chunk.
this.ringBuffer.write(chunk);
const endFrame = this.ringBuffer.getCurrentFrame();
// 2. Process VAD on resampled audio
// The processor uses its own internal history for lookback, but we pull full audio from ring buffer later.
const currentTime = this.ringBuffer.getCurrentTime();
const segments = this.audioProcessor.processAudioData(chunk, currentTime, energy);
// 2.5 Update visualization buffer
this.updateVisualizationBuffer(chunk);
// 2.6 Update metrics
const stats = this.audioProcessor.getStats();
const stateInfo = this.audioProcessor.getStateInfo();
this.metrics.currentEnergy = energy;
this.metrics.averageEnergy = this.metrics.averageEnergy * 0.95 + energy * 0.05;
this.metrics.peakEnergy = Math.max(this.metrics.peakEnergy * 0.99, energy);
this.metrics.noiseFloor = stats.noiseFloor ?? 0.01;
this.metrics.currentSNR = stats.snr ?? 0;
this.metrics.isSpeaking = stateInfo.inSpeech;
// Periodic debug log
if (Math.random() < 0.05) {
console.debug(`[AudioEngine] Metrics: E=${energy.toFixed(6)}, NF=${this.metrics.noiseFloor.toFixed(6)}, SNR=${this.metrics.currentSNR.toFixed(2)}, Speaking=${this.metrics.isSpeaking}`);
}
// 3. Handle segments
if (segments.length > 0) {
for (const seg of segments) {
// Apply lookback and overlap adjustments matching legacy UI project
const lookbackDuration = this.config.lookbackDuration ?? 0.120;
const startTime = Math.max(0, seg.startTime - lookbackDuration);
// Calculate the sample positions for audio extraction
const startFrame = Math.round(startTime * this.targetSampleRate);
const endFrame = Math.round(seg.endTime * this.targetSampleRate);
// Retrieval with padding (hangover)
const speechHangover = this.config.speechHangover ?? 0.16;
const paddedEndFrame = Math.min(
this.ringBuffer.getCurrentFrame(),
endFrame + Math.round(speechHangover * this.targetSampleRate)
);
try {
const audioData = this.ringBuffer.read(startFrame, paddedEndFrame);
// Calculate precise energy metrics for filtering
const metrics = this.calculateSegmentEnergyMetrics(audioData, this.targetSampleRate);
// Normalize power to 16kHz equivalent
const normalizedPowerAt16k = metrics.averagePower * 16000;
const normalizedEnergyIntegralAt16k = normalizedPowerAt16k * metrics.duration;
// Adaptive threshold calculation
let minEnergyIntegralThreshold = this.config.minEnergyIntegral ?? 22;
let minEnergyPerSecondThreshold = this.config.minEnergyPerSecond ?? 5;
if (this.config.useAdaptiveEnergyThresholds) {
const windowSize = this.config.windowSize ?? Math.round(0.080 * this.targetSampleRate);
const normalizedNoiseFloor = windowSize > 0 ? this.metrics.noiseFloor / windowSize : 0;
const noiseFloorAt16k = normalizedNoiseFloor * 16000;
const adaptiveMinEnergyIntegral = noiseFloorAt16k * (this.config.adaptiveEnergyIntegralFactor ?? 25.0);
minEnergyIntegralThreshold = Math.max(this.config.minAdaptiveEnergyIntegral ?? 3, adaptiveMinEnergyIntegral);
const adaptiveMinEnergyPerSecond = noiseFloorAt16k * (this.config.adaptiveEnergyPerSecondFactor ?? 10.0);
minEnergyPerSecondThreshold = Math.max(this.config.minAdaptiveEnergyPerSecond ?? 1, adaptiveMinEnergyPerSecond);
}
const isValidSpeech =
metrics.duration >= (this.config.minSpeechDuration / 1000) &&
normalizedPowerAt16k >= minEnergyPerSecondThreshold &&
normalizedEnergyIntegralAt16k >= minEnergyIntegralThreshold;
if (isValidSpeech) {
const audioSegment: AudioSegment = {
startFrame: startFrame,
endFrame: paddedEndFrame,
duration: metrics.duration,
averageEnergy: metrics.averagePower,
timestamp: Date.now(),
};
this.notifySegment(audioSegment);
} else {
console.log('[AudioEngine] Filtered out noise segment:', {
duration: metrics.duration,
power: normalizedPowerAt16k,
integral: normalizedEnergyIntegralAt16k
});
}
} catch (err) {
console.warn('[AudioEngine] Failed to extract audio for validation:', err);
}
}
}
// 6. Fixed-window streaming (v3 token streaming mode)
this.processWindowCallbacks(endFrame);
// 7. Notify audio chunk subscribers AFTER internal processing.
// Callbacks may transfer the chunk's buffer; do not use `chunk` after this.
for (const cb of this.audioChunkCallbacks) {
cb(chunk);
}
// 8. Notify visualization subscribers
this.notifyVisualizationUpdate();
}
/**
* Helper to read audio from ring buffer and calculate energy metrics for a detected segment.
*/
private calculateSegmentEnergyMetrics(audioData: Float32Array, sampleRate: number): { averagePower: number; duration: number; numSamples: number } {
if (!audioData || audioData.length === 0) {
return { averagePower: 0, duration: 0, numSamples: 0 };
}
const numSamples = audioData.length;
let sumOfSquares = 0;
for (let i = 0; i < numSamples; i++) {
sumOfSquares += audioData[i] * audioData[i];
}
const duration = numSamples / sampleRate;
const averagePower = numSamples > 0 ? sumOfSquares / numSamples : 0;
return {
averagePower,
duration,
numSamples
};
}
/**
* Process fixed-window callbacks for token streaming mode.
* Fires when enough audio has accumulated for a new window.
*/
private processWindowCallbacks(currentFrame: number): void {
for (const entry of this.windowCallbacks) {
const windowFrames = Math.floor(entry.windowDuration * this.targetSampleRate);
const stepFrames = Math.floor(entry.triggerInterval * this.targetSampleRate);
// Initialize lastWindowEnd on first call
if (entry.lastWindowEnd === 0) {
entry.lastWindowEnd = currentFrame;
continue;
}
// Check if we have enough new audio for the next window
const framesSinceLastWindow = currentFrame - entry.lastWindowEnd;
if (framesSinceLastWindow >= stepFrames) {
// Calculate window boundaries
const windowEnd = currentFrame;
const windowStart = windowEnd - windowFrames;
// Ensure we have enough data in the ring buffer
const baseOffset = this.ringBuffer.getBaseFrameOffset();
if (windowStart >= baseOffset) {
try {
const audio = this.ringBuffer.read(windowStart, windowEnd);
const startTime = windowStart / this.targetSampleRate;
entry.callback(audio, startTime);
entry.lastWindowEnd = windowEnd;
} catch (e) {
console.warn('[AudioEngine] Window read failed:', e);
}
}
}
}
}
private notifySegment(segment: AudioSegment): void {
// Track segment for visualization
this.recentSegments.push({
startTime: segment.startFrame / this.targetSampleRate,
endTime: segment.endFrame / this.targetSampleRate,
isProcessed: false
});
// Limit segments count
if (this.recentSegments.length > this.MAX_SEGMENTS_FOR_VISUALIZATION) {
this.recentSegments.shift();
}
this.segmentCallbacks.forEach((cb) => cb(segment));
}
/**
* Get recent segments for visualization.
*/
getSegmentsForVisualization(): Array<{ startTime: number; endTime: number; isProcessed: boolean }> {
const segments = [...this.recentSegments];
// Add pending segment if speech is currently active
const vadState = this.audioProcessor.getStateInfo();
if (vadState.inSpeech && vadState.speechStartTime !== null) {
segments.push({
startTime: vadState.speechStartTime,
endTime: this.ringBuffer.getCurrentTime(),
isProcessed: false // Pending
});
}
return segments;
}
/**
* Mark a segment as processed (for visualization color coding).
*/
markSegmentProcessed(startTime: number): void {
const segment = this.recentSegments.find(s => Math.abs(s.startTime - startTime) < 0.1);
if (segment) {
segment.isProcessed = true;
}
}
/**
* Update the visualization buffer and summary with new audio data.
*/
private updateVisualizationBuffer(chunk: Float32Array): void {
if (!this.visualizationBuffer || !this.visualizationSummary) return;
const chunkLength = chunk.length;
const bufferLength = this.visualizationBufferSize;
// 1. Update raw circular buffer
if (chunkLength >= bufferLength) {
this.visualizationBuffer.set(chunk.subarray(chunkLength - bufferLength));
this.visualizationBufferPosition = 0;
} else {
const endPosition = this.visualizationBufferPosition + chunkLength;
if (endPosition <= bufferLength) {
this.visualizationBuffer.set(chunk, this.visualizationBufferPosition);
this.visualizationBufferPosition = endPosition % bufferLength;
} else {
const firstPart = bufferLength - this.visualizationBufferPosition;
this.visualizationBuffer.set(chunk.subarray(0, firstPart), this.visualizationBufferPosition);
this.visualizationBuffer.set(chunk.subarray(firstPart), 0);
this.visualizationBufferPosition = (chunkLength - firstPart) % bufferLength;
}
}
// 2. Update summary buffer (Low-res min/max pairs)
// Each point in VIS_SUMMARY_SIZE represents bufferLength / VIS_SUMMARY_SIZE samples
const samplesPerPoint = bufferLength / this.VIS_SUMMARY_SIZE;
const numNewPoints = Math.round(chunkLength / samplesPerPoint);
for (let i = 0; i < numNewPoints; i++) {
const start = Math.floor(i * samplesPerPoint);
const end = Math.min(chunkLength, Math.floor((i + 1) * samplesPerPoint));
if (start >= end) continue;
let min = chunk[start];
let max = chunk[start];
for (let s = start + 1; s < end; s++) {
const v = chunk[s];
if (v < min) min = v;
if (v > max) max = v;
}
// Write to circular summary
const targetIdx = this.visualizationSummaryPosition * 2;
this.visualizationSummary[targetIdx] = min;
this.visualizationSummary[targetIdx + 1] = max;
this.visualizationSummaryPosition = (this.visualizationSummaryPosition + 1) % this.VIS_SUMMARY_SIZE;
}
}
/**
* Get visualization data subsampled to fit the target width.
* Returns min/max pairs for each pixel to preserve peaks in the waveform.
* Zero-allocation except for the returned result.
* @param targetWidth - The desired number of data points (e.g., canvas width).
* @returns Float32Array containing alternating min/max values, length targetWidth * 2.
*/
getVisualizationData(targetWidth: number): Float32Array {
if (!this.visualizationSummary || !targetWidth || targetWidth <= 0) {
return new Float32Array(0);
}
// If targetWidth is close to or less than our summary size, use the summary (MUCH faster)
if (targetWidth <= this.VIS_SUMMARY_SIZE) {
const subsampledBuffer = new Float32Array(targetWidth * 2);
const samplesPerTarget = this.VIS_SUMMARY_SIZE / targetWidth;
for (let i = 0; i < targetWidth; i++) {
const rangeStart = i * samplesPerTarget;
const rangeEnd = (i + 1) * samplesPerTarget;
let minVal = 0;
let maxVal = 0;
let first = true;
for (let s = Math.floor(rangeStart); s < Math.floor(rangeEnd); s++) {
const idx = ((this.visualizationSummaryPosition + s) % this.VIS_SUMMARY_SIZE) * 2;
const vMin = this.visualizationSummary[idx];
const vMax = this.visualizationSummary[idx + 1];
if (first) {
minVal = vMin;
maxVal = vMax;
first = false;
} else {
if (vMin < minVal) minVal = vMin;
if (vMax > maxVal) maxVal = vMax;
}
}
subsampledBuffer[i * 2] = minVal;
subsampledBuffer[i * 2 + 1] = maxVal;
}
return subsampledBuffer;
}
return this.getVisualizationDataFromRaw(targetWidth);
}
private getVisualizationDataFromRaw(targetWidth: number): Float32Array {
if (!this.visualizationBuffer) return new Float32Array(0);
const buffer = this.visualizationBuffer;
const bufferLength = this.visualizationBufferSize;
const pos = this.visualizationBufferPosition;
const samplesPerPoint = bufferLength / targetWidth;
const subsampledBuffer = new Float32Array(targetWidth * 2);
// Logical index s maps to physical index:
// if s < wrapS: pos + s
// else: s - wrapS (which is s - (bufferLength - pos) = s + pos - bufferLength)
const wrapS = bufferLength - pos;
for (let i = 0; i < targetWidth; i++) {
const startS = Math.floor(i * samplesPerPoint);
const endS = Math.floor((i + 1) * samplesPerPoint);
let minVal = 0;
let maxVal = 0;
let first = true;
// Part 1: Before wrap (Logical indices < wrapS)
// Physical indices: pos + s
const end1 = (endS < wrapS) ? endS : wrapS;
if (startS < end1) {
let p = pos + startS;
const pEnd = pos + end1;
if (first && p < pEnd) {
const val = buffer[p];
minVal = val;
maxVal = val;
first = false;
p++;
}
for (; p < pEnd; p++) {
const val = buffer[p];
if (val < minVal) minVal = val;
else if (val > maxVal) maxVal = val;
}
}
// Part 2: After wrap (Logical indices >= wrapS)
// Physical indices: s - wrapS
const start2 = (startS > wrapS) ? startS : wrapS;
if (start2 < endS) {
let p = start2 - wrapS;
const pEnd = endS - wrapS;
if (first && p < pEnd) {
const val = buffer[p];
minVal = val;
maxVal = val;
first = false;
p++;
}
for (; p < pEnd; p++) {
const val = buffer[p];
if (val < minVal) minVal = val;
else if (val > maxVal) maxVal = val;
}
}
subsampledBuffer[i * 2] = minVal;
subsampledBuffer[i * 2 + 1] = maxVal;
}
return subsampledBuffer;
}
/**
* Get current audio metrics for UI visualization.
*/
getMetrics(): AudioMetrics {
return { ...this.metrics };
}
/**
* Get current time in seconds (for waveform time markers).
*/
getCurrentTime(): number {
return this.ringBuffer.getCurrentTime();
}
/**
* Get the visualization buffer duration in seconds.
*/
getVisualizationDuration(): number {
return VISUALIZATION_BUFFER_DURATION;
}
/**
* Subscribe to visualization updates.
* Callback is invoked after each audio chunk is processed.
*/
onVisualizationUpdate(callback: (data: Float32Array, metrics: AudioMetrics, bufferEndTime: number) => void): () => void {
this.visualizationCallbacks.push(callback);
return () => {
this.visualizationCallbacks = this.visualizationCallbacks.filter((cb) => cb !== callback);
};
}
/**
* Notify visualization subscribers with updated data.
* Throttled to ~30fps to avoid UI stuttering.
*/
private notifyVisualizationUpdate(): void {
const now = performance.now();
if (now - this.lastVisualizationNotifyTime < this.VISUALIZATION_NOTIFY_INTERVAL_MS) {
return;
}
this.lastVisualizationNotifyTime = now;
const data = this.getVisualizationData(400); // 400 points is enough for modern displays and saves CPU
const bufferEndTime = this.ringBuffer.getCurrentTime();
this.visualizationCallbacks.forEach((cb) => cb(data, this.getMetrics(), bufferEndTime));
}
}