keet-streaming / src /lib /vad /VADRingBuffer.ts
ysdede's picture
feat(space): migrate Hugging Face Space to keet SolidJS app
b8cc2bf
/**
* VADRingBuffer stores per-frame speech probabilities in a circular buffer,
* synchronized with the audio RingBuffer via global frame offsets.
*
* Each VAD probability covers `hopSize` audio frames (e.g., 512 frames = 32ms at 16kHz).
* VAD probability at index i corresponds to audio frames [i * hopSize, (i+1) * hopSize).
*
* Ported from zdasr-main/src/zdasr/ring_buffer.py (VAD support).
*/
export class VADRingBuffer {
/** Number of audio frames per VAD probability entry */
readonly hopSize: number;
/** Sample rate for time conversions */
readonly sampleRate: number;
private readonly maxEntries: number;
private buffer: Float32Array;
private globalIndex: number = 0; // Next VAD entry to be written (global)
/**
* @param sampleRate - Audio sample rate in Hz
* @param durationSeconds - Maximum buffer duration in seconds
* @param hopSize - Number of audio frames per VAD probability (default: 512 for Silero at 16kHz)
*/
constructor(sampleRate: number, durationSeconds: number, hopSize: number = 512) {
this.sampleRate = sampleRate;
this.hopSize = hopSize;
this.maxEntries = Math.ceil((sampleRate * durationSeconds) / hopSize);
this.buffer = new Float32Array(this.maxEntries);
}
/**
* Write one or more VAD probabilities.
* Each probability corresponds to hopSize audio frames.
*/
write(probability: number): void {
const writePos = this.globalIndex % this.maxEntries;
this.buffer[writePos] = probability;
this.globalIndex++;
}
/**
* Write multiple VAD probabilities at once.
*/
writeBatch(probabilities: Float32Array | number[]): void {
for (let i = 0; i < probabilities.length; i++) {
this.write(probabilities[i]);
}
}
/**
* Read VAD probabilities for a range of audio frames.
*
* @param startFrame - Start audio frame (global offset, inclusive)
* @param endFrame - End audio frame (global offset, exclusive)
* @returns Float32Array of VAD probabilities covering the range
*/
readForFrameRange(startFrame: number, endFrame: number): Float32Array {
if (endFrame <= startFrame) return new Float32Array(0);
const startEntry = Math.floor(startFrame / this.hopSize);
const endEntry = Math.ceil(endFrame / this.hopSize);
const baseEntry = this.getBaseEntry();
const clampedStart = Math.max(startEntry, baseEntry);
const clampedEnd = Math.min(endEntry, this.globalIndex);
if (clampedEnd <= clampedStart) return new Float32Array(0);
const length = clampedEnd - clampedStart;
const result = new Float32Array(length);
for (let i = 0; i < length; i++) {
const readPos = (clampedStart + i) % this.maxEntries;
result[i] = this.buffer[readPos];
}
return result;
}
/**
* Get the duration of trailing silence (in seconds) from the current position.
* Scans backward from the latest entry until a probability >= threshold is found.
*
* @param threshold - Probability threshold for speech (default: 0.5)
* @returns Duration of trailing silence in seconds
*/
getSilenceTailDuration(threshold: number = 0.5): number {
if (this.globalIndex === 0) return 0;
let silentEntries = 0;
const baseEntry = this.getBaseEntry();
for (let i = this.globalIndex - 1; i >= baseEntry; i--) {
const readPos = i % this.maxEntries;
if (this.buffer[readPos] >= threshold) {
break;
}
silentEntries++;
}
return (silentEntries * this.hopSize) / this.sampleRate;
}
/**
* Check if there is any speech in a frame range.
*
* @param startFrame - Start audio frame (global offset, inclusive)
* @param endFrame - End audio frame (global offset, exclusive)
* @param threshold - Probability threshold for speech (default: 0.5)
* @returns true if any VAD entry in the range exceeds the threshold
*/
hasSpeechInRange(startFrame: number, endFrame: number, threshold: number = 0.5): boolean {
const probs = this.readForFrameRange(startFrame, endFrame);
for (let i = 0; i < probs.length; i++) {
if (probs[i] >= threshold) return true;
}
return false;
}
/**
* Find a silence boundary (VAD probability below threshold) by scanning backward
* from a given frame. Used by WindowBuilder to align window start to silence.
*
* @param fromFrame - Frame to start scanning backward from
* @param minFrame - Don't scan past this frame
* @param threshold - VAD threshold below which is considered silence (default: 0.3)
* @returns Frame offset of the silence boundary, or minFrame if no silence found
*/
findSilenceBoundary(fromFrame: number, minFrame: number, threshold: number = 0.3): number {
const fromEntry = Math.floor(fromFrame / this.hopSize);
const minEntry = Math.floor(minFrame / this.hopSize);
const baseEntry = this.getBaseEntry();
const clampedMin = Math.max(minEntry, baseEntry);
for (let i = fromEntry; i >= clampedMin; i--) {
const readPos = i % this.maxEntries;
if (this.buffer[readPos] < threshold) {
return i * this.hopSize;
}
}
return minFrame;
}
/**
* Get the current global index (next entry to write).
*/
getCurrentIndex(): number {
return this.globalIndex;
}
/**
* Get the oldest available entry index.
*/
getBaseEntry(): number {
return Math.max(0, this.globalIndex - this.maxEntries);
}
/**
* Get the global audio frame corresponding to the latest VAD entry.
*/
getCurrentFrame(): number {
return this.globalIndex * this.hopSize;
}
/**
* Reset the buffer.
*/
reset(): void {
this.globalIndex = 0;
this.buffer.fill(0);
}
}