Spaces:

ysdede
/

keet-streaming

Running

App Files Files Community

keet-streaming / src /lib /vad /VADRingBuffer.ts

ysdede

feat(space): migrate Hugging Face Space to keet SolidJS app

b8cc2bf about 2 months ago

raw

history blame contribute delete

6.11 kB

	/**
	* VADRingBuffer stores per-frame speech probabilities in a circular buffer,
	* synchronized with the audio RingBuffer via global frame offsets.
	*
	* Each VAD probability covers `hopSize` audio frames (e.g., 512 frames = 32ms at 16kHz).
	* VAD probability at index i corresponds to audio frames [i * hopSize, (i+1) * hopSize).
	*
	* Ported from zdasr-main/src/zdasr/ring_buffer.py (VAD support).
	*/
	export class VADRingBuffer {
	/** Number of audio frames per VAD probability entry */
	readonly hopSize: number;
	/** Sample rate for time conversions */
	readonly sampleRate: number;

	private readonly maxEntries: number;
	private buffer: Float32Array;
	private globalIndex: number = 0; // Next VAD entry to be written (global)

	/**
	* @param sampleRate - Audio sample rate in Hz
	* @param durationSeconds - Maximum buffer duration in seconds
	* @param hopSize - Number of audio frames per VAD probability (default: 512 for Silero at 16kHz)
	*/
	constructor(sampleRate: number, durationSeconds: number, hopSize: number = 512) {
	this.sampleRate = sampleRate;
	this.hopSize = hopSize;
	this.maxEntries = Math.ceil((sampleRate * durationSeconds) / hopSize);
	this.buffer = new Float32Array(this.maxEntries);
	}

	/**
	* Write one or more VAD probabilities.
	* Each probability corresponds to hopSize audio frames.
	*/
	write(probability: number): void {
	const writePos = this.globalIndex % this.maxEntries;
	this.buffer[writePos] = probability;
	this.globalIndex++;
	}

	/**
	* Write multiple VAD probabilities at once.
	*/
	writeBatch(probabilities: Float32Array \| number[]): void {
	for (let i = 0; i < probabilities.length; i++) {
	this.write(probabilities[i]);
	}
	}

	/**
	* Read VAD probabilities for a range of audio frames.
	*
	* @param startFrame - Start audio frame (global offset, inclusive)
	* @param endFrame - End audio frame (global offset, exclusive)
	* @returns Float32Array of VAD probabilities covering the range
	*/
	readForFrameRange(startFrame: number, endFrame: number): Float32Array {
	if (endFrame <= startFrame) return new Float32Array(0);

	const startEntry = Math.floor(startFrame / this.hopSize);
	const endEntry = Math.ceil(endFrame / this.hopSize);

	const baseEntry = this.getBaseEntry();
	const clampedStart = Math.max(startEntry, baseEntry);
	const clampedEnd = Math.min(endEntry, this.globalIndex);

	if (clampedEnd <= clampedStart) return new Float32Array(0);

	const length = clampedEnd - clampedStart;
	const result = new Float32Array(length);

	for (let i = 0; i < length; i++) {
	const readPos = (clampedStart + i) % this.maxEntries;
	result[i] = this.buffer[readPos];
	}

	return result;
	}

	/**
	* Get the duration of trailing silence (in seconds) from the current position.
	* Scans backward from the latest entry until a probability >= threshold is found.
	*
	* @param threshold - Probability threshold for speech (default: 0.5)
	* @returns Duration of trailing silence in seconds
	*/
	getSilenceTailDuration(threshold: number = 0.5): number {
	if (this.globalIndex === 0) return 0;

	let silentEntries = 0;
	const baseEntry = this.getBaseEntry();

	for (let i = this.globalIndex - 1; i >= baseEntry; i--) {
	const readPos = i % this.maxEntries;
	if (this.buffer[readPos] >= threshold) {
	break;
	}
	silentEntries++;
	}

	return (silentEntries * this.hopSize) / this.sampleRate;
	}

	/**
	* Check if there is any speech in a frame range.
	*
	* @param startFrame - Start audio frame (global offset, inclusive)
	* @param endFrame - End audio frame (global offset, exclusive)
	* @param threshold - Probability threshold for speech (default: 0.5)
	* @returns true if any VAD entry in the range exceeds the threshold
	*/
	hasSpeechInRange(startFrame: number, endFrame: number, threshold: number = 0.5): boolean {
	const probs = this.readForFrameRange(startFrame, endFrame);
	for (let i = 0; i < probs.length; i++) {
	if (probs[i] >= threshold) return true;
	}
	return false;
	}

	/**
	* Find a silence boundary (VAD probability below threshold) by scanning backward
	* from a given frame. Used by WindowBuilder to align window start to silence.
	*
	* @param fromFrame - Frame to start scanning backward from
	* @param minFrame - Don't scan past this frame
	* @param threshold - VAD threshold below which is considered silence (default: 0.3)
	* @returns Frame offset of the silence boundary, or minFrame if no silence found
	*/
	findSilenceBoundary(fromFrame: number, minFrame: number, threshold: number = 0.3): number {
	const fromEntry = Math.floor(fromFrame / this.hopSize);
	const minEntry = Math.floor(minFrame / this.hopSize);
	const baseEntry = this.getBaseEntry();
	const clampedMin = Math.max(minEntry, baseEntry);

	for (let i = fromEntry; i >= clampedMin; i--) {
	const readPos = i % this.maxEntries;
	if (this.buffer[readPos] < threshold) {
	return i * this.hopSize;
	}
	}

	return minFrame;
	}

	/**
	* Get the current global index (next entry to write).
	*/
	getCurrentIndex(): number {
	return this.globalIndex;
	}

	/**
	* Get the oldest available entry index.
	*/
	getBaseEntry(): number {
	return Math.max(0, this.globalIndex - this.maxEntries);
	}

	/**
	* Get the global audio frame corresponding to the latest VAD entry.
	*/
	getCurrentFrame(): number {
	return this.globalIndex * this.hopSize;
	}

	/**
	* Reset the buffer.
	*/
	reset(): void {
	this.globalIndex = 0;
	this.buffer.fill(0);
	}
	}