Spaces:

ysdede
/

keet-streaming

Running

File size: 6,110 Bytes

b8cc2bf

/**
 * VADRingBuffer stores per-frame speech probabilities in a circular buffer,
 * synchronized with the audio RingBuffer via global frame offsets.
 *
 * Each VAD probability covers `hopSize` audio frames (e.g., 512 frames = 32ms at 16kHz).
 * VAD probability at index i corresponds to audio frames [i * hopSize, (i+1) * hopSize).
 *
 * Ported from zdasr-main/src/zdasr/ring_buffer.py (VAD support).
 */
export class VADRingBuffer {
    /** Number of audio frames per VAD probability entry */
    readonly hopSize: number;
    /** Sample rate for time conversions */
    readonly sampleRate: number;

    private readonly maxEntries: number;
    private buffer: Float32Array;
    private globalIndex: number = 0; // Next VAD entry to be written (global)

    /**
     * @param sampleRate - Audio sample rate in Hz
     * @param durationSeconds - Maximum buffer duration in seconds
     * @param hopSize - Number of audio frames per VAD probability (default: 512 for Silero at 16kHz)
     */
    constructor(sampleRate: number, durationSeconds: number, hopSize: number = 512) {
        this.sampleRate = sampleRate;
        this.hopSize = hopSize;
        this.maxEntries = Math.ceil((sampleRate * durationSeconds) / hopSize);
        this.buffer = new Float32Array(this.maxEntries);
    }

    /**
     * Write one or more VAD probabilities.
     * Each probability corresponds to hopSize audio frames.
     */
    write(probability: number): void {
        const writePos = this.globalIndex % this.maxEntries;
        this.buffer[writePos] = probability;
        this.globalIndex++;
    }

    /**
     * Write multiple VAD probabilities at once.
     */
    writeBatch(probabilities: Float32Array | number[]): void {
        for (let i = 0; i < probabilities.length; i++) {
            this.write(probabilities[i]);
        }
    }

    /**
     * Read VAD probabilities for a range of audio frames.
     *
     * @param startFrame - Start audio frame (global offset, inclusive)
     * @param endFrame - End audio frame (global offset, exclusive)
     * @returns Float32Array of VAD probabilities covering the range
     */
    readForFrameRange(startFrame: number, endFrame: number): Float32Array {
        if (endFrame <= startFrame) return new Float32Array(0);

        const startEntry = Math.floor(startFrame / this.hopSize);
        const endEntry = Math.ceil(endFrame / this.hopSize);

        const baseEntry = this.getBaseEntry();
        const clampedStart = Math.max(startEntry, baseEntry);
        const clampedEnd = Math.min(endEntry, this.globalIndex);

        if (clampedEnd <= clampedStart) return new Float32Array(0);

        const length = clampedEnd - clampedStart;
        const result = new Float32Array(length);

        for (let i = 0; i < length; i++) {
            const readPos = (clampedStart + i) % this.maxEntries;
            result[i] = this.buffer[readPos];
        }

        return result;
    }

    /**
     * Get the duration of trailing silence (in seconds) from the current position.
     * Scans backward from the latest entry until a probability >= threshold is found.
     *
     * @param threshold - Probability threshold for speech (default: 0.5)
     * @returns Duration of trailing silence in seconds
     */
    getSilenceTailDuration(threshold: number = 0.5): number {
        if (this.globalIndex === 0) return 0;

        let silentEntries = 0;
        const baseEntry = this.getBaseEntry();

        for (let i = this.globalIndex - 1; i >= baseEntry; i--) {
            const readPos = i % this.maxEntries;
            if (this.buffer[readPos] >= threshold) {
                break;
            }
            silentEntries++;
        }

        return (silentEntries * this.hopSize) / this.sampleRate;
    }

    /**
     * Check if there is any speech in a frame range.
     *
     * @param startFrame - Start audio frame (global offset, inclusive)
     * @param endFrame - End audio frame (global offset, exclusive)
     * @param threshold - Probability threshold for speech (default: 0.5)
     * @returns true if any VAD entry in the range exceeds the threshold
     */
    hasSpeechInRange(startFrame: number, endFrame: number, threshold: number = 0.5): boolean {
        const probs = this.readForFrameRange(startFrame, endFrame);
        for (let i = 0; i < probs.length; i++) {
            if (probs[i] >= threshold) return true;
        }
        return false;
    }

    /**
     * Find a silence boundary (VAD probability below threshold) by scanning backward
     * from a given frame. Used by WindowBuilder to align window start to silence.
     *
     * @param fromFrame - Frame to start scanning backward from
     * @param minFrame - Don't scan past this frame
     * @param threshold - VAD threshold below which is considered silence (default: 0.3)
     * @returns Frame offset of the silence boundary, or minFrame if no silence found
     */
    findSilenceBoundary(fromFrame: number, minFrame: number, threshold: number = 0.3): number {
        const fromEntry = Math.floor(fromFrame / this.hopSize);
        const minEntry = Math.floor(minFrame / this.hopSize);
        const baseEntry = this.getBaseEntry();
        const clampedMin = Math.max(minEntry, baseEntry);

        for (let i = fromEntry; i >= clampedMin; i--) {
            const readPos = i % this.maxEntries;
            if (this.buffer[readPos] < threshold) {
                return i * this.hopSize;
            }
        }

        return minFrame;
    }

    /**
     * Get the current global index (next entry to write).
     */
    getCurrentIndex(): number {
        return this.globalIndex;
    }

    /**
     * Get the oldest available entry index.
     */
    getBaseEntry(): number {
        return Math.max(0, this.globalIndex - this.maxEntries);
    }

    /**
     * Get the global audio frame corresponding to the latest VAD entry.
     */
    getCurrentFrame(): number {
        return this.globalIndex * this.hopSize;
    }

    /**
     * Reset the buffer.
     */
    reset(): void {
        this.globalIndex = 0;
        this.buffer.fill(0);
    }
}