Spaces:
Running
Running
| /** | |
| * VADRingBuffer stores per-frame speech probabilities in a circular buffer, | |
| * synchronized with the audio RingBuffer via global frame offsets. | |
| * | |
| * Each VAD probability covers `hopSize` audio frames (e.g., 512 frames = 32ms at 16kHz). | |
| * VAD probability at index i corresponds to audio frames [i * hopSize, (i+1) * hopSize). | |
| * | |
| * Ported from zdasr-main/src/zdasr/ring_buffer.py (VAD support). | |
| */ | |
| export class VADRingBuffer { | |
| /** Number of audio frames per VAD probability entry */ | |
| readonly hopSize: number; | |
| /** Sample rate for time conversions */ | |
| readonly sampleRate: number; | |
| private readonly maxEntries: number; | |
| private buffer: Float32Array; | |
| private globalIndex: number = 0; // Next VAD entry to be written (global) | |
| /** | |
| * @param sampleRate - Audio sample rate in Hz | |
| * @param durationSeconds - Maximum buffer duration in seconds | |
| * @param hopSize - Number of audio frames per VAD probability (default: 512 for Silero at 16kHz) | |
| */ | |
| constructor(sampleRate: number, durationSeconds: number, hopSize: number = 512) { | |
| this.sampleRate = sampleRate; | |
| this.hopSize = hopSize; | |
| this.maxEntries = Math.ceil((sampleRate * durationSeconds) / hopSize); | |
| this.buffer = new Float32Array(this.maxEntries); | |
| } | |
| /** | |
| * Write one or more VAD probabilities. | |
| * Each probability corresponds to hopSize audio frames. | |
| */ | |
| write(probability: number): void { | |
| const writePos = this.globalIndex % this.maxEntries; | |
| this.buffer[writePos] = probability; | |
| this.globalIndex++; | |
| } | |
| /** | |
| * Write multiple VAD probabilities at once. | |
| */ | |
| writeBatch(probabilities: Float32Array | number[]): void { | |
| for (let i = 0; i < probabilities.length; i++) { | |
| this.write(probabilities[i]); | |
| } | |
| } | |
| /** | |
| * Read VAD probabilities for a range of audio frames. | |
| * | |
| * @param startFrame - Start audio frame (global offset, inclusive) | |
| * @param endFrame - End audio frame (global offset, exclusive) | |
| * @returns Float32Array of VAD probabilities covering the range | |
| */ | |
| readForFrameRange(startFrame: number, endFrame: number): Float32Array { | |
| if (endFrame <= startFrame) return new Float32Array(0); | |
| const startEntry = Math.floor(startFrame / this.hopSize); | |
| const endEntry = Math.ceil(endFrame / this.hopSize); | |
| const baseEntry = this.getBaseEntry(); | |
| const clampedStart = Math.max(startEntry, baseEntry); | |
| const clampedEnd = Math.min(endEntry, this.globalIndex); | |
| if (clampedEnd <= clampedStart) return new Float32Array(0); | |
| const length = clampedEnd - clampedStart; | |
| const result = new Float32Array(length); | |
| for (let i = 0; i < length; i++) { | |
| const readPos = (clampedStart + i) % this.maxEntries; | |
| result[i] = this.buffer[readPos]; | |
| } | |
| return result; | |
| } | |
| /** | |
| * Get the duration of trailing silence (in seconds) from the current position. | |
| * Scans backward from the latest entry until a probability >= threshold is found. | |
| * | |
| * @param threshold - Probability threshold for speech (default: 0.5) | |
| * @returns Duration of trailing silence in seconds | |
| */ | |
| getSilenceTailDuration(threshold: number = 0.5): number { | |
| if (this.globalIndex === 0) return 0; | |
| let silentEntries = 0; | |
| const baseEntry = this.getBaseEntry(); | |
| for (let i = this.globalIndex - 1; i >= baseEntry; i--) { | |
| const readPos = i % this.maxEntries; | |
| if (this.buffer[readPos] >= threshold) { | |
| break; | |
| } | |
| silentEntries++; | |
| } | |
| return (silentEntries * this.hopSize) / this.sampleRate; | |
| } | |
| /** | |
| * Check if there is any speech in a frame range. | |
| * | |
| * @param startFrame - Start audio frame (global offset, inclusive) | |
| * @param endFrame - End audio frame (global offset, exclusive) | |
| * @param threshold - Probability threshold for speech (default: 0.5) | |
| * @returns true if any VAD entry in the range exceeds the threshold | |
| */ | |
| hasSpeechInRange(startFrame: number, endFrame: number, threshold: number = 0.5): boolean { | |
| const probs = this.readForFrameRange(startFrame, endFrame); | |
| for (let i = 0; i < probs.length; i++) { | |
| if (probs[i] >= threshold) return true; | |
| } | |
| return false; | |
| } | |
| /** | |
| * Find a silence boundary (VAD probability below threshold) by scanning backward | |
| * from a given frame. Used by WindowBuilder to align window start to silence. | |
| * | |
| * @param fromFrame - Frame to start scanning backward from | |
| * @param minFrame - Don't scan past this frame | |
| * @param threshold - VAD threshold below which is considered silence (default: 0.3) | |
| * @returns Frame offset of the silence boundary, or minFrame if no silence found | |
| */ | |
| findSilenceBoundary(fromFrame: number, minFrame: number, threshold: number = 0.3): number { | |
| const fromEntry = Math.floor(fromFrame / this.hopSize); | |
| const minEntry = Math.floor(minFrame / this.hopSize); | |
| const baseEntry = this.getBaseEntry(); | |
| const clampedMin = Math.max(minEntry, baseEntry); | |
| for (let i = fromEntry; i >= clampedMin; i--) { | |
| const readPos = i % this.maxEntries; | |
| if (this.buffer[readPos] < threshold) { | |
| return i * this.hopSize; | |
| } | |
| } | |
| return minFrame; | |
| } | |
| /** | |
| * Get the current global index (next entry to write). | |
| */ | |
| getCurrentIndex(): number { | |
| return this.globalIndex; | |
| } | |
| /** | |
| * Get the oldest available entry index. | |
| */ | |
| getBaseEntry(): number { | |
| return Math.max(0, this.globalIndex - this.maxEntries); | |
| } | |
| /** | |
| * Get the global audio frame corresponding to the latest VAD entry. | |
| */ | |
| getCurrentFrame(): number { | |
| return this.globalIndex * this.hopSize; | |
| } | |
| /** | |
| * Reset the buffer. | |
| */ | |
| reset(): void { | |
| this.globalIndex = 0; | |
| this.buffer.fill(0); | |
| } | |
| } | |