Spaces:
Running
Running
File size: 6,110 Bytes
b8cc2bf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | /**
* VADRingBuffer stores per-frame speech probabilities in a circular buffer,
* synchronized with the audio RingBuffer via global frame offsets.
*
* Each VAD probability covers `hopSize` audio frames (e.g., 512 frames = 32ms at 16kHz).
* VAD probability at index i corresponds to audio frames [i * hopSize, (i+1) * hopSize).
*
* Ported from zdasr-main/src/zdasr/ring_buffer.py (VAD support).
*/
export class VADRingBuffer {
/** Number of audio frames per VAD probability entry */
readonly hopSize: number;
/** Sample rate for time conversions */
readonly sampleRate: number;
private readonly maxEntries: number;
private buffer: Float32Array;
private globalIndex: number = 0; // Next VAD entry to be written (global)
/**
* @param sampleRate - Audio sample rate in Hz
* @param durationSeconds - Maximum buffer duration in seconds
* @param hopSize - Number of audio frames per VAD probability (default: 512 for Silero at 16kHz)
*/
constructor(sampleRate: number, durationSeconds: number, hopSize: number = 512) {
this.sampleRate = sampleRate;
this.hopSize = hopSize;
this.maxEntries = Math.ceil((sampleRate * durationSeconds) / hopSize);
this.buffer = new Float32Array(this.maxEntries);
}
/**
* Write one or more VAD probabilities.
* Each probability corresponds to hopSize audio frames.
*/
write(probability: number): void {
const writePos = this.globalIndex % this.maxEntries;
this.buffer[writePos] = probability;
this.globalIndex++;
}
/**
* Write multiple VAD probabilities at once.
*/
writeBatch(probabilities: Float32Array | number[]): void {
for (let i = 0; i < probabilities.length; i++) {
this.write(probabilities[i]);
}
}
/**
* Read VAD probabilities for a range of audio frames.
*
* @param startFrame - Start audio frame (global offset, inclusive)
* @param endFrame - End audio frame (global offset, exclusive)
* @returns Float32Array of VAD probabilities covering the range
*/
readForFrameRange(startFrame: number, endFrame: number): Float32Array {
if (endFrame <= startFrame) return new Float32Array(0);
const startEntry = Math.floor(startFrame / this.hopSize);
const endEntry = Math.ceil(endFrame / this.hopSize);
const baseEntry = this.getBaseEntry();
const clampedStart = Math.max(startEntry, baseEntry);
const clampedEnd = Math.min(endEntry, this.globalIndex);
if (clampedEnd <= clampedStart) return new Float32Array(0);
const length = clampedEnd - clampedStart;
const result = new Float32Array(length);
for (let i = 0; i < length; i++) {
const readPos = (clampedStart + i) % this.maxEntries;
result[i] = this.buffer[readPos];
}
return result;
}
/**
* Get the duration of trailing silence (in seconds) from the current position.
* Scans backward from the latest entry until a probability >= threshold is found.
*
* @param threshold - Probability threshold for speech (default: 0.5)
* @returns Duration of trailing silence in seconds
*/
getSilenceTailDuration(threshold: number = 0.5): number {
if (this.globalIndex === 0) return 0;
let silentEntries = 0;
const baseEntry = this.getBaseEntry();
for (let i = this.globalIndex - 1; i >= baseEntry; i--) {
const readPos = i % this.maxEntries;
if (this.buffer[readPos] >= threshold) {
break;
}
silentEntries++;
}
return (silentEntries * this.hopSize) / this.sampleRate;
}
/**
* Check if there is any speech in a frame range.
*
* @param startFrame - Start audio frame (global offset, inclusive)
* @param endFrame - End audio frame (global offset, exclusive)
* @param threshold - Probability threshold for speech (default: 0.5)
* @returns true if any VAD entry in the range exceeds the threshold
*/
hasSpeechInRange(startFrame: number, endFrame: number, threshold: number = 0.5): boolean {
const probs = this.readForFrameRange(startFrame, endFrame);
for (let i = 0; i < probs.length; i++) {
if (probs[i] >= threshold) return true;
}
return false;
}
/**
* Find a silence boundary (VAD probability below threshold) by scanning backward
* from a given frame. Used by WindowBuilder to align window start to silence.
*
* @param fromFrame - Frame to start scanning backward from
* @param minFrame - Don't scan past this frame
* @param threshold - VAD threshold below which is considered silence (default: 0.3)
* @returns Frame offset of the silence boundary, or minFrame if no silence found
*/
findSilenceBoundary(fromFrame: number, minFrame: number, threshold: number = 0.3): number {
const fromEntry = Math.floor(fromFrame / this.hopSize);
const minEntry = Math.floor(minFrame / this.hopSize);
const baseEntry = this.getBaseEntry();
const clampedMin = Math.max(minEntry, baseEntry);
for (let i = fromEntry; i >= clampedMin; i--) {
const readPos = i % this.maxEntries;
if (this.buffer[readPos] < threshold) {
return i * this.hopSize;
}
}
return minFrame;
}
/**
* Get the current global index (next entry to write).
*/
getCurrentIndex(): number {
return this.globalIndex;
}
/**
* Get the oldest available entry index.
*/
getBaseEntry(): number {
return Math.max(0, this.globalIndex - this.maxEntries);
}
/**
* Get the global audio frame corresponding to the latest VAD entry.
*/
getCurrentFrame(): number {
return this.globalIndex * this.hopSize;
}
/**
* Reset the buffer.
*/
reset(): void {
this.globalIndex = 0;
this.buffer.fill(0);
}
}
|