File size: 6,110 Bytes
b8cc2bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/**
 * VADRingBuffer stores per-frame speech probabilities in a circular buffer,
 * synchronized with the audio RingBuffer via global frame offsets.
 *
 * Each VAD probability covers `hopSize` audio frames (e.g., 512 frames = 32ms at 16kHz).
 * VAD probability at index i corresponds to audio frames [i * hopSize, (i+1) * hopSize).
 *
 * Ported from zdasr-main/src/zdasr/ring_buffer.py (VAD support).
 */
export class VADRingBuffer {
    /** Number of audio frames per VAD probability entry */
    readonly hopSize: number;
    /** Sample rate for time conversions */
    readonly sampleRate: number;

    private readonly maxEntries: number;
    private buffer: Float32Array;
    private globalIndex: number = 0; // Next VAD entry to be written (global)

    /**
     * @param sampleRate - Audio sample rate in Hz
     * @param durationSeconds - Maximum buffer duration in seconds
     * @param hopSize - Number of audio frames per VAD probability (default: 512 for Silero at 16kHz)
     */
    constructor(sampleRate: number, durationSeconds: number, hopSize: number = 512) {
        this.sampleRate = sampleRate;
        this.hopSize = hopSize;
        this.maxEntries = Math.ceil((sampleRate * durationSeconds) / hopSize);
        this.buffer = new Float32Array(this.maxEntries);
    }

    /**
     * Write one or more VAD probabilities.
     * Each probability corresponds to hopSize audio frames.
     */
    write(probability: number): void {
        const writePos = this.globalIndex % this.maxEntries;
        this.buffer[writePos] = probability;
        this.globalIndex++;
    }

    /**
     * Write multiple VAD probabilities at once.
     */
    writeBatch(probabilities: Float32Array | number[]): void {
        for (let i = 0; i < probabilities.length; i++) {
            this.write(probabilities[i]);
        }
    }

    /**
     * Read VAD probabilities for a range of audio frames.
     *
     * @param startFrame - Start audio frame (global offset, inclusive)
     * @param endFrame - End audio frame (global offset, exclusive)
     * @returns Float32Array of VAD probabilities covering the range
     */
    readForFrameRange(startFrame: number, endFrame: number): Float32Array {
        if (endFrame <= startFrame) return new Float32Array(0);

        const startEntry = Math.floor(startFrame / this.hopSize);
        const endEntry = Math.ceil(endFrame / this.hopSize);

        const baseEntry = this.getBaseEntry();
        const clampedStart = Math.max(startEntry, baseEntry);
        const clampedEnd = Math.min(endEntry, this.globalIndex);

        if (clampedEnd <= clampedStart) return new Float32Array(0);

        const length = clampedEnd - clampedStart;
        const result = new Float32Array(length);

        for (let i = 0; i < length; i++) {
            const readPos = (clampedStart + i) % this.maxEntries;
            result[i] = this.buffer[readPos];
        }

        return result;
    }

    /**
     * Get the duration of trailing silence (in seconds) from the current position.
     * Scans backward from the latest entry until a probability >= threshold is found.
     *
     * @param threshold - Probability threshold for speech (default: 0.5)
     * @returns Duration of trailing silence in seconds
     */
    getSilenceTailDuration(threshold: number = 0.5): number {
        if (this.globalIndex === 0) return 0;

        let silentEntries = 0;
        const baseEntry = this.getBaseEntry();

        for (let i = this.globalIndex - 1; i >= baseEntry; i--) {
            const readPos = i % this.maxEntries;
            if (this.buffer[readPos] >= threshold) {
                break;
            }
            silentEntries++;
        }

        return (silentEntries * this.hopSize) / this.sampleRate;
    }

    /**
     * Check if there is any speech in a frame range.
     *
     * @param startFrame - Start audio frame (global offset, inclusive)
     * @param endFrame - End audio frame (global offset, exclusive)
     * @param threshold - Probability threshold for speech (default: 0.5)
     * @returns true if any VAD entry in the range exceeds the threshold
     */
    hasSpeechInRange(startFrame: number, endFrame: number, threshold: number = 0.5): boolean {
        const probs = this.readForFrameRange(startFrame, endFrame);
        for (let i = 0; i < probs.length; i++) {
            if (probs[i] >= threshold) return true;
        }
        return false;
    }

    /**
     * Find a silence boundary (VAD probability below threshold) by scanning backward
     * from a given frame. Used by WindowBuilder to align window start to silence.
     *
     * @param fromFrame - Frame to start scanning backward from
     * @param minFrame - Don't scan past this frame
     * @param threshold - VAD threshold below which is considered silence (default: 0.3)
     * @returns Frame offset of the silence boundary, or minFrame if no silence found
     */
    findSilenceBoundary(fromFrame: number, minFrame: number, threshold: number = 0.3): number {
        const fromEntry = Math.floor(fromFrame / this.hopSize);
        const minEntry = Math.floor(minFrame / this.hopSize);
        const baseEntry = this.getBaseEntry();
        const clampedMin = Math.max(minEntry, baseEntry);

        for (let i = fromEntry; i >= clampedMin; i--) {
            const readPos = i % this.maxEntries;
            if (this.buffer[readPos] < threshold) {
                return i * this.hopSize;
            }
        }

        return minFrame;
    }

    /**
     * Get the current global index (next entry to write).
     */
    getCurrentIndex(): number {
        return this.globalIndex;
    }

    /**
     * Get the oldest available entry index.
     */
    getBaseEntry(): number {
        return Math.max(0, this.globalIndex - this.maxEntries);
    }

    /**
     * Get the global audio frame corresponding to the latest VAD entry.
     */
    getCurrentFrame(): number {
        return this.globalIndex * this.hopSize;
    }

    /**
     * Reset the buffer.
     */
    reset(): void {
        this.globalIndex = 0;
        this.buffer.fill(0);
    }
}