File size: 1,173 Bytes
2651a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import webrtcvad
from collections import deque


class VADFrameSplitter:
    """
    Converts raw PCM stream into speech segments.

    - 20ms frames
    - speech/silence detection
    - endpoint detection
    """

    def __init__(self, sample_rate=16000, mode=2):
        self.vad = webrtcvad.Vad(mode)
        self.sample_rate = sample_rate

        self.frame_buffer = deque()

        self.speech_count = 0
        self.silence_count = 0

    def process(self, frame: bytes):
        """
        Returns:
            - True if speaking
            - False if silence
        """

        is_speech = self.vad.is_speech(frame, self.sample_rate)

        self.frame_buffer.append(frame)

        if is_speech:
            self.speech_count += 1
            self.silence_count = 0
        else:
            self.silence_count += 1

        return is_speech

    def is_end_of_utterance(self):
        """
        Detect speech completion.
        """
        return self.silence_count > 15

    def flush(self):
        audio = b"".join(self.frame_buffer)
        self.frame_buffer.clear()
        self.speech_count = 0
        self.silence_count = 0
        return audio