File size: 2,774 Bytes
872019a
0caa3bc
 
 
 
 
872019a
0caa3bc
 
 
 
 
 
 
 
 
 
 
 
872019a
0caa3bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2cefd9
 
 
0caa3bc
 
 
 
a2cefd9
0caa3bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from silero_vad_axera import load_silero_vad
import numpy as np
from datetime import datetime, timedelta


class StreamVAD:
    def __init__(self, backend='ax650',

                        sensitivity=0.5,

                        silence_ms=200,

                        datetime_format='%Y-%m-%d %H:%M:%S.%f'):
        '''

        model_path: path of silero_vad.onnx

        sensitivity: thresh of voice activation, 

            higher means more sensitive, 

            hence, low speech prob thresh

        silence_ms: pop audio after silence for silence_ms milliseconds

        datetime_format: format of datetime in return data

        '''

        self.model = load_silero_vad(backend)
        self.sensitivity = sensitivity
        self.silence_ms = silence_ms
        self.datetime_format = datetime_format

        self.reset()


    def reset(self):
        self.silence_count = 0
        self.speech_count = 0
        self.return_data = {
            "start_ts": '',
            "end_ts": '',
            "audio": None
        }
        self.vad_data_list = []
        self.model.reset_states()

    
    def run(self, audio: np.ndarray, sr: int = 16000):
        # record datetime
        cur_ts = datetime.now()

        # freq scale
        freq_scale = int(sr / self.model.sr)

        # inference
        speech_probs = self.model.audio_forward(audio, sr)[0]

        for i, prob in enumerate(speech_probs):
            audio_slice = audio[i * self.model.num_samples * freq_scale : (i + 1) * self.model.num_samples * freq_scale]
            ts = cur_ts.strftime(self.datetime_format)

            # is speech
            if prob > 1 - self.sensitivity:
                self.silence_count = 0
                # new speech segment
                if self.speech_count == 0:
                    self.return_data['start_ts'] = ts

                self.speech_count += 1
                self.vad_data_list.append(audio_slice)
            # silence
            else:
                if self.speech_count > 0:
                    self.silence_count += 1

                    # exceed silence limit
                    if 1000 * self.silence_count * self.model.num_samples / self.model.sr > self.silence_ms:
                        # return audio segment
                        self.return_data['end_ts'] = ts
                        self.return_data['audio'] = np.concatenate(self.vad_data_list, axis=-1)

                        yield self.return_data

                        self.reset()
                    else:
                        self.vad_data_list.append(audio_slice)

            # timestamp
            cur_ts += timedelta(seconds=self.model.num_samples / self.model.sr)