|
|
|
|
|
from .base import MetaItem, BasePipe |
|
|
from ..helpers.vadprocessor import SileroVADProcessor, FixedVADIterator |
|
|
import numpy as np |
|
|
from silero_vad import get_speech_timestamps |
|
|
import torch |
|
|
from typing import List |
|
|
|
|
|
|
|
|
def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000): |
|
|
chunks = [] |
|
|
silent_samples = int(0.3 * sample_rate) |
|
|
silence = torch.zeros(silent_samples) |
|
|
|
|
|
for i in range(len(tss)): |
|
|
|
|
|
chunks.append(wav[tss[i]['start']: tss[i]['end']]) |
|
|
|
|
|
|
|
|
if i < len(tss) - 1: |
|
|
gap = tss[i+1]['start'] - tss[i]['end'] |
|
|
if gap > 0.1 * sample_rate: |
|
|
chunks.append(silence) |
|
|
|
|
|
return torch.cat(chunks) |
|
|
|
|
|
def collect_chunks_improved(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000): |
|
|
chunks = [] |
|
|
silent_samples = int(0.3 * sample_rate) |
|
|
silence = torch.zeros(silent_samples) |
|
|
min_gap_samples = int(0.1 * sample_rate) |
|
|
|
|
|
|
|
|
smoothed_tss = [] |
|
|
for i, ts in enumerate(tss): |
|
|
if i > 0 and ts['start'] - tss[i-1]['end'] < 0.02 * sample_rate: |
|
|
smoothed_tss[-1]['end'] = ts['end'] |
|
|
else: |
|
|
smoothed_tss.append(ts) |
|
|
|
|
|
for i in range(len(smoothed_tss)): |
|
|
|
|
|
chunks.append(wav[smoothed_tss[i]['start']: smoothed_tss[i]['end']]) |
|
|
|
|
|
|
|
|
if i < len(smoothed_tss) - 1: |
|
|
gap = smoothed_tss[i+1]['start'] - smoothed_tss[i]['end'] |
|
|
if gap > min_gap_samples: |
|
|
|
|
|
silence_length = min(gap // 2, silent_samples) |
|
|
chunks.append(torch.zeros(silence_length)) |
|
|
|
|
|
return torch.cat(chunks) |
|
|
|
|
|
class VadPipe(BasePipe): |
|
|
model = None |
|
|
sample_rate = 16000 |
|
|
window_size_samples = 512 |
|
|
|
|
|
|
|
|
@classmethod |
|
|
def init(cls): |
|
|
if cls.model is None: |
|
|
cls.model = SileroVADProcessor( |
|
|
activate_threshold=0.45, |
|
|
fusion_threshold=0.45, |
|
|
min_speech_duration=0.2, |
|
|
max_speech_duration=20, |
|
|
min_silence_duration=300, |
|
|
sample_rate=cls.sample_rate |
|
|
) |
|
|
cls.vac = FixedVADIterator(cls.model.silero_vad, sampling_rate=cls.sample_rate,) |
|
|
cls.vac.reset_states() |
|
|
|
|
|
|
|
|
def get_previous_buffer(self): |
|
|
if len(self.previous_buffer) == 2: |
|
|
return self.previous_buffer[-1] |
|
|
return np.array([], dtype=np.float32) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process(self, in_data: MetaItem) -> MetaItem: |
|
|
source_audio = in_data.source_audio |
|
|
source_audio = np.frombuffer(source_audio, dtype=np.float32) |
|
|
|
|
|
send_audio = b"" |
|
|
speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000) |
|
|
|
|
|
if speech_timestamps: |
|
|
send_audio = collect_chunks_improved(speech_timestamps, torch.Tensor(source_audio)) |
|
|
send_audio = send_audio.numpy() |
|
|
in_data.audio = send_audio |
|
|
|
|
|
in_data.source_audio = b"" |
|
|
return in_data |