david
add custom vad silence
d8ef700
raw
history blame
4.1 kB
from .base import MetaItem, BasePipe
from ..helpers.vadprocessor import SileroVADProcessor, FixedVADIterator
import numpy as np
from silero_vad import get_speech_timestamps
import torch
from typing import List
# import noisereduce as nr
def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
chunks = []
silent_samples = int(0.3 * sample_rate) # 300ms 的静音样本数
silence = torch.zeros(silent_samples) # 创建300ms的静音
for i in range(len(tss)):
# 先添加当前语音片段
chunks.append(wav[tss[i]['start']: tss[i]['end']])
# 如果不是最后一个片段,且与下一个片段间隔大于100ms,则添加静音
if i < len(tss) - 1:
gap = tss[i+1]['start'] - tss[i]['end']
if gap > 0.1 * sample_rate: # 判断间隔是否大于100ms
chunks.append(silence) # 添加300ms静音
return torch.cat(chunks)
def collect_chunks_improved(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
chunks = []
silent_samples = int(0.3 * sample_rate) # 300ms 的静音样本数
silence = torch.zeros(silent_samples) # 创建300ms的静音
min_gap_samples = int(0.1 * sample_rate) # 最小间隔阈值 (100ms)
# 对时间戳进行简单的平滑处理
smoothed_tss = []
for i, ts in enumerate(tss):
if i > 0 and ts['start'] - tss[i-1]['end'] < 0.02 * sample_rate: # 如果间隔小于20ms,认为是连续的
smoothed_tss[-1]['end'] = ts['end'] # 合并到前一个片段
else:
smoothed_tss.append(ts)
for i in range(len(smoothed_tss)):
# 添加当前语音片段
chunks.append(wav[smoothed_tss[i]['start']: smoothed_tss[i]['end']])
# 如果不是最后一个片段,且与下一个片段间隔大于阈值,则添加静音
if i < len(smoothed_tss) - 1:
gap = smoothed_tss[i+1]['start'] - smoothed_tss[i]['end']
if gap > min_gap_samples:
# 根据间隔大小动态调整静音长度,但最大不超过300ms
silence_length = min(gap // 2, silent_samples)
chunks.append(torch.zeros(silence_length))
return torch.cat(chunks)
class VadPipe(BasePipe):
model = None
sample_rate = 16000
window_size_samples = 512
@classmethod
def init(cls):
if cls.model is None:
cls.model = SileroVADProcessor(
activate_threshold=0.45, # 降低以捕获更多音频
fusion_threshold=0.45, # 提高以更好地融合语音片段
min_speech_duration=0.2, # 略微降低以捕获短音节
max_speech_duration=20, # 保持不变
min_silence_duration=300, # 增加到300毫秒,允许说话间的自然停顿
sample_rate=cls.sample_rate # 采样率,音频信号的采样频率
)
cls.vac = FixedVADIterator(cls.model.silero_vad, sampling_rate=cls.sample_rate,)
cls.vac.reset_states()
def get_previous_buffer(self):
if len(self.previous_buffer) == 2:
return self.previous_buffer[-1]
return np.array([], dtype=np.float32)
# def reduce_noise(self, data):
# return nr.reduce_noise(y=data, sr=self.sample_rate)
def process(self, in_data: MetaItem) -> MetaItem:
source_audio = in_data.source_audio
source_audio = np.frombuffer(source_audio, dtype=np.float32)
# source_audio = self.reduce_noise(source_audio)
send_audio = b""
speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
if speech_timestamps:
send_audio = collect_chunks_improved(speech_timestamps, torch.Tensor(source_audio))
send_audio = send_audio.numpy()
in_data.audio = send_audio
# send_audio = self.reduce_noise(send_audio).tobytes()
in_data.source_audio = b""
return in_data