streaming / whisper_streaming /vac_online_processor.py
rjzevallos's picture
Add simulstreaming_whisper module, update requirements, improve Dockerfile and model handling
d860e14
from whisper_streaming.base import OnlineProcessorInterface
from whisper_streaming.silero_vad_iterator import FixedVADIterator
import numpy as np
import logging
logger = logging.getLogger(__name__)
import sys
class VACOnlineASRProcessor(OnlineProcessorInterface):
'''Wraps OnlineASRProcessor with VAC (Voice Activity Controller).
It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds),
it runs VAD and continuously detects whether there is speech or not.
When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately.
'''
def __init__(self, online_chunk_size, online, min_buffered_length=1):
self.online_chunk_size = online_chunk_size
self.online = online
self.min_buffered_frames = int(min_buffered_length * self.SAMPLING_RATE)
# VAC:
import torch
model, _ = torch.hub.load(
repo_or_dir='snakers4/silero-vad',
model='silero_vad'
)
self.vac = FixedVADIterator(model) # we use the default options there: 500ms silence, 100ms padding, etc.
self.init()
def init(self):
self.online.init()
self.vac.reset_states()
self.current_online_chunk_buffer_size = 0
self.is_currently_final = False
self.status = None # or "voice" or "nonvoice"
self.audio_buffer = np.array([],dtype=np.float32)
self.buffer_offset = 0 # in frames
def clear_buffer(self):
self.audio_buffer = np.array([],dtype=np.float32)
def insert_audio_chunk(self, audio):
res = self.vac(audio)
self.audio_buffer = np.append(self.audio_buffer, audio)
if res is not None:
frame = list(res.values())[0] - self.buffer_offset
frame = max(0, frame)
if 'start' in res and 'end' not in res:
self.status = 'voice'
send_audio = self.audio_buffer[frame:]
self.online.init(offset=(frame + self.buffer_offset)/self.SAMPLING_RATE)
self.online.insert_audio_chunk(send_audio)
self.current_online_chunk_buffer_size += len(send_audio)
self.buffer_offset += len(self.audio_buffer)
self.clear_buffer()
elif 'end' in res and 'start' not in res:
self.status = 'nonvoice'
if frame > 0:
send_audio = self.audio_buffer[:frame]
self.online.insert_audio_chunk(send_audio)
self.current_online_chunk_buffer_size += len(send_audio)
self.is_currently_final = True
keep_frames = min(len(self.audio_buffer) - frame, self.min_buffered_frames)
self.buffer_offset += len(self.audio_buffer) - keep_frames
self.audio_buffer = self.audio_buffer[-keep_frames:]
else:
beg = max(0, res["start"] - self.buffer_offset)
end = max(0, res["end"] - self.buffer_offset)
self.status = 'nonvoice'
if beg < end:
send_audio = self.audio_buffer[beg:end]
self.online.init(offset=((beg + self.buffer_offset)/self.SAMPLING_RATE))
self.online.insert_audio_chunk(send_audio)
self.current_online_chunk_buffer_size += len(send_audio)
self.is_currently_final = True
keep_frames = min(len(self.audio_buffer) - end, self.min_buffered_frames)
self.buffer_offset += len(self.audio_buffer) - keep_frames
self.audio_buffer = self.audio_buffer[-keep_frames:]
else:
if self.status == 'voice':
self.online.insert_audio_chunk(self.audio_buffer)
self.current_online_chunk_buffer_size += len(self.audio_buffer)
self.buffer_offset += len(self.audio_buffer)
self.clear_buffer()
else:
# We keep 1 second because VAD may later find start of voice in it.
# But we trim it to prevent OOM.
self.buffer_offset += max(0, len(self.audio_buffer) - self.min_buffered_frames)
self.audio_buffer = self.audio_buffer[-self.min_buffered_frames:]
def process_iter(self):
if self.is_currently_final:
return self.finish()
elif self.current_online_chunk_buffer_size > self.SAMPLING_RATE*self.online_chunk_size:
self.current_online_chunk_buffer_size = 0
ret = self.online.process_iter()
return ret
else:
logger.info(f"no online update, only VAD. {self.status}")
return {}
def finish(self):
ret = self.online.finish()
self.current_online_chunk_buffer_size = 0
self.is_currently_final = False
return ret