Spaces:
Runtime error
Runtime error
Add simulstreaming_whisper module, update requirements, improve Dockerfile and model handling
d860e14
| from whisper_streaming.base import OnlineProcessorInterface | |
| from whisper_streaming.silero_vad_iterator import FixedVADIterator | |
| import numpy as np | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| import sys | |
| class VACOnlineASRProcessor(OnlineProcessorInterface): | |
| '''Wraps OnlineASRProcessor with VAC (Voice Activity Controller). | |
| It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds), | |
| it runs VAD and continuously detects whether there is speech or not. | |
| When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately. | |
| ''' | |
| def __init__(self, online_chunk_size, online, min_buffered_length=1): | |
| self.online_chunk_size = online_chunk_size | |
| self.online = online | |
| self.min_buffered_frames = int(min_buffered_length * self.SAMPLING_RATE) | |
| # VAC: | |
| import torch | |
| model, _ = torch.hub.load( | |
| repo_or_dir='snakers4/silero-vad', | |
| model='silero_vad' | |
| ) | |
| self.vac = FixedVADIterator(model) # we use the default options there: 500ms silence, 100ms padding, etc. | |
| self.init() | |
| def init(self): | |
| self.online.init() | |
| self.vac.reset_states() | |
| self.current_online_chunk_buffer_size = 0 | |
| self.is_currently_final = False | |
| self.status = None # or "voice" or "nonvoice" | |
| self.audio_buffer = np.array([],dtype=np.float32) | |
| self.buffer_offset = 0 # in frames | |
| def clear_buffer(self): | |
| self.audio_buffer = np.array([],dtype=np.float32) | |
| def insert_audio_chunk(self, audio): | |
| res = self.vac(audio) | |
| self.audio_buffer = np.append(self.audio_buffer, audio) | |
| if res is not None: | |
| frame = list(res.values())[0] - self.buffer_offset | |
| frame = max(0, frame) | |
| if 'start' in res and 'end' not in res: | |
| self.status = 'voice' | |
| send_audio = self.audio_buffer[frame:] | |
| self.online.init(offset=(frame + self.buffer_offset)/self.SAMPLING_RATE) | |
| self.online.insert_audio_chunk(send_audio) | |
| self.current_online_chunk_buffer_size += len(send_audio) | |
| self.buffer_offset += len(self.audio_buffer) | |
| self.clear_buffer() | |
| elif 'end' in res and 'start' not in res: | |
| self.status = 'nonvoice' | |
| if frame > 0: | |
| send_audio = self.audio_buffer[:frame] | |
| self.online.insert_audio_chunk(send_audio) | |
| self.current_online_chunk_buffer_size += len(send_audio) | |
| self.is_currently_final = True | |
| keep_frames = min(len(self.audio_buffer) - frame, self.min_buffered_frames) | |
| self.buffer_offset += len(self.audio_buffer) - keep_frames | |
| self.audio_buffer = self.audio_buffer[-keep_frames:] | |
| else: | |
| beg = max(0, res["start"] - self.buffer_offset) | |
| end = max(0, res["end"] - self.buffer_offset) | |
| self.status = 'nonvoice' | |
| if beg < end: | |
| send_audio = self.audio_buffer[beg:end] | |
| self.online.init(offset=((beg + self.buffer_offset)/self.SAMPLING_RATE)) | |
| self.online.insert_audio_chunk(send_audio) | |
| self.current_online_chunk_buffer_size += len(send_audio) | |
| self.is_currently_final = True | |
| keep_frames = min(len(self.audio_buffer) - end, self.min_buffered_frames) | |
| self.buffer_offset += len(self.audio_buffer) - keep_frames | |
| self.audio_buffer = self.audio_buffer[-keep_frames:] | |
| else: | |
| if self.status == 'voice': | |
| self.online.insert_audio_chunk(self.audio_buffer) | |
| self.current_online_chunk_buffer_size += len(self.audio_buffer) | |
| self.buffer_offset += len(self.audio_buffer) | |
| self.clear_buffer() | |
| else: | |
| # We keep 1 second because VAD may later find start of voice in it. | |
| # But we trim it to prevent OOM. | |
| self.buffer_offset += max(0, len(self.audio_buffer) - self.min_buffered_frames) | |
| self.audio_buffer = self.audio_buffer[-self.min_buffered_frames:] | |
| def process_iter(self): | |
| if self.is_currently_final: | |
| return self.finish() | |
| elif self.current_online_chunk_buffer_size > self.SAMPLING_RATE*self.online_chunk_size: | |
| self.current_online_chunk_buffer_size = 0 | |
| ret = self.online.process_iter() | |
| return ret | |
| else: | |
| logger.info(f"no online update, only VAD. {self.status}") | |
| return {} | |
| def finish(self): | |
| ret = self.online.finish() | |
| self.current_online_chunk_buffer_size = 0 | |
| self.is_currently_final = False | |
| return ret |