#!/usr/bin/env python3 from whisper_streaming.whisper_online_main import * import sys import argparse import os import logging import numpy as np logger = logging.getLogger(__name__) SAMPLING_RATE = 16000 ######### Server objects import whisper_streaming.line_packet as line_packet import socket class Connection: '''it wraps conn object''' PACKET_SIZE = 32000*5*60 # 5 minutes # was: 65536 def __init__(self, conn): self.conn = conn self.last_line = "" self.conn.setblocking(True) def send(self, line): '''it doesn't send the same line twice, because it was problematic in online-text-flow-events''' if line == self.last_line: return line_packet.send_one_line(self.conn, line) self.last_line = line def receive_lines(self): in_line = line_packet.receive_lines(self.conn) return in_line def non_blocking_receive_audio(self): try: r = self.conn.recv(self.PACKET_SIZE) return r except ConnectionResetError: return None import io import soundfile # wraps socket and ASR object, and serves one client connection. # next client should be served by a new instance of this object class ServerProcessor: def __init__(self, c, online_asr_proc, min_chunk): self.connection = c self.online_asr_proc = online_asr_proc self.min_chunk = min_chunk self.is_first = True def receive_audio_chunk(self): # receive all audio that is available by this time # blocks operation if less than self.min_chunk seconds is available # unblocks if connection is closed or a chunk is available out = [] minlimit = self.min_chunk*SAMPLING_RATE while sum(len(x) for x in out) < minlimit: raw_bytes = self.connection.non_blocking_receive_audio() if not raw_bytes: break # print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10]) sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32) out.append(audio) if not out: return None conc = np.concatenate(out) if self.is_first and len(conc) < minlimit: return None self.is_first = False return np.concatenate(out) def send_result(self, iteration_output): # output format in stdout is like: # 0 1720 Takhle to je # - the first two words are: # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway # - the next words: segment transcript if iteration_output: message = "%1.0f %1.0f %s" % (iteration_output['start'] * 1000, iteration_output['end'] * 1000, iteration_output['text']) print(message, flush=True, file=sys.stderr) self.connection.send(message) else: logger.debug("No text in this segment") def process(self): # handle one client connection self.online_asr_proc.init() while True: a = self.receive_audio_chunk() if a is None: break self.online_asr_proc.insert_audio_chunk(a) o = self.online_asr_proc.process_iter() try: self.send_result(o) except BrokenPipeError: logger.info("broken pipe -- connection closed?") break # o = online.finish() # this should be working # self.send_result(o) def main_server(factory, add_args): ''' factory: function that creates the ASR and online processor object from args and logger. or in the default WhisperStreaming local agreement backends (not implemented but could be). add_args: add specific args for the backend ''' logger = logging.getLogger(__name__) parser = argparse.ArgumentParser() # server options parser.add_argument("--host", type=str, default='localhost') parser.add_argument("--port", type=int, default=43007) parser.add_argument("--warmup-file", type=str, dest="warmup_file", help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. " "https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .") # options from whisper_online processor_args(parser) add_args(parser) args = parser.parse_args() set_logging(args,logger) # setting whisper object by args asr, online = asr_factory(args, factory) if args.vac: min_chunk = args.vac_chunk_size else: min_chunk = args.min_chunk_size # warm up the ASR because the very first transcribe takes more time than the others. # Test results in https://github.com/ufal/whisper_streaming/pull/81 msg = "Whisper is not warmed up. The first chunk processing may take longer." if args.warmup_file: if os.path.isfile(args.warmup_file): a = load_audio_chunk(args.warmup_file,0,1) asr.warmup(a) logger.info("Whisper is warmed up.") else: logger.critical("The warm up file is not available. "+msg) sys.exit(1) else: logger.warning(msg) # server loop with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind((args.host, args.port)) s.listen(1) logger.info('Listening on'+str((args.host, args.port))) while True: conn, addr = s.accept() logger.info('Connected to client on {}'.format(addr)) connection = Connection(conn) proc = ServerProcessor(connection, online, min_chunk) proc.process() conn.close() logger.info('Connection to client closed') logger.info('Connection closed, terminating.')