Spaces:

rjzevallos
/

streaming

Runtime error

App Files Files Community

streaming / whisper_streaming /whisper_server.py

rjzevallos

Add simulstreaming_whisper module, update requirements, improve Dockerfile and model handling

d860e14 about 2 months ago

raw

history blame contribute delete

6.09 kB

	#!/usr/bin/env python3
	from whisper_streaming.whisper_online_main import *

	import sys
	import argparse
	import os
	import logging
	import numpy as np

	logger = logging.getLogger(__name__)

	SAMPLING_RATE = 16000


	######### Server objects

	import whisper_streaming.line_packet as line_packet
	import socket

	class Connection:
	'''it wraps conn object'''
	PACKET_SIZE = 32000560 # 5 minutes # was: 65536

	def __init__(self, conn):
	self.conn = conn
	self.last_line = ""

	self.conn.setblocking(True)

	def send(self, line):
	'''it doesn't send the same line twice, because it was problematic in online-text-flow-events'''
	if line == self.last_line:
	return
	line_packet.send_one_line(self.conn, line)
	self.last_line = line

	def receive_lines(self):
	in_line = line_packet.receive_lines(self.conn)
	return in_line

	def non_blocking_receive_audio(self):
	try:
	r = self.conn.recv(self.PACKET_SIZE)
	return r
	except ConnectionResetError:
	return None

	import io
	import soundfile

	# wraps socket and ASR object, and serves one client connection.
	# next client should be served by a new instance of this object
	class ServerProcessor:

	def __init__(self, c, online_asr_proc, min_chunk):
	self.connection = c
	self.online_asr_proc = online_asr_proc
	self.min_chunk = min_chunk

	self.is_first = True

	def receive_audio_chunk(self):
	# receive all audio that is available by this time
	# blocks operation if less than self.min_chunk seconds is available
	# unblocks if connection is closed or a chunk is available
	out = []
	minlimit = self.min_chunk*SAMPLING_RATE
	while sum(len(x) for x in out) < minlimit:
	raw_bytes = self.connection.non_blocking_receive_audio()
	if not raw_bytes:
	break
	# print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
	sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
	audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
	out.append(audio)
	if not out:
	return None
	conc = np.concatenate(out)
	if self.is_first and len(conc) < minlimit:
	return None
	self.is_first = False
	return np.concatenate(out)

	def send_result(self, iteration_output):
	# output format in stdout is like:
	# 0 1720 Takhle to je
	# - the first two words are:
	# - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
	# - the next words: segment transcript
	if iteration_output:
	message = "%1.0f %1.0f %s" % (iteration_output['start'] * 1000, iteration_output['end'] * 1000, iteration_output['text'])
	print(message, flush=True, file=sys.stderr)
	self.connection.send(message)
	else:
	logger.debug("No text in this segment")

	def process(self):
	# handle one client connection
	self.online_asr_proc.init()
	while True:
	a = self.receive_audio_chunk()
	if a is None:
	break
	self.online_asr_proc.insert_audio_chunk(a)
	o = self.online_asr_proc.process_iter()
	try:
	self.send_result(o)
	except BrokenPipeError:
	logger.info("broken pipe -- connection closed?")
	break

	# o = online.finish() # this should be working
	# self.send_result(o)

	def main_server(factory, add_args):
	'''
	factory: function that creates the ASR and online processor object from args and logger.
	or in the default WhisperStreaming local agreement backends (not implemented but could be).
	add_args: add specific args for the backend
	'''
	logger = logging.getLogger(__name__)
	parser = argparse.ArgumentParser()

	# server options
	parser.add_argument("--host", type=str, default='localhost')
	parser.add_argument("--port", type=int, default=43007)
	parser.add_argument("--warmup-file", type=str, dest="warmup_file",
	help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. "
	"https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")

	# options from whisper_online
	processor_args(parser)

	add_args(parser)

	args = parser.parse_args()

	set_logging(args,logger)

	# setting whisper object by args


	asr, online = asr_factory(args, factory)
	if args.vac:
	min_chunk = args.vac_chunk_size
	else:
	min_chunk = args.min_chunk_size

	# warm up the ASR because the very first transcribe takes more time than the others.
	# Test results in https://github.com/ufal/whisper_streaming/pull/81
	msg = "Whisper is not warmed up. The first chunk processing may take longer."
	if args.warmup_file:
	if os.path.isfile(args.warmup_file):
	a = load_audio_chunk(args.warmup_file,0,1)
	asr.warmup(a)
	logger.info("Whisper is warmed up.")
	else:
	logger.critical("The warm up file is not available. "+msg)
	sys.exit(1)
	else:
	logger.warning(msg)

	# server loop

	with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
	s.bind((args.host, args.port))
	s.listen(1)
	logger.info('Listening on'+str((args.host, args.port)))
	while True:
	conn, addr = s.accept()
	logger.info('Connected to client on {}'.format(addr))
	connection = Connection(conn)
	proc = ServerProcessor(connection, online, min_chunk)
	proc.process()
	conn.close()
	logger.info('Connection to client closed')
	logger.info('Connection closed, terminating.')