tear_5 / webui /omni_streamlit.py

Upload folder using huggingface_hub

11eefb3 verified about 1 year ago

13 kB

	import sys
	import os

	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	from utils.vad import get_speech_timestamps, collect_chunks, VadOptions

	import streamlit as st
	import wave

	# from ASR import recognize
	import requests
	import pyaudio
	import numpy as np
	import base64
	import io
	from typing import List
	import av
	import os
	import time
	import tempfile
	import librosa
	import traceback
	from pydub import AudioSegment
	from datetime import datetime
	from PIL import Image
	import streamlit_webrtc
	from io import BytesIO

	# set wide mode
	# st.set_page_config(layout="wide")


	last_video_frame = None
	last_video_frame_ts = time.time()


	API_URL = os.getenv("API_URL", "http://127.0.0.1:60808/chat")
	API_URL = None if API_URL == "" else API_URL

	# recording parameters
	IN_FORMAT = pyaudio.paInt16
	IN_CHANNELS = 1
	IN_RATE = 24000
	IN_CHUNK = 1024
	IN_SAMPLE_WIDTH = 2
	VAD_STRIDE = 0.5

	# playing parameters
	OUT_FORMAT = pyaudio.paInt16
	OUT_CHANNELS = 1
	OUT_RATE = 24000
	OUT_SAMPLE_WIDTH = 2
	OUT_CHUNK = 5760


	# Initialize chat history
	if "messages" not in st.session_state:
	st.session_state.messages = []


	def run_vad(ori_audio, sr):
	_st = time.time()
	try:
	audio = np.frombuffer(ori_audio, dtype=np.int16)
	audio = audio.astype(np.float32) / 32768.0
	sampling_rate = 16000
	if sr != sampling_rate:
	audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)

	vad_parameters = {}
	vad_parameters = VadOptions(**vad_parameters)
	speech_chunks = get_speech_timestamps(audio, vad_parameters)
	audio = collect_chunks(audio, speech_chunks)
	duration_after_vad = audio.shape[0] / sampling_rate

	if sr != sampling_rate:
	# resample to original sampling rate
	vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
	else:
	vad_audio = audio
	vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
	vad_audio_bytes = vad_audio.tobytes()

	return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
	except Exception as e:
	msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {traceback.format_exc()}"
	print(msg)
	return -1, ori_audio, round(time.time() - _st, 4)


	def warm_up():
	frames = b"\x00\x00" * 1024 * 2 # 1024 frames of 2 bytes each
	dur, frames, tcost = run_vad(frames, 16000)
	print(f"warm up done, time_cost: {tcost:.3f} s")


	def save_tmp_audio(audio_bytes):
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
	file_name = tmpfile.name
	audio = AudioSegment(
	data=audio_bytes,
	sample_width=OUT_SAMPLE_WIDTH,
	frame_rate=OUT_RATE,
	channels=OUT_CHANNELS,
	)
	audio.export(file_name, format="wav")
	return file_name


	def speaking(status, resp_text_holder=None, encoded_img=None):

	# Initialize PyAudio
	p = pyaudio.PyAudio()

	# Open PyAudio stream
	stream = p.open(
	format=OUT_FORMAT, channels=OUT_CHANNELS, rate=OUT_RATE, output=True
	)

	audio_buffer = io.BytesIO()
	wf = wave.open(audio_buffer, "wb")
	wf.setnchannels(IN_CHANNELS)
	wf.setsampwidth(IN_SAMPLE_WIDTH)
	wf.setframerate(IN_RATE)
	total_frames = b"".join(st.session_state.frames)
	dur = len(total_frames) / (IN_RATE * IN_CHANNELS * IN_SAMPLE_WIDTH)
	status.warning(f"Speaking... recorded audio duration: {dur:.3f} s")
	wf.writeframes(total_frames)

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
	with open(tmpfile.name, "wb") as f:
	f.write(audio_buffer.getvalue())

	with open("input_audio.wav", "wb") as f:
	f.write(audio_buffer.getvalue())

	file_name = tmpfile.name
	with st.chat_message("user"):
	st.audio(file_name, format="audio/wav", loop=False, autoplay=False)
	st.session_state.messages.append(
	{"role": "assistant", "content": file_name, "type": "audio"}
	)

	st.session_state.frames = []

	audio_bytes = audio_buffer.getvalue()
	base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
	if API_URL is not None:
	output_audio_bytes = b""
	files = {"audio": base64_encoded}
	if encoded_img is not None:
	files["image"] = encoded_img
	print("sending request to server")
	resp_text_holder.empty()
	resp_text = ""
	with requests.post(API_URL, json=files, stream=True) as response:
	try:
	buffer = b''
	for chunk in response.iter_content(chunk_size=2048):
	buffer += chunk
	while b'\r\n--frame\r\n' in buffer:
	frame, buffer = buffer.split(b'\r\n--frame\r\n', 1)
	if b'Content-Type: audio/wav' in frame:
	audio_data = frame.split(b'\r\n\r\n', 1)[1]
	# audio_data = base64.b64decode(audio_data)
	output_audio_bytes += audio_data
	audio_array = np.frombuffer(audio_data, dtype=np.int8)
	stream.write(audio_array)
	elif b'Content-Type: text/plain' in frame:
	text_data = frame.split(b'\r\n\r\n', 1)[1].decode()
	resp_text += text_data
	if len(text_data) > 0:
	print(resp_text, end='\r')
	resp_text_holder.write(resp_text)

	except Exception as e:
	st.error(f"Error during audio streaming: {e}")

	out_file = save_tmp_audio(output_audio_bytes)
	with st.chat_message("assistant"):
	st.write(resp_text)
	with st.chat_message("assistant"):
	st.audio(out_file, format="audio/wav", loop=False, autoplay=False)
	st.session_state.messages.append(
	{"role": "assistant", "content": resp_text, "type": "text"}
	)
	st.session_state.messages.append(
	{"role": "assistant", "content": out_file, "type": "audio"}
	)
	else:
	st.error("API_URL is not set. Please set the API_URL environment variable.")
	time.sleep(1)

	wf.close()
	# Close PyAudio stream and terminate PyAudio
	stream.stop_stream()
	stream.close()
	p.terminate()
	st.session_state.speaking = False
	st.session_state.recording = True


	def recording(status):
	audio = pyaudio.PyAudio()

	stream = audio.open(
	format=IN_FORMAT,
	channels=IN_CHANNELS,
	rate=IN_RATE,
	input=True,
	frames_per_buffer=IN_CHUNK,
	)

	temp_audio = b""
	vad_audio = b""

	start_talking = False
	last_temp_audio = None
	st.session_state.frames = []

	while st.session_state.recording:
	status.success("Listening...")
	audio_bytes = stream.read(IN_CHUNK)
	temp_audio += audio_bytes

	if len(temp_audio) > IN_SAMPLE_WIDTH * IN_RATE * IN_CHANNELS * VAD_STRIDE:
	dur_vad, vad_audio_bytes, time_vad = run_vad(temp_audio, IN_RATE)

	print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")

	if dur_vad > 0.2 and not start_talking:
	if last_temp_audio is not None:
	st.session_state.frames.append(last_temp_audio)
	start_talking = True
	if start_talking:
	st.session_state.frames.append(temp_audio)
	if dur_vad < 0.1 and start_talking:
	st.session_state.recording = False
	print(f"speech end detected. excit")
	last_temp_audio = temp_audio
	temp_audio = b""

	stream.stop_stream()
	stream.close()

	audio.terminate()


	async def queued_video_frames_callback(frames: List[av.VideoFrame]) -> List[av.VideoFrame]:
	# print(f"test-------queued_video_frames_callback")
	global last_video_frame
	global last_video_frame_ts
	if len(frames) != 0:
	if time.time() - last_video_frame_ts > 1:
	last_frame = frames[-1]
	# with video_frame_lock:
	# last_video_frame[0] = last_frame.to_image()
	# last_video_frame_ts[0] = time.time()
	last_video_frame = last_frame.to_image()
	last_video_frame_ts = time.time()

	return frames



	def main():

	st.title("Chat Mini-Omni2 Demo")
	status = st.empty()

	# Mode selection
	mode = st.radio(
	"Select mode:",
	("Audio-only", "Audio-vision"),
	key="mode_selection",
	horizontal=True
	)

	if mode == "Audio-only":
	st.session_state.use_vision = False
	st.info("Audio-only mode selected. The system will process only audio input.")
	else: # Audio-vision
	st.session_state.use_vision = True
	st.info("Audio-vision mode selected. The system will process both audio and video input.")

	if "warm_up" not in st.session_state:
	warm_up()
	st.session_state.warm_up = True
	if "start" not in st.session_state:
	st.session_state.start = False
	if "recording" not in st.session_state:
	st.session_state.recording = False
	if "speaking" not in st.session_state:
	st.session_state.speaking = False
	if "frames" not in st.session_state:
	st.session_state.frames = []

	if not st.session_state.start:
	status.warning("Click Start to chat")

	start_col, stop_col, _ = st.columns([0.2, 0.2, 0.6])
	start_button = start_col.button("Start", key="start_button")
	# stop_button = stop_col.button("Stop", key="stop_button")
	if start_button:
	time.sleep(1)
	st.session_state.recording = True
	st.session_state.start = True

	if st.session_state.use_vision:
	with st.sidebar:

	webrtc_ctx = streamlit_webrtc.webrtc_streamer(
	key="speech-w-video",
	mode=streamlit_webrtc.WebRtcMode.SENDRECV,
	# rtc_configuration={"iceServers": get_ice_servers()},
	media_stream_constraints={"video": True, "audio": False},
	# video_receiver_size=10, # Increased from default 4 to 10
	queued_video_frames_callback=queued_video_frames_callback,
	)

	if not webrtc_ctx.state.playing:
	st.warning("Please allow camera access and try again.")
	return

	resp_text_holder = st.empty()

	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	if message["type"] == "text":
	st.markdown(message["content"])
	elif message["type"] == "img":
	st.image(message["content"], width=300)
	elif message["type"] == "audio":
	st.audio(
	message["content"], format="audio/wav", loop=False, autoplay=False
	)

	while st.session_state.start:
	if st.session_state.recording:
	recording(status)

	if not st.session_state.recording and st.session_state.start:
	encoded_img = None
	if st.session_state.use_vision:
	# last_img = webrtc_ctx.video_receiver.get_frame(timeout=5).to_image()
	last_img = last_video_frame
	if last_img:
	with st.chat_message("user"):
	st.image(last_img, width=300)
	st.session_state.messages.append({"role": "user", "content": last_img, "type": "img"})

	input_img = last_img
	buffer = BytesIO()
	input_img.save(buffer, format="JPEG")

	with open("input_image.jpg", "wb") as f:
	f.write(buffer.getvalue())

	encoded_img = str(base64.b64encode(buffer.getvalue()), encoding="utf-8")
	else:
	st.error("No image captured. Please allow camera access and try again.")
	return

	st.session_state.speaking = True
	speaking(status, resp_text_holder, encoded_img)

	# if stop_button:
	# status.warning("Stopped, click Start to chat")
	# st.session_state.start = False
	# st.session_state.recording = False
	# st.session_state.frames = []
	# break


	if __name__ == "__main__":
	main()