voiceBot / app.py
Deepak Sahu
Update app.py
b6fff79
import tempfile
import gradio as gr
from _data_model import AppState
from _utils import audio_to_bytes
from test1 import asr_transcribe
import numpy as np
import io
from pydub import AudioSegment
from _riva import riva_tts_service
from _prompts import SYSTEM_PROMPT
from _css import css_ui
DURATION_TOAST_TIMEOUT: int = 5
# def start_recording_user(state: AppState):
# if not state.stopped:
# return gr.Audio(recording=True)
def chat_llm(conversation, max_retries=3):
retries = 0
from _utils import client
from openai import RateLimitError, APIConnectionError, APIError
import time
while retries < max_retries:
try:
# Try to start streaming LLM output
completion = client.chat.completions.create(
model="meta/llama-3.1-405b-instruct",
messages=conversation,
temperature=0.2,
top_p=0.7,
max_tokens=4000,
stream=False
)
answer = completion.choices[0].message.content
return answer # end function
except RateLimitError:
retries += 1
wait_time = 2 ** retries
gr.Warning("⚠️ Rate limit hit. Retrying in {wait_time} seconds...")
time.sleep(wait_time)
except (APIConnectionError, APIError) as e:
retries += 1
gr.Warning("⚠️ API/network error: {e}. Retrying ({retries}/{max_retries})...")
time.sleep(2)
except Exception as e:
gr.Error(f"❌ Unexpected error: {e}")
return None
def step_transcribe(audio:tuple, state: AppState):
# STT
# --- Step 1: ASR (Whisper)
gr.Info("🎤 Asking `Whisper` to listen to your sweet voice...", DURATION_TOAST_TIMEOUT)
audio_bytes = audio_to_bytes(audio)
transcription = asr_transcribe(audio_bytes)
# Transfer audio to .wav file
audio_buffer = io.BytesIO()
segment = AudioSegment(
audio[1].tobytes(),
frame_rate=audio[0],
sample_width=audio[1].dtype.itemsize,
channels=(1 if len(audio[1].shape) == 1 else audio[1].shape[1]),
)
segment.export(audio_buffer, format="wav")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_buffer.getvalue())
conversation_audio_bit: dict = {"role": "user",
"content": {"path": f.name,
"mime_type": "audio/wav"}}
conversation_text_bit: dict = {
"role": "user",
"content": transcription
}
# update state variables
state.llm_conversation.append(conversation_text_bit)
#
state.display_conversation.append(conversation_text_bit)
state.display_conversation.append(conversation_audio_bit)
return None, state
def step_llm_response(state: AppState):
# STT
# Perform LLM calls
gr.Info("🧠 Now using `LLaMA` to build an answer for you...", DURATION_TOAST_TIMEOUT)
# Write code to TTS
llm_response = chat_llm(state.llm_conversation)
# update conversation
state.llm_response = llm_response
conversation_a_text_bit: dict = {
"role": "assistant",
"content": llm_response
}
state.llm_conversation.append(conversation_a_text_bit)
state.display_conversation.append(conversation_a_text_bit)
return state
def step_synth_audio(state: AppState):
gr.Info("🎤 Asking `magpie` to read Julia's response", DURATION_TOAST_TIMEOUT)
# TTS: get audio
audio_bytes: bytes = riva_tts_service(state.llm_response)
# --- TTS: get audio
audio_bytes: bytes = riva_tts_service(state.llm_response)
# --- Convert bytes to numpy for Gradio playback
audio_segment = AudioSegment(
data=audio_bytes,
sample_width=2, # bytes per sample (e.g. 16-bit PCM → 2)
frame_rate=44100, # or whatever your TTS sample rate is
channels=1 # mono, or adjust as needed
)
samples = np.array(audio_segment.get_array_of_samples())
if audio_segment.channels > 1:
samples = samples.reshape((-1, audio_segment.channels))
playable_audio = (audio_segment.frame_rate, samples) # usable directly by gr.Audio(type="numpy")
# --- (Optional) Export to .wav file for persistent chat storage
audio_buffer = io.BytesIO()
audio_segment.export(audio_buffer, format="wav")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_buffer.getvalue())
audio_path = f.name
conversation_audio_bit = {
"role": "assistant",
"content": {
"path": audio_path,
"mime_type": "audio/wav"
}
}
# update state variables
state.display_conversation.append(conversation_audio_bit)
return playable_audio, state
with gr.Blocks(css=css_ui) as demo:
gr.Markdown("""
# 💬 Talk To Julia about Me (Deepak)
""")
# Subtitle / description
gr.Markdown("""
**Powered by NVIDIA RIVA + NVIDIA NIM ⚡**
*Start by asking: “Can you hear me?”*
""")
# LinkedIn link
gr.Markdown("""
Reach me out on [LinkedIn](https://www.linkedin.com/in/deepak-sahu-7a6894159/)
""")
with gr.Row():
with gr.Column(4):
chatbot = gr.Chatbot(label="Conversation", type="messages", elem_classes=["chatbox"])
output_audio: gr.Audio = gr.Audio(
label="Last voice note from the bot",
visible=True,
autoplay=True
)
with gr.Column(scale=1):
input_audio: gr.Audio = gr.Audio(
label="Press Record button to speak with the chatbot. Stop it to send your voice note. :)",
sources="microphone", type="numpy",
scale=1
)
gr.Markdown('''
## Models is use:
1. Automatic Speech Recognition: [OpenAI: Whisper-large V3](https://build.nvidia.com/openai/whisper-large-v3)
2. LLM: [Meta Llama 3.1 405B](https://build.nvidia.com/meta/llama-3_1-405b-instruct)
3. Text to Speech: [NVIDIA Magpie](https://build.nvidia.com/nvidia/magpie-tts-multilingual)
''')
state = gr.State(value=AppState(
llm_conversation=[
{
"role": "system",
"content": SYSTEM_PROMPT
}
]
))
respond = input_audio.stop_recording(
step_transcribe,
[input_audio, state],
[input_audio, state],
show_progress="full",
show_progress_on=[input_audio, output_audio]
)
respond.then(
lambda s: s.display_conversation, [state], [chatbot]
).then(
step_llm_response, [state], [state], show_progress="full", show_progress_on=[input_audio, output_audio]
).then(
lambda s: s.display_conversation, [state], [chatbot]
).then(
step_synth_audio, [state], [output_audio, state], show_progress="full", show_progress_on=[input_audio, output_audio]
).then(
lambda s: s.display_conversation, [state], [chatbot]
)
if __name__ == "__main__":
demo.launch()