import tempfile import gradio as gr from _data_model import AppState from _utils import audio_to_bytes from test1 import asr_transcribe import numpy as np import io from pydub import AudioSegment from _riva import riva_tts_service from _prompts import SYSTEM_PROMPT from _css import css_ui DURATION_TOAST_TIMEOUT: int = 5 # def start_recording_user(state: AppState): # if not state.stopped: # return gr.Audio(recording=True) def chat_llm(conversation, max_retries=3): retries = 0 from _utils import client from openai import RateLimitError, APIConnectionError, APIError import time while retries < max_retries: try: # Try to start streaming LLM output completion = client.chat.completions.create( model="meta/llama-3.1-405b-instruct", messages=conversation, temperature=0.2, top_p=0.7, max_tokens=4000, stream=False ) answer = completion.choices[0].message.content return answer # end function except RateLimitError: retries += 1 wait_time = 2 ** retries gr.Warning("⚠️ Rate limit hit. Retrying in {wait_time} seconds...") time.sleep(wait_time) except (APIConnectionError, APIError) as e: retries += 1 gr.Warning("⚠️ API/network error: {e}. Retrying ({retries}/{max_retries})...") time.sleep(2) except Exception as e: gr.Error(f"❌ Unexpected error: {e}") return None def step_transcribe(audio:tuple, state: AppState): # STT # --- Step 1: ASR (Whisper) gr.Info("🎤 Asking `Whisper` to listen to your sweet voice...", DURATION_TOAST_TIMEOUT) audio_bytes = audio_to_bytes(audio) transcription = asr_transcribe(audio_bytes) # Transfer audio to .wav file audio_buffer = io.BytesIO() segment = AudioSegment( audio[1].tobytes(), frame_rate=audio[0], sample_width=audio[1].dtype.itemsize, channels=(1 if len(audio[1].shape) == 1 else audio[1].shape[1]), ) segment.export(audio_buffer, format="wav") with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: f.write(audio_buffer.getvalue()) conversation_audio_bit: dict = {"role": "user", "content": {"path": f.name, "mime_type": "audio/wav"}} conversation_text_bit: dict = { "role": "user", "content": transcription } # update state variables state.llm_conversation.append(conversation_text_bit) # state.display_conversation.append(conversation_text_bit) state.display_conversation.append(conversation_audio_bit) return None, state def step_llm_response(state: AppState): # STT # Perform LLM calls gr.Info("🧠 Now using `LLaMA` to build an answer for you...", DURATION_TOAST_TIMEOUT) # Write code to TTS llm_response = chat_llm(state.llm_conversation) # update conversation state.llm_response = llm_response conversation_a_text_bit: dict = { "role": "assistant", "content": llm_response } state.llm_conversation.append(conversation_a_text_bit) state.display_conversation.append(conversation_a_text_bit) return state def step_synth_audio(state: AppState): gr.Info("🎤 Asking `magpie` to read Julia's response", DURATION_TOAST_TIMEOUT) # TTS: get audio audio_bytes: bytes = riva_tts_service(state.llm_response) # --- TTS: get audio audio_bytes: bytes = riva_tts_service(state.llm_response) # --- Convert bytes to numpy for Gradio playback audio_segment = AudioSegment( data=audio_bytes, sample_width=2, # bytes per sample (e.g. 16-bit PCM → 2) frame_rate=44100, # or whatever your TTS sample rate is channels=1 # mono, or adjust as needed ) samples = np.array(audio_segment.get_array_of_samples()) if audio_segment.channels > 1: samples = samples.reshape((-1, audio_segment.channels)) playable_audio = (audio_segment.frame_rate, samples) # usable directly by gr.Audio(type="numpy") # --- (Optional) Export to .wav file for persistent chat storage audio_buffer = io.BytesIO() audio_segment.export(audio_buffer, format="wav") with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: f.write(audio_buffer.getvalue()) audio_path = f.name conversation_audio_bit = { "role": "assistant", "content": { "path": audio_path, "mime_type": "audio/wav" } } # update state variables state.display_conversation.append(conversation_audio_bit) return playable_audio, state with gr.Blocks(css=css_ui) as demo: gr.Markdown(""" # 💬 Talk To Julia about Me (Deepak) """) # Subtitle / description gr.Markdown(""" **Powered by NVIDIA RIVA + NVIDIA NIM ⚡** *Start by asking: “Can you hear me?”* """) # LinkedIn link gr.Markdown(""" Reach me out on [LinkedIn](https://www.linkedin.com/in/deepak-sahu-7a6894159/) """) with gr.Row(): with gr.Column(4): chatbot = gr.Chatbot(label="Conversation", type="messages", elem_classes=["chatbox"]) output_audio: gr.Audio = gr.Audio( label="Last voice note from the bot", visible=True, autoplay=True ) with gr.Column(scale=1): input_audio: gr.Audio = gr.Audio( label="Press Record button to speak with the chatbot. Stop it to send your voice note. :)", sources="microphone", type="numpy", scale=1 ) gr.Markdown(''' ## Models is use: 1. Automatic Speech Recognition: [OpenAI: Whisper-large V3](https://build.nvidia.com/openai/whisper-large-v3) 2. LLM: [Meta Llama 3.1 405B](https://build.nvidia.com/meta/llama-3_1-405b-instruct) 3. Text to Speech: [NVIDIA Magpie](https://build.nvidia.com/nvidia/magpie-tts-multilingual) ''') state = gr.State(value=AppState( llm_conversation=[ { "role": "system", "content": SYSTEM_PROMPT } ] )) respond = input_audio.stop_recording( step_transcribe, [input_audio, state], [input_audio, state], show_progress="full", show_progress_on=[input_audio, output_audio] ) respond.then( lambda s: s.display_conversation, [state], [chatbot] ).then( step_llm_response, [state], [state], show_progress="full", show_progress_on=[input_audio, output_audio] ).then( lambda s: s.display_conversation, [state], [chatbot] ).then( step_synth_audio, [state], [output_audio, state], show_progress="full", show_progress_on=[input_audio, output_audio] ).then( lambda s: s.display_conversation, [state], [chatbot] ) if __name__ == "__main__": demo.launch()