Spaces:
Sleeping
Sleeping
| import tempfile | |
| import gradio as gr | |
| from _data_model import AppState | |
| from _utils import audio_to_bytes | |
| from test1 import asr_transcribe | |
| import numpy as np | |
| import io | |
| from pydub import AudioSegment | |
| from _riva import riva_tts_service | |
| from _prompts import SYSTEM_PROMPT | |
| from _css import css_ui | |
| DURATION_TOAST_TIMEOUT: int = 5 | |
| # def start_recording_user(state: AppState): | |
| # if not state.stopped: | |
| # return gr.Audio(recording=True) | |
| def chat_llm(conversation, max_retries=3): | |
| retries = 0 | |
| from _utils import client | |
| from openai import RateLimitError, APIConnectionError, APIError | |
| import time | |
| while retries < max_retries: | |
| try: | |
| # Try to start streaming LLM output | |
| completion = client.chat.completions.create( | |
| model="meta/llama-3.1-405b-instruct", | |
| messages=conversation, | |
| temperature=0.2, | |
| top_p=0.7, | |
| max_tokens=4000, | |
| stream=False | |
| ) | |
| answer = completion.choices[0].message.content | |
| return answer # end function | |
| except RateLimitError: | |
| retries += 1 | |
| wait_time = 2 ** retries | |
| gr.Warning("⚠️ Rate limit hit. Retrying in {wait_time} seconds...") | |
| time.sleep(wait_time) | |
| except (APIConnectionError, APIError) as e: | |
| retries += 1 | |
| gr.Warning("⚠️ API/network error: {e}. Retrying ({retries}/{max_retries})...") | |
| time.sleep(2) | |
| except Exception as e: | |
| gr.Error(f"❌ Unexpected error: {e}") | |
| return None | |
| def step_transcribe(audio:tuple, state: AppState): | |
| # STT | |
| # --- Step 1: ASR (Whisper) | |
| gr.Info("🎤 Asking `Whisper` to listen to your sweet voice...", DURATION_TOAST_TIMEOUT) | |
| audio_bytes = audio_to_bytes(audio) | |
| transcription = asr_transcribe(audio_bytes) | |
| # Transfer audio to .wav file | |
| audio_buffer = io.BytesIO() | |
| segment = AudioSegment( | |
| audio[1].tobytes(), | |
| frame_rate=audio[0], | |
| sample_width=audio[1].dtype.itemsize, | |
| channels=(1 if len(audio[1].shape) == 1 else audio[1].shape[1]), | |
| ) | |
| segment.export(audio_buffer, format="wav") | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| f.write(audio_buffer.getvalue()) | |
| conversation_audio_bit: dict = {"role": "user", | |
| "content": {"path": f.name, | |
| "mime_type": "audio/wav"}} | |
| conversation_text_bit: dict = { | |
| "role": "user", | |
| "content": transcription | |
| } | |
| # update state variables | |
| state.llm_conversation.append(conversation_text_bit) | |
| # | |
| state.display_conversation.append(conversation_text_bit) | |
| state.display_conversation.append(conversation_audio_bit) | |
| return None, state | |
| def step_llm_response(state: AppState): | |
| # STT | |
| # Perform LLM calls | |
| gr.Info("🧠 Now using `LLaMA` to build an answer for you...", DURATION_TOAST_TIMEOUT) | |
| # Write code to TTS | |
| llm_response = chat_llm(state.llm_conversation) | |
| # update conversation | |
| state.llm_response = llm_response | |
| conversation_a_text_bit: dict = { | |
| "role": "assistant", | |
| "content": llm_response | |
| } | |
| state.llm_conversation.append(conversation_a_text_bit) | |
| state.display_conversation.append(conversation_a_text_bit) | |
| return state | |
| def step_synth_audio(state: AppState): | |
| gr.Info("🎤 Asking `magpie` to read Julia's response", DURATION_TOAST_TIMEOUT) | |
| # TTS: get audio | |
| audio_bytes: bytes = riva_tts_service(state.llm_response) | |
| # --- TTS: get audio | |
| audio_bytes: bytes = riva_tts_service(state.llm_response) | |
| # --- Convert bytes to numpy for Gradio playback | |
| audio_segment = AudioSegment( | |
| data=audio_bytes, | |
| sample_width=2, # bytes per sample (e.g. 16-bit PCM → 2) | |
| frame_rate=44100, # or whatever your TTS sample rate is | |
| channels=1 # mono, or adjust as needed | |
| ) | |
| samples = np.array(audio_segment.get_array_of_samples()) | |
| if audio_segment.channels > 1: | |
| samples = samples.reshape((-1, audio_segment.channels)) | |
| playable_audio = (audio_segment.frame_rate, samples) # usable directly by gr.Audio(type="numpy") | |
| # --- (Optional) Export to .wav file for persistent chat storage | |
| audio_buffer = io.BytesIO() | |
| audio_segment.export(audio_buffer, format="wav") | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| f.write(audio_buffer.getvalue()) | |
| audio_path = f.name | |
| conversation_audio_bit = { | |
| "role": "assistant", | |
| "content": { | |
| "path": audio_path, | |
| "mime_type": "audio/wav" | |
| } | |
| } | |
| # update state variables | |
| state.display_conversation.append(conversation_audio_bit) | |
| return playable_audio, state | |
| with gr.Blocks(css=css_ui) as demo: | |
| gr.Markdown(""" | |
| # 💬 Talk To Julia about Me (Deepak) | |
| """) | |
| # Subtitle / description | |
| gr.Markdown(""" | |
| **Powered by NVIDIA RIVA + NVIDIA NIM ⚡** | |
| *Start by asking: “Can you hear me?”* | |
| """) | |
| # LinkedIn link | |
| gr.Markdown(""" | |
| Reach me out on [LinkedIn](https://www.linkedin.com/in/deepak-sahu-7a6894159/) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(4): | |
| chatbot = gr.Chatbot(label="Conversation", type="messages", elem_classes=["chatbox"]) | |
| output_audio: gr.Audio = gr.Audio( | |
| label="Last voice note from the bot", | |
| visible=True, | |
| autoplay=True | |
| ) | |
| with gr.Column(scale=1): | |
| input_audio: gr.Audio = gr.Audio( | |
| label="Press Record button to speak with the chatbot. Stop it to send your voice note. :)", | |
| sources="microphone", type="numpy", | |
| scale=1 | |
| ) | |
| gr.Markdown(''' | |
| ## Models is use: | |
| 1. Automatic Speech Recognition: [OpenAI: Whisper-large V3](https://build.nvidia.com/openai/whisper-large-v3) | |
| 2. LLM: [Meta Llama 3.1 405B](https://build.nvidia.com/meta/llama-3_1-405b-instruct) | |
| 3. Text to Speech: [NVIDIA Magpie](https://build.nvidia.com/nvidia/magpie-tts-multilingual) | |
| ''') | |
| state = gr.State(value=AppState( | |
| llm_conversation=[ | |
| { | |
| "role": "system", | |
| "content": SYSTEM_PROMPT | |
| } | |
| ] | |
| )) | |
| respond = input_audio.stop_recording( | |
| step_transcribe, | |
| [input_audio, state], | |
| [input_audio, state], | |
| show_progress="full", | |
| show_progress_on=[input_audio, output_audio] | |
| ) | |
| respond.then( | |
| lambda s: s.display_conversation, [state], [chatbot] | |
| ).then( | |
| step_llm_response, [state], [state], show_progress="full", show_progress_on=[input_audio, output_audio] | |
| ).then( | |
| lambda s: s.display_conversation, [state], [chatbot] | |
| ).then( | |
| step_synth_audio, [state], [output_audio, state], show_progress="full", show_progress_on=[input_audio, output_audio] | |
| ).then( | |
| lambda s: s.display_conversation, [state], [chatbot] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |