Spaces:
Sleeping
Sleeping
File size: 7,195 Bytes
51c9eb3 07f8ff0 51c9eb3 f5d5c69 51c9eb3 5d6c840 f5d5c69 51c9eb3 f5d5c69 5d6c840 51c9eb3 5d6c840 51c9eb3 f5d5c69 51c9eb3 f5d5c69 51c9eb3 5d6c840 f5d5c69 5d6c840 51c9eb3 5d6c840 f5d5c69 5d6c840 f5d5c69 5d6c840 b6fff79 51c9eb3 5d6c840 51c9eb3 5d6c840 51c9eb3 5d6c840 51c9eb3 5d6c840 51c9eb3 5d6c840 51c9eb3 5d6c840 51c9eb3 5d6c840 51c9eb3 f5d5c69 51c9eb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import tempfile
import gradio as gr
from _data_model import AppState
from _utils import audio_to_bytes
from test1 import asr_transcribe
import numpy as np
import io
from pydub import AudioSegment
from _riva import riva_tts_service
from _prompts import SYSTEM_PROMPT
from _css import css_ui
DURATION_TOAST_TIMEOUT: int = 5
# def start_recording_user(state: AppState):
# if not state.stopped:
# return gr.Audio(recording=True)
def chat_llm(conversation, max_retries=3):
retries = 0
from _utils import client
from openai import RateLimitError, APIConnectionError, APIError
import time
while retries < max_retries:
try:
# Try to start streaming LLM output
completion = client.chat.completions.create(
model="meta/llama-3.1-405b-instruct",
messages=conversation,
temperature=0.2,
top_p=0.7,
max_tokens=4000,
stream=False
)
answer = completion.choices[0].message.content
return answer # end function
except RateLimitError:
retries += 1
wait_time = 2 ** retries
gr.Warning("⚠️ Rate limit hit. Retrying in {wait_time} seconds...")
time.sleep(wait_time)
except (APIConnectionError, APIError) as e:
retries += 1
gr.Warning("⚠️ API/network error: {e}. Retrying ({retries}/{max_retries})...")
time.sleep(2)
except Exception as e:
gr.Error(f"❌ Unexpected error: {e}")
return None
def step_transcribe(audio:tuple, state: AppState):
# STT
# --- Step 1: ASR (Whisper)
gr.Info("🎤 Asking `Whisper` to listen to your sweet voice...", DURATION_TOAST_TIMEOUT)
audio_bytes = audio_to_bytes(audio)
transcription = asr_transcribe(audio_bytes)
# Transfer audio to .wav file
audio_buffer = io.BytesIO()
segment = AudioSegment(
audio[1].tobytes(),
frame_rate=audio[0],
sample_width=audio[1].dtype.itemsize,
channels=(1 if len(audio[1].shape) == 1 else audio[1].shape[1]),
)
segment.export(audio_buffer, format="wav")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_buffer.getvalue())
conversation_audio_bit: dict = {"role": "user",
"content": {"path": f.name,
"mime_type": "audio/wav"}}
conversation_text_bit: dict = {
"role": "user",
"content": transcription
}
# update state variables
state.llm_conversation.append(conversation_text_bit)
#
state.display_conversation.append(conversation_text_bit)
state.display_conversation.append(conversation_audio_bit)
return None, state
def step_llm_response(state: AppState):
# STT
# Perform LLM calls
gr.Info("🧠 Now using `LLaMA` to build an answer for you...", DURATION_TOAST_TIMEOUT)
# Write code to TTS
llm_response = chat_llm(state.llm_conversation)
# update conversation
state.llm_response = llm_response
conversation_a_text_bit: dict = {
"role": "assistant",
"content": llm_response
}
state.llm_conversation.append(conversation_a_text_bit)
state.display_conversation.append(conversation_a_text_bit)
return state
def step_synth_audio(state: AppState):
gr.Info("🎤 Asking `magpie` to read Julia's response", DURATION_TOAST_TIMEOUT)
# TTS: get audio
audio_bytes: bytes = riva_tts_service(state.llm_response)
# --- TTS: get audio
audio_bytes: bytes = riva_tts_service(state.llm_response)
# --- Convert bytes to numpy for Gradio playback
audio_segment = AudioSegment(
data=audio_bytes,
sample_width=2, # bytes per sample (e.g. 16-bit PCM → 2)
frame_rate=44100, # or whatever your TTS sample rate is
channels=1 # mono, or adjust as needed
)
samples = np.array(audio_segment.get_array_of_samples())
if audio_segment.channels > 1:
samples = samples.reshape((-1, audio_segment.channels))
playable_audio = (audio_segment.frame_rate, samples) # usable directly by gr.Audio(type="numpy")
# --- (Optional) Export to .wav file for persistent chat storage
audio_buffer = io.BytesIO()
audio_segment.export(audio_buffer, format="wav")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_buffer.getvalue())
audio_path = f.name
conversation_audio_bit = {
"role": "assistant",
"content": {
"path": audio_path,
"mime_type": "audio/wav"
}
}
# update state variables
state.display_conversation.append(conversation_audio_bit)
return playable_audio, state
with gr.Blocks(css=css_ui) as demo:
gr.Markdown("""
# 💬 Talk To Julia about Me (Deepak)
""")
# Subtitle / description
gr.Markdown("""
**Powered by NVIDIA RIVA + NVIDIA NIM ⚡**
*Start by asking: “Can you hear me?”*
""")
# LinkedIn link
gr.Markdown("""
Reach me out on [LinkedIn](https://www.linkedin.com/in/deepak-sahu-7a6894159/)
""")
with gr.Row():
with gr.Column(4):
chatbot = gr.Chatbot(label="Conversation", type="messages", elem_classes=["chatbox"])
output_audio: gr.Audio = gr.Audio(
label="Last voice note from the bot",
visible=True,
autoplay=True
)
with gr.Column(scale=1):
input_audio: gr.Audio = gr.Audio(
label="Press Record button to speak with the chatbot. Stop it to send your voice note. :)",
sources="microphone", type="numpy",
scale=1
)
gr.Markdown('''
## Models is use:
1. Automatic Speech Recognition: [OpenAI: Whisper-large V3](https://build.nvidia.com/openai/whisper-large-v3)
2. LLM: [Meta Llama 3.1 405B](https://build.nvidia.com/meta/llama-3_1-405b-instruct)
3. Text to Speech: [NVIDIA Magpie](https://build.nvidia.com/nvidia/magpie-tts-multilingual)
''')
state = gr.State(value=AppState(
llm_conversation=[
{
"role": "system",
"content": SYSTEM_PROMPT
}
]
))
respond = input_audio.stop_recording(
step_transcribe,
[input_audio, state],
[input_audio, state],
show_progress="full",
show_progress_on=[input_audio, output_audio]
)
respond.then(
lambda s: s.display_conversation, [state], [chatbot]
).then(
step_llm_response, [state], [state], show_progress="full", show_progress_on=[input_audio, output_audio]
).then(
lambda s: s.display_conversation, [state], [chatbot]
).then(
step_synth_audio, [state], [output_audio, state], show_progress="full", show_progress_on=[input_audio, output_audio]
).then(
lambda s: s.display_conversation, [state], [chatbot]
)
if __name__ == "__main__":
demo.launch() |