File size: 7,195 Bytes
51c9eb3
07f8ff0
51c9eb3
 
 
f5d5c69
 
51c9eb3
5d6c840
 
 
 
 
f5d5c69
51c9eb3
 
 
f5d5c69
5d6c840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51c9eb3
5d6c840
 
51c9eb3
 
f5d5c69
51c9eb3
 
 
 
 
 
 
 
 
 
 
f5d5c69
51c9eb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d6c840
f5d5c69
5d6c840
 
 
 
51c9eb3
5d6c840
 
f5d5c69
5d6c840
 
 
 
 
 
 
 
f5d5c69
5d6c840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6fff79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51c9eb3
5d6c840
 
 
 
 
 
 
 
 
51c9eb3
5d6c840
 
 
51c9eb3
5d6c840
 
 
 
 
 
51c9eb3
5d6c840
 
 
 
 
 
 
 
 
51c9eb3
 
5d6c840
 
51c9eb3
5d6c840
 
51c9eb3
5d6c840
 
 
 
 
 
 
 
 
 
51c9eb3
f5d5c69
51c9eb3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import tempfile
import gradio as gr
from _data_model import AppState
from _utils import audio_to_bytes
from test1 import asr_transcribe
import numpy as np
import io
from pydub import AudioSegment
from _riva import riva_tts_service
from _prompts import SYSTEM_PROMPT
from _css import css_ui

DURATION_TOAST_TIMEOUT: int = 5

# def start_recording_user(state: AppState):
#     if not state.stopped:
#         return gr.Audio(recording=True)

def chat_llm(conversation, max_retries=3):
    retries = 0
    from _utils import client
    from openai import RateLimitError, APIConnectionError, APIError
    import time

    while retries < max_retries:
        try:
            # Try to start streaming LLM output
            completion = client.chat.completions.create(
                model="meta/llama-3.1-405b-instruct",
                messages=conversation,
                temperature=0.2,
                top_p=0.7,
                max_tokens=4000,
                stream=False
            )
            answer = completion.choices[0].message.content
            return answer # end function

        except RateLimitError:
            retries += 1
            wait_time = 2 ** retries
            gr.Warning("⚠️ Rate limit hit. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)

        except (APIConnectionError, APIError) as e:
            retries += 1
            gr.Warning("⚠️ API/network error: {e}. Retrying ({retries}/{max_retries})...")
            time.sleep(2)

        except Exception as e:
            gr.Error(f"❌ Unexpected error: {e}")
            return None
        
def step_transcribe(audio:tuple, state: AppState):
    # STT 
    # --- Step 1: ASR (Whisper)
    gr.Info("🎤 Asking `Whisper` to listen to your sweet voice...", DURATION_TOAST_TIMEOUT)
    audio_bytes = audio_to_bytes(audio)
    transcription = asr_transcribe(audio_bytes)
    
    # Transfer audio to .wav file 
    audio_buffer = io.BytesIO()
    segment = AudioSegment(
        audio[1].tobytes(),
        frame_rate=audio[0],
        sample_width=audio[1].dtype.itemsize,
        channels=(1 if len(audio[1].shape) == 1 else audio[1].shape[1]),
    )
    segment.export(audio_buffer, format="wav")
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        f.write(audio_buffer.getvalue())
    

    conversation_audio_bit: dict = {"role": "user",
                                "content": {"path": f.name,
                                "mime_type": "audio/wav"}}
    conversation_text_bit: dict = {
        "role": "user",
        "content": transcription
    }

    # update state variables
    state.llm_conversation.append(conversation_text_bit)
    #
    state.display_conversation.append(conversation_text_bit)
    state.display_conversation.append(conversation_audio_bit)

    return None, state

def step_llm_response(state: AppState):
    # STT 
    # Perform LLM calls    
    gr.Info("🧠 Now using `LLaMA` to build an answer for you...", DURATION_TOAST_TIMEOUT)
    # Write code to TTS
    llm_response = chat_llm(state.llm_conversation)
    

    # update conversation 
    state.llm_response = llm_response
    conversation_a_text_bit: dict = {
        "role": "assistant",
        "content": llm_response
    }
    state.llm_conversation.append(conversation_a_text_bit)
    state.display_conversation.append(conversation_a_text_bit)

    return state

def step_synth_audio(state: AppState): 
    gr.Info("🎤 Asking `magpie` to read Julia's response", DURATION_TOAST_TIMEOUT)
    # TTS: get audio
    audio_bytes: bytes = riva_tts_service(state.llm_response)

    # --- TTS: get audio
    audio_bytes: bytes = riva_tts_service(state.llm_response)

    # --- Convert bytes to numpy for Gradio playback
    audio_segment = AudioSegment(
        data=audio_bytes,
        sample_width=2,      # bytes per sample (e.g. 16-bit PCM → 2)
        frame_rate=44100,    # or whatever your TTS sample rate is
        channels=1           # mono, or adjust as needed
    )
    samples = np.array(audio_segment.get_array_of_samples())
    if audio_segment.channels > 1:
        samples = samples.reshape((-1, audio_segment.channels))

    playable_audio = (audio_segment.frame_rate, samples)  # usable directly by gr.Audio(type="numpy")

    # --- (Optional) Export to .wav file for persistent chat storage
    audio_buffer = io.BytesIO()
    audio_segment.export(audio_buffer, format="wav")

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        f.write(audio_buffer.getvalue())
        audio_path = f.name

    conversation_audio_bit = {
        "role": "assistant", 
        "content": {
            "path": audio_path,
            "mime_type": "audio/wav"
        }
        
    }
    
    # update state variables
    state.display_conversation.append(conversation_audio_bit)

    return playable_audio, state
    


with gr.Blocks(css=css_ui) as demo:

    gr.Markdown("""
# 💬 Talk To Julia about Me (Deepak)
""")

    # Subtitle / description
    gr.Markdown("""
**Powered by NVIDIA RIVA + NVIDIA NIM ⚡**  
*Start by asking: “Can you hear me?”*
""")

    # LinkedIn link
    gr.Markdown("""
Reach me out on [LinkedIn](https://www.linkedin.com/in/deepak-sahu-7a6894159/)
""")
    
    with gr.Row():
        with gr.Column(4):
            chatbot = gr.Chatbot(label="Conversation", type="messages", elem_classes=["chatbox"])
            output_audio: gr.Audio = gr.Audio(
                label="Last voice note from the bot",
                visible=True, 
                autoplay=True
            )
            
        with gr.Column(scale=1):
            input_audio: gr.Audio = gr.Audio(
                label="Press Record button to speak with the chatbot. Stop it to send your voice note. :)", 
                sources="microphone", type="numpy",
                scale=1
            )
            gr.Markdown('''
## Models is use: 
1. Automatic Speech Recognition: [OpenAI: Whisper-large V3](https://build.nvidia.com/openai/whisper-large-v3)
2. LLM: [Meta Llama 3.1 405B](https://build.nvidia.com/meta/llama-3_1-405b-instruct)
3. Text to Speech: [NVIDIA Magpie](https://build.nvidia.com/nvidia/magpie-tts-multilingual)
                        ''')


    state = gr.State(value=AppState(
        llm_conversation=[
            {
                "role": "system", 
                "content": SYSTEM_PROMPT
            }
        ]
    ))

    respond = input_audio.stop_recording(
        step_transcribe,
        [input_audio, state],
        [input_audio, state],
        show_progress="full",
        show_progress_on=[input_audio, output_audio]
    )
    respond.then(
        lambda s: s.display_conversation, [state], [chatbot]
    ).then(
        step_llm_response, [state], [state], show_progress="full", show_progress_on=[input_audio, output_audio]
    ).then(
        lambda s: s.display_conversation, [state], [chatbot]
    ).then(
        step_synth_audio, [state], [output_audio, state], show_progress="full", show_progress_on=[input_audio, output_audio]
    ).then(
        lambda s: s.display_conversation, [state], [chatbot]
    )

if __name__ == "__main__":
    demo.launch()