Spaces:

LunaticMaestro
/

voiceBot

Sleeping

File size: 7,195 Bytes

import tempfile
import gradio as gr
from _data_model import AppState
from _utils import audio_to_bytes
from test1 import asr_transcribe
import numpy as np
import io
from pydub import AudioSegment
from _riva import riva_tts_service
from _prompts import SYSTEM_PROMPT
from _css import css_ui

DURATION_TOAST_TIMEOUT: int = 5

# def start_recording_user(state: AppState):
#     if not state.stopped:
#         return gr.Audio(recording=True)

def chat_llm(conversation, max_retries=3):
    retries = 0
    from _utils import client
    from openai import RateLimitError, APIConnectionError, APIError
    import time

    while retries < max_retries:
        try:
            # Try to start streaming LLM output
            completion = client.chat.completions.create(
                model="meta/llama-3.1-405b-instruct",
                messages=conversation,
                temperature=0.2,
                top_p=0.7,
                max_tokens=4000,
                stream=False
            )
            answer = completion.choices[0].message.content
            return answer # end function

        except RateLimitError:
            retries += 1
            wait_time = 2 ** retries
            gr.Warning("⚠️ Rate limit hit. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)

        except (APIConnectionError, APIError) as e:
            retries += 1
            gr.Warning("⚠️ API/network error: {e}. Retrying ({retries}/{max_retries})...")
            time.sleep(2)

        except Exception as e:
            gr.Error(f"❌ Unexpected error: {e}")
            return None
        
def step_transcribe(audio:tuple, state: AppState):
    # STT 
    # --- Step 1: ASR (Whisper)
    gr.Info("🎤 Asking `Whisper` to listen to your sweet voice...", DURATION_TOAST_TIMEOUT)
    audio_bytes = audio_to_bytes(audio)
    transcription = asr_transcribe(audio_bytes)
    
    # Transfer audio to .wav file 
    audio_buffer = io.BytesIO()
    segment = AudioSegment(
        audio[1].tobytes(),
        frame_rate=audio[0],
        sample_width=audio[1].dtype.itemsize,
        channels=(1 if len(audio[1].shape) == 1 else audio[1].shape[1]),
    )
    segment.export(audio_buffer, format="wav")
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        f.write(audio_buffer.getvalue())
    

    conversation_audio_bit: dict = {"role": "user",
                                "content": {"path": f.name,
                                "mime_type": "audio/wav"}}
    conversation_text_bit: dict = {
        "role": "user",
        "content": transcription
    }

    # update state variables
    state.llm_conversation.append(conversation_text_bit)
    #
    state.display_conversation.append(conversation_text_bit)
    state.display_conversation.append(conversation_audio_bit)

    return None, state

def step_llm_response(state: AppState):
    # STT 
    # Perform LLM calls    
    gr.Info("🧠 Now using `LLaMA` to build an answer for you...", DURATION_TOAST_TIMEOUT)
    # Write code to TTS
    llm_response = chat_llm(state.llm_conversation)
    

    # update conversation 
    state.llm_response = llm_response
    conversation_a_text_bit: dict = {
        "role": "assistant",
        "content": llm_response
    }
    state.llm_conversation.append(conversation_a_text_bit)
    state.display_conversation.append(conversation_a_text_bit)

    return state

def step_synth_audio(state: AppState): 
    gr.Info("🎤 Asking `magpie` to read Julia's response", DURATION_TOAST_TIMEOUT)
    # TTS: get audio
    audio_bytes: bytes = riva_tts_service(state.llm_response)

    # --- TTS: get audio
    audio_bytes: bytes = riva_tts_service(state.llm_response)

    # --- Convert bytes to numpy for Gradio playback
    audio_segment = AudioSegment(
        data=audio_bytes,
        sample_width=2,      # bytes per sample (e.g. 16-bit PCM → 2)
        frame_rate=44100,    # or whatever your TTS sample rate is
        channels=1           # mono, or adjust as needed
    )
    samples = np.array(audio_segment.get_array_of_samples())
    if audio_segment.channels > 1:
        samples = samples.reshape((-1, audio_segment.channels))

    playable_audio = (audio_segment.frame_rate, samples)  # usable directly by gr.Audio(type="numpy")

    # --- (Optional) Export to .wav file for persistent chat storage
    audio_buffer = io.BytesIO()
    audio_segment.export(audio_buffer, format="wav")

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        f.write(audio_buffer.getvalue())
        audio_path = f.name

    conversation_audio_bit = {
        "role": "assistant", 
        "content": {
            "path": audio_path,
            "mime_type": "audio/wav"
        }
        
    }
    
    # update state variables
    state.display_conversation.append(conversation_audio_bit)

    return playable_audio, state
    


with gr.Blocks(css=css_ui) as demo:

    gr.Markdown("""
# 💬 Talk To Julia about Me (Deepak)
""")

    # Subtitle / description
    gr.Markdown("""
**Powered by NVIDIA RIVA + NVIDIA NIM ⚡**  
*Start by asking: “Can you hear me?”*
""")

    # LinkedIn link
    gr.Markdown("""
Reach me out on [LinkedIn](https://www.linkedin.com/in/deepak-sahu-7a6894159/)
""")
    
    with gr.Row():
        with gr.Column(4):
            chatbot = gr.Chatbot(label="Conversation", type="messages", elem_classes=["chatbox"])
            output_audio: gr.Audio = gr.Audio(
                label="Last voice note from the bot",
                visible=True, 
                autoplay=True
            )
            
        with gr.Column(scale=1):
            input_audio: gr.Audio = gr.Audio(
                label="Press Record button to speak with the chatbot. Stop it to send your voice note. :)", 
                sources="microphone", type="numpy",
                scale=1
            )
            gr.Markdown('''
## Models is use: 
1. Automatic Speech Recognition: [OpenAI: Whisper-large V3](https://build.nvidia.com/openai/whisper-large-v3)
2. LLM: [Meta Llama 3.1 405B](https://build.nvidia.com/meta/llama-3_1-405b-instruct)
3. Text to Speech: [NVIDIA Magpie](https://build.nvidia.com/nvidia/magpie-tts-multilingual)
                        ''')


    state = gr.State(value=AppState(
        llm_conversation=[
            {
                "role": "system", 
                "content": SYSTEM_PROMPT
            }
        ]
    ))

    respond = input_audio.stop_recording(
        step_transcribe,
        [input_audio, state],
        [input_audio, state],
        show_progress="full",
        show_progress_on=[input_audio, output_audio]
    )
    respond.then(
        lambda s: s.display_conversation, [state], [chatbot]
    ).then(
        step_llm_response, [state], [state], show_progress="full", show_progress_on=[input_audio, output_audio]
    ).then(
        lambda s: s.display_conversation, [state], [chatbot]
    ).then(
        step_synth_audio, [state], [output_audio, state], show_progress="full", show_progress_on=[input_audio, output_audio]
    ).then(
        lambda s: s.display_conversation, [state], [chatbot]
    )

if __name__ == "__main__":
    demo.launch()