|
|
import gradio as gr |
|
|
from llama_cpp import Llama |
|
|
from huggingface_hub import hf_hub_download |
|
|
import os |
|
|
from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "YOUR_DEEPGRAM_KEY_HERE") |
|
|
|
|
|
|
|
|
REPO_ID = "Kezovic/iris-q4gguf-v2" |
|
|
FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf" |
|
|
CONTEXT_WINDOW = 4096 |
|
|
MAX_NEW_TOKENS = 512 |
|
|
TEMPERATURE = 0.7 |
|
|
|
|
|
|
|
|
if DEEPGRAM_API_KEY == "YOUR_DEEPGRAM_KEY_HERE": |
|
|
print("WARNING: Please set your DEEPGRAM_API_KEY.") |
|
|
|
|
|
deepgram = DeepgramClient(DEEPGRAM_API_KEY) |
|
|
|
|
|
|
|
|
llm = None |
|
|
def load_llm(): |
|
|
"""Downloads the GGUF model and initializes LlamaCPP.""" |
|
|
global llm |
|
|
print("Downloading LLM...") |
|
|
try: |
|
|
model_path = hf_hub_download( |
|
|
repo_id=REPO_ID, |
|
|
filename=FILENAME |
|
|
) |
|
|
|
|
|
llm = Llama( |
|
|
model_path=model_path, |
|
|
n_ctx=CONTEXT_WINDOW, |
|
|
n_threads=2, |
|
|
verbose=False |
|
|
) |
|
|
print("LLM loaded successfully!") |
|
|
return llm |
|
|
except Exception as e: |
|
|
print(f"Error loading model: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
load_llm() |
|
|
|
|
|
|
|
|
def transcribe_audio(audio_filepath): |
|
|
"""Sends audio file to Deepgram and returns text.""" |
|
|
if not audio_filepath: |
|
|
return "" |
|
|
|
|
|
try: |
|
|
with open(audio_filepath, "rb") as buffer: |
|
|
payload = {"buffer": buffer} |
|
|
options = PrerecordedOptions( |
|
|
smart_format=True, |
|
|
model="nova-2", |
|
|
language="en-US" |
|
|
) |
|
|
response = deepgram.listen.rest.v("1").transcribe_file(payload, options) |
|
|
return response.results.channels[0].alternatives[0].transcript |
|
|
except Exception as e: |
|
|
print(f"STT Error: {e}") |
|
|
return "" |
|
|
|
|
|
|
|
|
def text_to_speech(text): |
|
|
"""Sends text to Deepgram and returns path to audio file.""" |
|
|
try: |
|
|
filename = "output_response.mp3" |
|
|
options = SpeakOptions( |
|
|
model="aura-asteria-en", |
|
|
encoding="linear16", |
|
|
container="wav" |
|
|
) |
|
|
|
|
|
deepgram.speak.rest.v("1").save(filename, {"text": text}, options) |
|
|
return filename |
|
|
except Exception as e: |
|
|
print(f"TTS Error: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def process_conversation(audio_input): |
|
|
""" |
|
|
1. Transcribe Audio (STT) |
|
|
2. Query LLM |
|
|
3. Synthesize Speech (TTS) |
|
|
""" |
|
|
if llm is None: |
|
|
return "Model not loaded.", None, "System Error: Model failed to load." |
|
|
|
|
|
|
|
|
user_text = transcribe_audio(audio_input) |
|
|
if not user_text: |
|
|
return "Could not hear audio.", None, "" |
|
|
|
|
|
print(f"User said: {user_text}") |
|
|
|
|
|
|
|
|
|
|
|
full_prompt = f"### Human: {user_text}\n### Assistant:" |
|
|
|
|
|
output = llm( |
|
|
prompt=full_prompt, |
|
|
max_tokens=MAX_NEW_TOKENS, |
|
|
temperature=TEMPERATURE, |
|
|
stop=["### Human:"], |
|
|
echo=False |
|
|
) |
|
|
response_text = output['choices'][0]['text'].strip() |
|
|
print(f"LLM said: {response_text}") |
|
|
|
|
|
|
|
|
output_audio_path = text_to_speech(response_text) |
|
|
|
|
|
|
|
|
return user_text, output_audio_path, response_text |
|
|
|
|
|
|
|
|
with gr.Blocks(title=f"Voice Chat with {FILENAME}") as demo: |
|
|
gr.Markdown(f"## 🗣️ Deepgram Voice Chat with {FILENAME}") |
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(): |
|
|
audio_input = gr.Audio( |
|
|
sources=["microphone"], |
|
|
type="filepath", |
|
|
label="Speak Now" |
|
|
) |
|
|
submit_btn = gr.Button("Submit Audio", variant="primary") |
|
|
|
|
|
|
|
|
with gr.Column(): |
|
|
audio_output = gr.Audio( |
|
|
label="Assistant Voice", |
|
|
autoplay=True, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
user_transcript = gr.Textbox(label="You said:") |
|
|
ai_response_text = gr.Textbox(label="AI Response:") |
|
|
|
|
|
|
|
|
submit_btn.click( |
|
|
fn=process_conversation, |
|
|
inputs=[audio_input], |
|
|
outputs=[user_transcript, audio_output, ai_response_text] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |