test / app.py
WWMachine's picture
Update app.py
5a6a6ff verified
raw
history blame
4.87 kB
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
# --- Configuration ---
# 1. API KEY: Ensure you have your Deepgram API Key ready
# Ideally, set this in your environment variables as DEEPGRAM_API_KEY
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "YOUR_DEEPGRAM_KEY_HERE")
# 2. Model Config
REPO_ID = "Kezovic/iris-q4gguf-v2"
FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
CONTEXT_WINDOW = 4096
MAX_NEW_TOKENS = 512
TEMPERATURE = 0.7
# --- Initialize Deepgram ---
if DEEPGRAM_API_KEY == "YOUR_DEEPGRAM_KEY_HERE":
print("WARNING: Please set your DEEPGRAM_API_KEY.")
deepgram = DeepgramClient(DEEPGRAM_API_KEY)
# --- Model Loading Function ---
llm = None
def load_llm():
"""Downloads the GGUF model and initializes LlamaCPP."""
global llm
print("Downloading LLM...")
try:
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME
)
# n_threads=2 is good for free Hugging Face CPU tiers
llm = Llama(
model_path=model_path,
n_ctx=CONTEXT_WINDOW,
n_threads=2,
verbose=False
)
print("LLM loaded successfully!")
return llm
except Exception as e:
print(f"Error loading model: {e}")
return None
# Load model on startup
load_llm()
# --- 1. Speech-to-Text (Deepgram) ---
def transcribe_audio(audio_filepath):
"""Sends audio file to Deepgram and returns text."""
if not audio_filepath:
return ""
try:
with open(audio_filepath, "rb") as buffer:
payload = {"buffer": buffer}
options = PrerecordedOptions(
smart_format=True,
model="nova-2",
language="en-US"
)
response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
return response.results.channels[0].alternatives[0].transcript
except Exception as e:
print(f"STT Error: {e}")
return ""
# --- 2. Text-to-Speech (Deepgram) ---
def text_to_speech(text):
"""Sends text to Deepgram and returns path to audio file."""
try:
filename = "output_response.mp3"
options = SpeakOptions(
model="aura-asteria-en", # Choices: aura-asteria-en, aura-helios-en, etc.
encoding="linear16",
container="wav"
)
# Save the audio to a file
deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
return filename
except Exception as e:
print(f"TTS Error: {e}")
return None
# --- 3. Main Pipeline Function ---
def process_conversation(audio_input):
"""
1. Transcribe Audio (STT)
2. Query LLM
3. Synthesize Speech (TTS)
"""
if llm is None:
return "Model not loaded.", None, "System Error: Model failed to load."
# Step A: Transcribe
user_text = transcribe_audio(audio_input)
if not user_text:
return "Could not hear audio.", None, ""
print(f"User said: {user_text}")
# Step B: LLM Inference
# Using the prompt format from your original code
full_prompt = f"### Human: {user_text}\n### Assistant:"
output = llm(
prompt=full_prompt,
max_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
stop=["### Human:"],
echo=False
)
response_text = output['choices'][0]['text'].strip()
print(f"LLM said: {response_text}")
# Step C: Speak Response
output_audio_path = text_to_speech(response_text)
# Return: Transcription (for display), Audio (for playback), LLM Text (for display)
return user_text, output_audio_path, response_text
# --- Gradio UI ---
with gr.Blocks(title=f"Voice Chat with {FILENAME}") as demo:
gr.Markdown(f"## 🗣️ Deepgram Voice Chat with {FILENAME}")
with gr.Row():
# Input Column
with gr.Column():
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Speak Now"
)
submit_btn = gr.Button("Submit Audio", variant="primary")
# Output Column
with gr.Column():
audio_output = gr.Audio(
label="Assistant Voice",
autoplay=True, # Automatically plays the response
interactive=False
)
# Debugging/Visuals
user_transcript = gr.Textbox(label="You said:")
ai_response_text = gr.Textbox(label="AI Response:")
# Event Listener
submit_btn.click(
fn=process_conversation,
inputs=[audio_input],
outputs=[user_transcript, audio_output, ai_response_text]
)
if __name__ == "__main__":
demo.launch()