File size: 4,871 Bytes
f821365 c86e03f d152984 5a6a6ff c86e03f 29a7fe7 5a6a6ff c86e03f 5a6a6ff 29a7fe7 5a6a6ff c86e03f 5a6a6ff 29a7fe7 5a6a6ff 29a7fe7 5a6a6ff 29a7fe7 5a6a6ff 29a7fe7 5a6a6ff 29a7fe7 c86e03f 5a6a6ff d152984 5a6a6ff 29a7fe7 5a6a6ff 29a7fe7 5a6a6ff 29a7fe7 5a6a6ff 29a7fe7 5a6a6ff d152984 5a6a6ff d152984 5a6a6ff d152984 5a6a6ff 29a7fe7 5a6a6ff 29a7fe7 5a6a6ff d152984 5a6a6ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
# --- Configuration ---
# 1. API KEY: Ensure you have your Deepgram API Key ready
# Ideally, set this in your environment variables as DEEPGRAM_API_KEY
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "YOUR_DEEPGRAM_KEY_HERE")
# 2. Model Config
REPO_ID = "Kezovic/iris-q4gguf-v2"
FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
CONTEXT_WINDOW = 4096
MAX_NEW_TOKENS = 512
TEMPERATURE = 0.7
# --- Initialize Deepgram ---
if DEEPGRAM_API_KEY == "YOUR_DEEPGRAM_KEY_HERE":
print("WARNING: Please set your DEEPGRAM_API_KEY.")
deepgram = DeepgramClient(DEEPGRAM_API_KEY)
# --- Model Loading Function ---
llm = None
def load_llm():
"""Downloads the GGUF model and initializes LlamaCPP."""
global llm
print("Downloading LLM...")
try:
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME
)
# n_threads=2 is good for free Hugging Face CPU tiers
llm = Llama(
model_path=model_path,
n_ctx=CONTEXT_WINDOW,
n_threads=2,
verbose=False
)
print("LLM loaded successfully!")
return llm
except Exception as e:
print(f"Error loading model: {e}")
return None
# Load model on startup
load_llm()
# --- 1. Speech-to-Text (Deepgram) ---
def transcribe_audio(audio_filepath):
"""Sends audio file to Deepgram and returns text."""
if not audio_filepath:
return ""
try:
with open(audio_filepath, "rb") as buffer:
payload = {"buffer": buffer}
options = PrerecordedOptions(
smart_format=True,
model="nova-2",
language="en-US"
)
response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
return response.results.channels[0].alternatives[0].transcript
except Exception as e:
print(f"STT Error: {e}")
return ""
# --- 2. Text-to-Speech (Deepgram) ---
def text_to_speech(text):
"""Sends text to Deepgram and returns path to audio file."""
try:
filename = "output_response.mp3"
options = SpeakOptions(
model="aura-asteria-en", # Choices: aura-asteria-en, aura-helios-en, etc.
encoding="linear16",
container="wav"
)
# Save the audio to a file
deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
return filename
except Exception as e:
print(f"TTS Error: {e}")
return None
# --- 3. Main Pipeline Function ---
def process_conversation(audio_input):
"""
1. Transcribe Audio (STT)
2. Query LLM
3. Synthesize Speech (TTS)
"""
if llm is None:
return "Model not loaded.", None, "System Error: Model failed to load."
# Step A: Transcribe
user_text = transcribe_audio(audio_input)
if not user_text:
return "Could not hear audio.", None, ""
print(f"User said: {user_text}")
# Step B: LLM Inference
# Using the prompt format from your original code
full_prompt = f"### Human: {user_text}\n### Assistant:"
output = llm(
prompt=full_prompt,
max_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
stop=["### Human:"],
echo=False
)
response_text = output['choices'][0]['text'].strip()
print(f"LLM said: {response_text}")
# Step C: Speak Response
output_audio_path = text_to_speech(response_text)
# Return: Transcription (for display), Audio (for playback), LLM Text (for display)
return user_text, output_audio_path, response_text
# --- Gradio UI ---
with gr.Blocks(title=f"Voice Chat with {FILENAME}") as demo:
gr.Markdown(f"## 🗣️ Deepgram Voice Chat with {FILENAME}")
with gr.Row():
# Input Column
with gr.Column():
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Speak Now"
)
submit_btn = gr.Button("Submit Audio", variant="primary")
# Output Column
with gr.Column():
audio_output = gr.Audio(
label="Assistant Voice",
autoplay=True, # Automatically plays the response
interactive=False
)
# Debugging/Visuals
user_transcript = gr.Textbox(label="You said:")
ai_response_text = gr.Textbox(label="AI Response:")
# Event Listener
submit_btn.click(
fn=process_conversation,
inputs=[audio_input],
outputs=[user_transcript, audio_output, ai_response_text]
)
if __name__ == "__main__":
demo.launch() |