Spaces:

WWMachine
/

test

Sleeping

App Files Files Community

test / app.py

WWMachine

Update app.py

5a6a6ff verified about 2 months ago

raw

history blame

4.87 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os
	from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions

	# --- Configuration ---
	# 1. API KEY: Ensure you have your Deepgram API Key ready
	# Ideally, set this in your environment variables as DEEPGRAM_API_KEY
	DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "YOUR_DEEPGRAM_KEY_HERE")

	# 2. Model Config
	REPO_ID = "Kezovic/iris-q4gguf-v2"
	FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
	CONTEXT_WINDOW = 4096
	MAX_NEW_TOKENS = 512
	TEMPERATURE = 0.7

	# --- Initialize Deepgram ---
	if DEEPGRAM_API_KEY == "YOUR_DEEPGRAM_KEY_HERE":
	print("WARNING: Please set your DEEPGRAM_API_KEY.")

	deepgram = DeepgramClient(DEEPGRAM_API_KEY)

	# --- Model Loading Function ---
	llm = None
	def load_llm():
	"""Downloads the GGUF model and initializes LlamaCPP."""
	global llm
	print("Downloading LLM...")
	try:
	model_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=FILENAME
	)
	# n_threads=2 is good for free Hugging Face CPU tiers
	llm = Llama(
	model_path=model_path,
	n_ctx=CONTEXT_WINDOW,
	n_threads=2,
	verbose=False
	)
	print("LLM loaded successfully!")
	return llm
	except Exception as e:
	print(f"Error loading model: {e}")
	return None

	# Load model on startup
	load_llm()

	# --- 1. Speech-to-Text (Deepgram) ---
	def transcribe_audio(audio_filepath):
	"""Sends audio file to Deepgram and returns text."""
	if not audio_filepath:
	return ""

	try:
	with open(audio_filepath, "rb") as buffer:
	payload = {"buffer": buffer}
	options = PrerecordedOptions(
	smart_format=True,
	model="nova-2",
	language="en-US"
	)
	response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
	return response.results.channels[0].alternatives[0].transcript
	except Exception as e:
	print(f"STT Error: {e}")
	return ""

	# --- 2. Text-to-Speech (Deepgram) ---
	def text_to_speech(text):
	"""Sends text to Deepgram and returns path to audio file."""
	try:
	filename = "output_response.mp3"
	options = SpeakOptions(
	model="aura-asteria-en", # Choices: aura-asteria-en, aura-helios-en, etc.
	encoding="linear16",
	container="wav"
	)
	# Save the audio to a file
	deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
	return filename
	except Exception as e:
	print(f"TTS Error: {e}")
	return None

	# --- 3. Main Pipeline Function ---
	def process_conversation(audio_input):
	"""
	1. Transcribe Audio (STT)
	2. Query LLM
	3. Synthesize Speech (TTS)
	"""
	if llm is None:
	return "Model not loaded.", None, "System Error: Model failed to load."

	# Step A: Transcribe
	user_text = transcribe_audio(audio_input)
	if not user_text:
	return "Could not hear audio.", None, ""

	print(f"User said: {user_text}")

	# Step B: LLM Inference
	# Using the prompt format from your original code
	full_prompt = f"### Human: {user_text}\n### Assistant:"

	output = llm(
	prompt=full_prompt,
	max_tokens=MAX_NEW_TOKENS,
	temperature=TEMPERATURE,
	stop=["### Human:"],
	echo=False
	)
	response_text = output['choices'][0]['text'].strip()
	print(f"LLM said: {response_text}")

	# Step C: Speak Response
	output_audio_path = text_to_speech(response_text)

	# Return: Transcription (for display), Audio (for playback), LLM Text (for display)
	return user_text, output_audio_path, response_text

	# --- Gradio UI ---
	with gr.Blocks(title=f"Voice Chat with {FILENAME}") as demo:
	gr.Markdown(f"## 🗣️ Deepgram Voice Chat with {FILENAME}")

	with gr.Row():
	# Input Column
	with gr.Column():
	audio_input = gr.Audio(
	sources=["microphone"],
	type="filepath",
	label="Speak Now"
	)
	submit_btn = gr.Button("Submit Audio", variant="primary")

	# Output Column
	with gr.Column():
	audio_output = gr.Audio(
	label="Assistant Voice",
	autoplay=True, # Automatically plays the response
	interactive=False
	)
	# Debugging/Visuals
	user_transcript = gr.Textbox(label="You said:")
	ai_response_text = gr.Textbox(label="AI Response:")

	# Event Listener
	submit_btn.click(
	fn=process_conversation,
	inputs=[audio_input],
	outputs=[user_transcript, audio_output, ai_response_text]
	)

	if __name__ == "__main__":
	demo.launch()