import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download import os from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions # --- Configuration --- # 1. API KEY: Ensure you have your Deepgram API Key ready # Ideally, set this in your environment variables as DEEPGRAM_API_KEY DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "YOUR_DEEPGRAM_KEY_HERE") # 2. Model Config REPO_ID = "Kezovic/iris-q4gguf-v2" FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf" CONTEXT_WINDOW = 4096 MAX_NEW_TOKENS = 512 TEMPERATURE = 0.7 # --- Initialize Deepgram --- if DEEPGRAM_API_KEY == "YOUR_DEEPGRAM_KEY_HERE": print("WARNING: Please set your DEEPGRAM_API_KEY.") deepgram = DeepgramClient(DEEPGRAM_API_KEY) # --- Model Loading Function --- llm = None def load_llm(): """Downloads the GGUF model and initializes LlamaCPP.""" global llm print("Downloading LLM...") try: model_path = hf_hub_download( repo_id=REPO_ID, filename=FILENAME ) # n_threads=2 is good for free Hugging Face CPU tiers llm = Llama( model_path=model_path, n_ctx=CONTEXT_WINDOW, n_threads=2, verbose=False ) print("LLM loaded successfully!") return llm except Exception as e: print(f"Error loading model: {e}") return None # Load model on startup load_llm() # --- 1. Speech-to-Text (Deepgram) --- def transcribe_audio(audio_filepath): """Sends audio file to Deepgram and returns text.""" if not audio_filepath: return "" try: with open(audio_filepath, "rb") as buffer: payload = {"buffer": buffer} options = PrerecordedOptions( smart_format=True, model="nova-2", language="en-US" ) response = deepgram.listen.rest.v("1").transcribe_file(payload, options) return response.results.channels[0].alternatives[0].transcript except Exception as e: print(f"STT Error: {e}") return "" # --- 2. Text-to-Speech (Deepgram) --- def text_to_speech(text): """Sends text to Deepgram and returns path to audio file.""" try: filename = "output_response.mp3" options = SpeakOptions( model="aura-asteria-en", # Choices: aura-asteria-en, aura-helios-en, etc. encoding="linear16", container="wav" ) # Save the audio to a file deepgram.speak.rest.v("1").save(filename, {"text": text}, options) return filename except Exception as e: print(f"TTS Error: {e}") return None # --- 3. Main Pipeline Function --- def process_conversation(audio_input): """ 1. Transcribe Audio (STT) 2. Query LLM 3. Synthesize Speech (TTS) """ if llm is None: return "Model not loaded.", None, "System Error: Model failed to load." # Step A: Transcribe user_text = transcribe_audio(audio_input) if not user_text: return "Could not hear audio.", None, "" print(f"User said: {user_text}") # Step B: LLM Inference # Using the prompt format from your original code full_prompt = f"### Human: {user_text}\n### Assistant:" output = llm( prompt=full_prompt, max_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE, stop=["### Human:"], echo=False ) response_text = output['choices'][0]['text'].strip() print(f"LLM said: {response_text}") # Step C: Speak Response output_audio_path = text_to_speech(response_text) # Return: Transcription (for display), Audio (for playback), LLM Text (for display) return user_text, output_audio_path, response_text # --- Gradio UI --- with gr.Blocks(title=f"Voice Chat with {FILENAME}") as demo: gr.Markdown(f"## 🗣️ Deepgram Voice Chat with {FILENAME}") with gr.Row(): # Input Column with gr.Column(): audio_input = gr.Audio( sources=["microphone"], type="filepath", label="Speak Now" ) submit_btn = gr.Button("Submit Audio", variant="primary") # Output Column with gr.Column(): audio_output = gr.Audio( label="Assistant Voice", autoplay=True, # Automatically plays the response interactive=False ) # Debugging/Visuals user_transcript = gr.Textbox(label="You said:") ai_response_text = gr.Textbox(label="AI Response:") # Event Listener submit_btn.click( fn=process_conversation, inputs=[audio_input], outputs=[user_transcript, audio_output, ai_response_text] ) if __name__ == "__main__": demo.launch()