Spaces:

WWMachine
/

test

Sleeping

App Files Files Community

WWMachine commited on Dec 2, 2025

Commit

592b831

verified ·

1 Parent(s): f4264e5

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -112

app.py CHANGED Viewed

@@ -3,10 +3,13 @@ from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
 from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
-import time
 # --- Configuration ---
-DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") # Ensure this is set in Space Settings
 REPO_ID = "Kezovic/iris-q4gguf-v2"
 FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
 CONTEXT_WINDOW = 4096
@@ -14,164 +17,143 @@ MAX_NEW_TOKENS = 512
 TEMPERATURE = 0.7
 # --- Initialize Deepgram ---
-if not DEEPGRAM_API_KEY:
-    print("Error: DEEPGRAM_API_KEY is missing.")
-    deepgram = None
-else:
-    deepgram = DeepgramClient(DEEPGRAM_API_KEY)
-# --- Load LLM ---
 llm = None
 def load_llm():
     global llm
     print("Downloading LLM...")
     try:
-        model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
         llm = Llama(
             model_path=model_path,
             n_ctx=CONTEXT_WINDOW,
             n_threads=2,
             verbose=False
         )
-        print("LLM loaded!")
     except Exception as e:
         print(f"Error loading model: {e}")
 load_llm()
-# --- Helper Functions ---
-def transcribe(audio_path):
-    """Converts Speech to Text using Deepgram Nova-2"""
-    if not audio_path or deepgram is None:
-        return None
     try:
-        with open(audio_path, "rb") as buffer:
             payload = {"buffer": buffer}
-            options = PrerecordedOptions(smart_format=True, model="nova-2", language="en-US")
             response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
             return response.results.channels[0].alternatives[0].transcript
     except Exception as e:
         print(f"STT Error: {e}")
-        return None
-def speak(text):
-    """Converts Text to Speech using Deepgram Aura"""
-    if not text or deepgram is None:
-        return None
     try:
-        filename = f"response_{int(time.time())}.mp3"
-        options = SpeakOptions(model="aura-asteria-en", encoding="linear16", container="wav")
         deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
         return filename
     except Exception as e:
         print(f"TTS Error: {e}")
         return None
-# --- Main Logic ---
-def run_chat_pipeline(audio_input, history, state_messages):
     """
-    1. Transcribe Audio -> Update UI with User Text
-    2. Query LLM -> Update UI with AI Text
-    3. Generate Audio -> Auto-play response
     """
     if llm is None:
-        return history, state_messages, None
-    # --- Step 1: User Speech to Text ---
-    user_text = transcribe(audio_input)
     if not user_text:
-        # If silence or error, return existing state without changes
-        return history, state_messages, None
-    # Update internal memory (Standard OpenAI/Llama format)
-    state_messages.append({"role": "user", "content": user_text})
-    # Update UI History (Gradio Chatbot format: list of [user_msg, bot_msg])
-    # We add the user message temporarily with a pending bot response
-    history.append((user_text, None))
-    # --- Step 2: LLM Generation ---
-    try:
-        completion = llm.create_chat_completion(
-            messages=state_messages,
-            max_tokens=MAX_NEW_TOKENS,
-            temperature=TEMPERATURE
-        )
-        ai_text = completion['choices'][0]['message']['content']
-    except Exception as e:
-        ai_text = f"Error: {str(e)}"
-    # Update internal memory with AI response
-    state_messages.append({"role": "assistant", "content": ai_text})
-    # Update UI History: Replace the 'None' with the actual AI text
-    history[-1] = (user_text, ai_text)
-    # --- Step 3: Text to Speech ---
-    audio_path = speak(ai_text)
-    # Return: Updated Chatbot UI, Updated Internal State, Audio File
-    return history, state_messages, audio_path
-# --- Gradio UI Layout ---
-with gr.Blocks(title="Voice Chatbot") as demo:
-    gr.Markdown("## 🎙️ Voice-First AI Chat")
-    # 1. Visual Conversation History (The "Screen")
-    chatbot = gr.Chatbot(
-        label="Conversation",
-        type="messages", # Uses newer Gradio format if available, else standard
-        height=500
     )
-    # 2. State (Hidden Memory)
-    state_messages = gr.State([]) # Stores [{"role":"user", "content":"..."}, ...]
-    # 3. Audio Interaction Area
     with gr.Row():
-        with gr.Column(scale=4):
-            # Input Microphone
             audio_input = gr.Audio(
                 sources=["microphone"],
-                type="filepath",
-                label="Record Your Message"
             )
-        with gr.Column(scale=1):
-            # Send Button
-            submit_btn = gr.Button("Send Voice 💬", variant="primary")
-            clear_btn = gr.Button("Clear Chat 🗑️")
-    # 4. Hidden Output Audio (For Autoplay)
-    # We make it visible=False so it doesn't clutter UI,
-    # but Gradio still plays it if we return it to this component.
-    # Note: Some browsers block autoplay from hidden components.
-    # If it doesn't play, set visible=True.
-    audio_player = gr.Audio(
-        label="AI Voice",
-        autoplay=True,
-        visible=True, # Kept visible for control, can set to False
-        interactive=False
-    )
-    # --- Event Wiring ---
     submit_btn.click(
-        fn=run_chat_pipeline,
-        inputs=[audio_input, chatbot, state_messages],
-        outputs=[chatbot, state_messages, audio_player]
-    )
-    # Clear Logic
-    def clear_all():
-        return [], [], None
-    clear_btn.click(
-        fn=clear_all,
-        inputs=None,
-        outputs=[chatbot, state_messages, audio_player]
     )
 if __name__ == "__main__":

 from huggingface_hub import hf_hub_download
 import os
 from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
 # --- Configuration ---
+# 1. API KEY: Ensure you have your Deepgram API Key ready
+# Ideally, set this in your environment variables as DEEPGRAM_API_KEY
+DEEPGRAM_API_KEY = "19d640a011569d78395c814e5f875b15cc84deb8"
+# 2. Model Config
 REPO_ID = "Kezovic/iris-q4gguf-v2"
 FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
 CONTEXT_WINDOW = 4096
 TEMPERATURE = 0.7
 # --- Initialize Deepgram ---
+if DEEPGRAM_API_KEY == "YOUR_DEEPGRAM_KEY_HERE":
+    print("WARNING: Please set your DEEPGRAM_API_KEY.")
+deepgram = DeepgramClient(DEEPGRAM_API_KEY)
+# --- Model Loading Function ---
 llm = None
 def load_llm():
+    """Downloads the GGUF model and initializes LlamaCPP."""
     global llm
     print("Downloading LLM...")
     try:
+        model_path = hf_hub_download(
+            repo_id=REPO_ID,
+            filename=FILENAME
+        )
+        # n_threads=2 is good for free Hugging Face CPU tiers
         llm = Llama(
             model_path=model_path,
             n_ctx=CONTEXT_WINDOW,
             n_threads=2,
             verbose=False
         )
+        print("LLM loaded successfully!")
+        return llm
     except Exception as e:
         print(f"Error loading model: {e}")
+        return None
+# Load model on startup
 load_llm()
+# --- 1. Speech-to-Text (Deepgram) ---
+def transcribe_audio(audio_filepath):
+    """Sends audio file to Deepgram and returns text."""
+    if not audio_filepath:
+        return ""
     try:
+        with open(audio_filepath, "rb") as buffer:
             payload = {"buffer": buffer}
+            options = PrerecordedOptions(
+                smart_format=True,
+                model="nova-2",
+                language="en-US"
+            )
             response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
             return response.results.channels[0].alternatives[0].transcript
     except Exception as e:
         print(f"STT Error: {e}")
+        return ""
+# --- 2. Text-to-Speech (Deepgram) ---
+def text_to_speech(text):
+    """Sends text to Deepgram and returns path to audio file."""
     try:
+        filename = "output_response.mp3"
+        options = SpeakOptions(
+            model="aura-asteria-en", # Choices: aura-asteria-en, aura-helios-en, etc.
+            encoding="linear16",
+            container="wav"
+        )
+        # Save the audio to a file
         deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
         return filename
     except Exception as e:
         print(f"TTS Error: {e}")
         return None
+# --- 3. Main Pipeline Function ---
+def process_conversation(audio_input):
     """
+    1. Transcribe Audio (STT)
+    2. Query LLM
+    3. Synthesize Speech (TTS)
     """
     if llm is None:
+        return "Model not loaded.", None, "System Error: Model failed to load."
+    # Step A: Transcribe
+    user_text = transcribe_audio(audio_input)
+    print(audio_input)
     if not user_text:
+        return "Could not hear audio.", None, ""
+    print(f"User said: {user_text}")
+    # Step B: LLM Inference
+    # Using the prompt format from your original code
+    full_prompt = f"### Human: {user_text}\n### Assistant:"
+    output = llm(
+        prompt=full_prompt,
+        max_tokens=MAX_NEW_TOKENS,
+        temperature=TEMPERATURE,
+        stop=["### Human:"],
+        echo=False
     )
+    response_text = output['choices'][0]['text'].strip()
+    print(f"LLM said: {response_text}")
+    # Step C: Speak Response
+    output_audio_path = text_to_speech(response_text)
+    # Return: Transcription (for display), Audio (for playback), LLM Text (for display)
+    return user_text, output_audio_path, response_text
+# --- Gradio UI ---
+with gr.Blocks(title=f"Voice Chat with {FILENAME}") as demo:
+    gr.Markdown(f"## 🗣️ Deepgram Voice Chat with {FILENAME}")
     with gr.Row():
+        # Input Column
+        with gr.Column():
             audio_input = gr.Audio(
                 sources=["microphone"],
+                type="filepath",
+                label="Speak Now"
             )
+            submit_btn = gr.Button("Submit Audio", variant="primary")
+        # Output Column
+        with gr.Column():
+            audio_output = gr.Audio(
+                label="Assistant Voice",
+                autoplay=True, # Automatically plays the response
+                interactive=False
+            )
+            # Debugging/Visuals
+            user_transcript = gr.Textbox(label="You said:")
+            ai_response_text = gr.Textbox(label="AI Response:")
+    # Event Listener
     submit_btn.click(
+        fn=process_conversation,
+        inputs=[audio_input],
+        outputs=[user_transcript, audio_output, ai_response_text]
     )
 if __name__ == "__main__":