Spaces:

WWMachine
/

test

Sleeping

App Files Files Community

WWMachine commited on Dec 2, 2025

Commit

f4264e5

verified ·

1 Parent(s): 9c039c3

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -94

app.py CHANGED Viewed

@@ -3,13 +3,10 @@ from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
 from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
 # --- Configuration ---
-# 1. API KEY: Ensure you have your Deepgram API Key ready
-# Ideally, set this in your environment variables as DEEPGRAM_API_KEY
-DEEPGRAM_API_KEY = "19d640a011569d78395c814e5f875b15cc84deb8"
-# 2. Model Config
 REPO_ID = "Kezovic/iris-q4gguf-v2"
 FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
 CONTEXT_WINDOW = 4096
@@ -17,143 +14,164 @@ MAX_NEW_TOKENS = 512
 TEMPERATURE = 0.7
 # --- Initialize Deepgram ---
-if DEEPGRAM_API_KEY == "YOUR_DEEPGRAM_KEY_HERE":
-    print("WARNING: Please set your DEEPGRAM_API_KEY.")
-deepgram = DeepgramClient(DEEPGRAM_API_KEY)
-# --- Model Loading Function ---
 llm = None
 def load_llm():
-    """Downloads the GGUF model and initializes LlamaCPP."""
     global llm
     print("Downloading LLM...")
     try:
-        model_path = hf_hub_download(
-            repo_id=REPO_ID,
-            filename=FILENAME
-        )
-        # n_threads=2 is good for free Hugging Face CPU tiers
         llm = Llama(
             model_path=model_path,
             n_ctx=CONTEXT_WINDOW,
             n_threads=2,
             verbose=False
         )
-        print("LLM loaded successfully!")
-        return llm
     except Exception as e:
         print(f"Error loading model: {e}")
-        return None
-# Load model on startup
 load_llm()
-# --- 1. Speech-to-Text (Deepgram) ---
-def transcribe_audio(audio_filepath):
-    """Sends audio file to Deepgram and returns text."""
-    if not audio_filepath:
-        return ""
     try:
-        with open(audio_filepath, "rb") as buffer:
             payload = {"buffer": buffer}
-            options = PrerecordedOptions(
-                smart_format=True,
-                model="nova-2",
-                language="en-US"
-            )
             response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
             return response.results.channels[0].alternatives[0].transcript
     except Exception as e:
         print(f"STT Error: {e}")
-        return ""
-# --- 2. Text-to-Speech (Deepgram) ---
-def text_to_speech(text):
-    """Sends text to Deepgram and returns path to audio file."""
     try:
-        filename = "output_response.mp3"
-        options = SpeakOptions(
-            model="aura-asteria-en", # Choices: aura-asteria-en, aura-helios-en, etc.
-            encoding="linear16",
-            container="wav"
-        )
-        # Save the audio to a file
         deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
         return filename
     except Exception as e:
         print(f"TTS Error: {e}")
         return None
-# --- 3. Main Pipeline Function ---
-def process_conversation(audio_input):
     """
-    1. Transcribe Audio (STT)
-    2. Query LLM
-    3. Synthesize Speech (TTS)
     """
     if llm is None:
-        return "Model not loaded.", None, "System Error: Model failed to load."
-    # Step A: Transcribe
-    user_text = transcribe_audio(audio_input)
-    print(audio_input)
     if not user_text:
-        return "Could not hear audio.", None, ""
-    print(f"User said: {user_text}")
-    # Step B: LLM Inference
-    # Using the prompt format from your original code
-    full_prompt = f"### Human: {user_text}\n### Assistant:"
-    output = llm(
-        prompt=full_prompt,
-        max_tokens=MAX_NEW_TOKENS,
-        temperature=TEMPERATURE,
-        stop=["### Human:"],
-        echo=False
-    )
-    response_text = output['choices'][0]['text'].strip()
-    print(f"LLM said: {response_text}")
-    # Step C: Speak Response
-    output_audio_path = text_to_speech(response_text)
-    # Return: Transcription (for display), Audio (for playback), LLM Text (for display)
-    return user_text, output_audio_path, response_text
-# --- Gradio UI ---
-with gr.Blocks(title=f"Voice Chat with {FILENAME}") as demo:
-    gr.Markdown(f"## 🗣️ Deepgram Voice Chat with {FILENAME}")
     with gr.Row():
-        # Input Column
-        with gr.Column():
             audio_input = gr.Audio(
                 sources=["microphone"],
-                type="filepath",
-                label="Speak Now"
             )
-            submit_btn = gr.Button("Submit Audio", variant="primary")
-        # Output Column
-        with gr.Column():
-            audio_output = gr.Audio(
-                label="Assistant Voice",
-                autoplay=True, # Automatically plays the response
-                interactive=False
-            )
-            # Debugging/Visuals
-            user_transcript = gr.Textbox(label="You said:")
-            ai_response_text = gr.Textbox(label="AI Response:")
-    # Event Listener
     submit_btn.click(
-        fn=process_conversation,
-        inputs=[audio_input],
-        outputs=[user_transcript, audio_output, ai_response_text]
     )
 if __name__ == "__main__":

 from huggingface_hub import hf_hub_download
 import os
 from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
+import time
 # --- Configuration ---
+DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") # Ensure this is set in Space Settings
 REPO_ID = "Kezovic/iris-q4gguf-v2"
 FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
 CONTEXT_WINDOW = 4096
 TEMPERATURE = 0.7
 # --- Initialize Deepgram ---
+if not DEEPGRAM_API_KEY:
+    print("Error: DEEPGRAM_API_KEY is missing.")
+    deepgram = None
+else:
+    deepgram = DeepgramClient(DEEPGRAM_API_KEY)
+# --- Load LLM ---
 llm = None
 def load_llm():
     global llm
     print("Downloading LLM...")
     try:
+        model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
         llm = Llama(
             model_path=model_path,
             n_ctx=CONTEXT_WINDOW,
             n_threads=2,
             verbose=False
         )
+        print("LLM loaded!")
     except Exception as e:
         print(f"Error loading model: {e}")
 load_llm()
+# --- Helper Functions ---
+def transcribe(audio_path):
+    """Converts Speech to Text using Deepgram Nova-2"""
+    if not audio_path or deepgram is None:
+        return None
     try:
+        with open(audio_path, "rb") as buffer:
             payload = {"buffer": buffer}
+            options = PrerecordedOptions(smart_format=True, model="nova-2", language="en-US")
             response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
             return response.results.channels[0].alternatives[0].transcript
     except Exception as e:
         print(f"STT Error: {e}")
+        return None
+def speak(text):
+    """Converts Text to Speech using Deepgram Aura"""
+    if not text or deepgram is None:
+        return None
     try:
+        filename = f"response_{int(time.time())}.mp3"
+        options = SpeakOptions(model="aura-asteria-en", encoding="linear16", container="wav")
         deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
         return filename
     except Exception as e:
         print(f"TTS Error: {e}")
         return None
+# --- Main Logic ---
+def run_chat_pipeline(audio_input, history, state_messages):
     """
+    1. Transcribe Audio -> Update UI with User Text
+    2. Query LLM -> Update UI with AI Text
+    3. Generate Audio -> Auto-play response
     """
     if llm is None:
+        return history, state_messages, None
+    # --- Step 1: User Speech to Text ---
+    user_text = transcribe(audio_input)
     if not user_text:
+        # If silence or error, return existing state without changes
+        return history, state_messages, None
+    # Update internal memory (Standard OpenAI/Llama format)
+    state_messages.append({"role": "user", "content": user_text})
+    # Update UI History (Gradio Chatbot format: list of [user_msg, bot_msg])
+    # We add the user message temporarily with a pending bot response
+    history.append((user_text, None))
+    # --- Step 2: LLM Generation ---
+    try:
+        completion = llm.create_chat_completion(
+            messages=state_messages,
+            max_tokens=MAX_NEW_TOKENS,
+            temperature=TEMPERATURE
+        )
+        ai_text = completion['choices'][0]['message']['content']
+    except Exception as e:
+        ai_text = f"Error: {str(e)}"
+    # Update internal memory with AI response
+    state_messages.append({"role": "assistant", "content": ai_text})
+    # Update UI History: Replace the 'None' with the actual AI text
+    history[-1] = (user_text, ai_text)
+    # --- Step 3: Text to Speech ---
+    audio_path = speak(ai_text)
+    # Return: Updated Chatbot UI, Updated Internal State, Audio File
+    return history, state_messages, audio_path
+# --- Gradio UI Layout ---
+with gr.Blocks(title="Voice Chatbot") as demo:
+    gr.Markdown("## 🎙️ Voice-First AI Chat")
+    # 1. Visual Conversation History (The "Screen")
+    chatbot = gr.Chatbot(
+        label="Conversation",
+        type="messages", # Uses newer Gradio format if available, else standard
+        height=500
+    )
+    # 2. State (Hidden Memory)
+    state_messages = gr.State([]) # Stores [{"role":"user", "content":"..."}, ...]
+    # 3. Audio Interaction Area
     with gr.Row():
+        with gr.Column(scale=4):
+            # Input Microphone
             audio_input = gr.Audio(
                 sources=["microphone"],
+                type="filepath",
+                label="Record Your Message"
             )
+        with gr.Column(scale=1):
+            # Send Button
+            submit_btn = gr.Button("Send Voice 💬", variant="primary")
+            clear_btn = gr.Button("Clear Chat 🗑️")
+    # 4. Hidden Output Audio (For Autoplay)
+    # We make it visible=False so it doesn't clutter UI,
+    # but Gradio still plays it if we return it to this component.
+    # Note: Some browsers block autoplay from hidden components.
+    # If it doesn't play, set visible=True.
+    audio_player = gr.Audio(
+        label="AI Voice",
+        autoplay=True,
+        visible=True, # Kept visible for control, can set to False
+        interactive=False
+    )
+    # --- Event Wiring ---
     submit_btn.click(
+        fn=run_chat_pipeline,
+        inputs=[audio_input, chatbot, state_messages],
+        outputs=[chatbot, state_messages, audio_player]
+    )
+    # Clear Logic
+    def clear_all():
+        return [], [], None
+    clear_btn.click(
+        fn=clear_all,
+        inputs=None,
+        outputs=[chatbot, state_messages, audio_player]
     )
 if __name__ == "__main__":