Spaces:

WWMachine
/

test

Sleeping

App Files Files Community

WWMachine commited on Dec 2, 2025

Commit

02799cd

verified ·

1 Parent(s): e34cdab

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -102

app.py CHANGED Viewed

@@ -2,158 +2,211 @@ import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
 from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
 # --- Configuration ---
-# 1. API KEY: Ensure you have your Deepgram API Key ready
-# Ideally, set this in your environment variables as DEEPGRAM_API_KEY
 DEEPGRAM_API_KEY = "19d640a011569d78395c814e5f875b15cc84deb8"
-# 2. Model Config
 REPO_ID = "Kezovic/iris-q4gguf-v2"
 FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
 CONTEXT_WINDOW = 4096
 MAX_NEW_TOKENS = 512
 TEMPERATURE = 0.7
-# --- Initialize Deepgram ---
-if DEEPGRAM_API_KEY == "YOUR_DEEPGRAM_KEY_HERE":
-    print("WARNING: Please set your DEEPGRAM_API_KEY.")
-deepgram = DeepgramClient(DEEPGRAM_API_KEY)
-# --- Model Loading Function ---
 llm = None
 def load_llm():
-    """Downloads the GGUF model and initializes LlamaCPP."""
     global llm
-    print("Downloading LLM...")
     try:
-        model_path = hf_hub_download(
-            repo_id=REPO_ID,
-            filename=FILENAME
-        )
-        # n_threads=2 is good for free Hugging Face CPU tiers
         llm = Llama(
             model_path=model_path,
             n_ctx=CONTEXT_WINDOW,
-            n_threads=2,
             verbose=False
         )
-        print("LLM loaded successfully!")
-        return llm
     except Exception as e:
-        print(f"Error loading model: {e}")
-        return None
-# Load model on startup
 load_llm()
-# --- 1. Speech-to-Text (Deepgram) ---
-def transcribe_audio(audio_filepath):
-    """Sends audio file to Deepgram and returns text."""
-    if not audio_filepath:
-        return ""
     try:
-        with open(audio_filepath, "rb") as buffer:
             payload = {"buffer": buffer}
             options = PrerecordedOptions(
-                smart_format=True,
-                model="nova-2",
-                language="en-US"
             )
             response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
             return response.results.channels[0].alternatives[0].transcript
     except Exception as e:
         print(f"STT Error: {e}")
-        return ""
-# --- 2. Text-to-Speech (Deepgram) ---
 def text_to_speech(text):
-    """Sends text to Deepgram and returns path to audio file."""
-    try:
-        filename = "output_response.mp3"
-        options = SpeakOptions(
-            model="aura-asteria-en", # Choices: aura-asteria-en, aura-helios-en, etc.
-            encoding="linear16",
-            container="wav"
-        )
-        # Save the audio to a file
-        deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
-        return filename
-    except Exception as e:
-        print(f"TTS Error: {e}")
         return None
-# --- 3. Main Pipeline Function ---
-def process_conversation(audio_input):
-    """
-    1. Transcribe Audio (STT)
-    2. Query LLM
-    3. Synthesize Speech (TTS)
-    """
-    if llm is None:
-        return "Model not loaded.", None, "System Error: Model failed to load."
-    # Step A: Transcribe
-    user_text = transcribe_audio(audio_input)
-    print(audio_input)
-    if not user_text:
-        return "Could not hear audio.", None, ""
-    print(f"User said: {user_text}")
-    # Step B: LLM Inference
-    # Using the prompt format from your original code
-    full_prompt = f"### Human: {user_text}\n### Assistant:"
-    output = llm(
-        prompt=full_prompt,
-        max_tokens=MAX_NEW_TOKENS,
-        temperature=TEMPERATURE,
-        stop=["### Human:"],
-        echo=False
-    )
-    response_text = output['choices'][0]['text'].strip()
-    print(f"LLM said: {response_text}")
-    # Step C: Speak Response
-    output_audio_path = text_to_speech(response_text)
-    # Return: Transcription (for display), Audio (for playback), LLM Text (for display)
-    return user_text, output_audio_path, response_text
-# --- Gradio UI ---
-with gr.Blocks(title=f"Voice Chat with {FILENAME}") as demo:
-    gr.Markdown(f"## 🗣️ Deepgram Voice Chat with {FILENAME}")
     with gr.Row():
-        # Input Column
-        with gr.Column():
             audio_input = gr.Audio(
                 sources=["microphone"],
-                type="filepath",
-                label="Speak Now"
             )
-            submit_btn = gr.Button("Submit Audio", variant="primary")
-        # Output Column
-        with gr.Column():
-            audio_output = gr.Audio(
-                label="Assistant Voice",
-                autoplay=True, # Automatically plays the response
-                interactive=False
-            )
-            # Debugging/Visuals
-            user_transcript = gr.Textbox(label="You said:")
-            ai_response_text = gr.Textbox(label="AI Response:")
-    # Event Listener
     submit_btn.click(
-        fn=process_conversation,
-        inputs=[audio_input],
-        outputs=[user_transcript, audio_output, ai_response_text]
     )
 if __name__ == "__main__":

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
+import re
+import time
 from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
+from pydub import AudioSegment # Added for audio stitching
 # --- Configuration ---
 DEEPGRAM_API_KEY = "19d640a011569d78395c814e5f875b15cc84deb8"
 REPO_ID = "Kezovic/iris-q4gguf-v2"
 FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
 CONTEXT_WINDOW = 4096
 MAX_NEW_TOKENS = 512
 TEMPERATURE = 0.7
+# Deepgram Limit: Maximum 2000 characters per TTS request.
+TTS_MAX_CHARS = 1900 # Use slightly less than max for safety
+# --- Initialize Deepgram & LLM ---
+deepgram = DeepgramClient(DEEPGRAM_API_KEY) if DEEPGRAM_API_KEY else None
 llm = None
 def load_llm():
     global llm
     try:
+        model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
         llm = Llama(
             model_path=model_path,
             n_ctx=CONTEXT_WINDOW,
+            n_threads=2,
             verbose=False
         )
     except Exception as e:
+        print(f"Error loading LLM: {e}")
 load_llm()
+# --- Helper Functions for Splitting ---
+def split_text_for_tts(text, max_chars=TTS_MAX_CHARS):
+    """Splits text into chunks <= max_chars based on punctuation for natural TTS."""
+    # Split on strong delimiters (period, question mark, exclamation mark, newline)
+    # The delimiters are kept in the segments by using parentheses
+    segments = re.split(r'([.?!]\s+|\n+)', text)
+    chunks = []
+    current_chunk = ""
+    for segment in segments:
+        if len(current_chunk) + len(segment) < max_chars:
+            current_chunk += segment
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = segment
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return [chunk for chunk in chunks if chunk]
+# --- 1. Speech-to-Text (STT) with File Size Check ---
+def transcribe(audio_path):
+    """Converts Speech to Text using Deepgram, with a file size check."""
+    if not audio_path or deepgram is None:
+        return None
+    # STT API check: Deepgram Pre-Recorded supports files up to 2GB
+    # We check file size and return a warning if too large (e.g., > 200MB, where asynchronous processing is better)
+    file_size_bytes = os.path.getsize(audio_path)
+    if file_size_bytes > 200 * 1024 * 1024:
+        print("Warning: Audio file is large. Transcription may take a moment.")
     try:
+        with open(audio_path, "rb") as buffer:
             payload = {"buffer": buffer}
             options = PrerecordedOptions(
+                smart_format=True, model="nova-2", language="en-US",
+                # Add diarization=True if you want speaker separation in the transcript
             )
             response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
             return response.results.channels[0].alternatives[0].transcript
     except Exception as e:
         print(f"STT Error: {e}")
+        return None
+# --- 2. Text-to-Speech (TTS) with Stitching ---
 def text_to_speech(text):
+    """Converts Text to Speech, splitting long text and stitching audio."""
+    if deepgram is None:
         return None
+    # Step A: Split text into small chunks
+    text_chunks = split_text_for_tts(text)
+    audio_segments = []
+    # Step B: Generate audio for each chunk
+    for i, chunk in enumerate(text_chunks):
+        try:
+            temp_filename = f"temp_tts_chunk_{i}_{int(time.time())}.wav"
+            options = SpeakOptions(
+                model="aura-asteria-en", encoding="linear16", container="wav"
+            )
+            deepgram.speak.rest.v("1").save(temp_filename, {"text": chunk}, options)
+            # Load the temporary audio into pydub
+            audio_segments.append(AudioSegment.from_wav(temp_filename))
+            os.remove(temp_filename)
+        except Exception as e:
+            print(f"TTS API FAILED for chunk {i}: {e}. Skipping chunk.")
+            continue
+    if not audio_segments:
+        return None
+    # Step C: Stitch the audio files together
+    stitched_audio = audio_segments[0]
+    for i in range(1, len(audio_segments)):
+        # Add a 200ms pause between sentences for better flow
+        stitched_audio += AudioSegment.silent(duration=200)
+        stitched_audio += audio_segments[i]
+    # Step D: Export the final stitched file
+    final_filename = f"final_response_{int(time.time())}.wav"
+    stitched_audio.export(final_filename, format="wav")
+    return final_filename
+# --- Main Chat Logic (Same as before) ---
+def run_chat_pipeline(audio_input, history, state_messages):
+    if llm is None:
+        return history, state_messages, None
+    # 1. Transcribe Audio (STT)
+    user_text = transcribe(audio_input)
+    if not user_text:
+        # If transcription fails (e.g., bad audio, API key error), inform the user via the chat.
+        history.append(("", "System Error: Could not process audio. Check API Key or try speaking louder."))
+        return history, state_messages, None
+    # 2. Update UI and State with User Message
+    state_messages.append({"role": "user", "content": user_text})
+    history.append((user_text, None))
+    # 3. LLM Generation (Contextual)
+    try:
+        completion = llm.create_chat_completion(
+            messages=state_messages,
+            max_tokens=MAX_NEW_TOKENS,
+            temperature=TEMPERATURE
+        )
+        ai_text = completion['choices'][0]['message']['content']
+    except Exception as e:
+        ai_text = f"LLM Generation Error: {str(e)}"
+    # 4. Update UI and State with AI Response
+    state_messages.append({"role": "assistant", "content": ai_text})
+    history[-1] = (user_text, ai_text)
+    # 5. Generate Audio (TTS with splitting)
+    audio_path = text_to_speech(ai_text) # This handles the stitching
+    return history, state_messages, audio_path
+# --- Gradio UI Layout ---
+with gr.Blocks(title="Voice Chatbot") as demo:
+    gr.Markdown("## 🎙️ Voice-First AI Chat (Memory & Long-Text Handled)")
+    chatbot = gr.Chatbot(label="Conversation", height=500)
+    state_messages = gr.State([])
     with gr.Row():
+        with gr.Column(scale=4):
             audio_input = gr.Audio(
                 sources=["microphone"],
+                type="filepath",
+                label="Record Your Message"
             )
+        with gr.Column(scale=1):
+            submit_btn = gr.Button("Send Voice 💬", variant="primary")
+            clear_btn = gr.Button("Clear Memory 🗑️")
+    audio_player = gr.Audio(
+        label="AI Voice",
+        autoplay=True,
+        interactive=False
+    )
+    # --- Event Wiring ---
     submit_btn.click(
+        fn=run_chat_pipeline,
+        inputs=[audio_input, chatbot, state_messages],
+        outputs=[chatbot, state_messages, audio_player]
+    )
+    def clear_all():
+        return [], [], None
+    clear_btn.click(
+        fn=clear_all,
+        inputs=None,
+        outputs=[chatbot, state_messages, audio_player]
     )
 if __name__ == "__main__":