Spaces:

SreekarB
/

SLP

Running

App Files Files Community

SreekarB commited on Mar 18, 2025

Commit

f4deca2

verified ·

1 Parent(s): 6243a25

Upload 4 files

Browse files

Files changed (2) hide show

app.py +148 -153
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -4,16 +4,11 @@ import tempfile
 import os
 import wave
 import requests
-import threading
-import time
-import speech_recognition as sr
 from gtts import gTTS
 # Conversation state
 conversation = []
-is_active = False
-listen_thread = None
-stop_listening = False
 # Hugging Face API configuration
 HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
@@ -24,91 +19,63 @@ headers = {
     "Content-Type": "application/json"
 }
-def recognize_from_microphone(timeout=None):
-    """Recognize speech from microphone continuously"""
-    global conversation, stop_listening, output_audio, conversation_text
-    recognizer = sr.Recognizer()
-    recognizer.dynamic_energy_threshold = True
-    # Initialize conversation with system message
-    if not conversation:
-        conversation = [{"role": "system", "content": "You are a helpful AI assistant like Alexa. Keep responses brief and conversational."}]
-    # Add welcome message
-    welcome_msg = "Hello! I'm your AI assistant. What can I help you with today?"
-    conversation.append({"role": "assistant", "content": welcome_msg})
-    # Create welcome speech
-    welcome_audio = text_to_speech(welcome_msg)
-    # Update display
-    update_conversation_display(welcome_audio)
-    # Start listening
-    with sr.Microphone() as source:
-        # Initial calibration
-        print("Calibrating for ambient noise...")
-        recognizer.adjust_for_ambient_noise(source, duration=1)
-        # Listening loop
-        while not stop_listening:
-            try:
-                print("Listening...")
-                audio = recognizer.listen(source, timeout=10, phrase_time_limit=10)
-                try:
-                    text = recognizer.recognize_google(audio)
-                    print(f"Recognized: {text}")
-                    if text.strip():
-                        # Get LLM response
-                        conversation.append({"role": "user", "content": text})
-                        ai_response = get_llm_response()
-                        # Generate speech
-                        speech_file = text_to_speech(ai_response)
-                        # Update the display
-                        update_conversation_display(speech_file)
-                except sr.UnknownValueError:
-                    print("Could not understand audio")
-                except sr.RequestError as e:
-                    print(f"Could not request results; {e}")
-            except Exception as e:
-                print(f"Listening error: {e}")
-                time.sleep(0.1)
-    print("Stopped listening.")
-    return
-# Variables for storing outputs (needed for updating the interface)
-output_audio = None
-conversation_text = ""
-def update_conversation_display(audio_path):
-    """Update the conversation display with latest content"""
-    global output_audio, conversation_text
-    # Format conversation for display
-    conversation_text = ""
-    for msg in conversation:
-        if msg["role"] != "system":  # Skip system messages
-            prefix = "You: " if msg["role"] == "user" else "Assistant: "
-            conversation_text += f"{prefix}{msg['content']}\n\n"
-    output_audio = audio_path
-def get_llm_response():
-    """Get response from LLM API"""
     try:
         if not HF_API_TOKEN:
-            response_text = "Please add a Hugging Face API token to enable AI responses."
         else:
-            # Prepare messages for API
-            messages = [msg for msg in conversation]  # Include system message
             # Make API call
             payload = {
                 "inputs": messages,
@@ -122,15 +89,16 @@ def get_llm_response():
             response = requests.post(HF_API_URL, headers=headers, json=payload)
             if response.status_code == 200:
-                generated_text = response.json()[0]["generated_text"]
-                conversation.append({"role": "assistant", "content": generated_text})
-                response_text = generated_text
             else:
-                response_text = f"Error from API: {response.status_code}"
-        return response_text
     except Exception as e:
-        return f"Error: {str(e)}"
 def text_to_speech(text):
     """Convert text to speech"""
@@ -149,84 +117,111 @@ def text_to_speech(text):
         print(f"TTS error: {e}")
         return None
-def toggle_conversation():
-    """Toggle the conversation on/off"""
-    global is_active, listen_thread, stop_listening
-    if not is_active:
-        # Start conversation
-        is_active = True
-        stop_listening = False
-        # Start the listening thread
-        listen_thread = threading.Thread(target=recognize_from_microphone)
-        listen_thread.daemon = True
-        listen_thread.start()
-        return "Stop Conversation", conversation_text, output_audio
-    else:
-        # Stop conversation
-        is_active = False
-        stop_listening = True
-        # Wait for the thread to end
-        if listen_thread and listen_thread.is_alive():
-            listen_thread.join(timeout=1)
-        return "Start Conversation", conversation_text, output_audio
-def check_for_updates():
-    """Check for updates in the conversation (used by the periodic event)"""
-    return conversation_text, output_audio
-# Create the Gradio interface
-with gr.Blocks(title="Interactive Voice Assistant", css=".gradio-container {background-color: #f0f8ff;}") as demo:
     with gr.Column():
         gr.Markdown("# Interactive Voice Assistant")
-        gr.Markdown("Click 'Start Conversation' and begin talking naturally with the AI assistant")
-        # The main conversation button
-        conversation_button = gr.Button("Start Conversation", variant="primary", scale=2)
-        # Display area for conversation history
-        conversation_display = gr.Textbox(
-            label="Conversation",
-            lines=10,
-            value="Click 'Start Conversation' to begin talking with the AI assistant..."
-        )
-        # Audio output (hidden, but will autoplay)
-        audio_output = gr.Audio(label="Assistant Voice", type="filepath", autoplay=True)
-    # Button click toggles conversation
-    conversation_button.click(
-        toggle_conversation,
-        outputs=[conversation_button, conversation_display, audio_output]
     )
-    # Periodic event to update interface with new conversation and audio
-    demo.load(
-        check_for_updates,
-        inputs=None,
-        outputs=[conversation_display, audio_output],
-        every=1  # Check every second
     )
-    gr.Markdown("""
-    ## How to use
-    1. Click 'Start Conversation'
-    2. Start speaking directly to the AI assistant
-    3. The assistant will respond when you pause speaking
-    4. Keep the conversation going naturally
-    5. Click 'Stop Conversation' when done
-    ## Notes
-    - Make sure your microphone is enabled in your browser
-    - Speak clearly with pauses between your questions
-    - The assistant will update the conversation history in real-time
-    """)
 # Launch the app
 if __name__ == "__main__":
-    # Use a larger queue size to allow for continuous updates
-    demo.queue(max_size=20).launch(debug=True)

 import os
 import wave
 import requests
+import json
 from gtts import gTTS
 # Conversation state
 conversation = []
 # Hugging Face API configuration
 HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
     "Content-Type": "application/json"
 }
+def transcribe_audio(audio):
+    """Transcribe audio to text using Gradio's built-in speech recognition"""
+    if audio is None:
+        return None
+    # Gradio 3.50.0 passes (sample_rate, audio_data)
+    sample_rate, audio_data = audio
+    if len(audio_data) == 0:
+        return None
+    # Simple energy check to see if there's actually speech
+    if np.max(np.abs(audio_data)) < 0.05:
+        return None
+    # Create a temporary WAV file
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+        temp_filename = temp_file.name
+    try:
+        with wave.open(temp_filename, 'wb') as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)  # 16-bit audio
+            wf.setframerate(sample_rate)
+            wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
+        # Use Gradio's default transcription
+        import speech_recognition as sr
+        recognizer = sr.Recognizer()
+        with sr.AudioFile(temp_filename) as source:
+            audio_data = recognizer.record(source)
+            text = recognizer.recognize_google(audio_data)
+            return text.strip()
+    except Exception as e:
+        print(f"Error in transcription: {e}")
+        return None
+    finally:
+        # Clean up
+        if os.path.exists(temp_filename):
+            os.unlink(temp_filename)
+def get_ai_response(user_text):
+    """Get AI response from LLM API"""
+    if not user_text:
+        return "I didn't catch that. Could you speak again?"
+    # Add user message to conversation
+    conversation.append({"role": "user", "content": user_text})
+    # Prepare messages for API
+    messages = [{"role": "system", "content": "You are a helpful AI assistant like Alexa. Keep responses brief and conversational."}]
+    messages.extend(conversation)
     try:
         if not HF_API_TOKEN:
+            ai_response = "Please add a Hugging Face API token to enable AI responses."
         else:
             # Make API call
             payload = {
                 "inputs": messages,
             response = requests.post(HF_API_URL, headers=headers, json=payload)
             if response.status_code == 200:
+                ai_response = response.json()[0]["generated_text"]
             else:
+                ai_response = f"I'm having trouble connecting. Error: {response.status_code}"
     except Exception as e:
+        ai_response = f"Error: {str(e)}"
+    # Add assistant response to conversation
+    conversation.append({"role": "assistant", "content": ai_response})
+    return ai_response
 def text_to_speech(text):
     """Convert text to speech"""
         print(f"TTS error: {e}")
         return None
+def start_conversation():
+    """Start a new conversation"""
+    global conversation
+    conversation = []
+    # Add welcome message
+    welcome = "Hello! I'm your AI assistant. Speak into the microphone and I'll respond to you."
+    conversation.append({"role": "assistant", "content": welcome})
+    # Generate speech
+    welcome_audio = text_to_speech(welcome)
+    return welcome_audio, "Conversation started. Speak into the microphone."
+def process_interaction(audio):
+    """Process a single interaction"""
+    if audio is None:
+        return None, get_conversation_text()
+    # Transcribe audio to text
+    user_text = transcribe_audio(audio)
+    if not user_text:
+        return None, get_conversation_text()
+    # Get AI response
+    ai_response = get_ai_response(user_text)
+    # Convert to speech
+    speech_file = text_to_speech(ai_response)
+    # Update conversation display
+    conversation_text = get_conversation_text()
+    return speech_file, conversation_text
+def get_conversation_text():
+    """Format conversation history for display"""
+    result = ""
+    for msg in conversation:
+        if msg["role"] != "system":  # Skip system messages
+            prefix = "You: " if msg["role"] == "user" else "Assistant: "
+            result += f"{prefix}{msg['content']}\n\n"
+    return result
+# Create Gradio interface
+with gr.Blocks(title="Interactive Voice Assistant") as demo:
     with gr.Column():
         gr.Markdown("# Interactive Voice Assistant")
+        gr.Markdown("""
+        Just click "Start" and begin speaking with the assistant.
+        The interaction is simple: speak, get a response, speak again.
+        """)
+        # Two-panel layout
+        with gr.Row():
+            with gr.Column(scale=1):
+                start_button = gr.Button("Start New Conversation", variant="primary")
+                # Recording component that captures voice
+                audio_input = gr.Audio(
+                    label="Speak Here",
+                    type="numpy",
+                    sources=None,
+                    interactive=True
+                )
+            with gr.Column(scale=2):
+                # Display conversation
+                conversation_display = gr.Textbox(
+                    label="Conversation History",
+                    lines=15,
+                    value=""
+                )
+                # Audio output for assistant responses
+                audio_output = gr.Audio(
+                    label="Assistant's Voice",
+                    type="filepath",
+                    autoplay=True
+                )
+        gr.Markdown("""
+        ## How to use
+        1. Click "Start New Conversation" to begin
+        2. Click the microphone button below "Speak Here" and talk to the assistant
+        3. When done speaking, click the stop button
+        4. The assistant will respond with voice and text
+        5. Continue the conversation by speaking again
+        This assistant works like Alexa - just speak, and get a response!
+        """)
+    # Set up the interactions
+    start_button.click(
+        start_conversation,
+        outputs=[audio_output, conversation_display]
     )
+    audio_input.change(
+        process_interaction,
+        inputs=[audio_input],
+        outputs=[audio_output, conversation_display]
     )
 # Launch the app
 if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -2,5 +2,4 @@ gradio==3.50.0
 numpy>=1.19.0
 SpeechRecognition>=3.8.1
 requests>=2.25.1
-gTTS>=2.3.2
-pyaudio>=0.2.11

 numpy>=1.19.0
 SpeechRecognition>=3.8.1
 requests>=2.25.1
+gTTS>=2.3.2