Spaces:

SreekarB
/

SLP

Running

App Files Files Community

SreekarB commited on Mar 18, 2025

Commit

6dcd45d

verified ·

1 Parent(s): 6cd20fc

Upload 4 files

Browse files

Files changed (2) hide show

app.py +185 -283
requirements.txt +4 -2

app.py CHANGED Viewed

@@ -3,34 +3,21 @@ import numpy as np
 import tempfile
 import os
 import wave
-import queue
-import threading
 import time
-from datetime import datetime
 import speech_recognition as sr
 import requests
 import json
-from gtts import gTTS
 import io
-# Queue for audio chunks
-audio_queue = queue.Queue()
-# Flag to control real-time processing thread
-is_running = False
-# Store conversation history
 conversation_history = []
-# LLM response queue
-response_queue = queue.Queue()
-# For tracking if speech is active
-speech_active = False
-# For tracking silence periods
-last_speech_time = time.time()
-# Silence threshold in seconds before processing
-SILENCE_THRESHOLD = 1.0
 # Hugging Face API configuration
 HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
-# Get API token from environment
 HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
 headers = {
@@ -38,191 +25,50 @@ headers = {
     "Content-Type": "application/json"
 }
-def start_real_time_processing():
-    """Start real-time audio processing"""
-    global is_running, speech_active, last_speech_time
-    is_running = True
-    speech_active = False
-    last_speech_time = time.time()
-    # Clear previous history
-    conversation_history.clear()
-    # Add system message
-    conversation_history.append({
-        "role": "system",
-        "content": "You are a helpful, friendly AI assistant engaged in a natural voice conversation. Keep responses brief, conversational, and engaging. Ask follow-up questions when appropriate to maintain the dialogue flow."
-    })
-    # Add initial greeting to conversation history
-    greeting = "Hello! I'm your voice assistant. How can I help you today?"
-    conversation_history.append({"role": "assistant", "content": greeting})
-    # Convert greeting to speech and add to response queue
-    greeting_audio = text_to_speech(greeting)
-    if greeting_audio:
-        response_queue.put(greeting_audio)
-    # Start the processing thread
-    processing_thread = threading.Thread(target=process_audio_queue)
-    processing_thread.daemon = True
-    processing_thread.start()
-    # Start the response playback thread
-    response_thread = threading.Thread(target=process_response_queue)
-    response_thread.daemon = True
-    response_thread.start()
-    # Start the speech activity monitor thread
-    activity_thread = threading.Thread(target=monitor_speech_activity)
-    activity_thread.daemon = True
-    activity_thread.start()
-    return "Starting conversation... Please speak when ready."
-def stop_real_time_processing():
-    """Stop real-time audio processing"""
-    global is_running
-    is_running = False
-    return "Conversation ended."
-def process_audio_chunk(audio_chunk, sample_rate):
-    """Process incoming audio chunk and add to queue"""
-    global speech_active, last_speech_time
-    if is_running and audio_chunk is not None and len(audio_chunk) > 0:
-        # Check if there's actual speech (not just silence)
-        rms = np.sqrt(np.mean(audio_chunk**2))
-        if rms > 0.01:  # Simple threshold for detecting speech
-            speech_active = True
-            last_speech_time = time.time()
-        # Add to queue for processing
-        audio_queue.put((audio_chunk, sample_rate))
-        # Join the conversation history into a single string for display
-        conversation_text = ""
-        for message in conversation_history:
-            if message["role"] != "system":  # Skip system messages in display
-                prefix = "You: " if message["role"] == "user" else "Assistant: "
-                conversation_text += f"{prefix}{message['content']}\n\n"
-        # Get the current response audio if available
-        try:
-            response_audio = response_queue.get_nowait()
-            response_queue.task_done()
-        except queue.Empty:
-            response_audio = None
-        # Return the input audio for immediate playback if no response audio
-        if response_audio is None:
-            playback_audio = (sample_rate, audio_chunk)
-        else:
-            playback_audio = response_audio
-        # Also return speech activity status
-        status = "Listening..." if speech_active else "Ready for your input..."
-        if len(conversation_history) > 1 and conversation_history[-1]["role"] == "user":
-            status = "Processing your response..."
-        return playback_audio, conversation_text + "\n" + status
-    return None, "Click 'Start Conversation' to begin"
-def monitor_speech_activity():
-    """Monitor speech activity and trigger processing when speech stops"""
-    global speech_active, last_speech_time
-    while is_running:
-        # If speech was active but has been silent for a while
-        if speech_active and (time.time() - last_speech_time) > SILENCE_THRESHOLD:
-            speech_active = False
-            # Signal to process the accumulated speech
-            process_accumulated_speech()
-        time.sleep(0.1)
-def process_accumulated_speech():
-    """Process all accumulated speech when a silence is detected"""
-    recognizer = sr.Recognizer()
-    # Create a temporary WAV file for all accumulated audio chunks
-    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
-        temp_filename = temp_file.name
-    # Check if we have enough accumulated audio
-    if not hasattr(process_accumulated_speech, 'accumulated_chunks'):
-        process_accumulated_speech.accumulated_chunks = []
-    # If we have accumulated audio chunks
-    if process_accumulated_speech.accumulated_chunks:
-        # Get the sample rate from the first chunk
-        sample_rate = process_accumulated_speech.accumulated_chunks[0][1]
-        # Concatenate all audio chunks
-        all_audio = np.concatenate([chunk[0] for chunk in process_accumulated_speech.accumulated_chunks])
-        # Save to WAV file
-        with wave.open(temp_filename, 'wb') as wf:
-            wf.setnchannels(1)
-            wf.setsampwidth(2)
-            wf.setframerate(sample_rate)
-            wf.writeframes((all_audio * 32767).astype(np.int16).tobytes())
-        # Perform speech recognition
-        try:
-            with sr.AudioFile(temp_filename) as source:
-                audio = recognizer.record(source)
-                text = recognizer.recognize_google(audio)
-                if text.strip():
-                    # Add user message to conversation history
-                    conversation_history.append({"role": "user", "content": text})
-        except sr.UnknownValueError:
-            # No speech detected
-            pass
-        except sr.RequestError as e:
-            print(f"Speech recognition error: {e}")
-        # Clean up
-        os.unlink(temp_filename)
-        process_accumulated_speech.accumulated_chunks = []
-def text_to_speech(text):
-    """Convert text to speech using gTTS"""
-    if not text.strip():
         return None
-    tts = gTTS(text=text, lang='en', slow=False)
-    # Save to a BytesIO object
-    fp = io.BytesIO()
-    tts.write_to_fp(fp)
-    fp.seek(0)
-    # Convert to audio array
-    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
-        temp_filename = temp_file.name
-    # Save the gTTS output to the temp file
-    with open(temp_filename, 'wb') as f:
-        f.write(fp.read())
-    # Read WAV file
-    with wave.open(temp_filename, 'rb') as wf:
-        sample_rate = wf.getframerate()
-        frames = wf.readframes(wf.getnframes())
-        audio_array = np.frombuffer(frames, dtype=np.int16)
-        audio_array = audio_array.astype(np.float32) / 32767.0
-    # Clean up temp file
-    os.unlink(temp_filename)
-    return (sample_rate, audio_array)
-def get_llm_response():
-    """Get response from LLM API"""
-    # Build conversation for the LLM
     messages = [{"role": msg["role"], "content": msg["content"]} for msg in conversation_history]
     try:
@@ -241,123 +87,179 @@ def get_llm_response():
             if response.status_code == 200:
                 response_json = response.json()
-                return response_json[0]["generated_text"]
             else:
-                return f"I'm having trouble connecting to my language model. Error: {response.status_code}"
         else:
-            # No API token available
-            return "To enable AI responses, please add a Hugging Face API token in the Space settings. For now, I can hear you but can't generate intelligent responses."
     except Exception as e:
-        return f"I encountered an error: {str(e)}. Please try again in a moment."
-def process_response_queue():
-    """Process responses and convert to audio"""
-    while is_running:
-        try:
-            # Check if new user message and no pending assistant response
-            if len(conversation_history) > 1 and conversation_history[-1]["role"] == "user":
-                # Get LLM response
-                response_text = get_llm_response()
-                # Add to conversation history
-                conversation_history.append({"role": "assistant", "content": response_text})
-                # Convert to speech
-                audio = text_to_speech(response_text)
-                # Add to response queue
-                if audio is not None:
-                    response_queue.put(audio)
-            time.sleep(0.2)
-        except Exception as e:
-            print(f"Error in response thread: {e}")
-            time.sleep(0.5)
-def process_audio_queue():
-    """Process audio chunks from the queue"""
-    # Initialize accumulated chunks
-    if not hasattr(process_accumulated_speech, 'accumulated_chunks'):
-        process_accumulated_speech.accumulated_chunks = []
-    while is_running:
-        try:
-            # Get audio chunk from queue with timeout
-            audio_chunk, sample_rate = audio_queue.get(timeout=0.5)
-            if audio_chunk is not None and len(audio_chunk) > 0:
-                # Store in accumulated chunks for later processing
-                process_accumulated_speech.accumulated_chunks.append((audio_chunk, sample_rate))
-            # Mark this task as done
-            audio_queue.task_done()
-        except queue.Empty:
-            # Queue is empty, just continue
-            pass
-        except Exception as e:
-            print(f"Error in processing thread: {e}")
-            time.sleep(0.1)
-# Create Gradio interface
-with gr.Blocks(title="Real-Time Voice Conversation Assistant") as demo:
-    gr.Markdown("# Real-Time Voice Conversation Assistant")
-    gr.Markdown("Speak naturally and have an interactive conversation with the AI assistant.")
     with gr.Row():
-        start_button = gr.Button("Start Conversation", variant="primary", scale=2)
-        stop_button = gr.Button("End Conversation", variant="stop", scale=1)
-    # Real-time microphone input
-    audio_input = gr.Audio(sources=["microphone"], streaming=True, type="numpy",
-                         label="Your Voice", elem_id="mic-input")
     with gr.Row():
-        # Audio output for playback
-        audio_output = gr.Audio(label="Audio", autoplay=True, elem_id="audio-output")
-        # Conversation output
-        conversation_display = gr.Textbox(label="Conversation",
-                                        lines=15,
-                                        elem_id="conversation-display")
     # Connect the components
-    start_button.click(start_real_time_processing, outputs=conversation_display)
-    stop_button.click(stop_real_time_processing, outputs=conversation_display)
-    # Stream processing
-    audio_input.stream(
-        process_audio_chunk,
-        inputs=[audio_input],
-        outputs=[audio_output, conversation_display],
-        show_progress=False
     )
     gr.Markdown("""
     ## How to use
-    1. Click the "Start Conversation" button
-    2. Speak naturally into your microphone
-    3. Pause briefly when you finish speaking to let the AI respond
-    4. The AI will respond audibly - just like a natural conversation!
-    5. Continue the conversation as long as you like
-    6. Click "End Conversation" when done
     """)
-    with gr.Accordion("Setup Instructions", open=True):
         gr.Markdown("""
-        ### Important: Setting up your API Token
-        This app requires a Hugging Face API token to enable AI responses:
         1. Create an account on [Hugging Face](https://huggingface.co/)
         2. Generate an API token in your [settings page](https://huggingface.co/settings/tokens)
         3. Add the token in your Space settings:
            - Go to Settings > Repository Secrets
            - Add a secret with the key `HF_API_TOKEN` and your token as the value
-        Without a token, the app will still transcribe your speech but won't generate AI responses.
         """)
-# Launch the app with higher queue concurrency
 if __name__ == "__main__":
-    demo.queue(concurrency_count=3, max_size=20).launch()

 import tempfile
 import os
 import wave
 import time
+import subprocess
 import speech_recognition as sr
 import requests
 import json
 import io
+from gtts import gTTS
+import soundfile as sf
+# Conversation state
 conversation_history = []
+is_active = False
 # Hugging Face API configuration
 HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
 HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
 headers = {
     "Content-Type": "application/json"
 }
+def tts_with_ffmpeg(text):
+    """Convert text to speech using gTTS and ffmpeg"""
+    if not text or not text.strip():
+        return None
+    # Create temp files
+    mp3_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
+    wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
+    try:
+        # Generate speech with gTTS
+        tts = gTTS(text=text, lang='en', slow=False)
+        tts.save(mp3_file)
+        # Convert MP3 to WAV using ffmpeg (subprocess to ensure it works in all environments)
+        subprocess.run(["ffmpeg", "-i", mp3_file, "-ar", "22050", wav_file, "-y"],
+                      stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        # Load the WAV file
+        audio_data, sample_rate = sf.read(wav_file)
+        # Clean up temp files
+        os.unlink(mp3_file)
+        os.unlink(wav_file)
+        return (sample_rate, audio_data)
+    except Exception as e:
+        print(f"Error in TTS: {e}")
+        # Clean up temp files
+        if os.path.exists(mp3_file):
+            os.unlink(mp3_file)
+        if os.path.exists(wav_file):
+            os.unlink(wav_file)
         return None
+def get_ai_response(user_text):
+    """Get response from LLM"""
+    if not user_text or not user_text.strip():
+        return "I couldn't hear you clearly. Could you try again?"
+    # Add user message to history
+    conversation_history.append({"role": "user", "content": user_text})
+    # Build messages for API
     messages = [{"role": msg["role"], "content": msg["content"]} for msg in conversation_history]
     try:
             if response.status_code == 200:
                 response_json = response.json()
+                ai_text = response_json[0]["generated_text"]
             else:
+                ai_text = f"I'm having trouble connecting. Error: {response.status_code}"
         else:
+            ai_text = "Please add a Hugging Face API token in the Space settings to enable AI responses."
     except Exception as e:
+        ai_text = f"Error: {str(e)}. Please try again."
+    # Add AI response to history
+    conversation_history.append({"role": "assistant", "content": ai_text})
+    return ai_text
+def start_assistant():
+    """Start the voice assistant"""
+    global is_active, conversation_history
+    is_active = True
+    conversation_history = []
+    # Add system message
+    conversation_history.append({
+        "role": "system",
+        "content": "You are a helpful, friendly AI assistant like Alexa. Keep responses brief and conversational. When appropriate, ask follow-up questions to maintain the conversation."
+    })
+    # Welcome message
+    welcome = "Hello! I'm your AI assistant. I'm listening. What can I help you with?"
+    conversation_history.append({"role": "assistant", "content": welcome})
+    # Generate welcome audio
+    welcome_audio = tts_with_ffmpeg(welcome)
+    # Format conversation for display
+    conversation_text = "Assistant: " + welcome + "\n\n"
+    # Set initial state to listening
+    status = "Listening... (Click Record to speak)"
+    return welcome_audio, conversation_text, status, True
+def stop_assistant():
+    """Stop the voice assistant"""
+    global is_active
+    is_active = False
+    return None, "Assistant stopped.", "Inactive", False
+def process_voice(audio, listen_state, conversation_state, status_state):
+    """Process voice input and generate response"""
+    if not is_active or not listen_state:
+        return None, conversation_state, "Please start the assistant first", listen_state
+    if audio is None:
+        return None, conversation_state, status_state, listen_state
+    # Process the audio recording
+    sample_rate, audio_data = audio
+    # Save to temporary WAV file for speech recognition
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+        temp_filename = temp_file.name
+    with wave.open(temp_filename, 'wb') as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)  # 16-bit audio
+        wf.setframerate(sample_rate)
+        wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
+    # Perform speech recognition
+    recognizer = sr.Recognizer()
+    transcription = ""
+    try:
+        with sr.AudioFile(temp_filename) as source:
+            audio = recognizer.record(source)
+            transcription = recognizer.recognize_google(audio)
+    except sr.UnknownValueError:
+        os.unlink(temp_filename)
+        return None, conversation_state, "I didn't catch that. Please try again.", listen_state
+    except sr.RequestError as e:
+        os.unlink(temp_filename)
+        return None, conversation_state, f"Speech recognition error: {e}", listen_state
+    # Clean up temp file
+    os.unlink(temp_filename)
+    # Update status
+    status = "Processing your request..."
+    # Get AI response
+    ai_response = get_ai_response(transcription)
+    # Generate audio response
+    audio_response = tts_with_ffmpeg(ai_response)
+    # Format conversation for display
+    conversation_text = ""
+    for message in conversation_history:
+        if message["role"] != "system":  # Skip system messages
+            prefix = "You: " if message["role"] == "user" else "Assistant: "
+            conversation_text += f"{prefix}{message['content']}\n\n"
+    # Set status back to listening
+    status = "Listening... (Click Record to speak)"
+    return audio_response, conversation_text, status, listen_state
+# Create the Gradio interface
+with gr.Blocks(title="Voice Assistant (Alexa-style)") as demo:
+    gr.Markdown("# Voice Assistant")
+    gr.Markdown("Speak naturally with the AI assistant like you would with Alexa")
+    # State variables
+    listening = gr.State(False)
     with gr.Row():
+        start_button = gr.Button("Start Assistant", variant="primary", scale=2)
+        stop_button = gr.Button("Stop Assistant", variant="stop", scale=1)
+    with gr.Row():
+        status_display = gr.Textbox(label="Status", value="Inactive")
     with gr.Row():
+        with gr.Column(scale=1):
+            audio_input = gr.Audio(type="numpy", label="Speak", interactive=True)
+        with gr.Column(scale=2):
+            conversation_display = gr.Textbox(label="Conversation", lines=10, interactive=False)
+    audio_output = gr.Audio(label="Assistant's Voice", autoplay=True)
     # Connect the components
+    start_button.click(
+        start_assistant,
+        outputs=[audio_output, conversation_display, status_display, listening]
+    )
+    stop_button.click(
+        stop_assistant,
+        outputs=[audio_output, conversation_display, status_display, listening]
+    )
+    audio_input.change(
+        process_voice,
+        inputs=[audio_input, listening, conversation_display, status_display],
+        outputs=[audio_output, conversation_display, status_display, listening]
     )
     gr.Markdown("""
     ## How to use
+    1. Click "Start Assistant" to begin
+    2. Click the microphone button and speak your question or command
+    3. When done speaking, click Stop on the recording control
+    4. Listen to the assistant's response
+    5. Continue the conversation by speaking again
+    6. Click "Stop Assistant" when you're finished
+    For the best experience, make sure your question or command is clear and complete before stopping the recording.
     """)
+    with gr.Accordion("Setup Guide", open=True):
         gr.Markdown("""
+        ### API Token Setup
+        This app requires a Hugging Face API token for AI responses:
         1. Create an account on [Hugging Face](https://huggingface.co/)
         2. Generate an API token in your [settings page](https://huggingface.co/settings/tokens)
         3. Add the token in your Space settings:
            - Go to Settings > Repository Secrets
            - Add a secret with the key `HF_API_TOKEN` and your token as the value
         """)
+# Launch the app
 if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,5 +1,7 @@
-gradio>=3.50.0
 numpy>=1.19.0
 SpeechRecognition>=3.8.1
 requests>=2.25.1
-gTTS>=2.3.2

+gradio==3.50.0
 numpy>=1.19.0
 SpeechRecognition>=3.8.1
 requests>=2.25.1
+gTTS>=2.3.2
+soundfile>=0.12.1
+ffmpeg-python>=0.2.0