Spaces:

SreekarB
/

SLP

Running

App Files Files Community

SreekarB commited on Mar 18, 2025

Commit

6cd20fc

verified ·

1 Parent(s): 4d6f567

Upload 4 files

Browse files

Files changed (1) hide show

app.py +153 -72

app.py CHANGED Viewed

@@ -21,10 +21,16 @@ is_running = False
 conversation_history = []
 # LLM response queue
 response_queue = queue.Queue()
 # Hugging Face API configuration
 HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
-# Replace with your actual Hugging Face API token
 HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
 headers = {
@@ -34,8 +40,10 @@ headers = {
 def start_real_time_processing():
     """Start real-time audio processing"""
-    global is_running
     is_running = True
     # Clear previous history
     conversation_history.clear()
@@ -43,9 +51,18 @@ def start_real_time_processing():
     # Add system message
     conversation_history.append({
         "role": "system",
-        "content": "You are a helpful, friendly AI assistant. Keep responses brief and conversational."
     })
     # Start the processing thread
     processing_thread = threading.Thread(target=process_audio_queue)
     processing_thread.daemon = True
@@ -56,17 +73,30 @@ def start_real_time_processing():
     response_thread.daemon = True
     response_thread.start()
-    return "Real-time assistant started. Speak into your microphone..."
 def stop_real_time_processing():
     """Stop real-time audio processing"""
     global is_running
     is_running = False
-    return "Real-time assistant stopped."
 def process_audio_chunk(audio_chunk, sample_rate):
     """Process incoming audio chunk and add to queue"""
     if is_running and audio_chunk is not None and len(audio_chunk) > 0:
         # Add to queue for processing
         audio_queue.put((audio_chunk, sample_rate))
@@ -90,9 +120,73 @@ def process_audio_chunk(audio_chunk, sample_rate):
         else:
             playback_audio = response_audio
-        return playback_audio, conversation_text
-    return None, "Click 'Start' to begin real-time processing"
 def text_to_speech(text):
     """Convert text to speech using gTTS"""
@@ -106,16 +200,27 @@ def text_to_speech(text):
     tts.write_to_fp(fp)
     fp.seek(0)
     # Read WAV file
-    with wave.open(fp, 'rb') as wf:
         sample_rate = wf.getframerate()
         frames = wf.readframes(wf.getnframes())
         audio_array = np.frombuffer(frames, dtype=np.int16)
         audio_array = audio_array.astype(np.float32) / 32767.0
     return (sample_rate, audio_array)
-def get_llm_response(input_text):
     """Get response from LLM API"""
     # Build conversation for the LLM
     messages = [{"role": msg["role"], "content": msg["content"]} for msg in conversation_history]
@@ -138,22 +243,21 @@ def get_llm_response(input_text):
                 response_json = response.json()
                 return response_json[0]["generated_text"]
             else:
-                return f"API Error: {response.status_code} - {response.text}"
         else:
-            # Fallback response if no API token is provided
-            return "I don't have an API token configured, but I heard you! Please check the README for setup instructions."
     except Exception as e:
-        return f"Error: {str(e)}"
 def process_response_queue():
     """Process responses and convert to audio"""
     while is_running:
         try:
             if len(conversation_history) > 1 and conversation_history[-1]["role"] == "user":
-                user_message = conversation_history[-1]["content"]
                 # Get LLM response
-                response_text = get_llm_response(user_message)
                 # Add to conversation history
                 conversation_history.append({"role": "assistant", "content": response_text})
@@ -165,14 +269,16 @@ def process_response_queue():
                 if audio is not None:
                     response_queue.put(audio)
-            time.sleep(0.5)
         except Exception as e:
             print(f"Error in response thread: {e}")
             time.sleep(0.5)
 def process_audio_queue():
     """Process audio chunks from the queue"""
-    recognizer = sr.Recognizer()
     while is_running:
         try:
@@ -180,35 +286,8 @@ def process_audio_queue():
             audio_chunk, sample_rate = audio_queue.get(timeout=0.5)
             if audio_chunk is not None and len(audio_chunk) > 0:
-                # Create a temporary WAV file for speech recognition
-                with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
-                    temp_filename = temp_file.name
-                # Save the audio chunk to the temporary file
-                with wave.open(temp_filename, 'wb') as wf:
-                    wf.setnchannels(1)
-                    wf.setsampwidth(2)  # 16-bit audio
-                    wf.setframerate(sample_rate)
-                    wf.writeframes((audio_chunk * 32767).astype(np.int16).tobytes())
-                # Perform speech recognition
-                try:
-                    with sr.AudioFile(temp_filename) as source:
-                        audio = recognizer.record(source)
-                        text = recognizer.recognize_google(audio)
-                        # Only add to conversation if not empty
-                        if text.strip():
-                            # Add user message to conversation history
-                            conversation_history.append({"role": "user", "content": text})
-                except sr.UnknownValueError:
-                    # No speech detected, ignore
-                    pass
-                except sr.RequestError as e:
-                    print(f"Speech recognition error: {e}")
-                # Clean up temporary file
-                os.unlink(temp_filename)
             # Mark this task as done
             audio_queue.task_done()
@@ -221,23 +300,26 @@ def process_audio_queue():
             time.sleep(0.1)
 # Create Gradio interface
-with gr.Blocks(title="Real-Time Voice Assistant") as demo:
-    gr.Markdown("# Real-Time Voice Assistant")
-    gr.Markdown("Speak into your microphone and get AI responses in real-time.")
     with gr.Row():
-        start_button = gr.Button("Start Conversation", variant="primary")
-        stop_button = gr.Button("End Conversation", variant="stop")
     # Real-time microphone input
-    audio_input = gr.Audio(sources=["microphone"], streaming=True, type="numpy", label="Your Voice")
     with gr.Row():
         # Audio output for playback
-        audio_output = gr.Audio(label="Audio", autoplay=True)
         # Conversation output
-        conversation_display = gr.Textbox(label="Conversation", lines=15)
     # Connect the components
     start_button.click(start_real_time_processing, outputs=conversation_display)
@@ -253,30 +335,29 @@ with gr.Blocks(title="Real-Time Voice Assistant") as demo:
     gr.Markdown("""
     ## How to use
-    1. **Important**: Add your Hugging Face API token as an environment variable `HF_API_TOKEN`
-    2. Click the "Start Conversation" button
-    3. Speak into your microphone
-    4. Listen to the AI's voice responses
-    5. Continue the conversation naturally
     6. Click "End Conversation" when done
     """)
-    with gr.Accordion("Setup Instructions", open=False):
         gr.Markdown("""
-        ### Setting up your API Token
         1. Create an account on [Hugging Face](https://huggingface.co/)
         2. Generate an API token in your [settings page](https://huggingface.co/settings/tokens)
-        3. Add it as an environment variable when launching this app:
-        ```bash
-        export HF_API_TOKEN="your-token-here"
-        python app.py
-        ```
-        Or when deploying to Hugging Face Spaces, add it in the repository settings.
         """)
-# Launch the app
 if __name__ == "__main__":
-    demo.queue(max_size=10).launch()

 conversation_history = []
 # LLM response queue
 response_queue = queue.Queue()
+# For tracking if speech is active
+speech_active = False
+# For tracking silence periods
+last_speech_time = time.time()
+# Silence threshold in seconds before processing
+SILENCE_THRESHOLD = 1.0
 # Hugging Face API configuration
 HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
+# Get API token from environment
 HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
 headers = {
 def start_real_time_processing():
     """Start real-time audio processing"""
+    global is_running, speech_active, last_speech_time
     is_running = True
+    speech_active = False
+    last_speech_time = time.time()
     # Clear previous history
     conversation_history.clear()
     # Add system message
     conversation_history.append({
         "role": "system",
+        "content": "You are a helpful, friendly AI assistant engaged in a natural voice conversation. Keep responses brief, conversational, and engaging. Ask follow-up questions when appropriate to maintain the dialogue flow."
     })
+    # Add initial greeting to conversation history
+    greeting = "Hello! I'm your voice assistant. How can I help you today?"
+    conversation_history.append({"role": "assistant", "content": greeting})
+    # Convert greeting to speech and add to response queue
+    greeting_audio = text_to_speech(greeting)
+    if greeting_audio:
+        response_queue.put(greeting_audio)
     # Start the processing thread
     processing_thread = threading.Thread(target=process_audio_queue)
     processing_thread.daemon = True
     response_thread.daemon = True
     response_thread.start()
+    # Start the speech activity monitor thread
+    activity_thread = threading.Thread(target=monitor_speech_activity)
+    activity_thread.daemon = True
+    activity_thread.start()
+    return "Starting conversation... Please speak when ready."
 def stop_real_time_processing():
     """Stop real-time audio processing"""
     global is_running
     is_running = False
+    return "Conversation ended."
 def process_audio_chunk(audio_chunk, sample_rate):
     """Process incoming audio chunk and add to queue"""
+    global speech_active, last_speech_time
     if is_running and audio_chunk is not None and len(audio_chunk) > 0:
+        # Check if there's actual speech (not just silence)
+        rms = np.sqrt(np.mean(audio_chunk**2))
+        if rms > 0.01:  # Simple threshold for detecting speech
+            speech_active = True
+            last_speech_time = time.time()
         # Add to queue for processing
         audio_queue.put((audio_chunk, sample_rate))
         else:
             playback_audio = response_audio
+        # Also return speech activity status
+        status = "Listening..." if speech_active else "Ready for your input..."
+        if len(conversation_history) > 1 and conversation_history[-1]["role"] == "user":
+            status = "Processing your response..."
+        return playback_audio, conversation_text + "\n" + status
+    return None, "Click 'Start Conversation' to begin"
+def monitor_speech_activity():
+    """Monitor speech activity and trigger processing when speech stops"""
+    global speech_active, last_speech_time
+    while is_running:
+        # If speech was active but has been silent for a while
+        if speech_active and (time.time() - last_speech_time) > SILENCE_THRESHOLD:
+            speech_active = False
+            # Signal to process the accumulated speech
+            process_accumulated_speech()
+        time.sleep(0.1)
+def process_accumulated_speech():
+    """Process all accumulated speech when a silence is detected"""
+    recognizer = sr.Recognizer()
+    # Create a temporary WAV file for all accumulated audio chunks
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
+        temp_filename = temp_file.name
+    # Check if we have enough accumulated audio
+    if not hasattr(process_accumulated_speech, 'accumulated_chunks'):
+        process_accumulated_speech.accumulated_chunks = []
+    # If we have accumulated audio chunks
+    if process_accumulated_speech.accumulated_chunks:
+        # Get the sample rate from the first chunk
+        sample_rate = process_accumulated_speech.accumulated_chunks[0][1]
+        # Concatenate all audio chunks
+        all_audio = np.concatenate([chunk[0] for chunk in process_accumulated_speech.accumulated_chunks])
+        # Save to WAV file
+        with wave.open(temp_filename, 'wb') as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)
+            wf.setframerate(sample_rate)
+            wf.writeframes((all_audio * 32767).astype(np.int16).tobytes())
+        # Perform speech recognition
+        try:
+            with sr.AudioFile(temp_filename) as source:
+                audio = recognizer.record(source)
+                text = recognizer.recognize_google(audio)
+                if text.strip():
+                    # Add user message to conversation history
+                    conversation_history.append({"role": "user", "content": text})
+        except sr.UnknownValueError:
+            # No speech detected
+            pass
+        except sr.RequestError as e:
+            print(f"Speech recognition error: {e}")
+        # Clean up
+        os.unlink(temp_filename)
+        process_accumulated_speech.accumulated_chunks = []
 def text_to_speech(text):
     """Convert text to speech using gTTS"""
     tts.write_to_fp(fp)
     fp.seek(0)
+    # Convert to audio array
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
+        temp_filename = temp_file.name
+    # Save the gTTS output to the temp file
+    with open(temp_filename, 'wb') as f:
+        f.write(fp.read())
     # Read WAV file
+    with wave.open(temp_filename, 'rb') as wf:
         sample_rate = wf.getframerate()
         frames = wf.readframes(wf.getnframes())
         audio_array = np.frombuffer(frames, dtype=np.int16)
         audio_array = audio_array.astype(np.float32) / 32767.0
+    # Clean up temp file
+    os.unlink(temp_filename)
     return (sample_rate, audio_array)
+def get_llm_response():
     """Get response from LLM API"""
     # Build conversation for the LLM
     messages = [{"role": msg["role"], "content": msg["content"]} for msg in conversation_history]
                 response_json = response.json()
                 return response_json[0]["generated_text"]
             else:
+                return f"I'm having trouble connecting to my language model. Error: {response.status_code}"
         else:
+            # No API token available
+            return "To enable AI responses, please add a Hugging Face API token in the Space settings. For now, I can hear you but can't generate intelligent responses."
     except Exception as e:
+        return f"I encountered an error: {str(e)}. Please try again in a moment."
 def process_response_queue():
     """Process responses and convert to audio"""
     while is_running:
         try:
+            # Check if new user message and no pending assistant response
             if len(conversation_history) > 1 and conversation_history[-1]["role"] == "user":
                 # Get LLM response
+                response_text = get_llm_response()
                 # Add to conversation history
                 conversation_history.append({"role": "assistant", "content": response_text})
                 if audio is not None:
                     response_queue.put(audio)
+            time.sleep(0.2)
         except Exception as e:
             print(f"Error in response thread: {e}")
             time.sleep(0.5)
 def process_audio_queue():
     """Process audio chunks from the queue"""
+    # Initialize accumulated chunks
+    if not hasattr(process_accumulated_speech, 'accumulated_chunks'):
+        process_accumulated_speech.accumulated_chunks = []
     while is_running:
         try:
             audio_chunk, sample_rate = audio_queue.get(timeout=0.5)
             if audio_chunk is not None and len(audio_chunk) > 0:
+                # Store in accumulated chunks for later processing
+                process_accumulated_speech.accumulated_chunks.append((audio_chunk, sample_rate))
             # Mark this task as done
             audio_queue.task_done()
             time.sleep(0.1)
 # Create Gradio interface
+with gr.Blocks(title="Real-Time Voice Conversation Assistant") as demo:
+    gr.Markdown("# Real-Time Voice Conversation Assistant")
+    gr.Markdown("Speak naturally and have an interactive conversation with the AI assistant.")
     with gr.Row():
+        start_button = gr.Button("Start Conversation", variant="primary", scale=2)
+        stop_button = gr.Button("End Conversation", variant="stop", scale=1)
     # Real-time microphone input
+    audio_input = gr.Audio(sources=["microphone"], streaming=True, type="numpy",
+                         label="Your Voice", elem_id="mic-input")
     with gr.Row():
         # Audio output for playback
+        audio_output = gr.Audio(label="Audio", autoplay=True, elem_id="audio-output")
         # Conversation output
+        conversation_display = gr.Textbox(label="Conversation",
+                                        lines=15,
+                                        elem_id="conversation-display")
     # Connect the components
     start_button.click(start_real_time_processing, outputs=conversation_display)
     gr.Markdown("""
     ## How to use
+    1. Click the "Start Conversation" button
+    2. Speak naturally into your microphone
+    3. Pause briefly when you finish speaking to let the AI respond
+    4. The AI will respond audibly - just like a natural conversation!
+    5. Continue the conversation as long as you like
     6. Click "End Conversation" when done
     """)
+    with gr.Accordion("Setup Instructions", open=True):
         gr.Markdown("""
+        ### Important: Setting up your API Token
+        This app requires a Hugging Face API token to enable AI responses:
         1. Create an account on [Hugging Face](https://huggingface.co/)
         2. Generate an API token in your [settings page](https://huggingface.co/settings/tokens)
+        3. Add the token in your Space settings:
+           - Go to Settings > Repository Secrets
+           - Add a secret with the key `HF_API_TOKEN` and your token as the value
+        Without a token, the app will still transcribe your speech but won't generate AI responses.
         """)
+# Launch the app with higher queue concurrency
 if __name__ == "__main__":
+    demo.queue(concurrency_count=3, max_size=20).launch()