Spaces:

SreekarB
/

SLP

Sleeping

App Files Files Community

SreekarB commited on Mar 18, 2025

Commit

a191fd1

verified ·

1 Parent(s): f4deca2

Upload 4 files

Browse files

Files changed (1) hide show

app.py +78 -66

app.py CHANGED Viewed

@@ -4,11 +4,12 @@ import tempfile
 import os
 import wave
 import requests
-import json
 from gtts import gTTS
 # Conversation state
 conversation = []
 # Hugging Face API configuration
 HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
@@ -20,20 +21,13 @@ headers = {
 }
 def transcribe_audio(audio):
-    """Transcribe audio to text using Gradio's built-in speech recognition"""
     if audio is None:
         return None
     # Gradio 3.50.0 passes (sample_rate, audio_data)
     sample_rate, audio_data = audio
-    if len(audio_data) == 0:
-        return None
-    # Simple energy check to see if there's actually speech
-    if np.max(np.abs(audio_data)) < 0.05:
-        return None
     # Create a temporary WAV file
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
         temp_filename = temp_file.name
@@ -45,8 +39,7 @@ def transcribe_audio(audio):
             wf.setframerate(sample_rate)
             wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
-        # Use Gradio's default transcription
-        import speech_recognition as sr
         recognizer = sr.Recognizer()
         with sr.AudioFile(temp_filename) as source:
             audio_data = recognizer.record(source)
@@ -56,14 +49,14 @@ def transcribe_audio(audio):
         print(f"Error in transcription: {e}")
         return None
     finally:
-        # Clean up
         if os.path.exists(temp_filename):
             os.unlink(temp_filename)
 def get_ai_response(user_text):
     """Get AI response from LLM API"""
     if not user_text:
-        return "I didn't catch that. Could you speak again?"
     # Add user message to conversation
     conversation.append({"role": "user", "content": user_text})
@@ -123,16 +116,29 @@ def start_conversation():
     conversation = []
     # Add welcome message
-    welcome = "Hello! I'm your AI assistant. Speak into the microphone and I'll respond to you."
     conversation.append({"role": "assistant", "content": welcome})
     # Generate speech
     welcome_audio = text_to_speech(welcome)
-    return welcome_audio, "Conversation started. Speak into the microphone."
-def process_interaction(audio):
-    """Process a single interaction"""
     if audio is None:
         return None, get_conversation_text()
@@ -140,18 +146,18 @@ def process_interaction(audio):
     user_text = transcribe_audio(audio)
     if not user_text:
-        return None, get_conversation_text()
     # Get AI response
     ai_response = get_ai_response(user_text)
-    # Convert to speech
     speech_file = text_to_speech(ai_response)
     # Update conversation display
-    conversation_text = get_conversation_text()
-    return speech_file, conversation_text
 def get_conversation_text():
     """Format conversation history for display"""
@@ -163,64 +169,70 @@ def get_conversation_text():
     return result
 # Create Gradio interface
-with gr.Blocks(title="Interactive Voice Assistant") as demo:
     with gr.Column():
-        gr.Markdown("# Interactive Voice Assistant")
-        gr.Markdown("""
-        Just click "Start" and begin speaking with the assistant.
-        The interaction is simple: speak, get a response, speak again.
-        """)
-        # Two-panel layout
-        with gr.Row():
-            with gr.Column(scale=1):
-                start_button = gr.Button("Start New Conversation", variant="primary")
-                # Recording component that captures voice
-                audio_input = gr.Audio(
-                    label="Speak Here",
-                    type="numpy",
-                    sources=None,
-                    interactive=True
-                )
-            with gr.Column(scale=2):
-                # Display conversation
-                conversation_display = gr.Textbox(
-                    label="Conversation History",
-                    lines=15,
-                    value=""
-                )
-                # Audio output for assistant responses
-                audio_output = gr.Audio(
-                    label="Assistant's Voice",
-                    type="filepath",
-                    autoplay=True
-                )
-        gr.Markdown("""
-        ## How to use
-        1. Click "Start New Conversation" to begin
-        2. Click the microphone button below "Speak Here" and talk to the assistant
-        3. When done speaking, click the stop button
-        4. The assistant will respond with voice and text
-        5. Continue the conversation by speaking again
-        This assistant works like Alexa - just speak, and get a response!
-        """)
-    # Set up the interactions
     start_button.click(
         start_conversation,
         outputs=[audio_output, conversation_display]
     )
     audio_input.change(
-        process_interaction,
         inputs=[audio_input],
         outputs=[audio_output, conversation_display]
     )
 # Launch the app
 if __name__ == "__main__":

 import os
 import wave
 import requests
+import speech_recognition as sr
 from gtts import gTTS
 # Conversation state
 conversation = []
+is_listening = False
 # Hugging Face API configuration
 HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
 }
 def transcribe_audio(audio):
+    """Transcribe audio to text using Google Speech Recognition"""
     if audio is None:
         return None
     # Gradio 3.50.0 passes (sample_rate, audio_data)
     sample_rate, audio_data = audio
     # Create a temporary WAV file
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
         temp_filename = temp_file.name
             wf.setframerate(sample_rate)
             wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
+        # Perform speech recognition
         recognizer = sr.Recognizer()
         with sr.AudioFile(temp_filename) as source:
             audio_data = recognizer.record(source)
         print(f"Error in transcription: {e}")
         return None
     finally:
+        # Clean up temp file
         if os.path.exists(temp_filename):
             os.unlink(temp_filename)
 def get_ai_response(user_text):
     """Get AI response from LLM API"""
     if not user_text:
+        return "I couldn't hear what you said. Please try speaking again."
     # Add user message to conversation
     conversation.append({"role": "user", "content": user_text})
     conversation = []
     # Add welcome message
+    welcome = "Hello! I'm your AI assistant. Press the SPEAK button and start talking to me."
     conversation.append({"role": "assistant", "content": welcome})
     # Generate speech
     welcome_audio = text_to_speech(welcome)
+    # Format for display
+    display_text = "Assistant: " + welcome + "\n\n"
+    return welcome_audio, display_text
+def toggle_recording(state):
+    """Toggle recording state and return button text"""
+    global is_listening
+    is_listening = not state
+    if is_listening:
+        return True, "RECORDING... CLICK TO STOP"
+    else:
+        return False, "CLICK TO SPEAK"
+def process_voice(audio):
+    """Process voice input and generate response"""
     if audio is None:
         return None, get_conversation_text()
     user_text = transcribe_audio(audio)
     if not user_text:
+        return None, "I couldn't hear what you said. Please try speaking again."
     # Get AI response
     ai_response = get_ai_response(user_text)
+    # Generate speech
     speech_file = text_to_speech(ai_response)
     # Update conversation display
+    display_text = get_conversation_text()
+    return speech_file, display_text
 def get_conversation_text():
     """Format conversation history for display"""
     return result
 # Create Gradio interface
+with gr.Blocks(title="One-Click Voice Assistant") as demo:
     with gr.Column():
+        gr.Markdown("# One-Click Voice Assistant")
+        gr.Markdown("Just one button to talk with the AI assistant!")
+        # Conversation history display
+        conversation_display = gr.Textbox(
+            label="Conversation",
+            lines=10,
+            value="Click 'Start Conversation' below to begin"
+        )
+        # Audio output for responses
+        audio_output = gr.Audio(
+            label="AI Voice Response",
+            type="filepath",
+            autoplay=True
+        )
+        # Start conversation button
+        start_button = gr.Button("START CONVERSATION", variant="primary", size="lg")
+        # Single recording button that toggles state
+        with gr.Row():
+            recording_state = gr.State(False)
+            recording_button = gr.Button("CLICK TO SPEAK", variant="secondary", size="lg")
+        # Audio input (hidden)
+        audio_input = gr.Audio(
+            label="Voice Input",
+            type="numpy",
+            visible=False,
+            source="microphone",
+            streaming=False
+        )
+    # Connect the components
     start_button.click(
         start_conversation,
         outputs=[audio_output, conversation_display]
     )
+    recording_button.click(
+        toggle_recording,
+        inputs=[recording_state],
+        outputs=[recording_state, recording_button]
+    )
     audio_input.change(
+        process_voice,
         inputs=[audio_input],
         outputs=[audio_output, conversation_display]
     )
+    gr.Markdown("""
+    ## How to use:
+    1. Click "START CONVERSATION" to begin
+    2. Click "CLICK TO SPEAK" and speak to the assistant
+    3. Click again to stop recording and get a response
+    4. Continue the conversation - just click the button again to speak
+    This assistant is designed to be as simple as possible - just one button to talk!
+    """)
 # Launch the app
 if __name__ == "__main__":