Spaces:

SreekarB
/

SLP

Sleeping

App Files Files Community

SreekarB commited on Mar 19, 2025

Commit

b24a089

verified ·

1 Parent(s): 83ba3e1

Upload 5 files

Browse files

Files changed (2) hide show

app.py +149 -128
recording.py +176 -0

app.py CHANGED Viewed

@@ -1,14 +1,17 @@
 import gradio as gr
-import numpy as np
 import tempfile
 import os
 import wave
 import requests
-import speech_recognition as sr
 from gtts import gTTS
 # Conversation state
 conversation = []
 # Hugging Face API configuration
 HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
@@ -19,54 +22,21 @@ headers = {
     "Content-Type": "application/json"
 }
-def transcribe_audio(audio):
-    """Transcribe audio to text using Google Speech Recognition"""
-    if audio is None:
-        return None
-    # Gradio 3.50.0 passes (sample_rate, audio_data)
-    sample_rate, audio_data = audio
-    # Create a temporary WAV file
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
-        temp_filename = temp_file.name
-    try:
-        with wave.open(temp_filename, 'wb') as wf:
-            wf.setnchannels(1)
-            wf.setsampwidth(2)  # 16-bit audio
-            wf.setframerate(sample_rate)
-            wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
-        # Perform speech recognition
-        recognizer = sr.Recognizer()
-        with sr.AudioFile(temp_filename) as source:
-            audio_data = recognizer.record(source)
-            text = recognizer.recognize_google(audio_data)
-            return text.strip()
-    except Exception as e:
-        print(f"Error in transcription: {e}")
-        return None
-    finally:
-        # Clean up temp file
-        if os.path.exists(temp_filename):
-            os.unlink(temp_filename)
 def get_ai_response(user_text):
-    """Get AI response from LLM API"""
     if not user_text:
-        return "I couldn't hear what you said. Please try speaking again."
-    # Add user message to conversation
     conversation.append({"role": "user", "content": user_text})
-    # Prepare messages for API
     messages = [{"role": "system", "content": "You are a helpful AI assistant like Alexa. Keep responses brief and conversational."}]
     messages.extend(conversation)
     try:
         if not HF_API_TOKEN:
-            ai_response = "Please add a Hugging Face API token to enable AI responses."
         else:
             # Make API call
             payload = {
@@ -81,64 +51,73 @@ def get_ai_response(user_text):
             response = requests.post(HF_API_URL, headers=headers, json=payload)
             if response.status_code == 200:
-                ai_response = response.json()[0]["generated_text"]
             else:
-                ai_response = f"I'm having trouble connecting. Error: {response.status_code}"
     except Exception as e:
-        ai_response = f"Error: {str(e)}"
-    # Add assistant response to conversation
-    conversation.append({"role": "assistant", "content": ai_response})
-    return ai_response
 def text_to_speech(text):
-    """Convert text to speech"""
     try:
-        # Create temp file
-        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
-            mp3_filename = temp_file.name
-        # Generate speech with gTTS
-        tts = gTTS(text=text, lang='en', slow=False)
-        tts.save(mp3_filename)
-        # Return the path to the audio file
-        return mp3_filename
     except Exception as e:
-        print(f"TTS error: {e}")
         return None
-def process_voice(audio):
-    """Process voice input and generate response"""
     if audio is None:
-        # Just start with a greeting if no audio
-        if not conversation:
-            welcome = "Hello! I'm your AI assistant. Click the Talk button below and speak to me."
-            conversation.append({"role": "assistant", "content": welcome})
-            welcome_audio = text_to_speech(welcome)
-            return welcome_audio, "Assistant: " + welcome + "\n\n"
-        return None, get_conversation_text()
-    # Transcribe audio to text
-    user_text = transcribe_audio(audio)
-    if not user_text:
-        return None, get_conversation_text() + "\n\nI couldn't hear you clearly. Please try again."
-    # Get AI response
-    ai_response = get_ai_response(user_text)
-    # Generate speech
-    speech_file = text_to_speech(ai_response)
-    # Update conversation display
-    display_text = get_conversation_text()
-    return speech_file, display_text
-def get_conversation_text():
-    """Format conversation history for display"""
     result = ""
     for msg in conversation:
         if msg["role"] != "system":  # Skip system messages
@@ -146,63 +125,105 @@ def get_conversation_text():
             result += f"{prefix}{msg['content']}\n\n"
     return result
-# Create Gradio interface
-with gr.Blocks(title="Super Simple Voice Assistant") as demo:
-    with gr.Column():
-        gr.Markdown("# Super Simple Voice Assistant")
-        gr.Markdown("Just one button to talk with the AI!")
-        # Conversation history display
-        conversation_display = gr.Textbox(
-            label="Conversation",
-            lines=10,
-            value=""
-        )
-        # Audio output for responses
-        audio_output = gr.Audio(
-            label="AI Response",
-            type="filepath",
-            autoplay=True
-        )
-        # Direct audio recording button
-        audio_recorder = gr.Audio(
-            sources=["microphone"],
-            type="numpy",
-            label="CLICK & TALK - Then click stop when done speaking"
-        )
-        # Make button more prominent and obvious
-        gr.Markdown("""
-        <div style="text-align: center; margin: 10px 0; padding: 10px; background-color: #f0f0f0; border-radius: 5px;">
-            <p style="font-size: 20px; font-weight: bold;">👆 CLICK THE MICROPHONE ABOVE TO SPEAK 👆</p>
-        </div>
         """)
-    # Connect the components
-    audio_recorder.change(
-        fn=process_voice,
-        inputs=[audio_recorder],
         outputs=[audio_output, conversation_display]
     )
-    # Auto-start the greeting
-    demo.load(
-        fn=lambda: process_voice(None),
-        inputs=None,
         outputs=[audio_output, conversation_display]
     )
-    gr.Markdown("""
-    ## How to use - JUST ONE BUTTON!
-    1. Click the microphone button and start speaking
-    2. Click Stop when you're done speaking
-    3. The AI will respond with voice
-    4. Click the microphone button again to continue the conversation
-    """)
 # Launch the app
 if __name__ == "__main__":
-    demo.queue().launch()

 import gradio as gr
 import tempfile
+import numpy as np
 import os
+import time
 import wave
 import requests
+import json
 from gtts import gTTS
+import speech_recognition as sr
 # Conversation state
 conversation = []
+recording_status = False
 # Hugging Face API configuration
 HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
     "Content-Type": "application/json"
 }
 def get_ai_response(user_text):
+    """Get AI response from Hugging Face API"""
     if not user_text:
+        return "I couldn't understand what you said. Could you try again?"
+    # Add user input to conversation history
     conversation.append({"role": "user", "content": user_text})
+    # Prepare for API call
     messages = [{"role": "system", "content": "You are a helpful AI assistant like Alexa. Keep responses brief and conversational."}]
     messages.extend(conversation)
     try:
         if not HF_API_TOKEN:
+            response_text = "Please add a Hugging Face API token in the Space settings to enable AI responses."
         else:
             # Make API call
             payload = {
             response = requests.post(HF_API_URL, headers=headers, json=payload)
             if response.status_code == 200:
+                response_text = response.json()[0]["generated_text"]
             else:
+                response_text = f"I'm having trouble connecting to my language model. Error: {response.status_code}"
     except Exception as e:
+        response_text = f"An error occurred: {str(e)}"
+    # Add assistant response to conversation history
+    conversation.append({"role": "assistant", "content": response_text})
+    return response_text
 def text_to_speech(text):
+    """Convert text to speech using gTTS"""
     try:
+        # Create a temporary file
+        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp:
+            filename = temp.name
+        # Generate speech
+        tts = gTTS(text=text, lang="en", slow=False)
+        tts.save(filename)
+        return filename
     except Exception as e:
+        print(f"TTS Error: {e}")
         return None
+def speech_to_text(audio):
+    """Convert speech to text using SpeechRecognition"""
     if audio is None:
+        return None
+    # Extract audio data
+    sample_rate, audio_data = audio
+    # Create a temporary WAV file
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+        temp_path = temp_file.name
+    try:
+        # Save audio to file
+        with wave.open(temp_path, 'wb') as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)  # 16-bit audio
+            wf.setframerate(sample_rate)
+            wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
+        # Use SpeechRecognition to transcribe
+        recognizer = sr.Recognizer()
+        with sr.AudioFile(temp_path) as source:
+            audio_data = recognizer.record(source)
+            text = recognizer.recognize_google(audio_data)
+            return text
+    except sr.UnknownValueError:
+        return None
+    except sr.RequestError:
+        return "Sorry, I couldn't access the speech recognition service."
+    except Exception as e:
+        print(f"STT Error: {e}")
+        return None
+    finally:
+        # Clean up
+        if os.path.exists(temp_path):
+            os.unlink(temp_path)
+def format_conversation():
+    """Format the conversation history for display"""
     result = ""
     for msg in conversation:
         if msg["role"] != "system":  # Skip system messages
             result += f"{prefix}{msg['content']}\n\n"
     return result
+def process_audio(audio):
+    """Process recorded audio and generate response"""
+    if audio is None:
+        return None, "No audio detected. Please try again."
+    # Convert speech to text
+    transcript = speech_to_text(audio)
+    if not transcript:
+        return None, format_conversation() + "\nI couldn't understand your speech. Please try again."
+    # Get AI response
+    response = get_ai_response(transcript)
+    # Convert response to speech
+    audio_file = text_to_speech(response)
+    # Return response
+    return audio_file, format_conversation()
+def initialize_conversation():
+    """Initialize the conversation with a welcome message"""
+    global conversation
+    conversation = []
+    # Add welcome message
+    welcome = "Hello! I'm your voice assistant. Click the Record button below, speak to me, and I'll respond."
+    conversation.append({"role": "assistant", "content": welcome})
+    # Generate speech
+    welcome_audio = text_to_speech(welcome)
+    return welcome_audio, format_conversation()
+# Create Gradio interface with simplified layout
+with gr.Blocks(title="Interactive Voice Assistant") as demo:
+    gr.Markdown("# Interactive Voice Assistant")
+    gr.Markdown("Speak to the AI and get voice responses in real-time")
+    with gr.Row():
+        # Left panel - Controls
+        with gr.Column(scale=1):
+            # Start button
+            start_button = gr.Button("Start Conversation", variant="primary")
+            # Microphone input
+            audio_input = gr.Audio(
+                label="🎤 SPEAK HERE",
+                type="numpy",
+                sources=["microphone"],
+                streaming=False
+            )
+            # Status display
+            status_display = gr.Markdown("Click 'Start Conversation' to begin")
+        # Right panel - Conversation
+        with gr.Column(scale=2):
+            # Conversation display
+            conversation_display = gr.Textbox(
+                label="Conversation History",
+                lines=12,
+                value=""
+            )
+            # Audio playback
+            audio_output = gr.Audio(
+                label="AI Response",
+                type="filepath",
+                autoplay=True
+            )
+    # Instructions
+    with gr.Accordion("How to use", open=True):
+        gr.Markdown("""
+        ## Simple Instructions:
+        1. Click **Start Conversation** to begin
+        2. Click the microphone button to record your voice
+        3. Speak your question or request
+        4. Click the stop button when done speaking
+        5. The AI will respond with voice and text
+        6. Continue the conversation by recording more messages
+        The assistant maintains context throughout your conversation, so you can refer back to previous exchanges.
         """)
+    # Connect components
+    start_button.click(
+        fn=initialize_conversation,
         outputs=[audio_output, conversation_display]
     )
+    audio_input.change(
+        fn=process_audio,
+        inputs=[audio_input],
         outputs=[audio_output, conversation_display]
     )
 # Launch the app
 if __name__ == "__main__":
+    demo.launch()

recording.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import sounddevice as sd
+import numpy as np
+import torch
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
+import librosa
+import scipy.io.wavfile as wavf
+import threading
+import queue
+import time
+from datasets import load_dataset
+import io
+import tempfile
+import soundfile as sf
+from scipy.io import wavfile
+import os
+class VoiceAssistant:
+    def __init__(self):
+        print("Initializing Voice Assistant...")
+        # Initialize speech recognition model
+        print("Loading speech recognition model...")
+        self.asr_pipeline = pipeline(
+            "automatic-speech-recognition",
+            model="openai/whisper-small",
+            device=0 if torch.cuda.is_available() else -1
+        )
+        # Initialize text generation model
+        print("Loading language model...")
+        self.model_name = "HuggingFaceH4/zephyr-7b-beta"
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            low_cpu_mem_usage=True,
+            device_map="auto"
+        )
+        # Initialize text-to-speech model
+        print("Loading text-to-speech model...")
+        self.tts_pipeline = pipeline(
+            "text-to-speech",
+            model="microsoft/speecht5_tts",
+            device=0 if torch.cuda.is_available() else -1
+        )
+        # Load speaker embedding for TTS
+        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+        self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+        # Audio parameters
+        self.sample_rate = 16000
+        self.duration = 5  # Record 5 seconds at a time
+        self.is_listening = False
+        self.audio_queue = queue.Queue()
+        self.conversation_history = []
+        print("Voice Assistant initialized and ready!")
+    def record_audio(self):
+        """Record audio from microphone and put in queue"""
+        def callback(indata, frames, time, status):
+            if status:
+                print(f"Error in audio callback: {status}")
+            self.audio_queue.put(indata.copy())
+        print("Listening... (Press Ctrl+C to stop)")
+        self.is_listening = True
+        try:
+            with sd.InputStream(samplerate=self.sample_rate, channels=1, callback=callback):
+                while self.is_listening:
+                    time.sleep(0.1)
+        except KeyboardInterrupt:
+            print("\nStopping...")
+            self.is_listening = False
+        except Exception as e:
+            print(f"Error recording audio: {e}")
+            self.is_listening = False
+    def process_audio(self):
+        """Process audio from queue and respond"""
+        while self.is_listening:
+            try:
+                # Wait for audio chunks to accumulate for self.duration seconds
+                chunks = []
+                start_time = time.time()
+                while time.time() - start_time < self.duration and self.is_listening:
+                    try:
+                        chunk = self.audio_queue.get(timeout=1)
+                        chunks.append(chunk)
+                    except queue.Empty:
+                        continue
+                if not chunks:
+                    continue
+                # Combine audio chunks
+                audio = np.concatenate(chunks)
+                # Convert to expected format
+                audio_float = audio.flatten().astype(np.float32) / np.iinfo(np.int16).max
+                # Save audio to temporary file
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
+                    temp_filename = temp_audio.name
+                    wavf.write(temp_filename, self.sample_rate, audio)
+                # Transcribe audio
+                result = self.asr_pipeline(temp_filename)
+                transcript = result["text"].strip()
+                os.unlink(temp_filename)  # Delete temp file
+                if not transcript:
+                    continue
+                print(f"\nYou: {transcript}")
+                # Process transcription with language model
+                if len(self.conversation_history) == 0:
+                    prompt = f"<|system|>\nYou are a friendly and helpful assistant.\n<|user|>\n{transcript}\n<|assistant|>"
+                else:
+                    prompt = "<|assistant|>".join(self.conversation_history) + f"<|user|>\n{transcript}\n<|assistant|>"
+                inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=100,
+                    temperature=0.7,
+                    do_sample=True
+                )
+                response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                # Extract the assistant's response
+                if "<|assistant|>" in response:
+                    response = response.split("<|assistant|>")[-1].strip()
+                print(f"Assistant: {response}")
+                # Update conversation history
+                self.conversation_history.append(f"<|user|>\n{transcript}\n<|assistant|>\n{response}")
+                if len(self.conversation_history) > 5:  # Keep only last 5 exchanges to save memory
+                    self.conversation_history.pop(0)
+                # Convert response to speech
+                speech = self.tts_pipeline(
+                    response,
+                    forward_params={"speaker_embeddings": self.speaker_embeddings}
+                )
+                # Play audio response
+                sd.play(speech["audio"], speech["sampling_rate"])
+                sd.wait()
+            except Exception as e:
+                print(f"Error processing audio: {e}")
+    def run(self):
+        """Run the voice assistant"""
+        record_thread = threading.Thread(target=self.record_audio)
+        process_thread = threading.Thread(target=self.process_audio)
+        record_thread.start()
+        process_thread.start()
+        record_thread.join()
+        process_thread.join()
+if __name__ == "__main__":
+    assistant = VoiceAssistant()
+    assistant.run()