SreekarB commited on
Commit
b24a089
·
verified ·
1 Parent(s): 83ba3e1

Upload 5 files

Browse files
Files changed (2) hide show
  1. app.py +149 -128
  2. recording.py +176 -0
app.py CHANGED
@@ -1,14 +1,17 @@
1
  import gradio as gr
2
- import numpy as np
3
  import tempfile
 
4
  import os
 
5
  import wave
6
  import requests
7
- import speech_recognition as sr
8
  from gtts import gTTS
 
9
 
10
  # Conversation state
11
  conversation = []
 
12
 
13
  # Hugging Face API configuration
14
  HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
@@ -19,54 +22,21 @@ headers = {
19
  "Content-Type": "application/json"
20
  }
21
 
22
- def transcribe_audio(audio):
23
- """Transcribe audio to text using Google Speech Recognition"""
24
- if audio is None:
25
- return None
26
-
27
- # Gradio 3.50.0 passes (sample_rate, audio_data)
28
- sample_rate, audio_data = audio
29
-
30
- # Create a temporary WAV file
31
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
32
- temp_filename = temp_file.name
33
-
34
- try:
35
- with wave.open(temp_filename, 'wb') as wf:
36
- wf.setnchannels(1)
37
- wf.setsampwidth(2) # 16-bit audio
38
- wf.setframerate(sample_rate)
39
- wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
40
-
41
- # Perform speech recognition
42
- recognizer = sr.Recognizer()
43
- with sr.AudioFile(temp_filename) as source:
44
- audio_data = recognizer.record(source)
45
- text = recognizer.recognize_google(audio_data)
46
- return text.strip()
47
- except Exception as e:
48
- print(f"Error in transcription: {e}")
49
- return None
50
- finally:
51
- # Clean up temp file
52
- if os.path.exists(temp_filename):
53
- os.unlink(temp_filename)
54
-
55
  def get_ai_response(user_text):
56
- """Get AI response from LLM API"""
57
  if not user_text:
58
- return "I couldn't hear what you said. Please try speaking again."
59
 
60
- # Add user message to conversation
61
  conversation.append({"role": "user", "content": user_text})
62
 
63
- # Prepare messages for API
64
  messages = [{"role": "system", "content": "You are a helpful AI assistant like Alexa. Keep responses brief and conversational."}]
65
  messages.extend(conversation)
66
 
67
  try:
68
  if not HF_API_TOKEN:
69
- ai_response = "Please add a Hugging Face API token to enable AI responses."
70
  else:
71
  # Make API call
72
  payload = {
@@ -81,64 +51,73 @@ def get_ai_response(user_text):
81
  response = requests.post(HF_API_URL, headers=headers, json=payload)
82
 
83
  if response.status_code == 200:
84
- ai_response = response.json()[0]["generated_text"]
85
  else:
86
- ai_response = f"I'm having trouble connecting. Error: {response.status_code}"
87
  except Exception as e:
88
- ai_response = f"Error: {str(e)}"
89
 
90
- # Add assistant response to conversation
91
- conversation.append({"role": "assistant", "content": ai_response})
92
 
93
- return ai_response
94
 
95
  def text_to_speech(text):
96
- """Convert text to speech"""
97
  try:
98
- # Create temp file
99
- with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
100
- mp3_filename = temp_file.name
101
 
102
- # Generate speech with gTTS
103
- tts = gTTS(text=text, lang='en', slow=False)
104
- tts.save(mp3_filename)
105
 
106
- # Return the path to the audio file
107
- return mp3_filename
108
  except Exception as e:
109
- print(f"TTS error: {e}")
110
  return None
111
 
112
- def process_voice(audio):
113
- """Process voice input and generate response"""
114
  if audio is None:
115
- # Just start with a greeting if no audio
116
- if not conversation:
117
- welcome = "Hello! I'm your AI assistant. Click the Talk button below and speak to me."
118
- conversation.append({"role": "assistant", "content": welcome})
119
- welcome_audio = text_to_speech(welcome)
120
- return welcome_audio, "Assistant: " + welcome + "\n\n"
121
- return None, get_conversation_text()
122
-
123
- # Transcribe audio to text
124
- user_text = transcribe_audio(audio)
125
-
126
- if not user_text:
127
- return None, get_conversation_text() + "\n\nI couldn't hear you clearly. Please try again."
128
-
129
- # Get AI response
130
- ai_response = get_ai_response(user_text)
131
 
132
- # Generate speech
133
- speech_file = text_to_speech(ai_response)
134
 
135
- # Update conversation display
136
- display_text = get_conversation_text()
 
137
 
138
- return speech_file, display_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
- def get_conversation_text():
141
- """Format conversation history for display"""
142
  result = ""
143
  for msg in conversation:
144
  if msg["role"] != "system": # Skip system messages
@@ -146,63 +125,105 @@ def get_conversation_text():
146
  result += f"{prefix}{msg['content']}\n\n"
147
  return result
148
 
149
- # Create Gradio interface
150
- with gr.Blocks(title="Super Simple Voice Assistant") as demo:
151
- with gr.Column():
152
- gr.Markdown("# Super Simple Voice Assistant")
153
- gr.Markdown("Just one button to talk with the AI!")
154
-
155
- # Conversation history display
156
- conversation_display = gr.Textbox(
157
- label="Conversation",
158
- lines=10,
159
- value=""
160
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- # Audio output for responses
163
- audio_output = gr.Audio(
164
- label="AI Response",
165
- type="filepath",
166
- autoplay=True
167
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- # Direct audio recording button
170
- audio_recorder = gr.Audio(
171
- sources=["microphone"],
172
- type="numpy",
173
- label="CLICK & TALK - Then click stop when done speaking"
174
- )
175
 
176
- # Make button more prominent and obvious
177
- gr.Markdown("""
178
- <div style="text-align: center; margin: 10px 0; padding: 10px; background-color: #f0f0f0; border-radius: 5px;">
179
- <p style="font-size: 20px; font-weight: bold;">👆 CLICK THE MICROPHONE ABOVE TO SPEAK 👆</p>
180
- </div>
181
  """)
182
 
183
- # Connect the components
184
- audio_recorder.change(
185
- fn=process_voice,
186
- inputs=[audio_recorder],
187
  outputs=[audio_output, conversation_display]
188
  )
189
 
190
- # Auto-start the greeting
191
- demo.load(
192
- fn=lambda: process_voice(None),
193
- inputs=None,
194
  outputs=[audio_output, conversation_display]
195
  )
196
-
197
- gr.Markdown("""
198
- ## How to use - JUST ONE BUTTON!
199
-
200
- 1. Click the microphone button and start speaking
201
- 2. Click Stop when you're done speaking
202
- 3. The AI will respond with voice
203
- 4. Click the microphone button again to continue the conversation
204
- """)
205
 
206
  # Launch the app
207
  if __name__ == "__main__":
208
- demo.queue().launch()
 
1
  import gradio as gr
 
2
  import tempfile
3
+ import numpy as np
4
  import os
5
+ import time
6
  import wave
7
  import requests
8
+ import json
9
  from gtts import gTTS
10
+ import speech_recognition as sr
11
 
12
  # Conversation state
13
  conversation = []
14
+ recording_status = False
15
 
16
  # Hugging Face API configuration
17
  HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
 
22
  "Content-Type": "application/json"
23
  }
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def get_ai_response(user_text):
26
+ """Get AI response from Hugging Face API"""
27
  if not user_text:
28
+ return "I couldn't understand what you said. Could you try again?"
29
 
30
+ # Add user input to conversation history
31
  conversation.append({"role": "user", "content": user_text})
32
 
33
+ # Prepare for API call
34
  messages = [{"role": "system", "content": "You are a helpful AI assistant like Alexa. Keep responses brief and conversational."}]
35
  messages.extend(conversation)
36
 
37
  try:
38
  if not HF_API_TOKEN:
39
+ response_text = "Please add a Hugging Face API token in the Space settings to enable AI responses."
40
  else:
41
  # Make API call
42
  payload = {
 
51
  response = requests.post(HF_API_URL, headers=headers, json=payload)
52
 
53
  if response.status_code == 200:
54
+ response_text = response.json()[0]["generated_text"]
55
  else:
56
+ response_text = f"I'm having trouble connecting to my language model. Error: {response.status_code}"
57
  except Exception as e:
58
+ response_text = f"An error occurred: {str(e)}"
59
 
60
+ # Add assistant response to conversation history
61
+ conversation.append({"role": "assistant", "content": response_text})
62
 
63
+ return response_text
64
 
65
  def text_to_speech(text):
66
+ """Convert text to speech using gTTS"""
67
  try:
68
+ # Create a temporary file
69
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp:
70
+ filename = temp.name
71
 
72
+ # Generate speech
73
+ tts = gTTS(text=text, lang="en", slow=False)
74
+ tts.save(filename)
75
 
76
+ return filename
 
77
  except Exception as e:
78
+ print(f"TTS Error: {e}")
79
  return None
80
 
81
+ def speech_to_text(audio):
82
+ """Convert speech to text using SpeechRecognition"""
83
  if audio is None:
84
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ # Extract audio data
87
+ sample_rate, audio_data = audio
88
 
89
+ # Create a temporary WAV file
90
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
91
+ temp_path = temp_file.name
92
 
93
+ try:
94
+ # Save audio to file
95
+ with wave.open(temp_path, 'wb') as wf:
96
+ wf.setnchannels(1)
97
+ wf.setsampwidth(2) # 16-bit audio
98
+ wf.setframerate(sample_rate)
99
+ wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
100
+
101
+ # Use SpeechRecognition to transcribe
102
+ recognizer = sr.Recognizer()
103
+ with sr.AudioFile(temp_path) as source:
104
+ audio_data = recognizer.record(source)
105
+ text = recognizer.recognize_google(audio_data)
106
+ return text
107
+ except sr.UnknownValueError:
108
+ return None
109
+ except sr.RequestError:
110
+ return "Sorry, I couldn't access the speech recognition service."
111
+ except Exception as e:
112
+ print(f"STT Error: {e}")
113
+ return None
114
+ finally:
115
+ # Clean up
116
+ if os.path.exists(temp_path):
117
+ os.unlink(temp_path)
118
 
119
+ def format_conversation():
120
+ """Format the conversation history for display"""
121
  result = ""
122
  for msg in conversation:
123
  if msg["role"] != "system": # Skip system messages
 
125
  result += f"{prefix}{msg['content']}\n\n"
126
  return result
127
 
128
+ def process_audio(audio):
129
+ """Process recorded audio and generate response"""
130
+ if audio is None:
131
+ return None, "No audio detected. Please try again."
132
+
133
+ # Convert speech to text
134
+ transcript = speech_to_text(audio)
135
+
136
+ if not transcript:
137
+ return None, format_conversation() + "\nI couldn't understand your speech. Please try again."
138
+
139
+ # Get AI response
140
+ response = get_ai_response(transcript)
141
+
142
+ # Convert response to speech
143
+ audio_file = text_to_speech(response)
144
+
145
+ # Return response
146
+ return audio_file, format_conversation()
147
+
148
+ def initialize_conversation():
149
+ """Initialize the conversation with a welcome message"""
150
+ global conversation
151
+ conversation = []
152
+
153
+ # Add welcome message
154
+ welcome = "Hello! I'm your voice assistant. Click the Record button below, speak to me, and I'll respond."
155
+ conversation.append({"role": "assistant", "content": welcome})
156
+
157
+ # Generate speech
158
+ welcome_audio = text_to_speech(welcome)
159
+
160
+ return welcome_audio, format_conversation()
161
+
162
+ # Create Gradio interface with simplified layout
163
+ with gr.Blocks(title="Interactive Voice Assistant") as demo:
164
+ gr.Markdown("# Interactive Voice Assistant")
165
+ gr.Markdown("Speak to the AI and get voice responses in real-time")
166
+
167
+ with gr.Row():
168
+ # Left panel - Controls
169
+ with gr.Column(scale=1):
170
+ # Start button
171
+ start_button = gr.Button("Start Conversation", variant="primary")
172
+
173
+ # Microphone input
174
+ audio_input = gr.Audio(
175
+ label="🎤 SPEAK HERE",
176
+ type="numpy",
177
+ sources=["microphone"],
178
+ streaming=False
179
+ )
180
+
181
+ # Status display
182
+ status_display = gr.Markdown("Click 'Start Conversation' to begin")
183
 
184
+ # Right panel - Conversation
185
+ with gr.Column(scale=2):
186
+ # Conversation display
187
+ conversation_display = gr.Textbox(
188
+ label="Conversation History",
189
+ lines=12,
190
+ value=""
191
+ )
192
+
193
+ # Audio playback
194
+ audio_output = gr.Audio(
195
+ label="AI Response",
196
+ type="filepath",
197
+ autoplay=True
198
+ )
199
+
200
+ # Instructions
201
+ with gr.Accordion("How to use", open=True):
202
+ gr.Markdown("""
203
+ ## Simple Instructions:
204
 
205
+ 1. Click **Start Conversation** to begin
206
+ 2. Click the microphone button to record your voice
207
+ 3. Speak your question or request
208
+ 4. Click the stop button when done speaking
209
+ 5. The AI will respond with voice and text
210
+ 6. Continue the conversation by recording more messages
211
 
212
+ The assistant maintains context throughout your conversation, so you can refer back to previous exchanges.
 
 
 
 
213
  """)
214
 
215
+ # Connect components
216
+ start_button.click(
217
+ fn=initialize_conversation,
 
218
  outputs=[audio_output, conversation_display]
219
  )
220
 
221
+ audio_input.change(
222
+ fn=process_audio,
223
+ inputs=[audio_input],
 
224
  outputs=[audio_output, conversation_display]
225
  )
 
 
 
 
 
 
 
 
 
226
 
227
  # Launch the app
228
  if __name__ == "__main__":
229
+ demo.launch()
recording.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sounddevice as sd
2
+ import numpy as np
3
+ import torch
4
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
5
+ import librosa
6
+ import scipy.io.wavfile as wavf
7
+ import threading
8
+ import queue
9
+ import time
10
+ from datasets import load_dataset
11
+ import io
12
+ import tempfile
13
+ import soundfile as sf
14
+ from scipy.io import wavfile
15
+ import os
16
+
17
+ class VoiceAssistant:
18
+ def __init__(self):
19
+ print("Initializing Voice Assistant...")
20
+
21
+ # Initialize speech recognition model
22
+ print("Loading speech recognition model...")
23
+ self.asr_pipeline = pipeline(
24
+ "automatic-speech-recognition",
25
+ model="openai/whisper-small",
26
+ device=0 if torch.cuda.is_available() else -1
27
+ )
28
+
29
+ # Initialize text generation model
30
+ print("Loading language model...")
31
+ self.model_name = "HuggingFaceH4/zephyr-7b-beta"
32
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
33
+ self.model = AutoModelForCausalLM.from_pretrained(
34
+ self.model_name,
35
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
36
+ low_cpu_mem_usage=True,
37
+ device_map="auto"
38
+ )
39
+
40
+ # Initialize text-to-speech model
41
+ print("Loading text-to-speech model...")
42
+ self.tts_pipeline = pipeline(
43
+ "text-to-speech",
44
+ model="microsoft/speecht5_tts",
45
+ device=0 if torch.cuda.is_available() else -1
46
+ )
47
+
48
+ # Load speaker embedding for TTS
49
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
50
+ self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
51
+
52
+ # Audio parameters
53
+ self.sample_rate = 16000
54
+ self.duration = 5 # Record 5 seconds at a time
55
+ self.is_listening = False
56
+ self.audio_queue = queue.Queue()
57
+ self.conversation_history = []
58
+
59
+ print("Voice Assistant initialized and ready!")
60
+
61
+ def record_audio(self):
62
+ """Record audio from microphone and put in queue"""
63
+ def callback(indata, frames, time, status):
64
+ if status:
65
+ print(f"Error in audio callback: {status}")
66
+ self.audio_queue.put(indata.copy())
67
+
68
+ print("Listening... (Press Ctrl+C to stop)")
69
+ self.is_listening = True
70
+
71
+ try:
72
+ with sd.InputStream(samplerate=self.sample_rate, channels=1, callback=callback):
73
+ while self.is_listening:
74
+ time.sleep(0.1)
75
+ except KeyboardInterrupt:
76
+ print("\nStopping...")
77
+ self.is_listening = False
78
+ except Exception as e:
79
+ print(f"Error recording audio: {e}")
80
+ self.is_listening = False
81
+
82
+ def process_audio(self):
83
+ """Process audio from queue and respond"""
84
+ while self.is_listening:
85
+ try:
86
+ # Wait for audio chunks to accumulate for self.duration seconds
87
+ chunks = []
88
+ start_time = time.time()
89
+
90
+ while time.time() - start_time < self.duration and self.is_listening:
91
+ try:
92
+ chunk = self.audio_queue.get(timeout=1)
93
+ chunks.append(chunk)
94
+ except queue.Empty:
95
+ continue
96
+
97
+ if not chunks:
98
+ continue
99
+
100
+ # Combine audio chunks
101
+ audio = np.concatenate(chunks)
102
+
103
+ # Convert to expected format
104
+ audio_float = audio.flatten().astype(np.float32) / np.iinfo(np.int16).max
105
+
106
+ # Save audio to temporary file
107
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
108
+ temp_filename = temp_audio.name
109
+ wavf.write(temp_filename, self.sample_rate, audio)
110
+
111
+ # Transcribe audio
112
+ result = self.asr_pipeline(temp_filename)
113
+ transcript = result["text"].strip()
114
+ os.unlink(temp_filename) # Delete temp file
115
+
116
+ if not transcript:
117
+ continue
118
+
119
+ print(f"\nYou: {transcript}")
120
+
121
+ # Process transcription with language model
122
+ if len(self.conversation_history) == 0:
123
+ prompt = f"<|system|>\nYou are a friendly and helpful assistant.\n<|user|>\n{transcript}\n<|assistant|>"
124
+ else:
125
+ prompt = "<|assistant|>".join(self.conversation_history) + f"<|user|>\n{transcript}\n<|assistant|>"
126
+
127
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
128
+
129
+ outputs = self.model.generate(
130
+ **inputs,
131
+ max_new_tokens=100,
132
+ temperature=0.7,
133
+ do_sample=True
134
+ )
135
+
136
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
137
+
138
+ # Extract the assistant's response
139
+ if "<|assistant|>" in response:
140
+ response = response.split("<|assistant|>")[-1].strip()
141
+
142
+ print(f"Assistant: {response}")
143
+
144
+ # Update conversation history
145
+ self.conversation_history.append(f"<|user|>\n{transcript}\n<|assistant|>\n{response}")
146
+ if len(self.conversation_history) > 5: # Keep only last 5 exchanges to save memory
147
+ self.conversation_history.pop(0)
148
+
149
+ # Convert response to speech
150
+ speech = self.tts_pipeline(
151
+ response,
152
+ forward_params={"speaker_embeddings": self.speaker_embeddings}
153
+ )
154
+
155
+ # Play audio response
156
+ sd.play(speech["audio"], speech["sampling_rate"])
157
+ sd.wait()
158
+
159
+ except Exception as e:
160
+ print(f"Error processing audio: {e}")
161
+
162
+ def run(self):
163
+ """Run the voice assistant"""
164
+ record_thread = threading.Thread(target=self.record_audio)
165
+ process_thread = threading.Thread(target=self.process_audio)
166
+
167
+ record_thread.start()
168
+ process_thread.start()
169
+
170
+ record_thread.join()
171
+ process_thread.join()
172
+
173
+
174
+ if __name__ == "__main__":
175
+ assistant = VoiceAssistant()
176
+ assistant.run()