SreekarB commited on
Commit
6dcd45d
·
verified ·
1 Parent(s): 6cd20fc

Upload 4 files

Browse files
Files changed (2) hide show
  1. app.py +185 -283
  2. requirements.txt +4 -2
app.py CHANGED
@@ -3,34 +3,21 @@ import numpy as np
3
  import tempfile
4
  import os
5
  import wave
6
- import queue
7
- import threading
8
  import time
9
- from datetime import datetime
10
  import speech_recognition as sr
11
  import requests
12
  import json
13
- from gtts import gTTS
14
  import io
 
 
15
 
16
- # Queue for audio chunks
17
- audio_queue = queue.Queue()
18
- # Flag to control real-time processing thread
19
- is_running = False
20
- # Store conversation history
21
  conversation_history = []
22
- # LLM response queue
23
- response_queue = queue.Queue()
24
- # For tracking if speech is active
25
- speech_active = False
26
- # For tracking silence periods
27
- last_speech_time = time.time()
28
- # Silence threshold in seconds before processing
29
- SILENCE_THRESHOLD = 1.0
30
 
31
  # Hugging Face API configuration
32
  HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
33
- # Get API token from environment
34
  HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
35
 
36
  headers = {
@@ -38,191 +25,50 @@ headers = {
38
  "Content-Type": "application/json"
39
  }
40
 
41
- def start_real_time_processing():
42
- """Start real-time audio processing"""
43
- global is_running, speech_active, last_speech_time
44
- is_running = True
45
- speech_active = False
46
- last_speech_time = time.time()
47
-
48
- # Clear previous history
49
- conversation_history.clear()
50
-
51
- # Add system message
52
- conversation_history.append({
53
- "role": "system",
54
- "content": "You are a helpful, friendly AI assistant engaged in a natural voice conversation. Keep responses brief, conversational, and engaging. Ask follow-up questions when appropriate to maintain the dialogue flow."
55
- })
56
-
57
- # Add initial greeting to conversation history
58
- greeting = "Hello! I'm your voice assistant. How can I help you today?"
59
- conversation_history.append({"role": "assistant", "content": greeting})
60
-
61
- # Convert greeting to speech and add to response queue
62
- greeting_audio = text_to_speech(greeting)
63
- if greeting_audio:
64
- response_queue.put(greeting_audio)
65
-
66
- # Start the processing thread
67
- processing_thread = threading.Thread(target=process_audio_queue)
68
- processing_thread.daemon = True
69
- processing_thread.start()
70
-
71
- # Start the response playback thread
72
- response_thread = threading.Thread(target=process_response_queue)
73
- response_thread.daemon = True
74
- response_thread.start()
75
-
76
- # Start the speech activity monitor thread
77
- activity_thread = threading.Thread(target=monitor_speech_activity)
78
- activity_thread.daemon = True
79
- activity_thread.start()
80
-
81
- return "Starting conversation... Please speak when ready."
82
-
83
- def stop_real_time_processing():
84
- """Stop real-time audio processing"""
85
- global is_running
86
- is_running = False
87
- return "Conversation ended."
88
-
89
- def process_audio_chunk(audio_chunk, sample_rate):
90
- """Process incoming audio chunk and add to queue"""
91
- global speech_active, last_speech_time
92
-
93
- if is_running and audio_chunk is not None and len(audio_chunk) > 0:
94
- # Check if there's actual speech (not just silence)
95
- rms = np.sqrt(np.mean(audio_chunk**2))
96
- if rms > 0.01: # Simple threshold for detecting speech
97
- speech_active = True
98
- last_speech_time = time.time()
99
-
100
- # Add to queue for processing
101
- audio_queue.put((audio_chunk, sample_rate))
102
-
103
- # Join the conversation history into a single string for display
104
- conversation_text = ""
105
- for message in conversation_history:
106
- if message["role"] != "system": # Skip system messages in display
107
- prefix = "You: " if message["role"] == "user" else "Assistant: "
108
- conversation_text += f"{prefix}{message['content']}\n\n"
109
-
110
- # Get the current response audio if available
111
- try:
112
- response_audio = response_queue.get_nowait()
113
- response_queue.task_done()
114
- except queue.Empty:
115
- response_audio = None
116
-
117
- # Return the input audio for immediate playback if no response audio
118
- if response_audio is None:
119
- playback_audio = (sample_rate, audio_chunk)
120
- else:
121
- playback_audio = response_audio
122
-
123
- # Also return speech activity status
124
- status = "Listening..." if speech_active else "Ready for your input..."
125
- if len(conversation_history) > 1 and conversation_history[-1]["role"] == "user":
126
- status = "Processing your response..."
127
-
128
- return playback_audio, conversation_text + "\n" + status
129
-
130
- return None, "Click 'Start Conversation' to begin"
131
-
132
- def monitor_speech_activity():
133
- """Monitor speech activity and trigger processing when speech stops"""
134
- global speech_active, last_speech_time
135
-
136
- while is_running:
137
- # If speech was active but has been silent for a while
138
- if speech_active and (time.time() - last_speech_time) > SILENCE_THRESHOLD:
139
- speech_active = False
140
- # Signal to process the accumulated speech
141
- process_accumulated_speech()
142
-
143
- time.sleep(0.1)
144
-
145
- def process_accumulated_speech():
146
- """Process all accumulated speech when a silence is detected"""
147
- recognizer = sr.Recognizer()
148
-
149
- # Create a temporary WAV file for all accumulated audio chunks
150
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
151
- temp_filename = temp_file.name
152
 
153
- # Check if we have enough accumulated audio
154
- if not hasattr(process_accumulated_speech, 'accumulated_chunks'):
155
- process_accumulated_speech.accumulated_chunks = []
156
 
157
- # If we have accumulated audio chunks
158
- if process_accumulated_speech.accumulated_chunks:
159
- # Get the sample rate from the first chunk
160
- sample_rate = process_accumulated_speech.accumulated_chunks[0][1]
161
 
162
- # Concatenate all audio chunks
163
- all_audio = np.concatenate([chunk[0] for chunk in process_accumulated_speech.accumulated_chunks])
 
164
 
165
- # Save to WAV file
166
- with wave.open(temp_filename, 'wb') as wf:
167
- wf.setnchannels(1)
168
- wf.setsampwidth(2)
169
- wf.setframerate(sample_rate)
170
- wf.writeframes((all_audio * 32767).astype(np.int16).tobytes())
171
 
172
- # Perform speech recognition
173
- try:
174
- with sr.AudioFile(temp_filename) as source:
175
- audio = recognizer.record(source)
176
- text = recognizer.recognize_google(audio)
177
-
178
- if text.strip():
179
- # Add user message to conversation history
180
- conversation_history.append({"role": "user", "content": text})
181
- except sr.UnknownValueError:
182
- # No speech detected
183
- pass
184
- except sr.RequestError as e:
185
- print(f"Speech recognition error: {e}")
186
 
187
- # Clean up
188
- os.unlink(temp_filename)
189
- process_accumulated_speech.accumulated_chunks = []
190
-
191
- def text_to_speech(text):
192
- """Convert text to speech using gTTS"""
193
- if not text.strip():
 
194
  return None
 
 
 
 
 
195
 
196
- tts = gTTS(text=text, lang='en', slow=False)
197
-
198
- # Save to a BytesIO object
199
- fp = io.BytesIO()
200
- tts.write_to_fp(fp)
201
- fp.seek(0)
202
-
203
- # Convert to audio array
204
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
205
- temp_filename = temp_file.name
206
-
207
- # Save the gTTS output to the temp file
208
- with open(temp_filename, 'wb') as f:
209
- f.write(fp.read())
210
-
211
- # Read WAV file
212
- with wave.open(temp_filename, 'rb') as wf:
213
- sample_rate = wf.getframerate()
214
- frames = wf.readframes(wf.getnframes())
215
- audio_array = np.frombuffer(frames, dtype=np.int16)
216
- audio_array = audio_array.astype(np.float32) / 32767.0
217
-
218
- # Clean up temp file
219
- os.unlink(temp_filename)
220
 
221
- return (sample_rate, audio_array)
222
-
223
- def get_llm_response():
224
- """Get response from LLM API"""
225
- # Build conversation for the LLM
226
  messages = [{"role": msg["role"], "content": msg["content"]} for msg in conversation_history]
227
 
228
  try:
@@ -241,123 +87,179 @@ def get_llm_response():
241
 
242
  if response.status_code == 200:
243
  response_json = response.json()
244
- return response_json[0]["generated_text"]
245
  else:
246
- return f"I'm having trouble connecting to my language model. Error: {response.status_code}"
247
  else:
248
- # No API token available
249
- return "To enable AI responses, please add a Hugging Face API token in the Space settings. For now, I can hear you but can't generate intelligent responses."
250
  except Exception as e:
251
- return f"I encountered an error: {str(e)}. Please try again in a moment."
 
 
 
 
 
252
 
253
- def process_response_queue():
254
- """Process responses and convert to audio"""
255
- while is_running:
256
- try:
257
- # Check if new user message and no pending assistant response
258
- if len(conversation_history) > 1 and conversation_history[-1]["role"] == "user":
259
- # Get LLM response
260
- response_text = get_llm_response()
261
-
262
- # Add to conversation history
263
- conversation_history.append({"role": "assistant", "content": response_text})
264
-
265
- # Convert to speech
266
- audio = text_to_speech(response_text)
267
-
268
- # Add to response queue
269
- if audio is not None:
270
- response_queue.put(audio)
271
-
272
- time.sleep(0.2)
273
- except Exception as e:
274
- print(f"Error in response thread: {e}")
275
- time.sleep(0.5)
 
 
 
276
 
277
- def process_audio_queue():
278
- """Process audio chunks from the queue"""
279
- # Initialize accumulated chunks
280
- if not hasattr(process_accumulated_speech, 'accumulated_chunks'):
281
- process_accumulated_speech.accumulated_chunks = []
282
-
283
- while is_running:
284
- try:
285
- # Get audio chunk from queue with timeout
286
- audio_chunk, sample_rate = audio_queue.get(timeout=0.5)
287
-
288
- if audio_chunk is not None and len(audio_chunk) > 0:
289
- # Store in accumulated chunks for later processing
290
- process_accumulated_speech.accumulated_chunks.append((audio_chunk, sample_rate))
291
-
292
- # Mark this task as done
293
- audio_queue.task_done()
294
-
295
- except queue.Empty:
296
- # Queue is empty, just continue
297
- pass
298
- except Exception as e:
299
- print(f"Error in processing thread: {e}")
300
- time.sleep(0.1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
- # Create Gradio interface
303
- with gr.Blocks(title="Real-Time Voice Conversation Assistant") as demo:
304
- gr.Markdown("# Real-Time Voice Conversation Assistant")
305
- gr.Markdown("Speak naturally and have an interactive conversation with the AI assistant.")
 
 
 
306
 
307
  with gr.Row():
308
- start_button = gr.Button("Start Conversation", variant="primary", scale=2)
309
- stop_button = gr.Button("End Conversation", variant="stop", scale=1)
310
 
311
- # Real-time microphone input
312
- audio_input = gr.Audio(sources=["microphone"], streaming=True, type="numpy",
313
- label="Your Voice", elem_id="mic-input")
314
 
315
  with gr.Row():
316
- # Audio output for playback
317
- audio_output = gr.Audio(label="Audio", autoplay=True, elem_id="audio-output")
318
 
319
- # Conversation output
320
- conversation_display = gr.Textbox(label="Conversation",
321
- lines=15,
322
- elem_id="conversation-display")
323
 
324
  # Connect the components
325
- start_button.click(start_real_time_processing, outputs=conversation_display)
326
- stop_button.click(stop_real_time_processing, outputs=conversation_display)
327
-
328
- # Stream processing
329
- audio_input.stream(
330
- process_audio_chunk,
331
- inputs=[audio_input],
332
- outputs=[audio_output, conversation_display],
333
- show_progress=False
 
 
 
 
 
334
  )
335
 
336
  gr.Markdown("""
337
  ## How to use
338
- 1. Click the "Start Conversation" button
339
- 2. Speak naturally into your microphone
340
- 3. Pause briefly when you finish speaking to let the AI respond
341
- 4. The AI will respond audibly - just like a natural conversation!
342
- 5. Continue the conversation as long as you like
343
- 6. Click "End Conversation" when done
 
 
 
344
  """)
345
 
346
- with gr.Accordion("Setup Instructions", open=True):
347
  gr.Markdown("""
348
- ### Important: Setting up your API Token
349
 
350
- This app requires a Hugging Face API token to enable AI responses:
351
 
352
  1. Create an account on [Hugging Face](https://huggingface.co/)
353
  2. Generate an API token in your [settings page](https://huggingface.co/settings/tokens)
354
  3. Add the token in your Space settings:
355
  - Go to Settings > Repository Secrets
356
  - Add a secret with the key `HF_API_TOKEN` and your token as the value
357
-
358
- Without a token, the app will still transcribe your speech but won't generate AI responses.
359
  """)
360
 
361
- # Launch the app with higher queue concurrency
362
  if __name__ == "__main__":
363
- demo.queue(concurrency_count=3, max_size=20).launch()
 
3
  import tempfile
4
  import os
5
  import wave
 
 
6
  import time
7
+ import subprocess
8
  import speech_recognition as sr
9
  import requests
10
  import json
 
11
  import io
12
+ from gtts import gTTS
13
+ import soundfile as sf
14
 
15
+ # Conversation state
 
 
 
 
16
  conversation_history = []
17
+ is_active = False
 
 
 
 
 
 
 
18
 
19
  # Hugging Face API configuration
20
  HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
 
21
  HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
22
 
23
  headers = {
 
25
  "Content-Type": "application/json"
26
  }
27
 
28
+ def tts_with_ffmpeg(text):
29
+ """Convert text to speech using gTTS and ffmpeg"""
30
+ if not text or not text.strip():
31
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # Create temp files
34
+ mp3_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
35
+ wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
36
 
37
+ try:
38
+ # Generate speech with gTTS
39
+ tts = gTTS(text=text, lang='en', slow=False)
40
+ tts.save(mp3_file)
41
 
42
+ # Convert MP3 to WAV using ffmpeg (subprocess to ensure it works in all environments)
43
+ subprocess.run(["ffmpeg", "-i", mp3_file, "-ar", "22050", wav_file, "-y"],
44
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
45
 
46
+ # Load the WAV file
47
+ audio_data, sample_rate = sf.read(wav_file)
 
 
 
 
48
 
49
+ # Clean up temp files
50
+ os.unlink(mp3_file)
51
+ os.unlink(wav_file)
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ return (sample_rate, audio_data)
54
+ except Exception as e:
55
+ print(f"Error in TTS: {e}")
56
+ # Clean up temp files
57
+ if os.path.exists(mp3_file):
58
+ os.unlink(mp3_file)
59
+ if os.path.exists(wav_file):
60
+ os.unlink(wav_file)
61
  return None
62
+
63
+ def get_ai_response(user_text):
64
+ """Get response from LLM"""
65
+ if not user_text or not user_text.strip():
66
+ return "I couldn't hear you clearly. Could you try again?"
67
 
68
+ # Add user message to history
69
+ conversation_history.append({"role": "user", "content": user_text})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ # Build messages for API
 
 
 
 
72
  messages = [{"role": msg["role"], "content": msg["content"]} for msg in conversation_history]
73
 
74
  try:
 
87
 
88
  if response.status_code == 200:
89
  response_json = response.json()
90
+ ai_text = response_json[0]["generated_text"]
91
  else:
92
+ ai_text = f"I'm having trouble connecting. Error: {response.status_code}"
93
  else:
94
+ ai_text = "Please add a Hugging Face API token in the Space settings to enable AI responses."
 
95
  except Exception as e:
96
+ ai_text = f"Error: {str(e)}. Please try again."
97
+
98
+ # Add AI response to history
99
+ conversation_history.append({"role": "assistant", "content": ai_text})
100
+
101
+ return ai_text
102
 
103
+ def start_assistant():
104
+ """Start the voice assistant"""
105
+ global is_active, conversation_history
106
+ is_active = True
107
+ conversation_history = []
108
+
109
+ # Add system message
110
+ conversation_history.append({
111
+ "role": "system",
112
+ "content": "You are a helpful, friendly AI assistant like Alexa. Keep responses brief and conversational. When appropriate, ask follow-up questions to maintain the conversation."
113
+ })
114
+
115
+ # Welcome message
116
+ welcome = "Hello! I'm your AI assistant. I'm listening. What can I help you with?"
117
+ conversation_history.append({"role": "assistant", "content": welcome})
118
+
119
+ # Generate welcome audio
120
+ welcome_audio = tts_with_ffmpeg(welcome)
121
+
122
+ # Format conversation for display
123
+ conversation_text = "Assistant: " + welcome + "\n\n"
124
+
125
+ # Set initial state to listening
126
+ status = "Listening... (Click Record to speak)"
127
+
128
+ return welcome_audio, conversation_text, status, True
129
 
130
+ def stop_assistant():
131
+ """Stop the voice assistant"""
132
+ global is_active
133
+ is_active = False
134
+ return None, "Assistant stopped.", "Inactive", False
135
+
136
+ def process_voice(audio, listen_state, conversation_state, status_state):
137
+ """Process voice input and generate response"""
138
+ if not is_active or not listen_state:
139
+ return None, conversation_state, "Please start the assistant first", listen_state
140
+
141
+ if audio is None:
142
+ return None, conversation_state, status_state, listen_state
143
+
144
+ # Process the audio recording
145
+ sample_rate, audio_data = audio
146
+
147
+ # Save to temporary WAV file for speech recognition
148
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
149
+ temp_filename = temp_file.name
150
+
151
+ with wave.open(temp_filename, 'wb') as wf:
152
+ wf.setnchannels(1)
153
+ wf.setsampwidth(2) # 16-bit audio
154
+ wf.setframerate(sample_rate)
155
+ wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
156
+
157
+ # Perform speech recognition
158
+ recognizer = sr.Recognizer()
159
+ transcription = ""
160
+
161
+ try:
162
+ with sr.AudioFile(temp_filename) as source:
163
+ audio = recognizer.record(source)
164
+ transcription = recognizer.recognize_google(audio)
165
+ except sr.UnknownValueError:
166
+ os.unlink(temp_filename)
167
+ return None, conversation_state, "I didn't catch that. Please try again.", listen_state
168
+ except sr.RequestError as e:
169
+ os.unlink(temp_filename)
170
+ return None, conversation_state, f"Speech recognition error: {e}", listen_state
171
+
172
+ # Clean up temp file
173
+ os.unlink(temp_filename)
174
+
175
+ # Update status
176
+ status = "Processing your request..."
177
+
178
+ # Get AI response
179
+ ai_response = get_ai_response(transcription)
180
+
181
+ # Generate audio response
182
+ audio_response = tts_with_ffmpeg(ai_response)
183
+
184
+ # Format conversation for display
185
+ conversation_text = ""
186
+ for message in conversation_history:
187
+ if message["role"] != "system": # Skip system messages
188
+ prefix = "You: " if message["role"] == "user" else "Assistant: "
189
+ conversation_text += f"{prefix}{message['content']}\n\n"
190
+
191
+ # Set status back to listening
192
+ status = "Listening... (Click Record to speak)"
193
+
194
+ return audio_response, conversation_text, status, listen_state
195
 
196
+ # Create the Gradio interface
197
+ with gr.Blocks(title="Voice Assistant (Alexa-style)") as demo:
198
+ gr.Markdown("# Voice Assistant")
199
+ gr.Markdown("Speak naturally with the AI assistant like you would with Alexa")
200
+
201
+ # State variables
202
+ listening = gr.State(False)
203
 
204
  with gr.Row():
205
+ start_button = gr.Button("Start Assistant", variant="primary", scale=2)
206
+ stop_button = gr.Button("Stop Assistant", variant="stop", scale=1)
207
 
208
+ with gr.Row():
209
+ status_display = gr.Textbox(label="Status", value="Inactive")
 
210
 
211
  with gr.Row():
212
+ with gr.Column(scale=1):
213
+ audio_input = gr.Audio(type="numpy", label="Speak", interactive=True)
214
 
215
+ with gr.Column(scale=2):
216
+ conversation_display = gr.Textbox(label="Conversation", lines=10, interactive=False)
217
+
218
+ audio_output = gr.Audio(label="Assistant's Voice", autoplay=True)
219
 
220
  # Connect the components
221
+ start_button.click(
222
+ start_assistant,
223
+ outputs=[audio_output, conversation_display, status_display, listening]
224
+ )
225
+
226
+ stop_button.click(
227
+ stop_assistant,
228
+ outputs=[audio_output, conversation_display, status_display, listening]
229
+ )
230
+
231
+ audio_input.change(
232
+ process_voice,
233
+ inputs=[audio_input, listening, conversation_display, status_display],
234
+ outputs=[audio_output, conversation_display, status_display, listening]
235
  )
236
 
237
  gr.Markdown("""
238
  ## How to use
239
+
240
+ 1. Click "Start Assistant" to begin
241
+ 2. Click the microphone button and speak your question or command
242
+ 3. When done speaking, click Stop on the recording control
243
+ 4. Listen to the assistant's response
244
+ 5. Continue the conversation by speaking again
245
+ 6. Click "Stop Assistant" when you're finished
246
+
247
+ For the best experience, make sure your question or command is clear and complete before stopping the recording.
248
  """)
249
 
250
+ with gr.Accordion("Setup Guide", open=True):
251
  gr.Markdown("""
252
+ ### API Token Setup
253
 
254
+ This app requires a Hugging Face API token for AI responses:
255
 
256
  1. Create an account on [Hugging Face](https://huggingface.co/)
257
  2. Generate an API token in your [settings page](https://huggingface.co/settings/tokens)
258
  3. Add the token in your Space settings:
259
  - Go to Settings > Repository Secrets
260
  - Add a secret with the key `HF_API_TOKEN` and your token as the value
 
 
261
  """)
262
 
263
+ # Launch the app
264
  if __name__ == "__main__":
265
+ demo.launch()
requirements.txt CHANGED
@@ -1,5 +1,7 @@
1
- gradio>=3.50.0
2
  numpy>=1.19.0
3
  SpeechRecognition>=3.8.1
4
  requests>=2.25.1
5
- gTTS>=2.3.2
 
 
 
1
+ gradio==3.50.0
2
  numpy>=1.19.0
3
  SpeechRecognition>=3.8.1
4
  requests>=2.25.1
5
+ gTTS>=2.3.2
6
+ soundfile>=0.12.1
7
+ ffmpeg-python>=0.2.0