SreekarB commited on
Commit
6cd20fc
·
verified ·
1 Parent(s): 4d6f567

Upload 4 files

Browse files
Files changed (1) hide show
  1. app.py +153 -72
app.py CHANGED
@@ -21,10 +21,16 @@ is_running = False
21
  conversation_history = []
22
  # LLM response queue
23
  response_queue = queue.Queue()
 
 
 
 
 
 
24
 
25
  # Hugging Face API configuration
26
  HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
27
- # Replace with your actual Hugging Face API token
28
  HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
29
 
30
  headers = {
@@ -34,8 +40,10 @@ headers = {
34
 
35
  def start_real_time_processing():
36
  """Start real-time audio processing"""
37
- global is_running
38
  is_running = True
 
 
39
 
40
  # Clear previous history
41
  conversation_history.clear()
@@ -43,9 +51,18 @@ def start_real_time_processing():
43
  # Add system message
44
  conversation_history.append({
45
  "role": "system",
46
- "content": "You are a helpful, friendly AI assistant. Keep responses brief and conversational."
47
  })
48
 
 
 
 
 
 
 
 
 
 
49
  # Start the processing thread
50
  processing_thread = threading.Thread(target=process_audio_queue)
51
  processing_thread.daemon = True
@@ -56,17 +73,30 @@ def start_real_time_processing():
56
  response_thread.daemon = True
57
  response_thread.start()
58
 
59
- return "Real-time assistant started. Speak into your microphone..."
 
 
 
 
 
60
 
61
  def stop_real_time_processing():
62
  """Stop real-time audio processing"""
63
  global is_running
64
  is_running = False
65
- return "Real-time assistant stopped."
66
 
67
  def process_audio_chunk(audio_chunk, sample_rate):
68
  """Process incoming audio chunk and add to queue"""
 
 
69
  if is_running and audio_chunk is not None and len(audio_chunk) > 0:
 
 
 
 
 
 
70
  # Add to queue for processing
71
  audio_queue.put((audio_chunk, sample_rate))
72
 
@@ -90,9 +120,73 @@ def process_audio_chunk(audio_chunk, sample_rate):
90
  else:
91
  playback_audio = response_audio
92
 
93
- return playback_audio, conversation_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- return None, "Click 'Start' to begin real-time processing"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  def text_to_speech(text):
98
  """Convert text to speech using gTTS"""
@@ -106,16 +200,27 @@ def text_to_speech(text):
106
  tts.write_to_fp(fp)
107
  fp.seek(0)
108
 
 
 
 
 
 
 
 
 
109
  # Read WAV file
110
- with wave.open(fp, 'rb') as wf:
111
  sample_rate = wf.getframerate()
112
  frames = wf.readframes(wf.getnframes())
113
  audio_array = np.frombuffer(frames, dtype=np.int16)
114
  audio_array = audio_array.astype(np.float32) / 32767.0
115
 
 
 
 
116
  return (sample_rate, audio_array)
117
 
118
- def get_llm_response(input_text):
119
  """Get response from LLM API"""
120
  # Build conversation for the LLM
121
  messages = [{"role": msg["role"], "content": msg["content"]} for msg in conversation_history]
@@ -138,22 +243,21 @@ def get_llm_response(input_text):
138
  response_json = response.json()
139
  return response_json[0]["generated_text"]
140
  else:
141
- return f"API Error: {response.status_code} - {response.text}"
142
  else:
143
- # Fallback response if no API token is provided
144
- return "I don't have an API token configured, but I heard you! Please check the README for setup instructions."
145
  except Exception as e:
146
- return f"Error: {str(e)}"
147
 
148
  def process_response_queue():
149
  """Process responses and convert to audio"""
150
  while is_running:
151
  try:
 
152
  if len(conversation_history) > 1 and conversation_history[-1]["role"] == "user":
153
- user_message = conversation_history[-1]["content"]
154
-
155
  # Get LLM response
156
- response_text = get_llm_response(user_message)
157
 
158
  # Add to conversation history
159
  conversation_history.append({"role": "assistant", "content": response_text})
@@ -165,14 +269,16 @@ def process_response_queue():
165
  if audio is not None:
166
  response_queue.put(audio)
167
 
168
- time.sleep(0.5)
169
  except Exception as e:
170
  print(f"Error in response thread: {e}")
171
  time.sleep(0.5)
172
 
173
  def process_audio_queue():
174
  """Process audio chunks from the queue"""
175
- recognizer = sr.Recognizer()
 
 
176
 
177
  while is_running:
178
  try:
@@ -180,35 +286,8 @@ def process_audio_queue():
180
  audio_chunk, sample_rate = audio_queue.get(timeout=0.5)
181
 
182
  if audio_chunk is not None and len(audio_chunk) > 0:
183
- # Create a temporary WAV file for speech recognition
184
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
185
- temp_filename = temp_file.name
186
-
187
- # Save the audio chunk to the temporary file
188
- with wave.open(temp_filename, 'wb') as wf:
189
- wf.setnchannels(1)
190
- wf.setsampwidth(2) # 16-bit audio
191
- wf.setframerate(sample_rate)
192
- wf.writeframes((audio_chunk * 32767).astype(np.int16).tobytes())
193
-
194
- # Perform speech recognition
195
- try:
196
- with sr.AudioFile(temp_filename) as source:
197
- audio = recognizer.record(source)
198
- text = recognizer.recognize_google(audio)
199
-
200
- # Only add to conversation if not empty
201
- if text.strip():
202
- # Add user message to conversation history
203
- conversation_history.append({"role": "user", "content": text})
204
- except sr.UnknownValueError:
205
- # No speech detected, ignore
206
- pass
207
- except sr.RequestError as e:
208
- print(f"Speech recognition error: {e}")
209
-
210
- # Clean up temporary file
211
- os.unlink(temp_filename)
212
 
213
  # Mark this task as done
214
  audio_queue.task_done()
@@ -221,23 +300,26 @@ def process_audio_queue():
221
  time.sleep(0.1)
222
 
223
  # Create Gradio interface
224
- with gr.Blocks(title="Real-Time Voice Assistant") as demo:
225
- gr.Markdown("# Real-Time Voice Assistant")
226
- gr.Markdown("Speak into your microphone and get AI responses in real-time.")
227
 
228
  with gr.Row():
229
- start_button = gr.Button("Start Conversation", variant="primary")
230
- stop_button = gr.Button("End Conversation", variant="stop")
231
 
232
  # Real-time microphone input
233
- audio_input = gr.Audio(sources=["microphone"], streaming=True, type="numpy", label="Your Voice")
 
234
 
235
  with gr.Row():
236
  # Audio output for playback
237
- audio_output = gr.Audio(label="Audio", autoplay=True)
238
 
239
  # Conversation output
240
- conversation_display = gr.Textbox(label="Conversation", lines=15)
 
 
241
 
242
  # Connect the components
243
  start_button.click(start_real_time_processing, outputs=conversation_display)
@@ -253,30 +335,29 @@ with gr.Blocks(title="Real-Time Voice Assistant") as demo:
253
 
254
  gr.Markdown("""
255
  ## How to use
256
- 1. **Important**: Add your Hugging Face API token as an environment variable `HF_API_TOKEN`
257
- 2. Click the "Start Conversation" button
258
- 3. Speak into your microphone
259
- 4. Listen to the AI's voice responses
260
- 5. Continue the conversation naturally
261
  6. Click "End Conversation" when done
262
  """)
263
-
264
- with gr.Accordion("Setup Instructions", open=False):
265
  gr.Markdown("""
266
- ### Setting up your API Token
 
 
267
 
268
  1. Create an account on [Hugging Face](https://huggingface.co/)
269
  2. Generate an API token in your [settings page](https://huggingface.co/settings/tokens)
270
- 3. Add it as an environment variable when launching this app:
271
-
272
- ```bash
273
- export HF_API_TOKEN="your-token-here"
274
- python app.py
275
- ```
276
 
277
- Or when deploying to Hugging Face Spaces, add it in the repository settings.
278
  """)
279
 
280
- # Launch the app
281
  if __name__ == "__main__":
282
- demo.queue(max_size=10).launch()
 
21
  conversation_history = []
22
  # LLM response queue
23
  response_queue = queue.Queue()
24
+ # For tracking if speech is active
25
+ speech_active = False
26
+ # For tracking silence periods
27
+ last_speech_time = time.time()
28
+ # Silence threshold in seconds before processing
29
+ SILENCE_THRESHOLD = 1.0
30
 
31
  # Hugging Face API configuration
32
  HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
33
+ # Get API token from environment
34
  HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
35
 
36
  headers = {
 
40
 
41
  def start_real_time_processing():
42
  """Start real-time audio processing"""
43
+ global is_running, speech_active, last_speech_time
44
  is_running = True
45
+ speech_active = False
46
+ last_speech_time = time.time()
47
 
48
  # Clear previous history
49
  conversation_history.clear()
 
51
  # Add system message
52
  conversation_history.append({
53
  "role": "system",
54
+ "content": "You are a helpful, friendly AI assistant engaged in a natural voice conversation. Keep responses brief, conversational, and engaging. Ask follow-up questions when appropriate to maintain the dialogue flow."
55
  })
56
 
57
+ # Add initial greeting to conversation history
58
+ greeting = "Hello! I'm your voice assistant. How can I help you today?"
59
+ conversation_history.append({"role": "assistant", "content": greeting})
60
+
61
+ # Convert greeting to speech and add to response queue
62
+ greeting_audio = text_to_speech(greeting)
63
+ if greeting_audio:
64
+ response_queue.put(greeting_audio)
65
+
66
  # Start the processing thread
67
  processing_thread = threading.Thread(target=process_audio_queue)
68
  processing_thread.daemon = True
 
73
  response_thread.daemon = True
74
  response_thread.start()
75
 
76
+ # Start the speech activity monitor thread
77
+ activity_thread = threading.Thread(target=monitor_speech_activity)
78
+ activity_thread.daemon = True
79
+ activity_thread.start()
80
+
81
+ return "Starting conversation... Please speak when ready."
82
 
83
  def stop_real_time_processing():
84
  """Stop real-time audio processing"""
85
  global is_running
86
  is_running = False
87
+ return "Conversation ended."
88
 
89
  def process_audio_chunk(audio_chunk, sample_rate):
90
  """Process incoming audio chunk and add to queue"""
91
+ global speech_active, last_speech_time
92
+
93
  if is_running and audio_chunk is not None and len(audio_chunk) > 0:
94
+ # Check if there's actual speech (not just silence)
95
+ rms = np.sqrt(np.mean(audio_chunk**2))
96
+ if rms > 0.01: # Simple threshold for detecting speech
97
+ speech_active = True
98
+ last_speech_time = time.time()
99
+
100
  # Add to queue for processing
101
  audio_queue.put((audio_chunk, sample_rate))
102
 
 
120
  else:
121
  playback_audio = response_audio
122
 
123
+ # Also return speech activity status
124
+ status = "Listening..." if speech_active else "Ready for your input..."
125
+ if len(conversation_history) > 1 and conversation_history[-1]["role"] == "user":
126
+ status = "Processing your response..."
127
+
128
+ return playback_audio, conversation_text + "\n" + status
129
+
130
+ return None, "Click 'Start Conversation' to begin"
131
+
132
+ def monitor_speech_activity():
133
+ """Monitor speech activity and trigger processing when speech stops"""
134
+ global speech_active, last_speech_time
135
+
136
+ while is_running:
137
+ # If speech was active but has been silent for a while
138
+ if speech_active and (time.time() - last_speech_time) > SILENCE_THRESHOLD:
139
+ speech_active = False
140
+ # Signal to process the accumulated speech
141
+ process_accumulated_speech()
142
+
143
+ time.sleep(0.1)
144
+
145
+ def process_accumulated_speech():
146
+ """Process all accumulated speech when a silence is detected"""
147
+ recognizer = sr.Recognizer()
148
 
149
+ # Create a temporary WAV file for all accumulated audio chunks
150
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
151
+ temp_filename = temp_file.name
152
+
153
+ # Check if we have enough accumulated audio
154
+ if not hasattr(process_accumulated_speech, 'accumulated_chunks'):
155
+ process_accumulated_speech.accumulated_chunks = []
156
+
157
+ # If we have accumulated audio chunks
158
+ if process_accumulated_speech.accumulated_chunks:
159
+ # Get the sample rate from the first chunk
160
+ sample_rate = process_accumulated_speech.accumulated_chunks[0][1]
161
+
162
+ # Concatenate all audio chunks
163
+ all_audio = np.concatenate([chunk[0] for chunk in process_accumulated_speech.accumulated_chunks])
164
+
165
+ # Save to WAV file
166
+ with wave.open(temp_filename, 'wb') as wf:
167
+ wf.setnchannels(1)
168
+ wf.setsampwidth(2)
169
+ wf.setframerate(sample_rate)
170
+ wf.writeframes((all_audio * 32767).astype(np.int16).tobytes())
171
+
172
+ # Perform speech recognition
173
+ try:
174
+ with sr.AudioFile(temp_filename) as source:
175
+ audio = recognizer.record(source)
176
+ text = recognizer.recognize_google(audio)
177
+
178
+ if text.strip():
179
+ # Add user message to conversation history
180
+ conversation_history.append({"role": "user", "content": text})
181
+ except sr.UnknownValueError:
182
+ # No speech detected
183
+ pass
184
+ except sr.RequestError as e:
185
+ print(f"Speech recognition error: {e}")
186
+
187
+ # Clean up
188
+ os.unlink(temp_filename)
189
+ process_accumulated_speech.accumulated_chunks = []
190
 
191
  def text_to_speech(text):
192
  """Convert text to speech using gTTS"""
 
200
  tts.write_to_fp(fp)
201
  fp.seek(0)
202
 
203
+ # Convert to audio array
204
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
205
+ temp_filename = temp_file.name
206
+
207
+ # Save the gTTS output to the temp file
208
+ with open(temp_filename, 'wb') as f:
209
+ f.write(fp.read())
210
+
211
  # Read WAV file
212
+ with wave.open(temp_filename, 'rb') as wf:
213
  sample_rate = wf.getframerate()
214
  frames = wf.readframes(wf.getnframes())
215
  audio_array = np.frombuffer(frames, dtype=np.int16)
216
  audio_array = audio_array.astype(np.float32) / 32767.0
217
 
218
+ # Clean up temp file
219
+ os.unlink(temp_filename)
220
+
221
  return (sample_rate, audio_array)
222
 
223
+ def get_llm_response():
224
  """Get response from LLM API"""
225
  # Build conversation for the LLM
226
  messages = [{"role": msg["role"], "content": msg["content"]} for msg in conversation_history]
 
243
  response_json = response.json()
244
  return response_json[0]["generated_text"]
245
  else:
246
+ return f"I'm having trouble connecting to my language model. Error: {response.status_code}"
247
  else:
248
+ # No API token available
249
+ return "To enable AI responses, please add a Hugging Face API token in the Space settings. For now, I can hear you but can't generate intelligent responses."
250
  except Exception as e:
251
+ return f"I encountered an error: {str(e)}. Please try again in a moment."
252
 
253
  def process_response_queue():
254
  """Process responses and convert to audio"""
255
  while is_running:
256
  try:
257
+ # Check if new user message and no pending assistant response
258
  if len(conversation_history) > 1 and conversation_history[-1]["role"] == "user":
 
 
259
  # Get LLM response
260
+ response_text = get_llm_response()
261
 
262
  # Add to conversation history
263
  conversation_history.append({"role": "assistant", "content": response_text})
 
269
  if audio is not None:
270
  response_queue.put(audio)
271
 
272
+ time.sleep(0.2)
273
  except Exception as e:
274
  print(f"Error in response thread: {e}")
275
  time.sleep(0.5)
276
 
277
  def process_audio_queue():
278
  """Process audio chunks from the queue"""
279
+ # Initialize accumulated chunks
280
+ if not hasattr(process_accumulated_speech, 'accumulated_chunks'):
281
+ process_accumulated_speech.accumulated_chunks = []
282
 
283
  while is_running:
284
  try:
 
286
  audio_chunk, sample_rate = audio_queue.get(timeout=0.5)
287
 
288
  if audio_chunk is not None and len(audio_chunk) > 0:
289
+ # Store in accumulated chunks for later processing
290
+ process_accumulated_speech.accumulated_chunks.append((audio_chunk, sample_rate))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
  # Mark this task as done
293
  audio_queue.task_done()
 
300
  time.sleep(0.1)
301
 
302
  # Create Gradio interface
303
+ with gr.Blocks(title="Real-Time Voice Conversation Assistant") as demo:
304
+ gr.Markdown("# Real-Time Voice Conversation Assistant")
305
+ gr.Markdown("Speak naturally and have an interactive conversation with the AI assistant.")
306
 
307
  with gr.Row():
308
+ start_button = gr.Button("Start Conversation", variant="primary", scale=2)
309
+ stop_button = gr.Button("End Conversation", variant="stop", scale=1)
310
 
311
  # Real-time microphone input
312
+ audio_input = gr.Audio(sources=["microphone"], streaming=True, type="numpy",
313
+ label="Your Voice", elem_id="mic-input")
314
 
315
  with gr.Row():
316
  # Audio output for playback
317
+ audio_output = gr.Audio(label="Audio", autoplay=True, elem_id="audio-output")
318
 
319
  # Conversation output
320
+ conversation_display = gr.Textbox(label="Conversation",
321
+ lines=15,
322
+ elem_id="conversation-display")
323
 
324
  # Connect the components
325
  start_button.click(start_real_time_processing, outputs=conversation_display)
 
335
 
336
  gr.Markdown("""
337
  ## How to use
338
+ 1. Click the "Start Conversation" button
339
+ 2. Speak naturally into your microphone
340
+ 3. Pause briefly when you finish speaking to let the AI respond
341
+ 4. The AI will respond audibly - just like a natural conversation!
342
+ 5. Continue the conversation as long as you like
343
  6. Click "End Conversation" when done
344
  """)
345
+
346
+ with gr.Accordion("Setup Instructions", open=True):
347
  gr.Markdown("""
348
+ ### Important: Setting up your API Token
349
+
350
+ This app requires a Hugging Face API token to enable AI responses:
351
 
352
  1. Create an account on [Hugging Face](https://huggingface.co/)
353
  2. Generate an API token in your [settings page](https://huggingface.co/settings/tokens)
354
+ 3. Add the token in your Space settings:
355
+ - Go to Settings > Repository Secrets
356
+ - Add a secret with the key `HF_API_TOKEN` and your token as the value
 
 
 
357
 
358
+ Without a token, the app will still transcribe your speech but won't generate AI responses.
359
  """)
360
 
361
+ # Launch the app with higher queue concurrency
362
  if __name__ == "__main__":
363
+ demo.queue(concurrency_count=3, max_size=20).launch()