SreekarB commited on
Commit
f4deca2
·
verified ·
1 Parent(s): 6243a25

Upload 4 files

Browse files
Files changed (2) hide show
  1. app.py +148 -153
  2. requirements.txt +1 -2
app.py CHANGED
@@ -4,16 +4,11 @@ import tempfile
4
  import os
5
  import wave
6
  import requests
7
- import threading
8
- import time
9
- import speech_recognition as sr
10
  from gtts import gTTS
11
 
12
  # Conversation state
13
  conversation = []
14
- is_active = False
15
- listen_thread = None
16
- stop_listening = False
17
 
18
  # Hugging Face API configuration
19
  HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
@@ -24,91 +19,63 @@ headers = {
24
  "Content-Type": "application/json"
25
  }
26
 
27
- def recognize_from_microphone(timeout=None):
28
- """Recognize speech from microphone continuously"""
29
- global conversation, stop_listening, output_audio, conversation_text
30
-
31
- recognizer = sr.Recognizer()
32
- recognizer.dynamic_energy_threshold = True
33
 
34
- # Initialize conversation with system message
35
- if not conversation:
36
- conversation = [{"role": "system", "content": "You are a helpful AI assistant like Alexa. Keep responses brief and conversational."}]
37
 
38
- # Add welcome message
39
- welcome_msg = "Hello! I'm your AI assistant. What can I help you with today?"
40
- conversation.append({"role": "assistant", "content": welcome_msg})
41
 
42
- # Create welcome speech
43
- welcome_audio = text_to_speech(welcome_msg)
 
44
 
45
- # Update display
46
- update_conversation_display(welcome_audio)
 
47
 
48
- # Start listening
49
- with sr.Microphone() as source:
50
- # Initial calibration
51
- print("Calibrating for ambient noise...")
52
- recognizer.adjust_for_ambient_noise(source, duration=1)
 
53
 
54
- # Listening loop
55
- while not stop_listening:
56
- try:
57
- print("Listening...")
58
- audio = recognizer.listen(source, timeout=10, phrase_time_limit=10)
59
-
60
- try:
61
- text = recognizer.recognize_google(audio)
62
- print(f"Recognized: {text}")
63
-
64
- if text.strip():
65
- # Get LLM response
66
- conversation.append({"role": "user", "content": text})
67
- ai_response = get_llm_response()
68
-
69
- # Generate speech
70
- speech_file = text_to_speech(ai_response)
71
-
72
- # Update the display
73
- update_conversation_display(speech_file)
74
- except sr.UnknownValueError:
75
- print("Could not understand audio")
76
- except sr.RequestError as e:
77
- print(f"Could not request results; {e}")
78
-
79
- except Exception as e:
80
- print(f"Listening error: {e}")
81
- time.sleep(0.1)
82
-
83
- print("Stopped listening.")
84
- return
85
-
86
- # Variables for storing outputs (needed for updating the interface)
87
- output_audio = None
88
- conversation_text = ""
89
 
90
- def update_conversation_display(audio_path):
91
- """Update the conversation display with latest content"""
92
- global output_audio, conversation_text
 
93
 
94
- # Format conversation for display
95
- conversation_text = ""
96
- for msg in conversation:
97
- if msg["role"] != "system": # Skip system messages
98
- prefix = "You: " if msg["role"] == "user" else "Assistant: "
99
- conversation_text += f"{prefix}{msg['content']}\n\n"
100
 
101
- output_audio = audio_path
102
-
103
- def get_llm_response():
104
- """Get response from LLM API"""
105
  try:
106
  if not HF_API_TOKEN:
107
- response_text = "Please add a Hugging Face API token to enable AI responses."
108
  else:
109
- # Prepare messages for API
110
- messages = [msg for msg in conversation] # Include system message
111
-
112
  # Make API call
113
  payload = {
114
  "inputs": messages,
@@ -122,15 +89,16 @@ def get_llm_response():
122
  response = requests.post(HF_API_URL, headers=headers, json=payload)
123
 
124
  if response.status_code == 200:
125
- generated_text = response.json()[0]["generated_text"]
126
- conversation.append({"role": "assistant", "content": generated_text})
127
- response_text = generated_text
128
  else:
129
- response_text = f"Error from API: {response.status_code}"
130
-
131
- return response_text
132
  except Exception as e:
133
- return f"Error: {str(e)}"
 
 
 
 
 
134
 
135
  def text_to_speech(text):
136
  """Convert text to speech"""
@@ -149,84 +117,111 @@ def text_to_speech(text):
149
  print(f"TTS error: {e}")
150
  return None
151
 
152
- def toggle_conversation():
153
- """Toggle the conversation on/off"""
154
- global is_active, listen_thread, stop_listening
155
-
156
- if not is_active:
157
- # Start conversation
158
- is_active = True
159
- stop_listening = False
160
-
161
- # Start the listening thread
162
- listen_thread = threading.Thread(target=recognize_from_microphone)
163
- listen_thread.daemon = True
164
- listen_thread.start()
165
-
166
- return "Stop Conversation", conversation_text, output_audio
167
- else:
168
- # Stop conversation
169
- is_active = False
170
- stop_listening = True
171
-
172
- # Wait for the thread to end
173
- if listen_thread and listen_thread.is_alive():
174
- listen_thread.join(timeout=1)
175
-
176
- return "Start Conversation", conversation_text, output_audio
 
 
 
 
 
 
 
 
 
 
177
 
178
- def check_for_updates():
179
- """Check for updates in the conversation (used by the periodic event)"""
180
- return conversation_text, output_audio
 
 
 
 
 
181
 
182
- # Create the Gradio interface
183
- with gr.Blocks(title="Interactive Voice Assistant", css=".gradio-container {background-color: #f0f8ff;}") as demo:
184
  with gr.Column():
185
  gr.Markdown("# Interactive Voice Assistant")
186
- gr.Markdown("Click 'Start Conversation' and begin talking naturally with the AI assistant")
 
 
 
187
 
188
- # The main conversation button
189
- conversation_button = gr.Button("Start Conversation", variant="primary", scale=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
- # Display area for conversation history
192
- conversation_display = gr.Textbox(
193
- label="Conversation",
194
- lines=10,
195
- value="Click 'Start Conversation' to begin talking with the AI assistant..."
196
- )
 
197
 
198
- # Audio output (hidden, but will autoplay)
199
- audio_output = gr.Audio(label="Assistant Voice", type="filepath", autoplay=True)
200
 
201
- # Button click toggles conversation
202
- conversation_button.click(
203
- toggle_conversation,
204
- outputs=[conversation_button, conversation_display, audio_output]
205
  )
206
 
207
- # Periodic event to update interface with new conversation and audio
208
- demo.load(
209
- check_for_updates,
210
- inputs=None,
211
- outputs=[conversation_display, audio_output],
212
- every=1 # Check every second
213
  )
214
-
215
- gr.Markdown("""
216
- ## How to use
217
- 1. Click 'Start Conversation'
218
- 2. Start speaking directly to the AI assistant
219
- 3. The assistant will respond when you pause speaking
220
- 4. Keep the conversation going naturally
221
- 5. Click 'Stop Conversation' when done
222
-
223
- ## Notes
224
- - Make sure your microphone is enabled in your browser
225
- - Speak clearly with pauses between your questions
226
- - The assistant will update the conversation history in real-time
227
- """)
228
 
229
  # Launch the app
230
  if __name__ == "__main__":
231
- # Use a larger queue size to allow for continuous updates
232
- demo.queue(max_size=20).launch(debug=True)
 
4
  import os
5
  import wave
6
  import requests
7
+ import json
 
 
8
  from gtts import gTTS
9
 
10
  # Conversation state
11
  conversation = []
 
 
 
12
 
13
  # Hugging Face API configuration
14
  HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
 
19
  "Content-Type": "application/json"
20
  }
21
 
22
+ def transcribe_audio(audio):
23
+ """Transcribe audio to text using Gradio's built-in speech recognition"""
24
+ if audio is None:
25
+ return None
 
 
26
 
27
+ # Gradio 3.50.0 passes (sample_rate, audio_data)
28
+ sample_rate, audio_data = audio
 
29
 
30
+ if len(audio_data) == 0:
31
+ return None
 
32
 
33
+ # Simple energy check to see if there's actually speech
34
+ if np.max(np.abs(audio_data)) < 0.05:
35
+ return None
36
 
37
+ # Create a temporary WAV file
38
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
39
+ temp_filename = temp_file.name
40
 
41
+ try:
42
+ with wave.open(temp_filename, 'wb') as wf:
43
+ wf.setnchannels(1)
44
+ wf.setsampwidth(2) # 16-bit audio
45
+ wf.setframerate(sample_rate)
46
+ wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
47
 
48
+ # Use Gradio's default transcription
49
+ import speech_recognition as sr
50
+ recognizer = sr.Recognizer()
51
+ with sr.AudioFile(temp_filename) as source:
52
+ audio_data = recognizer.record(source)
53
+ text = recognizer.recognize_google(audio_data)
54
+ return text.strip()
55
+ except Exception as e:
56
+ print(f"Error in transcription: {e}")
57
+ return None
58
+ finally:
59
+ # Clean up
60
+ if os.path.exists(temp_filename):
61
+ os.unlink(temp_filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ def get_ai_response(user_text):
64
+ """Get AI response from LLM API"""
65
+ if not user_text:
66
+ return "I didn't catch that. Could you speak again?"
67
 
68
+ # Add user message to conversation
69
+ conversation.append({"role": "user", "content": user_text})
70
+
71
+ # Prepare messages for API
72
+ messages = [{"role": "system", "content": "You are a helpful AI assistant like Alexa. Keep responses brief and conversational."}]
73
+ messages.extend(conversation)
74
 
 
 
 
 
75
  try:
76
  if not HF_API_TOKEN:
77
+ ai_response = "Please add a Hugging Face API token to enable AI responses."
78
  else:
 
 
 
79
  # Make API call
80
  payload = {
81
  "inputs": messages,
 
89
  response = requests.post(HF_API_URL, headers=headers, json=payload)
90
 
91
  if response.status_code == 200:
92
+ ai_response = response.json()[0]["generated_text"]
 
 
93
  else:
94
+ ai_response = f"I'm having trouble connecting. Error: {response.status_code}"
 
 
95
  except Exception as e:
96
+ ai_response = f"Error: {str(e)}"
97
+
98
+ # Add assistant response to conversation
99
+ conversation.append({"role": "assistant", "content": ai_response})
100
+
101
+ return ai_response
102
 
103
  def text_to_speech(text):
104
  """Convert text to speech"""
 
117
  print(f"TTS error: {e}")
118
  return None
119
 
120
+ def start_conversation():
121
+ """Start a new conversation"""
122
+ global conversation
123
+ conversation = []
124
+
125
+ # Add welcome message
126
+ welcome = "Hello! I'm your AI assistant. Speak into the microphone and I'll respond to you."
127
+ conversation.append({"role": "assistant", "content": welcome})
128
+
129
+ # Generate speech
130
+ welcome_audio = text_to_speech(welcome)
131
+
132
+ return welcome_audio, "Conversation started. Speak into the microphone."
133
+
134
+ def process_interaction(audio):
135
+ """Process a single interaction"""
136
+ if audio is None:
137
+ return None, get_conversation_text()
138
+
139
+ # Transcribe audio to text
140
+ user_text = transcribe_audio(audio)
141
+
142
+ if not user_text:
143
+ return None, get_conversation_text()
144
+
145
+ # Get AI response
146
+ ai_response = get_ai_response(user_text)
147
+
148
+ # Convert to speech
149
+ speech_file = text_to_speech(ai_response)
150
+
151
+ # Update conversation display
152
+ conversation_text = get_conversation_text()
153
+
154
+ return speech_file, conversation_text
155
 
156
+ def get_conversation_text():
157
+ """Format conversation history for display"""
158
+ result = ""
159
+ for msg in conversation:
160
+ if msg["role"] != "system": # Skip system messages
161
+ prefix = "You: " if msg["role"] == "user" else "Assistant: "
162
+ result += f"{prefix}{msg['content']}\n\n"
163
+ return result
164
 
165
+ # Create Gradio interface
166
+ with gr.Blocks(title="Interactive Voice Assistant") as demo:
167
  with gr.Column():
168
  gr.Markdown("# Interactive Voice Assistant")
169
+ gr.Markdown("""
170
+ Just click "Start" and begin speaking with the assistant.
171
+ The interaction is simple: speak, get a response, speak again.
172
+ """)
173
 
174
+ # Two-panel layout
175
+ with gr.Row():
176
+ with gr.Column(scale=1):
177
+ start_button = gr.Button("Start New Conversation", variant="primary")
178
+
179
+ # Recording component that captures voice
180
+ audio_input = gr.Audio(
181
+ label="Speak Here",
182
+ type="numpy",
183
+ sources=None,
184
+ interactive=True
185
+ )
186
+
187
+ with gr.Column(scale=2):
188
+ # Display conversation
189
+ conversation_display = gr.Textbox(
190
+ label="Conversation History",
191
+ lines=15,
192
+ value=""
193
+ )
194
+
195
+ # Audio output for assistant responses
196
+ audio_output = gr.Audio(
197
+ label="Assistant's Voice",
198
+ type="filepath",
199
+ autoplay=True
200
+ )
201
 
202
+ gr.Markdown("""
203
+ ## How to use
204
+ 1. Click "Start New Conversation" to begin
205
+ 2. Click the microphone button below "Speak Here" and talk to the assistant
206
+ 3. When done speaking, click the stop button
207
+ 4. The assistant will respond with voice and text
208
+ 5. Continue the conversation by speaking again
209
 
210
+ This assistant works like Alexa - just speak, and get a response!
211
+ """)
212
 
213
+ # Set up the interactions
214
+ start_button.click(
215
+ start_conversation,
216
+ outputs=[audio_output, conversation_display]
217
  )
218
 
219
+ audio_input.change(
220
+ process_interaction,
221
+ inputs=[audio_input],
222
+ outputs=[audio_output, conversation_display]
 
 
223
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  # Launch the app
226
  if __name__ == "__main__":
227
+ demo.launch()
 
requirements.txt CHANGED
@@ -2,5 +2,4 @@ gradio==3.50.0
2
  numpy>=1.19.0
3
  SpeechRecognition>=3.8.1
4
  requests>=2.25.1
5
- gTTS>=2.3.2
6
- pyaudio>=0.2.11
 
2
  numpy>=1.19.0
3
  SpeechRecognition>=3.8.1
4
  requests>=2.25.1
5
+ gTTS>=2.3.2