SreekarB commited on
Commit
a191fd1
·
verified ·
1 Parent(s): f4deca2

Upload 4 files

Browse files
Files changed (1) hide show
  1. app.py +78 -66
app.py CHANGED
@@ -4,11 +4,12 @@ import tempfile
4
  import os
5
  import wave
6
  import requests
7
- import json
8
  from gtts import gTTS
9
 
10
  # Conversation state
11
  conversation = []
 
12
 
13
  # Hugging Face API configuration
14
  HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
@@ -20,20 +21,13 @@ headers = {
20
  }
21
 
22
  def transcribe_audio(audio):
23
- """Transcribe audio to text using Gradio's built-in speech recognition"""
24
  if audio is None:
25
  return None
26
 
27
  # Gradio 3.50.0 passes (sample_rate, audio_data)
28
  sample_rate, audio_data = audio
29
 
30
- if len(audio_data) == 0:
31
- return None
32
-
33
- # Simple energy check to see if there's actually speech
34
- if np.max(np.abs(audio_data)) < 0.05:
35
- return None
36
-
37
  # Create a temporary WAV file
38
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
39
  temp_filename = temp_file.name
@@ -45,8 +39,7 @@ def transcribe_audio(audio):
45
  wf.setframerate(sample_rate)
46
  wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
47
 
48
- # Use Gradio's default transcription
49
- import speech_recognition as sr
50
  recognizer = sr.Recognizer()
51
  with sr.AudioFile(temp_filename) as source:
52
  audio_data = recognizer.record(source)
@@ -56,14 +49,14 @@ def transcribe_audio(audio):
56
  print(f"Error in transcription: {e}")
57
  return None
58
  finally:
59
- # Clean up
60
  if os.path.exists(temp_filename):
61
  os.unlink(temp_filename)
62
 
63
  def get_ai_response(user_text):
64
  """Get AI response from LLM API"""
65
  if not user_text:
66
- return "I didn't catch that. Could you speak again?"
67
 
68
  # Add user message to conversation
69
  conversation.append({"role": "user", "content": user_text})
@@ -123,16 +116,29 @@ def start_conversation():
123
  conversation = []
124
 
125
  # Add welcome message
126
- welcome = "Hello! I'm your AI assistant. Speak into the microphone and I'll respond to you."
127
  conversation.append({"role": "assistant", "content": welcome})
128
 
129
  # Generate speech
130
  welcome_audio = text_to_speech(welcome)
131
 
132
- return welcome_audio, "Conversation started. Speak into the microphone."
 
 
 
133
 
134
- def process_interaction(audio):
135
- """Process a single interaction"""
 
 
 
 
 
 
 
 
 
 
136
  if audio is None:
137
  return None, get_conversation_text()
138
 
@@ -140,18 +146,18 @@ def process_interaction(audio):
140
  user_text = transcribe_audio(audio)
141
 
142
  if not user_text:
143
- return None, get_conversation_text()
144
 
145
  # Get AI response
146
  ai_response = get_ai_response(user_text)
147
 
148
- # Convert to speech
149
  speech_file = text_to_speech(ai_response)
150
 
151
  # Update conversation display
152
- conversation_text = get_conversation_text()
153
 
154
- return speech_file, conversation_text
155
 
156
  def get_conversation_text():
157
  """Format conversation history for display"""
@@ -163,64 +169,70 @@ def get_conversation_text():
163
  return result
164
 
165
  # Create Gradio interface
166
- with gr.Blocks(title="Interactive Voice Assistant") as demo:
167
  with gr.Column():
168
- gr.Markdown("# Interactive Voice Assistant")
169
- gr.Markdown("""
170
- Just click "Start" and begin speaking with the assistant.
171
- The interaction is simple: speak, get a response, speak again.
172
- """)
173
 
174
- # Two-panel layout
175
- with gr.Row():
176
- with gr.Column(scale=1):
177
- start_button = gr.Button("Start New Conversation", variant="primary")
178
-
179
- # Recording component that captures voice
180
- audio_input = gr.Audio(
181
- label="Speak Here",
182
- type="numpy",
183
- sources=None,
184
- interactive=True
185
- )
186
-
187
- with gr.Column(scale=2):
188
- # Display conversation
189
- conversation_display = gr.Textbox(
190
- label="Conversation History",
191
- lines=15,
192
- value=""
193
- )
194
-
195
- # Audio output for assistant responses
196
- audio_output = gr.Audio(
197
- label="Assistant's Voice",
198
- type="filepath",
199
- autoplay=True
200
- )
201
 
202
- gr.Markdown("""
203
- ## How to use
204
- 1. Click "Start New Conversation" to begin
205
- 2. Click the microphone button below "Speak Here" and talk to the assistant
206
- 3. When done speaking, click the stop button
207
- 4. The assistant will respond with voice and text
208
- 5. Continue the conversation by speaking again
209
 
210
- This assistant works like Alexa - just speak, and get a response!
211
- """)
212
-
213
- # Set up the interactions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  start_button.click(
215
  start_conversation,
216
  outputs=[audio_output, conversation_display]
217
  )
218
 
 
 
 
 
 
 
219
  audio_input.change(
220
- process_interaction,
221
  inputs=[audio_input],
222
  outputs=[audio_output, conversation_display]
223
  )
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  # Launch the app
226
  if __name__ == "__main__":
 
4
  import os
5
  import wave
6
  import requests
7
+ import speech_recognition as sr
8
  from gtts import gTTS
9
 
10
  # Conversation state
11
  conversation = []
12
+ is_listening = False
13
 
14
  # Hugging Face API configuration
15
  HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
 
21
  }
22
 
23
  def transcribe_audio(audio):
24
+ """Transcribe audio to text using Google Speech Recognition"""
25
  if audio is None:
26
  return None
27
 
28
  # Gradio 3.50.0 passes (sample_rate, audio_data)
29
  sample_rate, audio_data = audio
30
 
 
 
 
 
 
 
 
31
  # Create a temporary WAV file
32
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
33
  temp_filename = temp_file.name
 
39
  wf.setframerate(sample_rate)
40
  wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
41
 
42
+ # Perform speech recognition
 
43
  recognizer = sr.Recognizer()
44
  with sr.AudioFile(temp_filename) as source:
45
  audio_data = recognizer.record(source)
 
49
  print(f"Error in transcription: {e}")
50
  return None
51
  finally:
52
+ # Clean up temp file
53
  if os.path.exists(temp_filename):
54
  os.unlink(temp_filename)
55
 
56
  def get_ai_response(user_text):
57
  """Get AI response from LLM API"""
58
  if not user_text:
59
+ return "I couldn't hear what you said. Please try speaking again."
60
 
61
  # Add user message to conversation
62
  conversation.append({"role": "user", "content": user_text})
 
116
  conversation = []
117
 
118
  # Add welcome message
119
+ welcome = "Hello! I'm your AI assistant. Press the SPEAK button and start talking to me."
120
  conversation.append({"role": "assistant", "content": welcome})
121
 
122
  # Generate speech
123
  welcome_audio = text_to_speech(welcome)
124
 
125
+ # Format for display
126
+ display_text = "Assistant: " + welcome + "\n\n"
127
+
128
+ return welcome_audio, display_text
129
 
130
+ def toggle_recording(state):
131
+ """Toggle recording state and return button text"""
132
+ global is_listening
133
+ is_listening = not state
134
+
135
+ if is_listening:
136
+ return True, "RECORDING... CLICK TO STOP"
137
+ else:
138
+ return False, "CLICK TO SPEAK"
139
+
140
+ def process_voice(audio):
141
+ """Process voice input and generate response"""
142
  if audio is None:
143
  return None, get_conversation_text()
144
 
 
146
  user_text = transcribe_audio(audio)
147
 
148
  if not user_text:
149
+ return None, "I couldn't hear what you said. Please try speaking again."
150
 
151
  # Get AI response
152
  ai_response = get_ai_response(user_text)
153
 
154
+ # Generate speech
155
  speech_file = text_to_speech(ai_response)
156
 
157
  # Update conversation display
158
+ display_text = get_conversation_text()
159
 
160
+ return speech_file, display_text
161
 
162
  def get_conversation_text():
163
  """Format conversation history for display"""
 
169
  return result
170
 
171
  # Create Gradio interface
172
+ with gr.Blocks(title="One-Click Voice Assistant") as demo:
173
  with gr.Column():
174
+ gr.Markdown("# One-Click Voice Assistant")
175
+ gr.Markdown("Just one button to talk with the AI assistant!")
 
 
 
176
 
177
+ # Conversation history display
178
+ conversation_display = gr.Textbox(
179
+ label="Conversation",
180
+ lines=10,
181
+ value="Click 'Start Conversation' below to begin"
182
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
+ # Audio output for responses
185
+ audio_output = gr.Audio(
186
+ label="AI Voice Response",
187
+ type="filepath",
188
+ autoplay=True
189
+ )
 
190
 
191
+ # Start conversation button
192
+ start_button = gr.Button("START CONVERSATION", variant="primary", size="lg")
193
+
194
+ # Single recording button that toggles state
195
+ with gr.Row():
196
+ recording_state = gr.State(False)
197
+ recording_button = gr.Button("CLICK TO SPEAK", variant="secondary", size="lg")
198
+
199
+ # Audio input (hidden)
200
+ audio_input = gr.Audio(
201
+ label="Voice Input",
202
+ type="numpy",
203
+ visible=False,
204
+ source="microphone",
205
+ streaming=False
206
+ )
207
+
208
+ # Connect the components
209
  start_button.click(
210
  start_conversation,
211
  outputs=[audio_output, conversation_display]
212
  )
213
 
214
+ recording_button.click(
215
+ toggle_recording,
216
+ inputs=[recording_state],
217
+ outputs=[recording_state, recording_button]
218
+ )
219
+
220
  audio_input.change(
221
+ process_voice,
222
  inputs=[audio_input],
223
  outputs=[audio_output, conversation_display]
224
  )
225
+
226
+ gr.Markdown("""
227
+ ## How to use:
228
+
229
+ 1. Click "START CONVERSATION" to begin
230
+ 2. Click "CLICK TO SPEAK" and speak to the assistant
231
+ 3. Click again to stop recording and get a response
232
+ 4. Continue the conversation - just click the button again to speak
233
+
234
+ This assistant is designed to be as simple as possible - just one button to talk!
235
+ """)
236
 
237
  # Launch the app
238
  if __name__ == "__main__":