Upload 4 files
Browse files
app.py
CHANGED
|
@@ -4,11 +4,12 @@ import tempfile
|
|
| 4 |
import os
|
| 5 |
import wave
|
| 6 |
import requests
|
| 7 |
-
import
|
| 8 |
from gtts import gTTS
|
| 9 |
|
| 10 |
# Conversation state
|
| 11 |
conversation = []
|
|
|
|
| 12 |
|
| 13 |
# Hugging Face API configuration
|
| 14 |
HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
|
|
@@ -20,20 +21,13 @@ headers = {
|
|
| 20 |
}
|
| 21 |
|
| 22 |
def transcribe_audio(audio):
|
| 23 |
-
"""Transcribe audio to text using
|
| 24 |
if audio is None:
|
| 25 |
return None
|
| 26 |
|
| 27 |
# Gradio 3.50.0 passes (sample_rate, audio_data)
|
| 28 |
sample_rate, audio_data = audio
|
| 29 |
|
| 30 |
-
if len(audio_data) == 0:
|
| 31 |
-
return None
|
| 32 |
-
|
| 33 |
-
# Simple energy check to see if there's actually speech
|
| 34 |
-
if np.max(np.abs(audio_data)) < 0.05:
|
| 35 |
-
return None
|
| 36 |
-
|
| 37 |
# Create a temporary WAV file
|
| 38 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
| 39 |
temp_filename = temp_file.name
|
|
@@ -45,8 +39,7 @@ def transcribe_audio(audio):
|
|
| 45 |
wf.setframerate(sample_rate)
|
| 46 |
wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
|
| 47 |
|
| 48 |
-
#
|
| 49 |
-
import speech_recognition as sr
|
| 50 |
recognizer = sr.Recognizer()
|
| 51 |
with sr.AudioFile(temp_filename) as source:
|
| 52 |
audio_data = recognizer.record(source)
|
|
@@ -56,14 +49,14 @@ def transcribe_audio(audio):
|
|
| 56 |
print(f"Error in transcription: {e}")
|
| 57 |
return None
|
| 58 |
finally:
|
| 59 |
-
# Clean up
|
| 60 |
if os.path.exists(temp_filename):
|
| 61 |
os.unlink(temp_filename)
|
| 62 |
|
| 63 |
def get_ai_response(user_text):
|
| 64 |
"""Get AI response from LLM API"""
|
| 65 |
if not user_text:
|
| 66 |
-
return "I
|
| 67 |
|
| 68 |
# Add user message to conversation
|
| 69 |
conversation.append({"role": "user", "content": user_text})
|
|
@@ -123,16 +116,29 @@ def start_conversation():
|
|
| 123 |
conversation = []
|
| 124 |
|
| 125 |
# Add welcome message
|
| 126 |
-
welcome = "Hello! I'm your AI assistant.
|
| 127 |
conversation.append({"role": "assistant", "content": welcome})
|
| 128 |
|
| 129 |
# Generate speech
|
| 130 |
welcome_audio = text_to_speech(welcome)
|
| 131 |
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
-
def
|
| 135 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
if audio is None:
|
| 137 |
return None, get_conversation_text()
|
| 138 |
|
|
@@ -140,18 +146,18 @@ def process_interaction(audio):
|
|
| 140 |
user_text = transcribe_audio(audio)
|
| 141 |
|
| 142 |
if not user_text:
|
| 143 |
-
return None,
|
| 144 |
|
| 145 |
# Get AI response
|
| 146 |
ai_response = get_ai_response(user_text)
|
| 147 |
|
| 148 |
-
#
|
| 149 |
speech_file = text_to_speech(ai_response)
|
| 150 |
|
| 151 |
# Update conversation display
|
| 152 |
-
|
| 153 |
|
| 154 |
-
return speech_file,
|
| 155 |
|
| 156 |
def get_conversation_text():
|
| 157 |
"""Format conversation history for display"""
|
|
@@ -163,64 +169,70 @@ def get_conversation_text():
|
|
| 163 |
return result
|
| 164 |
|
| 165 |
# Create Gradio interface
|
| 166 |
-
with gr.Blocks(title="
|
| 167 |
with gr.Column():
|
| 168 |
-
gr.Markdown("#
|
| 169 |
-
gr.Markdown(""
|
| 170 |
-
Just click "Start" and begin speaking with the assistant.
|
| 171 |
-
The interaction is simple: speak, get a response, speak again.
|
| 172 |
-
""")
|
| 173 |
|
| 174 |
-
#
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
audio_input = gr.Audio(
|
| 181 |
-
label="Speak Here",
|
| 182 |
-
type="numpy",
|
| 183 |
-
sources=None,
|
| 184 |
-
interactive=True
|
| 185 |
-
)
|
| 186 |
-
|
| 187 |
-
with gr.Column(scale=2):
|
| 188 |
-
# Display conversation
|
| 189 |
-
conversation_display = gr.Textbox(
|
| 190 |
-
label="Conversation History",
|
| 191 |
-
lines=15,
|
| 192 |
-
value=""
|
| 193 |
-
)
|
| 194 |
-
|
| 195 |
-
# Audio output for assistant responses
|
| 196 |
-
audio_output = gr.Audio(
|
| 197 |
-
label="Assistant's Voice",
|
| 198 |
-
type="filepath",
|
| 199 |
-
autoplay=True
|
| 200 |
-
)
|
| 201 |
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
5. Continue the conversation by speaking again
|
| 209 |
|
| 210 |
-
|
| 211 |
-
""")
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
start_button.click(
|
| 215 |
start_conversation,
|
| 216 |
outputs=[audio_output, conversation_display]
|
| 217 |
)
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
audio_input.change(
|
| 220 |
-
|
| 221 |
inputs=[audio_input],
|
| 222 |
outputs=[audio_output, conversation_display]
|
| 223 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
# Launch the app
|
| 226 |
if __name__ == "__main__":
|
|
|
|
| 4 |
import os
|
| 5 |
import wave
|
| 6 |
import requests
|
| 7 |
+
import speech_recognition as sr
|
| 8 |
from gtts import gTTS
|
| 9 |
|
| 10 |
# Conversation state
|
| 11 |
conversation = []
|
| 12 |
+
is_listening = False
|
| 13 |
|
| 14 |
# Hugging Face API configuration
|
| 15 |
HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
|
|
|
|
| 21 |
}
|
| 22 |
|
| 23 |
def transcribe_audio(audio):
|
| 24 |
+
"""Transcribe audio to text using Google Speech Recognition"""
|
| 25 |
if audio is None:
|
| 26 |
return None
|
| 27 |
|
| 28 |
# Gradio 3.50.0 passes (sample_rate, audio_data)
|
| 29 |
sample_rate, audio_data = audio
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
# Create a temporary WAV file
|
| 32 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
| 33 |
temp_filename = temp_file.name
|
|
|
|
| 39 |
wf.setframerate(sample_rate)
|
| 40 |
wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
|
| 41 |
|
| 42 |
+
# Perform speech recognition
|
|
|
|
| 43 |
recognizer = sr.Recognizer()
|
| 44 |
with sr.AudioFile(temp_filename) as source:
|
| 45 |
audio_data = recognizer.record(source)
|
|
|
|
| 49 |
print(f"Error in transcription: {e}")
|
| 50 |
return None
|
| 51 |
finally:
|
| 52 |
+
# Clean up temp file
|
| 53 |
if os.path.exists(temp_filename):
|
| 54 |
os.unlink(temp_filename)
|
| 55 |
|
| 56 |
def get_ai_response(user_text):
|
| 57 |
"""Get AI response from LLM API"""
|
| 58 |
if not user_text:
|
| 59 |
+
return "I couldn't hear what you said. Please try speaking again."
|
| 60 |
|
| 61 |
# Add user message to conversation
|
| 62 |
conversation.append({"role": "user", "content": user_text})
|
|
|
|
| 116 |
conversation = []
|
| 117 |
|
| 118 |
# Add welcome message
|
| 119 |
+
welcome = "Hello! I'm your AI assistant. Press the SPEAK button and start talking to me."
|
| 120 |
conversation.append({"role": "assistant", "content": welcome})
|
| 121 |
|
| 122 |
# Generate speech
|
| 123 |
welcome_audio = text_to_speech(welcome)
|
| 124 |
|
| 125 |
+
# Format for display
|
| 126 |
+
display_text = "Assistant: " + welcome + "\n\n"
|
| 127 |
+
|
| 128 |
+
return welcome_audio, display_text
|
| 129 |
|
| 130 |
+
def toggle_recording(state):
|
| 131 |
+
"""Toggle recording state and return button text"""
|
| 132 |
+
global is_listening
|
| 133 |
+
is_listening = not state
|
| 134 |
+
|
| 135 |
+
if is_listening:
|
| 136 |
+
return True, "RECORDING... CLICK TO STOP"
|
| 137 |
+
else:
|
| 138 |
+
return False, "CLICK TO SPEAK"
|
| 139 |
+
|
| 140 |
+
def process_voice(audio):
|
| 141 |
+
"""Process voice input and generate response"""
|
| 142 |
if audio is None:
|
| 143 |
return None, get_conversation_text()
|
| 144 |
|
|
|
|
| 146 |
user_text = transcribe_audio(audio)
|
| 147 |
|
| 148 |
if not user_text:
|
| 149 |
+
return None, "I couldn't hear what you said. Please try speaking again."
|
| 150 |
|
| 151 |
# Get AI response
|
| 152 |
ai_response = get_ai_response(user_text)
|
| 153 |
|
| 154 |
+
# Generate speech
|
| 155 |
speech_file = text_to_speech(ai_response)
|
| 156 |
|
| 157 |
# Update conversation display
|
| 158 |
+
display_text = get_conversation_text()
|
| 159 |
|
| 160 |
+
return speech_file, display_text
|
| 161 |
|
| 162 |
def get_conversation_text():
|
| 163 |
"""Format conversation history for display"""
|
|
|
|
| 169 |
return result
|
| 170 |
|
| 171 |
# Create Gradio interface
|
| 172 |
+
with gr.Blocks(title="One-Click Voice Assistant") as demo:
|
| 173 |
with gr.Column():
|
| 174 |
+
gr.Markdown("# One-Click Voice Assistant")
|
| 175 |
+
gr.Markdown("Just one button to talk with the AI assistant!")
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
+
# Conversation history display
|
| 178 |
+
conversation_display = gr.Textbox(
|
| 179 |
+
label="Conversation",
|
| 180 |
+
lines=10,
|
| 181 |
+
value="Click 'Start Conversation' below to begin"
|
| 182 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
+
# Audio output for responses
|
| 185 |
+
audio_output = gr.Audio(
|
| 186 |
+
label="AI Voice Response",
|
| 187 |
+
type="filepath",
|
| 188 |
+
autoplay=True
|
| 189 |
+
)
|
|
|
|
| 190 |
|
| 191 |
+
# Start conversation button
|
| 192 |
+
start_button = gr.Button("START CONVERSATION", variant="primary", size="lg")
|
| 193 |
+
|
| 194 |
+
# Single recording button that toggles state
|
| 195 |
+
with gr.Row():
|
| 196 |
+
recording_state = gr.State(False)
|
| 197 |
+
recording_button = gr.Button("CLICK TO SPEAK", variant="secondary", size="lg")
|
| 198 |
+
|
| 199 |
+
# Audio input (hidden)
|
| 200 |
+
audio_input = gr.Audio(
|
| 201 |
+
label="Voice Input",
|
| 202 |
+
type="numpy",
|
| 203 |
+
visible=False,
|
| 204 |
+
source="microphone",
|
| 205 |
+
streaming=False
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
# Connect the components
|
| 209 |
start_button.click(
|
| 210 |
start_conversation,
|
| 211 |
outputs=[audio_output, conversation_display]
|
| 212 |
)
|
| 213 |
|
| 214 |
+
recording_button.click(
|
| 215 |
+
toggle_recording,
|
| 216 |
+
inputs=[recording_state],
|
| 217 |
+
outputs=[recording_state, recording_button]
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
audio_input.change(
|
| 221 |
+
process_voice,
|
| 222 |
inputs=[audio_input],
|
| 223 |
outputs=[audio_output, conversation_display]
|
| 224 |
)
|
| 225 |
+
|
| 226 |
+
gr.Markdown("""
|
| 227 |
+
## How to use:
|
| 228 |
+
|
| 229 |
+
1. Click "START CONVERSATION" to begin
|
| 230 |
+
2. Click "CLICK TO SPEAK" and speak to the assistant
|
| 231 |
+
3. Click again to stop recording and get a response
|
| 232 |
+
4. Continue the conversation - just click the button again to speak
|
| 233 |
+
|
| 234 |
+
This assistant is designed to be as simple as possible - just one button to talk!
|
| 235 |
+
""")
|
| 236 |
|
| 237 |
# Launch the app
|
| 238 |
if __name__ == "__main__":
|