Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -355,34 +355,24 @@ def vqa_answer(image, question):
|
|
| 355 |
recognizer = sr.Recognizer()
|
| 356 |
mic = sr.Microphone()
|
| 357 |
|
| 358 |
-
|
|
|
|
| 359 |
global scene_context
|
| 360 |
-
#
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
recognizer.adjust_for_ambient_noise(source)
|
| 364 |
-
audio = recognizer.listen(source)
|
| 365 |
-
try:
|
| 366 |
-
trigger_text = recognizer.recognize_google(audio).lower()
|
| 367 |
-
if "ask question" not in trigger_text:
|
| 368 |
-
return "No trigger phrase detected. Say 'Ask question' to ask a question."
|
| 369 |
-
except:
|
| 370 |
-
return "Could not understand trigger phrase. Try again."
|
| 371 |
-
|
| 372 |
-
# Active listening: capture actual question
|
| 373 |
-
print("π€ Trigger detected! Listening for your question...")
|
| 374 |
-
speak("You can ask your question now")
|
| 375 |
-
with mic as source:
|
| 376 |
-
audio = recognizer.listen(source)
|
| 377 |
try:
|
|
|
|
|
|
|
| 378 |
question = recognizer.recognize_google(audio)
|
| 379 |
except:
|
| 380 |
return "Could not understand your question. Try again."
|
| 381 |
|
| 382 |
# Get answer
|
| 383 |
answer = vqa_answer(image, question)
|
| 384 |
-
|
| 385 |
-
|
|
|
|
| 386 |
|
| 387 |
# ----------------------
|
| 388 |
# Gradio UI
|
|
@@ -401,10 +391,17 @@ with gr.Blocks(title="BLIP Vision App") as demo:
|
|
| 401 |
btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
|
| 402 |
|
| 403 |
with gr.Tab("Voice Question Answering"):
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
|
| 409 |
with gr.Tab("Text VQA (Optional)"):
|
| 410 |
with gr.Row():
|
|
|
|
| 355 |
recognizer = sr.Recognizer()
|
| 356 |
mic = sr.Microphone()
|
| 357 |
|
| 358 |
+
# Voice Q&A function
|
| 359 |
+
def voice_question_answer(image, voice_file):
|
| 360 |
global scene_context
|
| 361 |
+
# Convert recorded audio to text
|
| 362 |
+
import speech_recognition as sr
|
| 363 |
+
recognizer = sr.Recognizer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
try:
|
| 365 |
+
with sr.AudioFile(voice_file) as source:
|
| 366 |
+
audio = recognizer.record(source)
|
| 367 |
question = recognizer.recognize_google(audio)
|
| 368 |
except:
|
| 369 |
return "Could not understand your question. Try again."
|
| 370 |
|
| 371 |
# Get answer
|
| 372 |
answer = vqa_answer(image, question)
|
| 373 |
+
# Speak answer
|
| 374 |
+
audio_path = speak(answer)
|
| 375 |
+
return f"Question: {question}\nAnswer: {answer}", audio_path
|
| 376 |
|
| 377 |
# ----------------------
|
| 378 |
# Gradio UI
|
|
|
|
| 391 |
btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
|
| 392 |
|
| 393 |
with gr.Tab("Voice Question Answering"):
|
| 394 |
+
img_vqa = gr.Image(type="pil", label="Upload Image for Voice Q&A")
|
| 395 |
+
voice_in = gr.Audio(source="microphone", type="filepath", label="Speak your question")
|
| 396 |
+
voice_out_text = gr.Textbox(label="Voice Q&A Output")
|
| 397 |
+
voice_out_audio = gr.Audio(label="Spoken Answer", type="filepath")
|
| 398 |
+
btn_voice = gr.Button("Get Answer")
|
| 399 |
+
btn_voice.click(
|
| 400 |
+
voice_question_answer,
|
| 401 |
+
inputs=[img_vqa, voice_in],
|
| 402 |
+
outputs=[voice_out_text, voice_out_audio]
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
|
| 406 |
with gr.Tab("Text VQA (Optional)"):
|
| 407 |
with gr.Row():
|