gopalagra commited on
Commit
0888801
Β·
verified Β·
1 Parent(s): aeaa361

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -24
app.py CHANGED
@@ -355,34 +355,24 @@ def vqa_answer(image, question):
355
  recognizer = sr.Recognizer()
356
  mic = sr.Microphone()
357
 
358
- def voice_question_answer(image):
 
359
  global scene_context
360
- # Passive listening: wait for trigger phrase
361
- print("πŸ”Š Listening for trigger phrase 'Ask question'...")
362
- with mic as source:
363
- recognizer.adjust_for_ambient_noise(source)
364
- audio = recognizer.listen(source)
365
- try:
366
- trigger_text = recognizer.recognize_google(audio).lower()
367
- if "ask question" not in trigger_text:
368
- return "No trigger phrase detected. Say 'Ask question' to ask a question."
369
- except:
370
- return "Could not understand trigger phrase. Try again."
371
-
372
- # Active listening: capture actual question
373
- print("🎀 Trigger detected! Listening for your question...")
374
- speak("You can ask your question now")
375
- with mic as source:
376
- audio = recognizer.listen(source)
377
  try:
 
 
378
  question = recognizer.recognize_google(audio)
379
  except:
380
  return "Could not understand your question. Try again."
381
 
382
  # Get answer
383
  answer = vqa_answer(image, question)
384
- speak(answer)
385
- return f"Question: {question}\nAnswer: {answer}"
 
386
 
387
  # ----------------------
388
  # Gradio UI
@@ -401,10 +391,17 @@ with gr.Blocks(title="BLIP Vision App") as demo:
401
  btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
402
 
403
  with gr.Tab("Voice Question Answering"):
404
- img_vqa = gr.Image(type="pil", label="Upload Image for Voice Q&A")
405
- voice_out = gr.Textbox(label="Voice Q&A Output")
406
- btn_voice = gr.Button("Start Voice Q&A")
407
- btn_voice.click(voice_question_answer, inputs=img_vqa, outputs=voice_out)
 
 
 
 
 
 
 
408
 
409
  with gr.Tab("Text VQA (Optional)"):
410
  with gr.Row():
 
355
  recognizer = sr.Recognizer()
356
  mic = sr.Microphone()
357
 
358
+ # Voice Q&A function
359
+ def voice_question_answer(image, voice_file):
360
  global scene_context
361
+ # Convert recorded audio to text
362
+ import speech_recognition as sr
363
+ recognizer = sr.Recognizer()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  try:
365
+ with sr.AudioFile(voice_file) as source:
366
+ audio = recognizer.record(source)
367
  question = recognizer.recognize_google(audio)
368
  except:
369
  return "Could not understand your question. Try again."
370
 
371
  # Get answer
372
  answer = vqa_answer(image, question)
373
+ # Speak answer
374
+ audio_path = speak(answer)
375
+ return f"Question: {question}\nAnswer: {answer}", audio_path
376
 
377
  # ----------------------
378
  # Gradio UI
 
391
  btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
392
 
393
  with gr.Tab("Voice Question Answering"):
394
+ img_vqa = gr.Image(type="pil", label="Upload Image for Voice Q&A")
395
+ voice_in = gr.Audio(source="microphone", type="filepath", label="Speak your question")
396
+ voice_out_text = gr.Textbox(label="Voice Q&A Output")
397
+ voice_out_audio = gr.Audio(label="Spoken Answer", type="filepath")
398
+ btn_voice = gr.Button("Get Answer")
399
+ btn_voice.click(
400
+ voice_question_answer,
401
+ inputs=[img_vqa, voice_in],
402
+ outputs=[voice_out_text, voice_out_audio]
403
+ )
404
+
405
 
406
  with gr.Tab("Text VQA (Optional)"):
407
  with gr.Row():