gopalagra commited on
Commit
d3a4a57
·
verified ·
1 Parent(s): a01c6e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -10
app.py CHANGED
@@ -315,7 +315,6 @@ def generate_caption_translate_speak(image, target_lang):
315
  # Step 1.5: Safety Check
316
  if not is_caption_safe(english_caption):
317
  beep = make_beep_sound()
318
- # Return warning text + auto-playing beep
319
  return "⚠️ Warning: Unsafe or inappropriate content detected!", "", beep
320
 
321
  # Step 2: Translate
@@ -324,7 +323,7 @@ def generate_caption_translate_speak(image, target_lang):
324
  else:
325
  translated = "Translation not available"
326
 
327
- # Step 3: Generate Speech (English caption for now)
328
  tts = gTTS(english_caption, lang="en")
329
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
330
  tts.save(tmp_file.name)
@@ -341,10 +340,8 @@ def vqa_answer(image, question):
341
  out = vqa_model.generate(**inputs, max_new_tokens=50)
342
  answer = vqa_processor.decode(out[0], skip_special_tokens=True)
343
 
344
- # Safety check
345
  if not is_caption_safe(answer):
346
  beep = make_beep_sound()
347
- # Return warning + beep sound
348
  return "⚠️ Warning: Unsafe or inappropriate content detected!", beep
349
 
350
  return answer, None
@@ -354,27 +351,28 @@ def vqa_answer(image, question):
354
  # Gradio UI
355
  # ----------------------
356
  with gr.Blocks(title="BLIP Vision App") as demo:
357
- gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter + Auto Beep Alert)")
358
 
 
359
  with gr.Tab("Caption + Translate + Speak"):
360
  with gr.Row():
361
  img_in = gr.Image(type="pil", label="Upload Image")
362
  lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
363
  eng_out = gr.Textbox(label="English Caption")
364
  trans_out = gr.Textbox(label="Translated Caption")
365
- audio_out = gr.Audio(label="Audio Output", type="filepath", autoplay=True) # autoplay enabled
366
  btn1 = gr.Button("Generate Caption, Translate & Speak")
367
- btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
 
368
 
 
369
  with gr.Tab("Visual Question Answering (VQA)"):
370
  with gr.Row():
371
  img_vqa = gr.Image(type="pil", label="Upload Image")
372
  q_in = gr.Textbox(label="Ask a Question about the Image")
373
  ans_out = gr.Textbox(label="Answer")
374
- beep_out = gr.Audio(label="Alert Sound", type="filepath", autoplay=True) # autoplay enabled
375
  btn2 = gr.Button("Ask")
376
  btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
377
 
378
  demo.launch()
379
-
380
-
 
315
  # Step 1.5: Safety Check
316
  if not is_caption_safe(english_caption):
317
  beep = make_beep_sound()
 
318
  return "⚠️ Warning: Unsafe or inappropriate content detected!", "", beep
319
 
320
  # Step 2: Translate
 
323
  else:
324
  translated = "Translation not available"
325
 
326
+ # Step 3: Generate Speech (English caption)
327
  tts = gTTS(english_caption, lang="en")
328
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
329
  tts.save(tmp_file.name)
 
340
  out = vqa_model.generate(**inputs, max_new_tokens=50)
341
  answer = vqa_processor.decode(out[0], skip_special_tokens=True)
342
 
 
343
  if not is_caption_safe(answer):
344
  beep = make_beep_sound()
 
345
  return "⚠️ Warning: Unsafe or inappropriate content detected!", beep
346
 
347
  return answer, None
 
351
  # Gradio UI
352
  # ----------------------
353
  with gr.Blocks(title="BLIP Vision App") as demo:
354
+ gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (Auto-Play TTS + Safety Beep)")
355
 
356
+ # --- Caption + Translate + Speak ---
357
  with gr.Tab("Caption + Translate + Speak"):
358
  with gr.Row():
359
  img_in = gr.Image(type="pil", label="Upload Image")
360
  lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
361
  eng_out = gr.Textbox(label="English Caption")
362
  trans_out = gr.Textbox(label="Translated Caption")
363
+ audio_out = gr.Audio(label="Speech Output", type="filepath", autoplay=True) # Auto-plays TTS or beep
364
  btn1 = gr.Button("Generate Caption, Translate & Speak")
365
+ btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
366
+ outputs=[eng_out, trans_out, audio_out])
367
 
368
+ # --- Visual Question Answering (VQA) ---
369
  with gr.Tab("Visual Question Answering (VQA)"):
370
  with gr.Row():
371
  img_vqa = gr.Image(type="pil", label="Upload Image")
372
  q_in = gr.Textbox(label="Ask a Question about the Image")
373
  ans_out = gr.Textbox(label="Answer")
374
+ beep_out = gr.Audio(label="Alert Sound", type="filepath", autoplay=True) # Auto-plays beep
375
  btn2 = gr.Button("Ask")
376
  btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
377
 
378
  demo.launch()