Spaces:

gopalagra
/

blind-image-captioning

Sleeping

App Files Files Community

gopalagra commited on Nov 13

Commit

d3a4a57

verified ·

1 Parent(s): a01c6e8

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -10

app.py CHANGED Viewed

@@ -315,7 +315,6 @@ def generate_caption_translate_speak(image, target_lang):
     # Step 1.5: Safety Check
     if not is_caption_safe(english_caption):
         beep = make_beep_sound()
-        # Return warning text + auto-playing beep
         return "⚠️ Warning: Unsafe or inappropriate content detected!", "", beep
     # Step 2: Translate
@@ -324,7 +323,7 @@ def generate_caption_translate_speak(image, target_lang):
     else:
         translated = "Translation not available"
-    # Step 3: Generate Speech (English caption for now)
     tts = gTTS(english_caption, lang="en")
     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
     tts.save(tmp_file.name)
@@ -341,10 +340,8 @@ def vqa_answer(image, question):
         out = vqa_model.generate(**inputs, max_new_tokens=50)
     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
-    # Safety check
     if not is_caption_safe(answer):
         beep = make_beep_sound()
-        # Return warning + beep sound
         return "⚠️ Warning: Unsafe or inappropriate content detected!", beep
     return answer, None
@@ -354,27 +351,28 @@ def vqa_answer(image, question):
 # Gradio UI
 # ----------------------
 with gr.Blocks(title="BLIP Vision App") as demo:
-    gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter + Auto Beep Alert)")
     with gr.Tab("Caption + Translate + Speak"):
         with gr.Row():
             img_in = gr.Image(type="pil", label="Upload Image")
             lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
         eng_out = gr.Textbox(label="English Caption")
         trans_out = gr.Textbox(label="Translated Caption")
-        audio_out = gr.Audio(label="Audio Output", type="filepath", autoplay=True)  # autoplay enabled
         btn1 = gr.Button("Generate Caption, Translate & Speak")
-        btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
     with gr.Tab("Visual Question Answering (VQA)"):
         with gr.Row():
             img_vqa = gr.Image(type="pil", label="Upload Image")
             q_in = gr.Textbox(label="Ask a Question about the Image")
         ans_out = gr.Textbox(label="Answer")
-        beep_out = gr.Audio(label="Alert Sound", type="filepath", autoplay=True)  # autoplay enabled
         btn2 = gr.Button("Ask")
         btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
 demo.launch()

     # Step 1.5: Safety Check
     if not is_caption_safe(english_caption):
         beep = make_beep_sound()
         return "⚠️ Warning: Unsafe or inappropriate content detected!", "", beep
     # Step 2: Translate
     else:
         translated = "Translation not available"
+    # Step 3: Generate Speech (English caption)
     tts = gTTS(english_caption, lang="en")
     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
     tts.save(tmp_file.name)
         out = vqa_model.generate(**inputs, max_new_tokens=50)
     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
     if not is_caption_safe(answer):
         beep = make_beep_sound()
         return "⚠️ Warning: Unsafe or inappropriate content detected!", beep
     return answer, None
 # Gradio UI
 # ----------------------
 with gr.Blocks(title="BLIP Vision App") as demo:
+    gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (Auto-Play TTS + Safety Beep)")
+    # --- Caption + Translate + Speak ---
     with gr.Tab("Caption + Translate + Speak"):
         with gr.Row():
             img_in = gr.Image(type="pil", label="Upload Image")
             lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
         eng_out = gr.Textbox(label="English Caption")
         trans_out = gr.Textbox(label="Translated Caption")
+        audio_out = gr.Audio(label="Speech Output", type="filepath", autoplay=True)  # Auto-plays TTS or beep
         btn1 = gr.Button("Generate Caption, Translate & Speak")
+        btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
+                   outputs=[eng_out, trans_out, audio_out])
+    # --- Visual Question Answering (VQA) ---
     with gr.Tab("Visual Question Answering (VQA)"):
         with gr.Row():
             img_vqa = gr.Image(type="pil", label="Upload Image")
             q_in = gr.Textbox(label="Ask a Question about the Image")
         ans_out = gr.Textbox(label="Answer")
+        beep_out = gr.Audio(label="Alert Sound", type="filepath", autoplay=True)  # Auto-plays beep
         btn2 = gr.Button("Ask")
         btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
 demo.launch()