Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -315,7 +315,6 @@ def generate_caption_translate_speak(image, target_lang):
|
|
| 315 |
# Step 1.5: Safety Check
|
| 316 |
if not is_caption_safe(english_caption):
|
| 317 |
beep = make_beep_sound()
|
| 318 |
-
# Return warning text + auto-playing beep
|
| 319 |
return "⚠️ Warning: Unsafe or inappropriate content detected!", "", beep
|
| 320 |
|
| 321 |
# Step 2: Translate
|
|
@@ -324,7 +323,7 @@ def generate_caption_translate_speak(image, target_lang):
|
|
| 324 |
else:
|
| 325 |
translated = "Translation not available"
|
| 326 |
|
| 327 |
-
# Step 3: Generate Speech (English caption
|
| 328 |
tts = gTTS(english_caption, lang="en")
|
| 329 |
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
|
| 330 |
tts.save(tmp_file.name)
|
|
@@ -341,10 +340,8 @@ def vqa_answer(image, question):
|
|
| 341 |
out = vqa_model.generate(**inputs, max_new_tokens=50)
|
| 342 |
answer = vqa_processor.decode(out[0], skip_special_tokens=True)
|
| 343 |
|
| 344 |
-
# Safety check
|
| 345 |
if not is_caption_safe(answer):
|
| 346 |
beep = make_beep_sound()
|
| 347 |
-
# Return warning + beep sound
|
| 348 |
return "⚠️ Warning: Unsafe or inappropriate content detected!", beep
|
| 349 |
|
| 350 |
return answer, None
|
|
@@ -354,27 +351,28 @@ def vqa_answer(image, question):
|
|
| 354 |
# Gradio UI
|
| 355 |
# ----------------------
|
| 356 |
with gr.Blocks(title="BLIP Vision App") as demo:
|
| 357 |
-
gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (
|
| 358 |
|
|
|
|
| 359 |
with gr.Tab("Caption + Translate + Speak"):
|
| 360 |
with gr.Row():
|
| 361 |
img_in = gr.Image(type="pil", label="Upload Image")
|
| 362 |
lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
|
| 363 |
eng_out = gr.Textbox(label="English Caption")
|
| 364 |
trans_out = gr.Textbox(label="Translated Caption")
|
| 365 |
-
audio_out = gr.Audio(label="
|
| 366 |
btn1 = gr.Button("Generate Caption, Translate & Speak")
|
| 367 |
-
btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
|
|
|
|
| 368 |
|
|
|
|
| 369 |
with gr.Tab("Visual Question Answering (VQA)"):
|
| 370 |
with gr.Row():
|
| 371 |
img_vqa = gr.Image(type="pil", label="Upload Image")
|
| 372 |
q_in = gr.Textbox(label="Ask a Question about the Image")
|
| 373 |
ans_out = gr.Textbox(label="Answer")
|
| 374 |
-
beep_out = gr.Audio(label="Alert Sound", type="filepath", autoplay=True) #
|
| 375 |
btn2 = gr.Button("Ask")
|
| 376 |
btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
|
| 377 |
|
| 378 |
demo.launch()
|
| 379 |
-
|
| 380 |
-
|
|
|
|
| 315 |
# Step 1.5: Safety Check
|
| 316 |
if not is_caption_safe(english_caption):
|
| 317 |
beep = make_beep_sound()
|
|
|
|
| 318 |
return "⚠️ Warning: Unsafe or inappropriate content detected!", "", beep
|
| 319 |
|
| 320 |
# Step 2: Translate
|
|
|
|
| 323 |
else:
|
| 324 |
translated = "Translation not available"
|
| 325 |
|
| 326 |
+
# Step 3: Generate Speech (English caption)
|
| 327 |
tts = gTTS(english_caption, lang="en")
|
| 328 |
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
|
| 329 |
tts.save(tmp_file.name)
|
|
|
|
| 340 |
out = vqa_model.generate(**inputs, max_new_tokens=50)
|
| 341 |
answer = vqa_processor.decode(out[0], skip_special_tokens=True)
|
| 342 |
|
|
|
|
| 343 |
if not is_caption_safe(answer):
|
| 344 |
beep = make_beep_sound()
|
|
|
|
| 345 |
return "⚠️ Warning: Unsafe or inappropriate content detected!", beep
|
| 346 |
|
| 347 |
return answer, None
|
|
|
|
| 351 |
# Gradio UI
|
| 352 |
# ----------------------
|
| 353 |
with gr.Blocks(title="BLIP Vision App") as demo:
|
| 354 |
+
gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (Auto-Play TTS + Safety Beep)")
|
| 355 |
|
| 356 |
+
# --- Caption + Translate + Speak ---
|
| 357 |
with gr.Tab("Caption + Translate + Speak"):
|
| 358 |
with gr.Row():
|
| 359 |
img_in = gr.Image(type="pil", label="Upload Image")
|
| 360 |
lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
|
| 361 |
eng_out = gr.Textbox(label="English Caption")
|
| 362 |
trans_out = gr.Textbox(label="Translated Caption")
|
| 363 |
+
audio_out = gr.Audio(label="Speech Output", type="filepath", autoplay=True) # Auto-plays TTS or beep
|
| 364 |
btn1 = gr.Button("Generate Caption, Translate & Speak")
|
| 365 |
+
btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
|
| 366 |
+
outputs=[eng_out, trans_out, audio_out])
|
| 367 |
|
| 368 |
+
# --- Visual Question Answering (VQA) ---
|
| 369 |
with gr.Tab("Visual Question Answering (VQA)"):
|
| 370 |
with gr.Row():
|
| 371 |
img_vqa = gr.Image(type="pil", label="Upload Image")
|
| 372 |
q_in = gr.Textbox(label="Ask a Question about the Image")
|
| 373 |
ans_out = gr.Textbox(label="Answer")
|
| 374 |
+
beep_out = gr.Audio(label="Alert Sound", type="filepath", autoplay=True) # Auto-plays beep
|
| 375 |
btn2 = gr.Button("Ask")
|
| 376 |
btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
|
| 377 |
|
| 378 |
demo.launch()
|
|
|
|
|
|