Spaces:

gopalagra
/

blind-image-captioning

Sleeping

App Files Files Community

gopalagra commited on 24 days ago

Commit

b53948f

verified ·

1 Parent(s): 9cf0535

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -58

app.py CHANGED Viewed

@@ -226,7 +226,7 @@ from transformers import (
 from PIL import Image
 import torch
 import tempfile
-import base64
 # ----------------------
 # Device setup
@@ -234,24 +234,29 @@ import base64
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------
-# Simple BEEP sound (base64)
 # ----------------------
-BEEP_BASE64 = """
-SUQzAwAAAAAAFlRFTkMAAAAPAAADdAAAABJBTUFEAAAAGwAAAG1kYXQAAAAA/////wABAAAC
-AgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
-ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICA
-gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIC
-AgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
-ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICA
-gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgAAAA==
-"""
-def load_beep():
-    audio_bytes = base64.b64decode(BEEP_BASE64)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-    tmp.write(audio_bytes)
-    tmp.close()
     return tmp.name
 # ----------------------
@@ -282,42 +287,41 @@ print("✅ All models loaded!")
 # ----------------------
 # Safety check
 # ----------------------
-def is_caption_safe(caption):
     try:
-        result = moderation_model(caption)
         if isinstance(result, list) and "label" in result[0]:
             if result[0]["label"].lower() == "toxic" and result[0]["score"] > 0.5:
                 return False
     except:
         pass
-    # extra simple keyword check
-    unsafe_words = ["gun", "kill", "dead", "weapon", "blood"]
-    return not any(w in caption.lower() for w in unsafe_words)
 # ----------------------
-# Auto Caption + Translate + Optional BEEP
 # ----------------------
-def auto_process(image, target_lang):
     if image is None:
         return "", "", None
-    # Caption
     inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
-        output = caption_model.generate(**inputs, max_new_tokens=40)
     caption = caption_processor.decode(output[0], skip_special_tokens=True)
-    # Safety
-    if not is_caption_safe(caption):
-        return "⚠️ Unsafe content detected!", "", load_beep()
     # Translate
     translated = translation_models[target_lang](caption)[0]["translation_text"]
-    # SAFE → No beep
-    return caption, translated, None
 # ----------------------
 # VQA
@@ -329,37 +333,38 @@ def vqa_answer(image, question):
     inputs = vqa_processor(image, question, return_tensors="pt").to(device)
     with torch.no_grad():
         out = vqa_model.generate(**inputs, max_new_tokens=30)
-    ans = vqa_processor.decode(out[0], skip_special_tokens=True)
-    if not is_caption_safe(ans):
         return "⚠️ Unsafe content detected!"
-    return ans
 # ----------------------
 # Gradio UI
 # ----------------------
-with gr.Blocks(title="BLIP App") as demo:
-    gr.Markdown("## 🖼️ Auto-Caption + Translation + Safety Beep")
-    with gr.Tab("Auto Caption"):
-        img = gr.Image(type="pil", label="Upload Image")
-        lang = gr.Dropdown(["Hindi", "French", "Spanish"], value="Hindi", label="Translate To")
-        out_eng = gr.Textbox(label="English Caption")
-        out_trans = gr.Textbox(label="Translated")
-        out_audio = gr.Audio(label="Audio", type="filepath", autoplay=True)
-        # Auto-run on image or language change
-        img.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
-        lang.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
-    with gr.Tab("VQA"):
-        img_vqa = gr.Image(type="pil", label="Upload Image")
-        q = gr.Textbox(label="Ask a question")
-        ans = gr.Textbox(label="Answer")
-        ask_btn = gr.Button("Ask")
-        ask_btn.click(vqa_answer, inputs=[img_vqa, q], outputs=ans)
 demo.launch()

 from PIL import Image
 import torch
 import tempfile
+import pyttsx3  # offline TTS
 # ----------------------
 # Device setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------
+# Offline TTS for safe captions
 # ----------------------
+def offline_tts(text):
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+    engine = pyttsx3.init()
+    engine.save_to_file(text, tmp.name)
+    engine.runAndWait()
+    return tmp.name
+# ----------------------
+# Simple BEEP sound
+# ----------------------
+def generate_beep():
+    import numpy as np
+    import soundfile as sf
+    sr = 44100
+    duration = 0.3
+    freq = 880
+    t = np.linspace(0, duration, int(sr*duration), False)
+    wave = 0.5*np.sin(2*np.pi*freq*t)
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    sf.write(tmp.name, wave, sr)
     return tmp.name
 # ----------------------
 # ----------------------
 # Safety check
 # ----------------------
+def is_safe(text):
     try:
+        result = moderation_model(text)
         if isinstance(result, list) and "label" in result[0]:
             if result[0]["label"].lower() == "toxic" and result[0]["score"] > 0.5:
                 return False
     except:
         pass
+    unsafe_keywords = ["gun", "kill", "dead", "blood", "weapon"]
+    return not any(k in text.lower() for k in unsafe_keywords)
 # ----------------------
+# Caption + Translate + Audio
 # ----------------------
+def generate_caption_translate_speak(image, target_lang):
     if image is None:
         return "", "", None
+    # Generate caption
     inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
+        output = caption_model.generate(**inputs, max_new_tokens=50)
     caption = caption_processor.decode(output[0], skip_special_tokens=True)
+    # Safety check
+    if not is_safe(caption):
+        return "⚠️ Unsafe content detected!", "", generate_beep()
     # Translate
     translated = translation_models[target_lang](caption)[0]["translation_text"]
+    # Generate TTS for safe caption
+    audio_file = offline_tts(caption)
+    return caption, translated, audio_file
 # ----------------------
 # VQA
     inputs = vqa_processor(image, question, return_tensors="pt").to(device)
     with torch.no_grad():
         out = vqa_model.generate(**inputs, max_new_tokens=30)
+    answer = vqa_processor.decode(out[0], skip_special_tokens=True)
+    if not is_safe(answer):
         return "⚠️ Unsafe content detected!"
+    return answer
 # ----------------------
 # Gradio UI
 # ----------------------
+with gr.Blocks(title="BLIP Vision App") as demo:
+    gr.Markdown("## 🖼️ BLIP: Caption + Translation + TTS + VQA")
+    with gr.Tab("Caption + Translate + Speak"):
+        with gr.Row():
+            img_in = gr.Image(type="pil", label="Upload Image")
+            lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], value="Hindi", label="Translate To")
+        eng_out = gr.Textbox(label="English Caption")
+        trans_out = gr.Textbox(label="Translated Caption")
+        audio_out = gr.Audio(label="Audio / Beep", type="filepath", autoplay=True)
+        btn = gr.Button("Generate Caption, Translate & Speak")
+        btn.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
+    with gr.Tab("Visual Question Answering (VQA)"):
+        with gr.Row():
+            img_vqa = gr.Image(type="pil")
+            q_in = gr.Textbox(label="Ask About the Image")
+        ans_out = gr.Textbox(label="Answer")
+        btn2 = gr.Button("Ask")
+        btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
 demo.launch()