Spaces:

gopalagra
/

blind-image-captioning

Sleeping

App Files Files Community

gopalagra commited on 25 days ago

Commit

9cf0535

verified ·

1 Parent(s): 0a27bcd

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -19

app.py CHANGED Viewed

@@ -229,7 +229,7 @@ import tempfile
 import base64
 # ----------------------
-# Device
 # ----------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -237,10 +237,16 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 # Simple BEEP sound (base64)
 # ----------------------
 BEEP_BASE64 = """
-SUQzAwAAAAAAF1RTU0UAAAAPAAADTGF2ZjU4LjMyLjEwNAAAAAAAAAAAAAAA//uQxAADB...
 """
-# Convert base64 to temp mp3 file
 def load_beep():
     audio_bytes = base64.b64decode(BEEP_BASE64)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
@@ -248,7 +254,6 @@ def load_beep():
     tmp.close()
     return tmp.name
 # ----------------------
 # Load models
 # ----------------------
@@ -274,7 +279,6 @@ moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
 print("✅ All models loaded!")
 # ----------------------
 # Safety check
 # ----------------------
@@ -282,17 +286,17 @@ def is_caption_safe(caption):
     try:
         result = moderation_model(caption)
         if isinstance(result, list) and "label" in result[0]:
-            if result[0]["label"] == "toxic" and result[0]["score"] > 0.5:
                 return False
     except:
         pass
     unsafe_words = ["gun", "kill", "dead", "weapon", "blood"]
     return not any(w in caption.lower() for w in unsafe_words)
 # ----------------------
-# Auto Caption + Translate + BEEP
 # ----------------------
 def auto_process(image, target_lang):
     if image is None:
@@ -302,6 +306,7 @@ def auto_process(image, target_lang):
     inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         output = caption_model.generate(**inputs, max_new_tokens=40)
     caption = caption_processor.decode(output[0], skip_special_tokens=True)
     # Safety
@@ -311,11 +316,8 @@ def auto_process(image, target_lang):
     # Translate
     translated = translation_models[target_lang](caption)[0]["translation_text"]
-    # Always play BEEP once caption is ready
-    beep_file = load_beep()
-    return caption, translated, beep_file
 # ----------------------
 # VQA
@@ -335,26 +337,25 @@ def vqa_answer(image, question):
     return ans
 # ----------------------
-# UI
 # ----------------------
 with gr.Blocks(title="BLIP App") as demo:
-    gr.Markdown("## 🖼️ Auto-Caption + Translation + Automatic Beep")
     with gr.Tab("Auto Caption"):
         img = gr.Image(type="pil", label="Upload Image")
         lang = gr.Dropdown(["Hindi", "French", "Spanish"], value="Hindi", label="Translate To")
         out_eng = gr.Textbox(label="English Caption")
         out_trans = gr.Textbox(label="Translated")
-        out_audio = gr.Audio(label="Beep", autoplay=True)
-        # 🔥 Auto-run when image is uploaded
         img.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
         lang.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
     with gr.Tab("VQA"):
-        img_vqa = gr.Image(type="pil")
         q = gr.Textbox(label="Ask a question")
         ans = gr.Textbox(label="Answer")
         ask_btn = gr.Button("Ask")

 import base64
 # ----------------------
+# Device setup
 # ----------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Simple BEEP sound (base64)
 # ----------------------
 BEEP_BASE64 = """
+SUQzAwAAAAAAFlRFTkMAAAAPAAADdAAAABJBTUFEAAAAGwAAAG1kYXQAAAAA/////wABAAAC
+AgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
+ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICA
+gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIC
+AgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
+ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICA
+gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgAAAA==
 """
 def load_beep():
     audio_bytes = base64.b64decode(BEEP_BASE64)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
     tmp.close()
     return tmp.name
 # ----------------------
 # Load models
 # ----------------------
 print("✅ All models loaded!")
 # ----------------------
 # Safety check
 # ----------------------
     try:
         result = moderation_model(caption)
         if isinstance(result, list) and "label" in result[0]:
+            if result[0]["label"].lower() == "toxic" and result[0]["score"] > 0.5:
                 return False
     except:
         pass
+    # extra simple keyword check
     unsafe_words = ["gun", "kill", "dead", "weapon", "blood"]
     return not any(w in caption.lower() for w in unsafe_words)
 # ----------------------
+# Auto Caption + Translate + Optional BEEP
 # ----------------------
 def auto_process(image, target_lang):
     if image is None:
     inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         output = caption_model.generate(**inputs, max_new_tokens=40)
     caption = caption_processor.decode(output[0], skip_special_tokens=True)
     # Safety
     # Translate
     translated = translation_models[target_lang](caption)[0]["translation_text"]
+    # SAFE → No beep
+    return caption, translated, None
 # ----------------------
 # VQA
     return ans
 # ----------------------
+# Gradio UI
 # ----------------------
 with gr.Blocks(title="BLIP App") as demo:
+    gr.Markdown("## 🖼️ Auto-Caption + Translation + Safety Beep")
     with gr.Tab("Auto Caption"):
         img = gr.Image(type="pil", label="Upload Image")
         lang = gr.Dropdown(["Hindi", "French", "Spanish"], value="Hindi", label="Translate To")
         out_eng = gr.Textbox(label="English Caption")
         out_trans = gr.Textbox(label="Translated")
+        out_audio = gr.Audio(label="Audio", type="filepath", autoplay=True)
+        # Auto-run on image or language change
         img.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
         lang.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
     with gr.Tab("VQA"):
+        img_vqa = gr.Image(type="pil", label="Upload Image")
         q = gr.Textbox(label="Ask a question")
         ans = gr.Textbox(label="Answer")
         ask_btn = gr.Button("Ask")