Spaces:

gopalagra
/

blind-image-captioning

Sleeping

App Files Files Community

gopalagra commited on 26 days ago

Commit

1b286f6

verified ·

1 Parent(s): 5bed2ca

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -84

app.py CHANGED Viewed

@@ -223,146 +223,134 @@ from transformers import (
     BlipForQuestionAnswering,
     pipeline
 )
 import torch
 import tempfile
-import numpy as np
-import soundfile as sf
-from TTS.api import TTS
-from PIL import Image
 # ----------------------
-# Device
 # ----------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------
-# Load Models
 # ----------------------
-print("🔄 Loading BLIP models...")
 caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-caption_model = BlipForConditionalGeneration.from_pretrained(
-    "Salesforce/blip-image-captioning-large"
-).to(device)
 vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
-vqa_model = BlipForQuestionAnswering.from_pretrained(
-    "Salesforce/blip-vqa-base"
-).to(device)
 translation_models = {
     "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
     "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
     "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
 }
 moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
-# Load TTS model (Hugging Face, offline)
-tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
 print("✅ All models loaded!")
 # ----------------------
-# Safety check
 # ----------------------
 def is_caption_safe(caption):
     try:
-        result = moderation_model(caption)
-        if isinstance(result, list) and "label" in result[0]:
-            if result[0]["label"] == "toxic" and result[0]["score"] > 0.5:
                 return False
-    except:
-        pass
-    unsafe_words = ["gun", "kill", "dead", "weapon", "blood", "suicide", "bomb"]
-    return not any(w in caption.lower() for w in unsafe_words)
 # ----------------------
-# Beep Generator
-# ----------------------
-def generate_beep():
-    sr = 44100
-    duration = 0.4
-    frequency = 880
-    t = np.linspace(0, duration, int(sr * duration), False)
-    wave = 0.5 * np.sin(2 * np.pi * frequency * t)
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    sf.write(tmp.name, wave, sr)
-    return tmp.name
-# ----------------------
-# TTS
-# ----------------------
-def offline_tts(text):
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    tts_model.tts_to_file(text=text, file_path=tmp.name)
-    return tmp.name
-# ----------------------
-# Caption + Translate + TTS
 # ----------------------
 def generate_caption_translate_speak(image, target_lang):
-    if image is None:
-        return "", "", None
-    # Caption
     inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         out = caption_model.generate(**inputs, max_new_tokens=50)
-    caption = caption_processor.decode(out[0], skip_special_tokens=True)
-    # Safety check
-    if not is_caption_safe(caption):
-        beep_file = generate_beep()
-        return "⚠️ Unsafe content detected!", "", beep_file
-    # Translation
-    translated = translation_models[target_lang](caption)[0]["translation_text"]
-    # TTS only for safe caption
-    audio_file = offline_tts(caption)
-    return caption, translated, audio_file
 # ----------------------
 # VQA
 # ----------------------
 def vqa_answer(image, question):
-    if image is None or not question:
-        return ""
     inputs = vqa_processor(image, question, return_tensors="pt").to(device)
     with torch.no_grad():
         out = vqa_model.generate(**inputs, max_new_tokens=50)
     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
     if not is_caption_safe(answer):
-        return "⚠️ Unsafe content detected!"
     return answer
 # ----------------------
 # Gradio UI
 # ----------------------
 with gr.Blocks(title="BLIP Vision App") as demo:
-    gr.Markdown("## 🖼️ BLIP: Caption + Translation + TTS + VQA (with Safety)")
     with gr.Tab("Caption + Translate + Speak"):
         with gr.Row():
             img_in = gr.Image(type="pil", label="Upload Image")
-            lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], value="Hindi", label="Translate To")
-        eng_out = gr.Textbox(label="English Caption")
-        trans_out = gr.Textbox(label="Translated Caption")
-        audio_out = gr.Audio(label="Speech / Warning Beep", type="filepath", autoplay=True)
-        btn = gr.Button("Generate Caption, Translate & Speak")
-        btn.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
     with gr.Tab("Visual Question Answering (VQA)"):
         with gr.Row():
-            img_vqa = gr.Image(type="pil")
-            q_in = gr.Textbox(label="Ask About the Image")
-        ans_out = gr.Textbox(label="Answer")
-        btn2 = gr.Button("Ask")
-        btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
 demo.launch()

     BlipForQuestionAnswering,
     pipeline
 )
+from PIL import Image
 import torch
+from gtts import gTTS
 import tempfile
 # ----------------------
+# Device setup
 # ----------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------
+# Load Models Once
 # ----------------------
+print("🔄 Loading models...")
+# Captioning
 caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
+# VQA
 vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
+# Translation
 translation_models = {
     "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
     "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
     "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
 }
+# Safety Moderation Pipeline
 moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
 print("✅ All models loaded!")
 # ----------------------
+# Safety Filter Function
 # ----------------------
 def is_caption_safe(caption):
     try:
+        votes = moderation_model(caption)
+        # If return_all_scores=True, it's [[{label, score}, ...]]
+        if isinstance(votes, list) and isinstance(votes[0], list):
+            votes = votes[0]
+        # Loop through scores
+        for item in votes:
+            if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
                 return False
+    except Exception as e:
+        print("⚠️ Moderation failed:", e)
+    # Fallback keyword check
+    unsafe_keywords = [
+        "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon", "fire",
+        "murder", "dead", "death", "suicide", "bomb", "explosion", "terrorist", "assault",
+        "stab", "shoot", "pistol", "rifle", "shotgun", "grenade", "horror", "beheaded",
+        "torture", "hostage", "rape", "war", "massacre", "chainsaw", "poison", "strangle",
+        "hang", "drown"
+    ]
+    if any(word in caption.lower() for word in unsafe_keywords):
+        return False
+    return True
 # ----------------------
+# Caption + Translate + Speak
 # ----------------------
 def generate_caption_translate_speak(image, target_lang):
+    # Step 1: Caption
     inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         out = caption_model.generate(**inputs, max_new_tokens=50)
+    english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
+    # Step 1.5: Safety Check
+    if not is_caption_safe(english_caption):
+        return "⚠️ Warning: Unsafe or inappropriate content detected!", "", None
+    # Step 2: Translate
+    if target_lang in translation_models:
+        translated = translation_models[target_lang](english_caption)[0]['translation_text']
+    else:
+        translated = "Translation not available"
+    # Step 3: Generate Speech (English caption for now)
+    tts = gTTS(english_caption, lang="en")
+    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+    tts.save(tmp_file.name)
+    return english_caption, translated, tmp_file.name
 # ----------------------
 # VQA
 # ----------------------
 def vqa_answer(image, question):
     inputs = vqa_processor(image, question, return_tensors="pt").to(device)
     with torch.no_grad():
         out = vqa_model.generate(**inputs, max_new_tokens=50)
     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
+    # Safety filter
     if not is_caption_safe(answer):
+        return "⚠️ Warning: Unsafe or inappropriate content detected!"
     return answer
 # ----------------------
 # Gradio UI
 # ----------------------
 with gr.Blocks(title="BLIP Vision App") as demo:
+    gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter)")
     with gr.Tab("Caption + Translate + Speak"):
         with gr.Row():
             img_in = gr.Image(type="pil", label="Upload Image")
+            lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
+            eng_out = gr.Textbox(label="English Caption")
+            trans_out = gr.Textbox(label="Translated Caption")
+            audio_out = gr.Audio(label="Spoken Caption", type="filepath")
+            btn1 = gr.Button("Generate Caption, Translate & Speak")
+            btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
     with gr.Tab("Visual Question Answering (VQA)"):
         with gr.Row():
+            img_vqa = gr.Image(type="pil", label="Upload Image")
+            q_in = gr.Textbox(label="Ask a Question about the Image")
+            ans_out = gr.Textbox(label="Answer")
+            btn2 = gr.Button("Ask")
+            btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
 demo.launch()