Spaces:

gopalagra
/

blind-image-captioning

Sleeping

App Files Files Community

gopalagra commited on 25 days ago

Commit

d2ac3ec

verified ·

1 Parent(s): b53948f

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -53

app.py CHANGED Viewed

@@ -223,47 +223,22 @@ from transformers import (
     BlipForQuestionAnswering,
     pipeline
 )
-from PIL import Image
 import torch
 import tempfile
-import pyttsx3  # offline TTS
 # ----------------------
-# Device setup
 # ----------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------
-# Offline TTS for safe captions
-# ----------------------
-def offline_tts(text):
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-    engine = pyttsx3.init()
-    engine.save_to_file(text, tmp.name)
-    engine.runAndWait()
-    return tmp.name
-# ----------------------
-# Simple BEEP sound
-# ----------------------
-def generate_beep():
-    import numpy as np
-    import soundfile as sf
-    sr = 44100
-    duration = 0.3
-    freq = 880
-    t = np.linspace(0, duration, int(sr*duration), False)
-    wave = 0.5*np.sin(2*np.pi*freq*t)
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    sf.write(tmp.name, wave, sr)
-    return tmp.name
-# ----------------------
-# Load models
 # ----------------------
-print("🔄 Loading models...")
 caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
 caption_model = BlipForConditionalGeneration.from_pretrained(
     "Salesforce/blip-image-captioning-large"
@@ -282,43 +257,69 @@ translation_models = {
 moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
 print("✅ All models loaded!")
 # ----------------------
 # Safety check
 # ----------------------
-def is_safe(text):
     try:
-        result = moderation_model(text)
         if isinstance(result, list) and "label" in result[0]:
-            if result[0]["label"].lower() == "toxic" and result[0]["score"] > 0.5:
                 return False
     except:
         pass
-    unsafe_keywords = ["gun", "kill", "dead", "blood", "weapon"]
-    return not any(k in text.lower() for k in unsafe_keywords)
 # ----------------------
-# Caption + Translate + Audio
 # ----------------------
 def generate_caption_translate_speak(image, target_lang):
     if image is None:
         return "", "", None
-    # Generate caption
     inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
-        output = caption_model.generate(**inputs, max_new_tokens=50)
-    caption = caption_processor.decode(output[0], skip_special_tokens=True)
     # Safety check
-    if not is_safe(caption):
-        return "⚠️ Unsafe content detected!", "", generate_beep()
-    # Translate
     translated = translation_models[target_lang](caption)[0]["translation_text"]
-    # Generate TTS for safe caption
     audio_file = offline_tts(caption)
     return caption, translated, audio_file
@@ -329,13 +330,11 @@ def generate_caption_translate_speak(image, target_lang):
 def vqa_answer(image, question):
     if image is None or not question:
         return ""
     inputs = vqa_processor(image, question, return_tensors="pt").to(device)
     with torch.no_grad():
-        out = vqa_model.generate(**inputs, max_new_tokens=30)
     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
-    if not is_safe(answer):
         return "⚠️ Unsafe content detected!"
     return answer
@@ -343,7 +342,7 @@ def vqa_answer(image, question):
 # Gradio UI
 # ----------------------
 with gr.Blocks(title="BLIP Vision App") as demo:
-    gr.Markdown("## 🖼️ BLIP: Caption + Translation + TTS + VQA")
     with gr.Tab("Caption + Translate + Speak"):
         with gr.Row():
@@ -352,7 +351,7 @@ with gr.Blocks(title="BLIP Vision App") as demo:
         eng_out = gr.Textbox(label="English Caption")
         trans_out = gr.Textbox(label="Translated Caption")
-        audio_out = gr.Audio(label="Audio / Beep", type="filepath", autoplay=True)
         btn = gr.Button("Generate Caption, Translate & Speak")
         btn.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
@@ -361,14 +360,12 @@ with gr.Blocks(title="BLIP Vision App") as demo:
         with gr.Row():
             img_vqa = gr.Image(type="pil")
             q_in = gr.Textbox(label="Ask About the Image")
         ans_out = gr.Textbox(label="Answer")
         btn2 = gr.Button("Ask")
         btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
 demo.launch()

     BlipForQuestionAnswering,
     pipeline
 )
 import torch
 import tempfile
+import numpy as np
+import soundfile as sf
+from TTS.api import TTS
+from PIL import Image
 # ----------------------
+# Device
 # ----------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------
+# Load Models
 # ----------------------
+print("🔄 Loading BLIP models...")
 caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
 caption_model = BlipForConditionalGeneration.from_pretrained(
     "Salesforce/blip-image-captioning-large"
 moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
+# Load TTS model (Hugging Face, offline)
+tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
 print("✅ All models loaded!")
 # ----------------------
 # Safety check
 # ----------------------
+def is_caption_safe(caption):
     try:
+        result = moderation_model(caption)
         if isinstance(result, list) and "label" in result[0]:
+            if result[0]["label"] == "toxic" and result[0]["score"] > 0.5:
                 return False
     except:
         pass
+    unsafe_words = ["gun", "kill", "dead", "weapon", "blood", "suicide", "bomb"]
+    return not any(w in caption.lower() for w in unsafe_words)
+# ----------------------
+# Beep Generator
+# ----------------------
+def generate_beep():
+    sr = 44100
+    duration = 0.4
+    frequency = 880
+    t = np.linspace(0, duration, int(sr * duration), False)
+    wave = 0.5 * np.sin(2 * np.pi * frequency * t)
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    sf.write(tmp.name, wave, sr)
+    return tmp.name
 # ----------------------
+# TTS
+# ----------------------
+def offline_tts(text):
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    tts_model.tts_to_file(text=text, file_path=tmp.name)
+    return tmp.name
+# ----------------------
+# Caption + Translate + TTS
 # ----------------------
 def generate_caption_translate_speak(image, target_lang):
     if image is None:
         return "", "", None
+    # Caption
     inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
+        out = caption_model.generate(**inputs, max_new_tokens=50)
+    caption = caption_processor.decode(out[0], skip_special_tokens=True)
     # Safety check
+    if not is_caption_safe(caption):
+        beep_file = generate_beep()
+        return "⚠️ Unsafe content detected!", "", beep_file
+    # Translation
     translated = translation_models[target_lang](caption)[0]["translation_text"]
+    # TTS only for safe caption
     audio_file = offline_tts(caption)
     return caption, translated, audio_file
 def vqa_answer(image, question):
     if image is None or not question:
         return ""
     inputs = vqa_processor(image, question, return_tensors="pt").to(device)
     with torch.no_grad():
+        out = vqa_model.generate(**inputs, max_new_tokens=50)
     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
+    if not is_caption_safe(answer):
         return "⚠️ Unsafe content detected!"
     return answer
 # Gradio UI
 # ----------------------
 with gr.Blocks(title="BLIP Vision App") as demo:
+    gr.Markdown("## 🖼️ BLIP: Caption + Translation + TTS + VQA (with Safety)")
     with gr.Tab("Caption + Translate + Speak"):
         with gr.Row():
         eng_out = gr.Textbox(label="English Caption")
         trans_out = gr.Textbox(label="Translated Caption")
+        audio_out = gr.Audio(label="Speech / Warning Beep", type="filepath", autoplay=True)
         btn = gr.Button("Generate Caption, Translate & Speak")
         btn.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
         with gr.Row():
             img_vqa = gr.Image(type="pil")
             q_in = gr.Textbox(label="Ask About the Image")
         ans_out = gr.Textbox(label="Answer")
         btn2 = gr.Button("Ask")
         btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
 demo.launch()