Spaces:

gopalagra
/

blind-image-captioning

Sleeping

App Files Files Community

gopalagra commited on 25 days ago

Commit

f62268d

verified ·

1 Parent(s): bb2f9ea

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -428

app.py CHANGED Viewed

@@ -1,217 +1,217 @@
-# app.py
-# import gradio as gr
-# from transformers import BlipProcessor, BlipForConditionalGeneration
-# from gtts import gTTS
-# import io
-# from PIL import Image
-# # -------------------------------
-# # Load BLIP-base model (lighter version)
-# # -------------------------------
-# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
-# # -------------------------------
-# # Generate caption function
-# # -------------------------------
-# # def generate_caption_tts(image):
-# #     caption = generate_caption(model, processor, image)
-# #     audio_file = text_to_audio_file(caption)
-# #     return caption, audio_file  # return file path, not BytesIO
-# # -------------------------------
-# # Convert text to speech using gTTS
-# # -------------------------------
-# import tempfile
-# import pyttsx3
-# def text_to_audio_file(text):
-#     # Create a temporary file
-#     tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
-#     tmp_path = tmp_file.name
-#     tmp_file.close()
-#     engine = pyttsx3.init()
-#     engine.save_to_file(text, tmp_path)
-#     engine.runAndWait()
-#     return tmp_path
-# def generate_caption_from_image(model, processor, image):
-#     # image: PIL.Image
-#     inputs = processor(images=image, return_tensors="pt")
-#     out = model.generate(**inputs)
-#     caption = processor.decode(out[0], skip_special_tokens=True)
-#     return caption
-# # -------------------------------
-# # Gradio interface: Caption + Audio
-# # -------------------------------
 # def generate_caption_tts(image):
-#     caption = generate_caption_from_image(model, processor, image)  # uses global model/processor
-#     # audio_file = text_to_audio_file(caption)
-#     return caption
-# interface = gr.Interface(
-#     fn=generate_caption_tts,
-#     inputs=gr.Image(type="numpy"),
-#     outputs=[gr.Textbox(label="Generated Caption")],
-#     title="Image Captioning for Visually Impaired",
-#     description="Upload an image, get a caption and audio description."
-# )
-# interface.launch()
-# # demo.launch(share=True)
-# import gradio as gr
-# from transformers import (
-#     BlipProcessor,
-#     BlipForConditionalGeneration,
-#     BlipForQuestionAnswering,
-#     pipeline
-# )
-# moderation_model = pipeline(
-#     "text-classification",
-#     model="Vrandan/Comment-Moderation",
-#     return_all_scores=True
-# )
-# from PIL import Image
-# import torch
-# from gtts import gTTS
-# import tempfile
-# # ----------------------
-# # Device setup
-# # ----------------------
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-# # ----------------------
-# # Load Models Once
-# # ----------------------
-# print("🔄 Loading models...")
-# # Captioning
-# caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-# caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
-# # VQA
-# vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
-# vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
-# # Translation
-# translation_models = {
-#     "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
-#     "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
-#     "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
-# }
-# # Safety Moderation Pipeline
-# moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
-# print("✅ All models loaded!")
-# # ----------------------
-# # Safety Filter Function
-# # ----------------------
-# def is_caption_safe(caption):
-#     try:
-#         votes = moderation_model(caption)
-#         # If return_all_scores=True, it's [[{label, score}, ...]]
-#         if isinstance(votes, list) and isinstance(votes[0], list):
-#             votes = votes[0]
-#         # Now safe to loop
-#         for item in votes:
-#             if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
-#                 return False
-#     except Exception as e:
-#         print("⚠️ Moderation failed:", e)
-#     # Fallback keywords
-#     unsafe_keywords = [
-#     "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
-#     "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
-#     "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
-#     "grenade", "horror", "beheaded", "torture", "hostage", "rape",
-#     "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
-#     ]
-#     if any(word in caption.lower() for word in unsafe_keywords):
-#         return False
-#     return True
-# # ----------------------
-# # Caption + Translate + Speak
-# # ----------------------
-# def generate_caption_translate_speak(image, target_lang):
-#     # Step 1: Caption
-#     inputs = caption_processor(images=image, return_tensors="pt").to(device)
-#     with torch.no_grad():
-#         out = caption_model.generate(**inputs, max_new_tokens=50)
-#     english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
-#     # Step 1.5: Safety Check
-#     if not is_caption_safe(english_caption):
-#         return "⚠️ Warning: Unsafe or inappropriate content detected!", "", None
-#     # Step 2: Translate
-#     if target_lang in translation_models:
-#         translated = translation_models[target_lang](english_caption)[0]['translation_text']
-#     else:
-#         translated = "Translation not available"
-#     # Step 3: Generate Speech (English caption for now)
-#     tts = gTTS(english_caption, lang="en")
-#     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-#     tts.save(tmp_file.name)
-#     return english_caption, translated, tmp_file.name
-# # ----------------------
-# # VQA
-# # ----------------------
-# def vqa_answer(image, question):
-#     inputs = vqa_processor(image, question, return_tensors="pt").to(device)
-#     with torch.no_grad():
-#         out = vqa_model.generate(**inputs, max_new_tokens=50)
-#     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
-#     # Run safety filter on answers too
-#     if not is_caption_safe(answer):
-#         return "⚠️ Warning: Unsafe or inappropriate content detected!"
-#     return answer
-# # ----------------------
-# # Gradio UI
-# # ----------------------
-# with gr.Blocks(title="BLIP Vision App") as demo:
-#     gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter)")
-#     with gr.Tab("Caption + Translate + Speak"):
-#         with gr.Row():
-#             img_in = gr.Image(type="pil", label="Upload Image")
-#             lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
-#         eng_out = gr.Textbox(label="English Caption")
-#         trans_out = gr.Textbox(label="Translated Caption")
-#         audio_out = gr.Audio(label="Spoken Caption", type="filepath")
-#         btn1 = gr.Button("Generate Caption, Translate & Speak")
-#         btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
-#     with gr.Tab("Visual Question Answering (VQA)"):
-#         with gr.Row():
-#             img_vqa = gr.Image(type="pil", label="Upload Image")
-#             q_in = gr.Textbox(label="Ask a Question about the Image")
-#         ans_out = gr.Textbox(label="Answer")
-#         btn2 = gr.Button("Ask")
-#         btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
-# demo.launch()
@@ -432,250 +432,4 @@
-import gradio as gr
-from transformers import (
-    BlipProcessor,
-    BlipForConditionalGeneration,
-    BlipForQuestionAnswering,
-    pipeline,
-    SpeechT5Processor,          # <--- NEW
-    SpeechT5ForTextToSpeech,    # <--- NEW
-    set_seed                      # <--- NEW
-)
-from datasets import load_dataset # <--- NEW for speaker embedding
-from PIL import Image
-import torch
-# from gtts import gTTS # <--- REMOVED
-import tempfile
-import numpy as np
-import soundfile as sf
-import librosa
-import tempfile
-import time # <--- Added for potential cleanup, but mostly for future use
-# Set seed for reproducibility in TTS generation
-set_seed(42)
-def combine_audio(beep_path, speech_path):
-    """Combine beep + speech audio into one clip."""
-    # ... (Keep this function as is)
-    beep, sr1 = sf.read(beep_path)
-    speech, sr2 = sf.read(speech_path)
-    # Resample beep if needed
-    if sr1 != sr2:
-        beep = librosa.resample(y=beep, orig_sr=sr1, target_sr=sr2)
-        sr1 = sr2
-    # Convert multi-channel to mono
-    if len(beep.shape) > 1:
-        beep = beep.mean(axis=1)
-    if len(speech.shape) > 1:
-        # Check if speech is stereo (channels > 1) and has data
-        if speech.ndim > 1:
-            speech = speech.mean(axis=1)
-        # Ensure speech is treated as a 1D array even if it was originally mono 2D
-        # For single channel (mono) soundfile output, it might be 2D with shape (N, 1)
-    # Concatenate beep + speech
-    combined = np.concatenate((beep, speech))
-    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    # SpeechT5 generates 16000Hz WAV, so we use sr1 (which is 16000) for the output
-    sf.write(tmp_file.name, combined, sr1)
-    return tmp_file.name
-# ----------------------
-# Device setup
-# ----------------------
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# ----------------------
-# Load Models Once
-# ----------------------
-print("🔄 Loading models...")
-# Captioning
-caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
-# VQA
-vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
-vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
-# Translation
-translation_models = {
-    "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
-    "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
-    "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
-}
-# Text-to-Speech (TTS) Models # <--- NEW/MODIFIED
-print("    Loading SpeechT5 TTS model...")
-tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
-# Load a speaker embedding (required for SpeechT5 to define a voice)
-# Using a sample speaker from the VCTK dataset
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
-# Safety Moderation Pipeline
-moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
-print("✅ All models loaded!")
-# ----------------------
-# Utility: Generate Local Speech (TTS) # <--- NEW FUNCTION
-# ----------------------
-def synthesize_speech_local(text, tts_processor, tts_model, speaker_embeddings):
-    """Generates speech using local HuggingFace SpeechT5 model."""
-    inputs = tts_processor(text=text, return_tensors="pt").to(device)
-    # Generate speech with the loaded model and speaker embedding
-    speech = tts_model.generate_speech(
-        inputs["input_ids"],
-        speaker_embeddings,
-        do_sample=True # Use sampling for more natural tone
-    )
-    # Convert the Tensor to a NumPy array
-    speech_np = speech.cpu().numpy()
-    # Create a temporary WAV file to save the audio
-    # SpeechT5's default sampling rate is 16000Hz
-    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    sf.write(tmp_file.name, speech_np, samplerate=16000)
-    return tmp_file.name
-# ----------------------
-# Utility: Generate a Beep Sound
-# ----------------------
-def make_beep_sound(duration=0.5, freq=1000):
-    """Generate a short beep tone and save as temporary .wav file."""
-    # We use 16000Hz to match SpeechT5's output for combining audio later
-    samplerate = 16000
-    t = np.linspace(0, duration, int(samplerate * duration), endpoint=False)
-    wave = 0.5 * np.sin(2 * np.pi * freq * t)
-    tmp_beep = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    sf.write(tmp_beep.name, wave, samplerate)
-    return tmp_beep.name
-# ----------------------
-# Safety Filter Function (Keep as is)
-# ----------------------
-def is_caption_safe(caption):
-    # ... (Keep this function as is)
-    try:
-        votes = moderation_model(caption)
-        if isinstance(votes, list) and isinstance(votes[0], list):
-            votes = votes[0]
-        for item in votes:
-            # Checking for 'V' or 'V2' (Violent, etc.) labels with high confidence
-            if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
-                return False
-    except Exception as e:
-        print("⚠️ Moderation failed:", e)
-    unsafe_keywords = [
-        "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
-        "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
-        "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
-        "grenade", "horror", "beheaded", "torture", "hostage", "rape",
-        "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
-    ]
-    if any(word in caption.lower() for word in unsafe_keywords):
-        return False
-    return True
-# ----------------------
-# Caption + Translate + Speak # <--- MODIFIED
-# ----------------------
-def generate_caption_translate_speak(image, target_lang):
-    # Step 1: Caption
-    inputs = caption_processor(images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        out = caption_model.generate(**inputs, max_new_tokens=50)
-    english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
-    # Step 1.5: Safety Check (MODIFIED TTS)
-    if not is_caption_safe(english_caption):
-        # Generate beep (16000Hz WAV)
-        beep = make_beep_sound()
-        # Generate warning speech (16000Hz WAV)
-        warning_text = "Warning! Unsafe or inappropriate content detected."
-        speech_tmp_name = synthesize_speech_local(warning_text, tts_processor, tts_model, speaker_embeddings)
-        # Combine beep + speech
-        combined_audio = combine_audio(beep, speech_tmp_name)
-        # Return combined audio automatically
-        return "⚠️ Warning: Unsafe or inappropriate content detected!", "", combined_audio
-    # Step 2: Translate
-    if target_lang in translation_models:
-        translated = translation_models[target_lang](english_caption)[0]['translation_text']
-    else:
-        translated = "Translation not available"
-    # Step 3: Generate Speech (English caption) (MODIFIED TTS)
-    tmp_file_name = synthesize_speech_local(english_caption, tts_processor, tts_model, speaker_embeddings)
-    # The output is a .wav file now, but Gradio's Audio component is flexible
-    return english_caption, translated, tmp_file_name
-# ----------------------
-# VQA # <--- MODIFIED
-# ----------------------
-def vqa_answer(image, question):
-    inputs = vqa_processor(image, question, return_tensors="pt").to(device)
-    with torch.no_grad():
-        out = vqa_model.generate(**inputs, max_new_tokens=50)
-    answer = vqa_processor.decode(out[0], skip_special_tokens=True)
-    if not is_caption_safe(answer):
-        # Generate beep (16000Hz WAV)
-        beep = make_beep_sound()
-        # Generate warning speech (16000Hz WAV)
-        warning_text = "Warning! Unsafe or inappropriate content detected."
-        speech_tmp_name = synthesize_speech_local(warning_text, tts_processor, tts_model, speaker_embeddings)
-        combined = combine_audio(beep, speech_tmp_name)
-        return "⚠️ Warning: Unsafe or inappropriate content detected!", combined
-    return answer, None
-# ----------------------
-# Gradio UI (Keep as is)
-# ----------------------
-with gr.Blocks(title="BLIP Vision App") as demo:
-    gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (Auto-Play TTS + Safety Beep)")
-    gr.Markdown("### Note: Text-to-Speech now uses a local HuggingFace model to prevent 'Too Many Requests' (429) errors.")
-    # --- Caption + Translate + Speak ---
-    with gr.Tab("Caption + Translate + Speak"):
-        with gr.Row():
-            img_in = gr.Image(type="pil", label="Upload Image")
-            lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
-        eng_out = gr.Textbox(label="English Caption")
-        trans_out = gr.Textbox(label="Translated Caption")
-        # Note: We changed the output to WAV but Gradio handles it fine.
-        audio_out = gr.Audio(label="Speech Output (WAV format)", type="filepath", autoplay=True)
-        btn1 = gr.Button("Generate Caption, Translate & Speak")
-        btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
-                   outputs=[eng_out, trans_out, audio_out])
-    # --- Visual Question Answering (VQA) ---
-    with gr.Tab("Visual Question Answering (VQA)"):
-        with gr.Row():
-            img_vqa = gr.Image(type="pil", label="Upload Image")
-            q_in = gr.Textbox(label="Ask a Question about the Image")
-        ans_out = gr.Textbox(label="Answer")
-        beep_out = gr.Audio(label="Alert Sound (WAV format)", type="filepath", autoplay=True)
-        btn2 = gr.Button("Ask")
-        btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
-demo.launch()

+app.py
+import gradio as gr
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from gtts import gTTS
+import io
+from PIL import Image
+# -------------------------------
+# Load BLIP-base model (lighter version)
+# -------------------------------
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+# -------------------------------
+# Generate caption function
+# -------------------------------
 # def generate_caption_tts(image):
+#     caption = generate_caption(model, processor, image)
+#     audio_file = text_to_audio_file(caption)
+#     return caption, audio_file  # return file path, not BytesIO
+# -------------------------------
+# Convert text to speech using gTTS
+# -------------------------------
+import tempfile
+import pyttsx3
+def text_to_audio_file(text):
+    # Create a temporary file
+    tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
+    tmp_path = tmp_file.name
+    tmp_file.close()
+    engine = pyttsx3.init()
+    engine.save_to_file(text, tmp_path)
+    engine.runAndWait()
+    return tmp_path
+def generate_caption_from_image(model, processor, image):
+    # image: PIL.Image
+    inputs = processor(images=image, return_tensors="pt")
+    out = model.generate(**inputs)
+    caption = processor.decode(out[0], skip_special_tokens=True)
+    return caption
+# -------------------------------
+# Gradio interface: Caption + Audio
+# -------------------------------
+def generate_caption_tts(image):
+    caption = generate_caption_from_image(model, processor, image)  # uses global model/processor
+    # audio_file = text_to_audio_file(caption)
+    return caption
+interface = gr.Interface(
+    fn=generate_caption_tts,
+    inputs=gr.Image(type="numpy"),
+    outputs=[gr.Textbox(label="Generated Caption")],
+    title="Image Captioning for Visually Impaired",
+    description="Upload an image, get a caption and audio description."
+)
+interface.launch()
+# demo.launch(share=True)
+import gradio as gr
+from transformers import (
+    BlipProcessor,
+    BlipForConditionalGeneration,
+    BlipForQuestionAnswering,
+    pipeline
+)
+moderation_model = pipeline(
+    "text-classification",
+    model="Vrandan/Comment-Moderation",
+    return_all_scores=True
+)
+from PIL import Image
+import torch
+from gtts import gTTS
+import tempfile
+# ----------------------
+# Device setup
+# ----------------------
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# ----------------------
+# Load Models Once
+# ----------------------
+print("🔄 Loading models...")
+# Captioning
+caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
+# VQA
+vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
+# Translation
+translation_models = {
+    "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
+    "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
+    "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
+}
+# Safety Moderation Pipeline
+moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
+print("✅ All models loaded!")
+# ----------------------
+# Safety Filter Function
+# ----------------------
+def is_caption_safe(caption):
+    try:
+        votes = moderation_model(caption)
+        # If return_all_scores=True, it's [[{label, score}, ...]]
+        if isinstance(votes, list) and isinstance(votes[0], list):
+            votes = votes[0]
+        # Now safe to loop
+        for item in votes:
+            if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
+                return False
+    except Exception as e:
+        print("⚠️ Moderation failed:", e)
+    # Fallback keywords
+    unsafe_keywords = [
+    "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
+    "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
+    "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
+    "grenade", "horror", "beheaded", "torture", "hostage", "rape",
+    "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
+    ]
+    if any(word in caption.lower() for word in unsafe_keywords):
+        return False
+    return True
+# ----------------------
+# Caption + Translate + Speak
+# ----------------------
+def generate_caption_translate_speak(image, target_lang):
+    # Step 1: Caption
+    inputs = caption_processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        out = caption_model.generate(**inputs, max_new_tokens=50)
+    english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
+    # Step 1.5: Safety Check
+    if not is_caption_safe(english_caption):
+        return "⚠️ Warning: Unsafe or inappropriate content detected!", "", None
+    # Step 2: Translate
+    if target_lang in translation_models:
+        translated = translation_models[target_lang](english_caption)[0]['translation_text']
+    else:
+        translated = "Translation not available"
+    # Step 3: Generate Speech (English caption for now)
+    tts = gTTS(english_caption, lang="en")
+    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+    tts.save(tmp_file.name)
+    return english_caption, translated, tmp_file.name
+# ----------------------
+# VQA
+# ----------------------
+def vqa_answer(image, question):
+    inputs = vqa_processor(image, question, return_tensors="pt").to(device)
+    with torch.no_grad():
+        out = vqa_model.generate(**inputs, max_new_tokens=50)
+    answer = vqa_processor.decode(out[0], skip_special_tokens=True)
+    # Run safety filter on answers too
+    if not is_caption_safe(answer):
+        return "⚠️ Warning: Unsafe or inappropriate content detected!"
+    return answer
+# ----------------------
+# Gradio UI
+# ----------------------
+with gr.Blocks(title="BLIP Vision App") as demo:
+    gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter)")
+    with gr.Tab("Caption + Translate + Speak"):
+        with gr.Row():
+            img_in = gr.Image(type="pil", label="Upload Image")
+            lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
+        eng_out = gr.Textbox(label="English Caption")
+        trans_out = gr.Textbox(label="Translated Caption")
+        audio_out = gr.Audio(label="Spoken Caption", type="filepath")
+        btn1 = gr.Button("Generate Caption, Translate & Speak")
+        btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
+    with gr.Tab("Visual Question Answering (VQA)"):
+        with gr.Row():
+            img_vqa = gr.Image(type="pil", label="Upload Image")
+            q_in = gr.Textbox(label="Ask a Question about the Image")
+        ans_out = gr.Textbox(label="Answer")
+        btn2 = gr.Button("Ask")
+        btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
+demo.launch()