Spaces:

gopalagra
/

blind-image-captioning

Sleeping

App Files Files Community

gopalagra commited on 24 days ago

Commit

333179e

verified ·

1 Parent(s): 9ec241a

Update app.py

Browse files

Files changed (1) hide show

app.py +202 -230

app.py CHANGED Viewed

@@ -65,6 +65,157 @@
 # interface.launch()
 # # demo.launch(share=True)
 import gradio as gr
 from transformers import (
     BlipProcessor,
@@ -72,22 +223,20 @@ from transformers import (
     BlipForQuestionAnswering,
     pipeline
 )
-moderation_model = pipeline(
-    "text-classification",
-    model="Vrandan/Comment-Moderation",
-    return_all_scores=True
-)
 from PIL import Image
 import torch
-from gtts import gTTS
 import tempfile
 # ----------------------
 # Device setup
 # ----------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------
 # Load Models Once
 # ----------------------
@@ -113,41 +262,69 @@ moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
 print("✅ All models loaded!")
 # ----------------------
 # Safety Filter Function
 # ----------------------
 def is_caption_safe(caption):
     try:
         votes = moderation_model(caption)
-        # If return_all_scores=True, it's [[{label, score}, ...]]
         if isinstance(votes, list) and isinstance(votes[0], list):
             votes = votes[0]
-        # Now safe to loop
         for item in votes:
             if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
                 return False
     except Exception as e:
         print("⚠️ Moderation failed:", e)
-    # Fallback keywords
     unsafe_keywords = [
-    "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
-    "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
-    "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
-    "grenade", "horror", "beheaded", "torture", "hostage", "rape",
-    "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
     ]
     if any(word in caption.lower() for word in unsafe_keywords):
         return False
     return True
 # ----------------------
 # Caption + Translate + Speak
 # ----------------------
 def generate_caption_translate_speak(image, target_lang):
     # Step 1: Caption
     inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
@@ -156,7 +333,8 @@ def generate_caption_translate_speak(image, target_lang):
     # Step 1.5: Safety Check
     if not is_caption_safe(english_caption):
-        return "⚠️ Warning: Unsafe or inappropriate content detected!", "", None
     # Step 2: Translate
     if target_lang in translation_models:
@@ -164,12 +342,11 @@ def generate_caption_translate_speak(image, target_lang):
     else:
         translated = "Translation not available"
-    # Step 3: Generate Speech (English caption for now)
-    tts = gTTS(english_caption, lang="en")
-    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-    tts.save(tmp_file.name)
-    return english_caption, translated, tmp_file.name
 # ----------------------
 # VQA
@@ -180,17 +357,17 @@ def vqa_answer(image, question):
         out = vqa_model.generate(**inputs, max_new_tokens=50)
     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
-    # Run safety filter on answers too
     if not is_caption_safe(answer):
         return "⚠️ Warning: Unsafe or inappropriate content detected!"
     return answer
 # ----------------------
 # Gradio UI
 # ----------------------
 with gr.Blocks(title="BLIP Vision App") as demo:
-    gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter)")
     with gr.Tab("Caption + Translate + Speak"):
         with gr.Row():
@@ -198,7 +375,7 @@ with gr.Blocks(title="BLIP Vision App") as demo:
             lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
         eng_out = gr.Textbox(label="English Caption")
         trans_out = gr.Textbox(label="Translated Caption")
-        audio_out = gr.Audio(label="Spoken Caption", type="filepath")
         btn1 = gr.Button("Generate Caption, Translate & Speak")
         btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
@@ -212,212 +389,7 @@ with gr.Blocks(title="BLIP Vision App") as demo:
 demo.launch()
-# import gradio as gr
-# from transformers import (
-#     BlipProcessor,
-#     BlipForConditionalGeneration,
-#     BlipForQuestionAnswering,
-#     pipeline
-# )
-# from PIL import Image
-# import torch
-# from gtts import gTTS
-# import tempfile
-# import numpy as np
-# import soundfile as sf
-# import librosa
-# import tempfile
-# def combine_audio(beep_path, speech_path):
-#     """Combine beep + speech audio into one clip."""
-#     beep, sr1 = sf.read(beep_path)
-#     speech, sr2 = sf.read(speech_path)
-#     # Resample beep if needed
-#     if sr1 != sr2:
-#         beep = librosa.resample(y=beep, orig_sr=sr1, target_sr=sr2)
-#         sr1 = sr2
-#     # Convert multi-channel to mono
-#     if len(beep.shape) > 1:
-#         beep = beep.mean(axis=1)
-#     if len(speech.shape) > 1:
-#         speech = speech.mean(axis=1)
-#     # Concatenate beep + speech
-#     combined = np.concatenate((beep, speech))
-#     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-#     sf.write(tmp_file.name, combined, sr1)
-#     return tmp_file.name
-# # ----------------------
-# # Device setup
-# # ----------------------
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-# # ----------------------
-# # Load Models Once
-# # ----------------------
-# print("🔄 Loading models...")
-# # Captioning
-# caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-# caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
-# # VQA
-# vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
-# vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
-# # Translation
-# translation_models = {
-#     "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
-#     "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
-#     "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
-# }
-# # Safety Moderation Pipeline
-# moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
-# print("✅ All models loaded!")
-# # ----------------------
-# # Utility: Generate a Beep Sound
-# # ----------------------
-# def make_beep_sound(duration=0.5, freq=1000):
-#     """Generate a short beep tone and save as temporary .wav file."""
-#     samplerate = 44100
-#     t = np.linspace(0, duration, int(samplerate * duration), endpoint=False)
-#     wave = 0.5 * np.sin(2 * np.pi * freq * t)
-#     tmp_beep = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-#     sf.write(tmp_beep.name, wave, samplerate)
-#     return tmp_beep.name
-# # ----------------------
-# # Safety Filter Function
-# # ----------------------
-# def is_caption_safe(caption):
-#     try:
-#         votes = moderation_model(caption)
-#         if isinstance(votes, list) and isinstance(votes[0], list):
-#             votes = votes[0]
-#         for item in votes:
-#             if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
-#                 return False
-#     except Exception as e:
-#         print("⚠️ Moderation failed:", e)
-#     unsafe_keywords = [
-#         "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
-#         "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
-#         "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
-#         "grenade", "horror", "beheaded", "torture", "hostage", "rape",
-#         "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
-#     ]
-#     if any(word in caption.lower() for word in unsafe_keywords):
-#         return False
-#     return True
-# # ----------------------
-# # Caption + Translate + Speak
-# # ----------------------
-# def generate_caption_translate_speak(image, target_lang):
-#     # Step 1: Caption
-#     inputs = caption_processor(images=image, return_tensors="pt").to(device)
-#     with torch.no_grad():
-#         out = caption_model.generate(**inputs, max_new_tokens=50)
-#     english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
-#  # Step 1.5: Safety Check
-#     if not is_caption_safe(english_caption):
-#     # Generate beep
-#         beep = make_beep_sound()
-#     # Generate warning speech
-#         tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
-#         speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-#         tts.save(speech_tmp.name)
-#     # Combine beep + speech
-#         combined_audio = combine_audio(beep, speech_tmp.name)
-#     # Return combined audio automatically
-#         return "⚠️ Warning: Unsafe or inappropriate content detected!", "", combined_audio
-#     # Step 2: Translate
-#     if target_lang in translation_models:
-#         translated = translation_models[target_lang](english_caption)[0]['translation_text']
-#     else:
-#         translated = "Translation not available"
-#     # Step 3: Generate Speech (English caption)
-#     tts = gTTS(english_caption, lang="en")
-#     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-#     tts.save(tmp_file.name)
-#     return english_caption, translated, tmp_file.name
-# # ----------------------
-# # VQA
-# # ----------------------
-# def vqa_answer(image, question):
-#     inputs = vqa_processor(image, question, return_tensors="pt").to(device)
-#     with torch.no_grad():
-#         out = vqa_model.generate(**inputs, max_new_tokens=50)
-#     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
-#     if not is_caption_safe(answer):
-#         beep = make_beep_sound()
-#         tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
-#         speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-#         tts.save(speech_tmp.name)
-#         combined = combine_audio(beep, speech_tmp.name)
-#         return "⚠️ Warning: Unsafe or inappropriate content detected!", combined
-#     return answer, None
-# # ----------------------
-# # Gradio UI
-# # ----------------------
-# with gr.Blocks(title="BLIP Vision App") as demo:
-#     gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (Auto-Play TTS + Safety Beep)")
-#     # --- Caption + Translate + Speak ---
-#     with gr.Tab("Caption + Translate + Speak"):
-#         with gr.Row():
-#             img_in = gr.Image(type="pil", label="Upload Image")
-#             lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
-#         eng_out = gr.Textbox(label="English Caption")
-#         trans_out = gr.Textbox(label="Translated Caption")
-#         audio_out = gr.Audio(label="Speech Output", type="filepath", autoplay=True)  # Auto-plays TTS or beep
-#         btn1 = gr.Button("Generate Caption, Translate & Speak")
-#         btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
-#                    outputs=[eng_out, trans_out, audio_out])
-#     # --- Visual Question Answering (VQA) ---
-#     with gr.Tab("Visual Question Answering (VQA)"):
-#         with gr.Row():
-#             img_vqa = gr.Image(type="pil", label="Upload Image")
-#             q_in = gr.Textbox(label="Ask a Question about the Image")
-#         ans_out = gr.Textbox(label="Answer")
-#         beep_out = gr.Audio(label="Alert Sound", type="filepath", autoplay=True)  # Auto-plays beep
-#         btn2 = gr.Button("Ask")
-#         btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
-# demo.launch()

 # interface.launch()
 # # demo.launch(share=True)
+# import gradio as gr
+# from transformers import (
+#     BlipProcessor,
+#     BlipForConditionalGeneration,
+#     BlipForQuestionAnswering,
+#     pipeline
+# )
+# moderation_model = pipeline(
+#     "text-classification",
+#     model="Vrandan/Comment-Moderation",
+#     return_all_scores=True
+# )
+# from PIL import Image
+# import torch
+# from gtts import gTTS
+# import tempfile
+# # ----------------------
+# # Device setup
+# # ----------------------
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# # ----------------------
+# # Load Models Once
+# # ----------------------
+# print("🔄 Loading models...")
+# # Captioning
+# caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+# caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
+# # VQA
+# vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+# vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
+# # Translation
+# translation_models = {
+#     "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
+#     "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
+#     "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
+# }
+# # Safety Moderation Pipeline
+# moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
+# print("✅ All models loaded!")
+# # ----------------------
+# # Safety Filter Function
+# # ----------------------
+# def is_caption_safe(caption):
+#     try:
+#         votes = moderation_model(caption)
+#         # If return_all_scores=True, it's [[{label, score}, ...]]
+#         if isinstance(votes, list) and isinstance(votes[0], list):
+#             votes = votes[0]
+#         # Now safe to loop
+#         for item in votes:
+#             if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
+#                 return False
+#     except Exception as e:
+#         print("⚠️ Moderation failed:", e)
+#     # Fallback keywords
+#     unsafe_keywords = [
+#     "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
+#     "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
+#     "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
+#     "grenade", "horror", "beheaded", "torture", "hostage", "rape",
+#     "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
+#     ]
+#     if any(word in caption.lower() for word in unsafe_keywords):
+#         return False
+#     return True
+# # ----------------------
+# # Caption + Translate + Speak
+# # ----------------------
+# def generate_caption_translate_speak(image, target_lang):
+#     # Step 1: Caption
+#     inputs = caption_processor(images=image, return_tensors="pt").to(device)
+#     with torch.no_grad():
+#         out = caption_model.generate(**inputs, max_new_tokens=50)
+#     english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
+#     # Step 1.5: Safety Check
+#     if not is_caption_safe(english_caption):
+#         return "⚠️ Warning: Unsafe or inappropriate content detected!", "", None
+#     # Step 2: Translate
+#     if target_lang in translation_models:
+#         translated = translation_models[target_lang](english_caption)[0]['translation_text']
+#     else:
+#         translated = "Translation not available"
+#     # Step 3: Generate Speech (English caption for now)
+#     tts = gTTS(english_caption, lang="en")
+#     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+#     tts.save(tmp_file.name)
+#     return english_caption, translated, tmp_file.name
+# # ----------------------
+# # VQA
+# # ----------------------
+# def vqa_answer(image, question):
+#     inputs = vqa_processor(image, question, return_tensors="pt").to(device)
+#     with torch.no_grad():
+#         out = vqa_model.generate(**inputs, max_new_tokens=50)
+#     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
+#     # Run safety filter on answers too
+#     if not is_caption_safe(answer):
+#         return "⚠️ Warning: Unsafe or inappropriate content detected!"
+#     return answer
+# # ----------------------
+# # Gradio UI
+# # ----------------------
+# with gr.Blocks(title="BLIP Vision App") as demo:
+#     gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter)")
+#     with gr.Tab("Caption + Translate + Speak"):
+#         with gr.Row():
+#             img_in = gr.Image(type="pil", label="Upload Image")
+#             lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
+#         eng_out = gr.Textbox(label="English Caption")
+#         trans_out = gr.Textbox(label="Translated Caption")
+#         audio_out = gr.Audio(label="Spoken Caption", type="filepath")
+#         btn1 = gr.Button("Generate Caption, Translate & Speak")
+#         btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
+#     with gr.Tab("Visual Question Answering (VQA)"):
+#         with gr.Row():
+#             img_vqa = gr.Image(type="pil", label="Upload Image")
+#             q_in = gr.Textbox(label="Ask a Question about the Image")
+#         ans_out = gr.Textbox(label="Answer")
+#         btn2 = gr.Button("Ask")
+#         btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
+# demo.launch()
 import gradio as gr
 from transformers import (
     BlipProcessor,
     BlipForQuestionAnswering,
     pipeline
 )
 from PIL import Image
 import torch
+import pyttsx3
 import tempfile
+import numpy as np
+import soundfile as sf
 # ----------------------
 # Device setup
 # ----------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------
 # Load Models Once
 # ----------------------
 print("✅ All models loaded!")
+# ----------------------
+# Beep Generator
+# ----------------------
+def generate_beep():
+    sr = 44100
+    duration = 0.4
+    frequency = 880
+    t = np.linspace(0, duration, int(sr * duration), False)
+    wave = 0.5 * np.sin(2 * np.pi * frequency * t)
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    sf.write(tmp.name, wave, sr)
+    return tmp.name
 # ----------------------
 # Safety Filter Function
 # ----------------------
 def is_caption_safe(caption):
     try:
         votes = moderation_model(caption)
         if isinstance(votes, list) and isinstance(votes[0], list):
             votes = votes[0]
         for item in votes:
             if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
                 return False
     except Exception as e:
         print("⚠️ Moderation failed:", e)
     unsafe_keywords = [
+        "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
+        "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
+        "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
+        "grenade", "horror", "beheaded", "torture", "hostage", "rape",
+        "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
     ]
     if any(word in caption.lower() for word in unsafe_keywords):
         return False
     return True
+# ----------------------
+# Offline Text-to-Speech using pyttsx3
+# ----------------------
+def offline_tts(text):
+    engine = pyttsx3.init()
+    tmp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+    engine.save_to_file(text, tmp_audio.name)
+    engine.runAndWait()
+    return tmp_audio.name
 # ----------------------
 # Caption + Translate + Speak
 # ----------------------
 def generate_caption_translate_speak(image, target_lang):
     # Step 1: Caption
     inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
     # Step 1.5: Safety Check
     if not is_caption_safe(english_caption):
+        beep = generate_beep()
+        return "⚠️ Warning: Unsafe or inappropriate content detected!", "", beep
     # Step 2: Translate
     if target_lang in translation_models:
     else:
         translated = "Translation not available"
+    # Step 3: Offline Speech
+    audio_path = offline_tts(english_caption)
+    return english_caption, translated, audio_path
 # ----------------------
 # VQA
         out = vqa_model.generate(**inputs, max_new_tokens=50)
     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
     if not is_caption_safe(answer):
         return "⚠️ Warning: Unsafe or inappropriate content detected!"
     return answer
 # ----------------------
 # Gradio UI
 # ----------------------
 with gr.Blocks(title="BLIP Vision App") as demo:
+    gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter + Warning Beep)")
     with gr.Tab("Caption + Translate + Speak"):
         with gr.Row():
             lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
         eng_out = gr.Textbox(label="English Caption")
         trans_out = gr.Textbox(label="Translated Caption")
+        audio_out = gr.Audio(label="Spoken Caption / Warning Beep", type="filepath")
         btn1 = gr.Button("Generate Caption, Translate & Speak")
         btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
 demo.launch()