Spaces:

gopalagra
/

blind-image-captioning

Sleeping

App Files Files Community

gopalagra commited on 24 days ago

Commit

1d32024

verified ·

1 Parent(s): b189448

Update app.py

Browse files

Files changed (1) hide show

app.py +311 -51

app.py CHANGED Viewed

@@ -218,42 +218,271 @@
 import gradio as gr
 from transformers import (
     BlipProcessor,
     BlipForConditionalGeneration,
     BlipForQuestionAnswering,
-    pipeline
 )
 from PIL import Image
 import torch
-from gtts import gTTS
 import tempfile
 import numpy as np
 import soundfile as sf
 import librosa
 import tempfile
 def combine_audio(beep_path, speech_path):
     """Combine beep + speech audio into one clip."""
     beep, sr1 = sf.read(beep_path)
     speech, sr2 = sf.read(speech_path)
     # Resample beep if needed
     if sr1 != sr2:
         beep = librosa.resample(y=beep, orig_sr=sr1, target_sr=sr2)
         sr1 = sr2
     # Convert multi-channel to mono
     if len(beep.shape) > 1:
         beep = beep.mean(axis=1)
     if len(speech.shape) > 1:
-        speech = speech.mean(axis=1)
     # Concatenate beep + speech
     combined = np.concatenate((beep, speech))
     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
     sf.write(tmp_file.name, combined, sr1)
     return tmp_file.name
@@ -282,39 +511,73 @@ translation_models = {
     "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
 }
 # Safety Moderation Pipeline
 moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
 print("✅ All models loaded!")
 # ----------------------
 # Utility: Generate a Beep Sound
 # ----------------------
 def make_beep_sound(duration=0.5, freq=1000):
     """Generate a short beep tone and save as temporary .wav file."""
-    samplerate = 44100
     t = np.linspace(0, duration, int(samplerate * duration), endpoint=False)
     wave = 0.5 * np.sin(2 * np.pi * freq * t)
     tmp_beep = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
     sf.write(tmp_beep.name, wave, samplerate)
     return tmp_beep.name
 # ----------------------
-# Safety Filter Function
 # ----------------------
 def is_caption_safe(caption):
     try:
         votes = moderation_model(caption)
         if isinstance(votes, list) and isinstance(votes[0], list):
             votes = votes[0]
         for item in votes:
             if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
                 return False
     except Exception as e:
         print("⚠️ Moderation failed:", e)
     unsafe_keywords = [
         "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
         "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
@@ -326,76 +589,72 @@ def is_caption_safe(caption):
         return False
     return True
 # ----------------------
-# Caption + Translate + Speak
 # ----------------------
 def generate_caption_translate_speak(image, target_lang):
     # Step 1: Caption
     inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         out = caption_model.generate(**inputs, max_new_tokens=50)
-    english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
- # Step 1.5: Safety Check
     if not is_caption_safe(english_caption):
-    # Generate beep
         beep = make_beep_sound()
-    # Generate warning speech
-        tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
-        speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-        tts.save(speech_tmp.name)
-    # Combine beep + speech
-        combined_audio = combine_audio(beep, speech_tmp.name)
-    # Return combined audio automatically
         return "⚠️ Warning: Unsafe or inappropriate content detected!", "", combined_audio
     # Step 2: Translate
     if target_lang in translation_models:
         translated = translation_models[target_lang](english_caption)[0]['translation_text']
     else:
         translated = "Translation not available"
-    # Step 3: Generate Speech (English caption)
-    tts = gTTS(english_caption, lang="en")
-    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-    tts.save(tmp_file.name)
-    return english_caption, translated, tmp_file.name
 # ----------------------
-# VQA
 # ----------------------
 def vqa_answer(image, question):
     inputs = vqa_processor(image, question, return_tensors="pt").to(device)
     with torch.no_grad():
         out = vqa_model.generate(**inputs, max_new_tokens=50)
     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
     if not is_caption_safe(answer):
         beep = make_beep_sound()
-        tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
-        speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-        tts.save(speech_tmp.name)
-        combined = combine_audio(beep, speech_tmp.name)
         return "⚠️ Warning: Unsafe or inappropriate content detected!", combined
     return answer, None
 # ----------------------
-# Gradio UI
 # ----------------------
 with gr.Blocks(title="BLIP Vision App") as demo:
     gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (Auto-Play TTS + Safety Beep)")
     # --- Caption + Translate + Speak ---
     with gr.Tab("Caption + Translate + Speak"):
         with gr.Row():
@@ -403,18 +662,19 @@ with gr.Blocks(title="BLIP Vision App") as demo:
             lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
         eng_out = gr.Textbox(label="English Caption")
         trans_out = gr.Textbox(label="Translated Caption")
-        audio_out = gr.Audio(label="Speech Output", type="filepath", autoplay=True)  # Auto-plays TTS or beep
         btn1 = gr.Button("Generate Caption, Translate & Speak")
         btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
                    outputs=[eng_out, trans_out, audio_out])
     # --- Visual Question Answering (VQA) ---
     with gr.Tab("Visual Question Answering (VQA)"):
         with gr.Row():
             img_vqa = gr.Image(type="pil", label="Upload Image")
             q_in = gr.Textbox(label="Ask a Question about the Image")
         ans_out = gr.Textbox(label="Answer")
-        beep_out = gr.Audio(label="Alert Sound", type="filepath", autoplay=True)  # Auto-plays beep
         btn2 = gr.Button("Ask")
         btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])

+# import gradio as gr
+# from transformers import (
+#     BlipProcessor,
+#     BlipForConditionalGeneration,
+#     BlipForQuestionAnswering,
+#     pipeline
+# )
+# from PIL import Image
+# import torch
+# from gtts import gTTS
+# import tempfile
+# import numpy as np
+# import soundfile as sf
+# import librosa
+# import tempfile
+# def combine_audio(beep_path, speech_path):
+#     """Combine beep + speech audio into one clip."""
+#     beep, sr1 = sf.read(beep_path)
+#     speech, sr2 = sf.read(speech_path)
+#     # Resample beep if needed
+#     if sr1 != sr2:
+#         beep = librosa.resample(y=beep, orig_sr=sr1, target_sr=sr2)
+#         sr1 = sr2
+#     # Convert multi-channel to mono
+#     if len(beep.shape) > 1:
+#         beep = beep.mean(axis=1)
+#     if len(speech.shape) > 1:
+#         speech = speech.mean(axis=1)
+#     # Concatenate beep + speech
+#     combined = np.concatenate((beep, speech))
+#     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+#     sf.write(tmp_file.name, combined, sr1)
+#     return tmp_file.name
+# # ----------------------
+# # Device setup
+# # ----------------------
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# # ----------------------
+# # Load Models Once
+# # ----------------------
+# print("🔄 Loading models...")
+# # Captioning
+# caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+# caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
+# # VQA
+# vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+# vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
+# # Translation
+# translation_models = {
+#     "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
+#     "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
+#     "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
+# }
+# # Safety Moderation Pipeline
+# moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
+# print("✅ All models loaded!")
+# # ----------------------
+# # Utility: Generate a Beep Sound
+# # ----------------------
+# def make_beep_sound(duration=0.5, freq=1000):
+#     """Generate a short beep tone and save as temporary .wav file."""
+#     samplerate = 44100
+#     t = np.linspace(0, duration, int(samplerate * duration), endpoint=False)
+#     wave = 0.5 * np.sin(2 * np.pi * freq * t)
+#     tmp_beep = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+#     sf.write(tmp_beep.name, wave, samplerate)
+#     return tmp_beep.name
+# # ----------------------
+# # Safety Filter Function
+# # ----------------------
+# def is_caption_safe(caption):
+#     try:
+#         votes = moderation_model(caption)
+#         if isinstance(votes, list) and isinstance(votes[0], list):
+#             votes = votes[0]
+#         for item in votes:
+#             if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
+#                 return False
+#     except Exception as e:
+#         print("⚠️ Moderation failed:", e)
+#     unsafe_keywords = [
+#         "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
+#         "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
+#         "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
+#         "grenade", "horror", "beheaded", "torture", "hostage", "rape",
+#         "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
+#     ]
+#     if any(word in caption.lower() for word in unsafe_keywords):
+#         return False
+#     return True
+# # ----------------------
+# # Caption + Translate + Speak
+# # ----------------------
+# def generate_caption_translate_speak(image, target_lang):
+#     # Step 1: Caption
+#     inputs = caption_processor(images=image, return_tensors="pt").to(device)
+#     with torch.no_grad():
+#         out = caption_model.generate(**inputs, max_new_tokens=50)
+#     english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
+#  # Step 1.5: Safety Check
+#     if not is_caption_safe(english_caption):
+#     # Generate beep
+#         beep = make_beep_sound()
+#     # Generate warning speech
+#         tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
+#         speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+#         tts.save(speech_tmp.name)
+#     # Combine beep + speech
+#         combined_audio = combine_audio(beep, speech_tmp.name)
+#     # Return combined audio automatically
+#         return "⚠️ Warning: Unsafe or inappropriate content detected!", "", combined_audio
+#     # Step 2: Translate
+#     if target_lang in translation_models:
+#         translated = translation_models[target_lang](english_caption)[0]['translation_text']
+#     else:
+#         translated = "Translation not available"
+#     # Step 3: Generate Speech (English caption)
+#     tts = gTTS(english_caption, lang="en")
+#     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+#     tts.save(tmp_file.name)
+#     return english_caption, translated, tmp_file.name
+# # ----------------------
+# # VQA
+# # ----------------------
+# def vqa_answer(image, question):
+#     inputs = vqa_processor(image, question, return_tensors="pt").to(device)
+#     with torch.no_grad():
+#         out = vqa_model.generate(**inputs, max_new_tokens=50)
+#     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
+#     if not is_caption_safe(answer):
+#         beep = make_beep_sound()
+#         tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
+#         speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+#         tts.save(speech_tmp.name)
+#         combined = combine_audio(beep, speech_tmp.name)
+#         return "⚠️ Warning: Unsafe or inappropriate content detected!", combined
+#     return answer, None
+# # ----------------------
+# # Gradio UI
+# # ----------------------
+# with gr.Blocks(title="BLIP Vision App") as demo:
+#     gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (Auto-Play TTS + Safety Beep)")
+#     # --- Caption + Translate + Speak ---
+#     with gr.Tab("Caption + Translate + Speak"):
+#         with gr.Row():
+#             img_in = gr.Image(type="pil", label="Upload Image")
+#             lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
+#         eng_out = gr.Textbox(label="English Caption")
+#         trans_out = gr.Textbox(label="Translated Caption")
+#         audio_out = gr.Audio(label="Speech Output", type="filepath", autoplay=True)  # Auto-plays TTS or beep
+#         btn1 = gr.Button("Generate Caption, Translate & Speak")
+#         btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
+#                    outputs=[eng_out, trans_out, audio_out])
+#     # --- Visual Question Answering (VQA) ---
+#     with gr.Tab("Visual Question Answering (VQA)"):
+#         with gr.Row():
+#             img_vqa = gr.Image(type="pil", label="Upload Image")
+#             q_in = gr.Textbox(label="Ask a Question about the Image")
+#         ans_out = gr.Textbox(label="Answer")
+#         beep_out = gr.Audio(label="Alert Sound", type="filepath", autoplay=True)  # Auto-plays beep
+#         btn2 = gr.Button("Ask")
+#         btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
+# demo.launch()
 import gradio as gr
 from transformers import (
     BlipProcessor,
     BlipForConditionalGeneration,
     BlipForQuestionAnswering,
+    pipeline,
+    SpeechT5Processor,          # <--- NEW
+    SpeechT5ForTextToSpeech,    # <--- NEW
+    set_seed                      # <--- NEW
 )
+from datasets import load_dataset # <--- NEW for speaker embedding
 from PIL import Image
 import torch
+# from gtts import gTTS # <--- REMOVED
 import tempfile
 import numpy as np
 import soundfile as sf
 import librosa
 import tempfile
+import time # <--- Added for potential cleanup, but mostly for future use
+# Set seed for reproducibility in TTS generation
+set_seed(42)
 def combine_audio(beep_path, speech_path):
     """Combine beep + speech audio into one clip."""
+    # ... (Keep this function as is)
     beep, sr1 = sf.read(beep_path)
     speech, sr2 = sf.read(speech_path)
     # Resample beep if needed
     if sr1 != sr2:
         beep = librosa.resample(y=beep, orig_sr=sr1, target_sr=sr2)
         sr1 = sr2
     # Convert multi-channel to mono
     if len(beep.shape) > 1:
         beep = beep.mean(axis=1)
     if len(speech.shape) > 1:
+        # Check if speech is stereo (channels > 1) and has data
+        if speech.ndim > 1:
+            speech = speech.mean(axis=1)
+        # Ensure speech is treated as a 1D array even if it was originally mono 2D
+        # For single channel (mono) soundfile output, it might be 2D with shape (N, 1)
     # Concatenate beep + speech
     combined = np.concatenate((beep, speech))
     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    # SpeechT5 generates 16000Hz WAV, so we use sr1 (which is 16000) for the output
     sf.write(tmp_file.name, combined, sr1)
     return tmp_file.name
     "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
 }
+# Text-to-Speech (TTS) Models # <--- NEW/MODIFIED
+print("    Loading SpeechT5 TTS model...")
+tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
+# Load a speaker embedding (required for SpeechT5 to define a voice)
+# Using a sample speaker from the VCTK dataset
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
 # Safety Moderation Pipeline
 moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
 print("✅ All models loaded!")
+# ----------------------
+# Utility: Generate Local Speech (TTS) # <--- NEW FUNCTION
+# ----------------------
+def synthesize_speech_local(text, tts_processor, tts_model, speaker_embeddings):
+    """Generates speech using local HuggingFace SpeechT5 model."""
+    inputs = tts_processor(text=text, return_tensors="pt").to(device)
+    # Generate speech with the loaded model and speaker embedding
+    speech = tts_model.generate_speech(
+        inputs["input_ids"],
+        speaker_embeddings,
+        do_sample=True # Use sampling for more natural tone
+    )
+    # Convert the Tensor to a NumPy array
+    speech_np = speech.cpu().numpy()
+    # Create a temporary WAV file to save the audio
+    # SpeechT5's default sampling rate is 16000Hz
+    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    sf.write(tmp_file.name, speech_np, samplerate=16000)
+    return tmp_file.name
 # ----------------------
 # Utility: Generate a Beep Sound
 # ----------------------
 def make_beep_sound(duration=0.5, freq=1000):
     """Generate a short beep tone and save as temporary .wav file."""
+    # We use 16000Hz to match SpeechT5's output for combining audio later
+    samplerate = 16000
     t = np.linspace(0, duration, int(samplerate * duration), endpoint=False)
     wave = 0.5 * np.sin(2 * np.pi * freq * t)
     tmp_beep = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
     sf.write(tmp_beep.name, wave, samplerate)
     return tmp_beep.name
 # ----------------------
+# Safety Filter Function (Keep as is)
 # ----------------------
 def is_caption_safe(caption):
+    # ... (Keep this function as is)
     try:
         votes = moderation_model(caption)
         if isinstance(votes, list) and isinstance(votes[0], list):
             votes = votes[0]
         for item in votes:
+            # Checking for 'V' or 'V2' (Violent, etc.) labels with high confidence
             if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
                 return False
     except Exception as e:
         print("⚠️ Moderation failed:", e)
     unsafe_keywords = [
         "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
         "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
         return False
     return True
 # ----------------------
+# Caption + Translate + Speak # <--- MODIFIED
 # ----------------------
 def generate_caption_translate_speak(image, target_lang):
     # Step 1: Caption
     inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         out = caption_model.generate(**inputs, max_new_tokens=50)
+    english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
+    # Step 1.5: Safety Check (MODIFIED TTS)
     if not is_caption_safe(english_caption):
+        # Generate beep (16000Hz WAV)
         beep = make_beep_sound()
+        # Generate warning speech (16000Hz WAV)
+        warning_text = "Warning! Unsafe or inappropriate content detected."
+        speech_tmp_name = synthesize_speech_local(warning_text, tts_processor, tts_model, speaker_embeddings)
+        # Combine beep + speech
+        combined_audio = combine_audio(beep, speech_tmp_name)
+        # Return combined audio automatically
         return "⚠️ Warning: Unsafe or inappropriate content detected!", "", combined_audio
     # Step 2: Translate
     if target_lang in translation_models:
         translated = translation_models[target_lang](english_caption)[0]['translation_text']
     else:
         translated = "Translation not available"
+    # Step 3: Generate Speech (English caption) (MODIFIED TTS)
+    tmp_file_name = synthesize_speech_local(english_caption, tts_processor, tts_model, speaker_embeddings)
+    # The output is a .wav file now, but Gradio's Audio component is flexible
+    return english_caption, translated, tmp_file_name
 # ----------------------
+# VQA # <--- MODIFIED
 # ----------------------
 def vqa_answer(image, question):
     inputs = vqa_processor(image, question, return_tensors="pt").to(device)
     with torch.no_grad():
         out = vqa_model.generate(**inputs, max_new_tokens=50)
     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
     if not is_caption_safe(answer):
+        # Generate beep (16000Hz WAV)
         beep = make_beep_sound()
+        # Generate warning speech (16000Hz WAV)
+        warning_text = "Warning! Unsafe or inappropriate content detected."
+        speech_tmp_name = synthesize_speech_local(warning_text, tts_processor, tts_model, speaker_embeddings)
+        combined = combine_audio(beep, speech_tmp_name)
         return "⚠️ Warning: Unsafe or inappropriate content detected!", combined
     return answer, None
 # ----------------------
+# Gradio UI (Keep as is)
 # ----------------------
 with gr.Blocks(title="BLIP Vision App") as demo:
     gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (Auto-Play TTS + Safety Beep)")
+    gr.Markdown("### Note: Text-to-Speech now uses a local HuggingFace model to prevent 'Too Many Requests' (429) errors.")
     # --- Caption + Translate + Speak ---
     with gr.Tab("Caption + Translate + Speak"):
         with gr.Row():
             lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
         eng_out = gr.Textbox(label="English Caption")
         trans_out = gr.Textbox(label="Translated Caption")
+        # Note: We changed the output to WAV but Gradio handles it fine.
+        audio_out = gr.Audio(label="Speech Output (WAV format)", type="filepath", autoplay=True)
         btn1 = gr.Button("Generate Caption, Translate & Speak")
         btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
                    outputs=[eng_out, trans_out, audio_out])
     # --- Visual Question Answering (VQA) ---
     with gr.Tab("Visual Question Answering (VQA)"):
         with gr.Row():
             img_vqa = gr.Image(type="pil", label="Upload Image")
             q_in = gr.Textbox(label="Ask a Question about the Image")
         ans_out = gr.Textbox(label="Answer")
+        beep_out = gr.Audio(label="Alert Sound (WAV format)", type="filepath", autoplay=True)
         btn2 = gr.Button("Ask")
         btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])