Spaces:

gopalagra
/

blind-image-captioning

Runtime error

App Files Files Community

gopalagra commited on Nov 13, 2025

Commit

5f94238

verified ·

1 Parent(s): d3a4a57

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -5

app.py CHANGED Viewed

@@ -231,6 +231,31 @@ from gtts import gTTS
 import tempfile
 import numpy as np
 import soundfile as sf
 # ----------------------
 # Device setup
@@ -312,10 +337,22 @@ def generate_caption_translate_speak(image, target_lang):
         out = caption_model.generate(**inputs, max_new_tokens=50)
     english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
-    # Step 1.5: Safety Check
     if not is_caption_safe(english_caption):
         beep = make_beep_sound()
-        return "⚠️ Warning: Unsafe or inappropriate content detected!", "", beep
     # Step 2: Translate
     if target_lang in translation_models:
@@ -340,9 +377,14 @@ def vqa_answer(image, question):
         out = vqa_model.generate(**inputs, max_new_tokens=50)
     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
-    if not is_caption_safe(answer):
-        beep = make_beep_sound()
-        return "⚠️ Warning: Unsafe or inappropriate content detected!", beep
     return answer, None

 import tempfile
 import numpy as np
 import soundfile as sf
+import librosa
+import tempfile
+def combine_audio(beep_path, speech_path):
+    """Combine beep + speech audio into one clip."""
+    beep, sr1 = sf.read(beep_path)
+    speech, sr2 = sf.read(speech_path)
+    # Resample beep if needed
+    if sr1 != sr2:
+        beep = librosa.resample(beep, sr1, sr2)
+        sr1 = sr2
+    # Convert multi-channel to mono
+    if len(beep.shape) > 1:
+        beep = beep.mean(axis=1)
+    if len(speech.shape) > 1:
+        speech = speech.mean(axis=1)
+    # Concatenate beep + speech
+    combined = np.concatenate((beep, speech))
+    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    sf.write(tmp_file.name, combined, sr1)
+    return tmp_file.name
 # ----------------------
 # Device setup
         out = caption_model.generate(**inputs, max_new_tokens=50)
     english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
+ # Step 1.5: Safety Check
     if not is_caption_safe(english_caption):
+    # Generate beep
         beep = make_beep_sound()
+    # Generate warning speech
+    tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
+    speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+    tts.save(speech_tmp.name)
+    # Combine beep + speech
+    combined_audio = combine_audio(beep, speech_tmp.name)
+    # Return combined audio automatically
+    return "⚠️ Warning: Unsafe or inappropriate content detected!", "", combined_audio
     # Step 2: Translate
     if target_lang in translation_models:
         out = vqa_model.generate(**inputs, max_new_tokens=50)
     answer = vqa_processor.decode(out[0], skip_special_tokens=True)
+  if not is_caption_safe(answer):
+    beep = make_beep_sound()
+    tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
+    speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+    tts.save(speech_tmp.name)
+    combined = combine_audio(beep, speech_tmp.name)
+    return "⚠️ Warning: Unsafe or inappropriate content detected!", combined
     return answer, None