Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -231,6 +231,31 @@ from gtts import gTTS
|
|
| 231 |
import tempfile
|
| 232 |
import numpy as np
|
| 233 |
import soundfile as sf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
# ----------------------
|
| 236 |
# Device setup
|
|
@@ -312,10 +337,22 @@ def generate_caption_translate_speak(image, target_lang):
|
|
| 312 |
out = caption_model.generate(**inputs, max_new_tokens=50)
|
| 313 |
english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
|
| 314 |
|
| 315 |
-
|
| 316 |
if not is_caption_safe(english_caption):
|
|
|
|
| 317 |
beep = make_beep_sound()
|
| 318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
# Step 2: Translate
|
| 321 |
if target_lang in translation_models:
|
|
@@ -340,9 +377,14 @@ def vqa_answer(image, question):
|
|
| 340 |
out = vqa_model.generate(**inputs, max_new_tokens=50)
|
| 341 |
answer = vqa_processor.decode(out[0], skip_special_tokens=True)
|
| 342 |
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
|
| 347 |
return answer, None
|
| 348 |
|
|
|
|
| 231 |
import tempfile
|
| 232 |
import numpy as np
|
| 233 |
import soundfile as sf
|
| 234 |
+
import librosa
|
| 235 |
+
import tempfile
|
| 236 |
+
|
| 237 |
+
def combine_audio(beep_path, speech_path):
|
| 238 |
+
"""Combine beep + speech audio into one clip."""
|
| 239 |
+
beep, sr1 = sf.read(beep_path)
|
| 240 |
+
speech, sr2 = sf.read(speech_path)
|
| 241 |
+
|
| 242 |
+
# Resample beep if needed
|
| 243 |
+
if sr1 != sr2:
|
| 244 |
+
beep = librosa.resample(beep, sr1, sr2)
|
| 245 |
+
sr1 = sr2
|
| 246 |
+
|
| 247 |
+
# Convert multi-channel to mono
|
| 248 |
+
if len(beep.shape) > 1:
|
| 249 |
+
beep = beep.mean(axis=1)
|
| 250 |
+
if len(speech.shape) > 1:
|
| 251 |
+
speech = speech.mean(axis=1)
|
| 252 |
+
|
| 253 |
+
# Concatenate beep + speech
|
| 254 |
+
combined = np.concatenate((beep, speech))
|
| 255 |
+
|
| 256 |
+
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
| 257 |
+
sf.write(tmp_file.name, combined, sr1)
|
| 258 |
+
return tmp_file.name
|
| 259 |
|
| 260 |
# ----------------------
|
| 261 |
# Device setup
|
|
|
|
| 337 |
out = caption_model.generate(**inputs, max_new_tokens=50)
|
| 338 |
english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
|
| 339 |
|
| 340 |
+
# Step 1.5: Safety Check
|
| 341 |
if not is_caption_safe(english_caption):
|
| 342 |
+
# Generate beep
|
| 343 |
beep = make_beep_sound()
|
| 344 |
+
|
| 345 |
+
# Generate warning speech
|
| 346 |
+
tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
|
| 347 |
+
speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
|
| 348 |
+
tts.save(speech_tmp.name)
|
| 349 |
+
|
| 350 |
+
# Combine beep + speech
|
| 351 |
+
combined_audio = combine_audio(beep, speech_tmp.name)
|
| 352 |
+
|
| 353 |
+
# Return combined audio automatically
|
| 354 |
+
return "β οΈ Warning: Unsafe or inappropriate content detected!", "", combined_audio
|
| 355 |
+
|
| 356 |
|
| 357 |
# Step 2: Translate
|
| 358 |
if target_lang in translation_models:
|
|
|
|
| 377 |
out = vqa_model.generate(**inputs, max_new_tokens=50)
|
| 378 |
answer = vqa_processor.decode(out[0], skip_special_tokens=True)
|
| 379 |
|
| 380 |
+
if not is_caption_safe(answer):
|
| 381 |
+
beep = make_beep_sound()
|
| 382 |
+
tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
|
| 383 |
+
speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
|
| 384 |
+
tts.save(speech_tmp.name)
|
| 385 |
+
combined = combine_audio(beep, speech_tmp.name)
|
| 386 |
+
return "β οΈ Warning: Unsafe or inappropriate content detected!", combined
|
| 387 |
+
|
| 388 |
|
| 389 |
return answer, None
|
| 390 |
|