gopalagra commited on
Commit
5f94238
Β·
verified Β·
1 Parent(s): d3a4a57

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -5
app.py CHANGED
@@ -231,6 +231,31 @@ from gtts import gTTS
231
  import tempfile
232
  import numpy as np
233
  import soundfile as sf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
  # ----------------------
236
  # Device setup
@@ -312,10 +337,22 @@ def generate_caption_translate_speak(image, target_lang):
312
  out = caption_model.generate(**inputs, max_new_tokens=50)
313
  english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
314
 
315
- # Step 1.5: Safety Check
316
  if not is_caption_safe(english_caption):
 
317
  beep = make_beep_sound()
318
- return "⚠️ Warning: Unsafe or inappropriate content detected!", "", beep
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  # Step 2: Translate
321
  if target_lang in translation_models:
@@ -340,9 +377,14 @@ def vqa_answer(image, question):
340
  out = vqa_model.generate(**inputs, max_new_tokens=50)
341
  answer = vqa_processor.decode(out[0], skip_special_tokens=True)
342
 
343
- if not is_caption_safe(answer):
344
- beep = make_beep_sound()
345
- return "⚠️ Warning: Unsafe or inappropriate content detected!", beep
 
 
 
 
 
346
 
347
  return answer, None
348
 
 
231
  import tempfile
232
  import numpy as np
233
  import soundfile as sf
234
+ import librosa
235
+ import tempfile
236
+
237
+ def combine_audio(beep_path, speech_path):
238
+ """Combine beep + speech audio into one clip."""
239
+ beep, sr1 = sf.read(beep_path)
240
+ speech, sr2 = sf.read(speech_path)
241
+
242
+ # Resample beep if needed
243
+ if sr1 != sr2:
244
+ beep = librosa.resample(beep, sr1, sr2)
245
+ sr1 = sr2
246
+
247
+ # Convert multi-channel to mono
248
+ if len(beep.shape) > 1:
249
+ beep = beep.mean(axis=1)
250
+ if len(speech.shape) > 1:
251
+ speech = speech.mean(axis=1)
252
+
253
+ # Concatenate beep + speech
254
+ combined = np.concatenate((beep, speech))
255
+
256
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
257
+ sf.write(tmp_file.name, combined, sr1)
258
+ return tmp_file.name
259
 
260
  # ----------------------
261
  # Device setup
 
337
  out = caption_model.generate(**inputs, max_new_tokens=50)
338
  english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
339
 
340
+ # Step 1.5: Safety Check
341
  if not is_caption_safe(english_caption):
342
+ # Generate beep
343
  beep = make_beep_sound()
344
+
345
+ # Generate warning speech
346
+ tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
347
+ speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
348
+ tts.save(speech_tmp.name)
349
+
350
+ # Combine beep + speech
351
+ combined_audio = combine_audio(beep, speech_tmp.name)
352
+
353
+ # Return combined audio automatically
354
+ return "⚠️ Warning: Unsafe or inappropriate content detected!", "", combined_audio
355
+
356
 
357
  # Step 2: Translate
358
  if target_lang in translation_models:
 
377
  out = vqa_model.generate(**inputs, max_new_tokens=50)
378
  answer = vqa_processor.decode(out[0], skip_special_tokens=True)
379
 
380
+ if not is_caption_safe(answer):
381
+ beep = make_beep_sound()
382
+ tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
383
+ speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
384
+ tts.save(speech_tmp.name)
385
+ combined = combine_audio(beep, speech_tmp.name)
386
+ return "⚠️ Warning: Unsafe or inappropriate content detected!", combined
387
+
388
 
389
  return answer, None
390