gopalagra commited on
Commit
1d32024
·
verified ·
1 Parent(s): b189448

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +311 -51
app.py CHANGED
@@ -218,42 +218,271 @@
218
 
219
 
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  import gradio as gr
222
  from transformers import (
223
  BlipProcessor,
224
  BlipForConditionalGeneration,
225
  BlipForQuestionAnswering,
226
- pipeline
 
 
 
227
  )
 
228
  from PIL import Image
229
  import torch
230
- from gtts import gTTS
231
  import tempfile
232
  import numpy as np
233
  import soundfile as sf
234
  import librosa
235
  import tempfile
 
 
 
 
236
 
237
  def combine_audio(beep_path, speech_path):
238
  """Combine beep + speech audio into one clip."""
 
239
  beep, sr1 = sf.read(beep_path)
240
  speech, sr2 = sf.read(speech_path)
241
-
242
  # Resample beep if needed
243
  if sr1 != sr2:
244
  beep = librosa.resample(y=beep, orig_sr=sr1, target_sr=sr2)
245
  sr1 = sr2
246
-
247
  # Convert multi-channel to mono
248
  if len(beep.shape) > 1:
249
  beep = beep.mean(axis=1)
250
  if len(speech.shape) > 1:
251
- speech = speech.mean(axis=1)
 
 
 
 
252
 
253
  # Concatenate beep + speech
254
  combined = np.concatenate((beep, speech))
255
-
256
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
 
257
  sf.write(tmp_file.name, combined, sr1)
258
  return tmp_file.name
259
 
@@ -282,39 +511,73 @@ translation_models = {
282
  "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
283
  }
284
 
 
 
 
 
 
 
 
 
 
 
285
  # Safety Moderation Pipeline
286
  moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
287
-
288
  print("✅ All models loaded!")
289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
  # ----------------------
292
  # Utility: Generate a Beep Sound
293
  # ----------------------
294
  def make_beep_sound(duration=0.5, freq=1000):
295
  """Generate a short beep tone and save as temporary .wav file."""
296
- samplerate = 44100
 
297
  t = np.linspace(0, duration, int(samplerate * duration), endpoint=False)
298
  wave = 0.5 * np.sin(2 * np.pi * freq * t)
299
  tmp_beep = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
300
  sf.write(tmp_beep.name, wave, samplerate)
301
  return tmp_beep.name
302
 
303
-
304
  # ----------------------
305
- # Safety Filter Function
306
  # ----------------------
307
  def is_caption_safe(caption):
 
308
  try:
309
  votes = moderation_model(caption)
310
  if isinstance(votes, list) and isinstance(votes[0], list):
311
  votes = votes[0]
312
  for item in votes:
 
313
  if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
314
  return False
315
  except Exception as e:
316
  print("⚠️ Moderation failed:", e)
317
-
318
  unsafe_keywords = [
319
  "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
320
  "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
@@ -326,76 +589,72 @@ def is_caption_safe(caption):
326
  return False
327
  return True
328
 
329
-
330
  # ----------------------
331
- # Caption + Translate + Speak
332
  # ----------------------
333
  def generate_caption_translate_speak(image, target_lang):
334
  # Step 1: Caption
335
  inputs = caption_processor(images=image, return_tensors="pt").to(device)
336
  with torch.no_grad():
337
  out = caption_model.generate(**inputs, max_new_tokens=50)
338
- english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
339
-
340
- # Step 1.5: Safety Check
341
  if not is_caption_safe(english_caption):
342
- # Generate beep
343
  beep = make_beep_sound()
344
-
345
- # Generate warning speech
346
- tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
347
- speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
348
- tts.save(speech_tmp.name)
349
-
350
- # Combine beep + speech
351
- combined_audio = combine_audio(beep, speech_tmp.name)
352
-
353
- # Return combined audio automatically
354
  return "⚠️ Warning: Unsafe or inappropriate content detected!", "", combined_audio
355
 
356
-
357
-
358
  # Step 2: Translate
359
  if target_lang in translation_models:
360
  translated = translation_models[target_lang](english_caption)[0]['translation_text']
361
  else:
362
  translated = "Translation not available"
363
-
364
- # Step 3: Generate Speech (English caption)
365
- tts = gTTS(english_caption, lang="en")
366
- tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
367
- tts.save(tmp_file.name)
368
-
369
- return english_caption, translated, tmp_file.name
370
-
371
 
372
  # ----------------------
373
- # VQA
374
  # ----------------------
375
  def vqa_answer(image, question):
376
  inputs = vqa_processor(image, question, return_tensors="pt").to(device)
377
  with torch.no_grad():
378
  out = vqa_model.generate(**inputs, max_new_tokens=50)
379
  answer = vqa_processor.decode(out[0], skip_special_tokens=True)
380
-
381
  if not is_caption_safe(answer):
 
382
  beep = make_beep_sound()
383
- tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
384
- speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
385
- tts.save(speech_tmp.name)
386
- combined = combine_audio(beep, speech_tmp.name)
 
 
387
  return "⚠️ Warning: Unsafe or inappropriate content detected!", combined
388
-
389
-
390
  return answer, None
391
 
392
-
393
  # ----------------------
394
- # Gradio UI
395
  # ----------------------
396
  with gr.Blocks(title="BLIP Vision App") as demo:
397
  gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (Auto-Play TTS + Safety Beep)")
398
-
 
399
  # --- Caption + Translate + Speak ---
400
  with gr.Tab("Caption + Translate + Speak"):
401
  with gr.Row():
@@ -403,18 +662,19 @@ with gr.Blocks(title="BLIP Vision App") as demo:
403
  lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
404
  eng_out = gr.Textbox(label="English Caption")
405
  trans_out = gr.Textbox(label="Translated Caption")
406
- audio_out = gr.Audio(label="Speech Output", type="filepath", autoplay=True) # Auto-plays TTS or beep
 
407
  btn1 = gr.Button("Generate Caption, Translate & Speak")
408
  btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
409
  outputs=[eng_out, trans_out, audio_out])
410
-
411
  # --- Visual Question Answering (VQA) ---
412
  with gr.Tab("Visual Question Answering (VQA)"):
413
  with gr.Row():
414
  img_vqa = gr.Image(type="pil", label="Upload Image")
415
  q_in = gr.Textbox(label="Ask a Question about the Image")
416
  ans_out = gr.Textbox(label="Answer")
417
- beep_out = gr.Audio(label="Alert Sound", type="filepath", autoplay=True) # Auto-plays beep
418
  btn2 = gr.Button("Ask")
419
  btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
420
 
 
218
 
219
 
220
 
221
+ # import gradio as gr
222
+ # from transformers import (
223
+ # BlipProcessor,
224
+ # BlipForConditionalGeneration,
225
+ # BlipForQuestionAnswering,
226
+ # pipeline
227
+ # )
228
+ # from PIL import Image
229
+ # import torch
230
+ # from gtts import gTTS
231
+ # import tempfile
232
+ # import numpy as np
233
+ # import soundfile as sf
234
+ # import librosa
235
+ # import tempfile
236
+
237
+ # def combine_audio(beep_path, speech_path):
238
+ # """Combine beep + speech audio into one clip."""
239
+ # beep, sr1 = sf.read(beep_path)
240
+ # speech, sr2 = sf.read(speech_path)
241
+
242
+ # # Resample beep if needed
243
+ # if sr1 != sr2:
244
+ # beep = librosa.resample(y=beep, orig_sr=sr1, target_sr=sr2)
245
+ # sr1 = sr2
246
+
247
+ # # Convert multi-channel to mono
248
+ # if len(beep.shape) > 1:
249
+ # beep = beep.mean(axis=1)
250
+ # if len(speech.shape) > 1:
251
+ # speech = speech.mean(axis=1)
252
+
253
+ # # Concatenate beep + speech
254
+ # combined = np.concatenate((beep, speech))
255
+
256
+ # tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
257
+ # sf.write(tmp_file.name, combined, sr1)
258
+ # return tmp_file.name
259
+
260
+ # # ----------------------
261
+ # # Device setup
262
+ # # ----------------------
263
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
264
+
265
+ # # ----------------------
266
+ # # Load Models Once
267
+ # # ----------------------
268
+ # print("🔄 Loading models...")
269
+
270
+ # # Captioning
271
+ # caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
272
+ # caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
273
+
274
+ # # VQA
275
+ # vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
276
+ # vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
277
+
278
+ # # Translation
279
+ # translation_models = {
280
+ # "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
281
+ # "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
282
+ # "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
283
+ # }
284
+
285
+ # # Safety Moderation Pipeline
286
+ # moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
287
+
288
+ # print("✅ All models loaded!")
289
+
290
+
291
+ # # ----------------------
292
+ # # Utility: Generate a Beep Sound
293
+ # # ----------------------
294
+ # def make_beep_sound(duration=0.5, freq=1000):
295
+ # """Generate a short beep tone and save as temporary .wav file."""
296
+ # samplerate = 44100
297
+ # t = np.linspace(0, duration, int(samplerate * duration), endpoint=False)
298
+ # wave = 0.5 * np.sin(2 * np.pi * freq * t)
299
+ # tmp_beep = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
300
+ # sf.write(tmp_beep.name, wave, samplerate)
301
+ # return tmp_beep.name
302
+
303
+
304
+ # # ----------------------
305
+ # # Safety Filter Function
306
+ # # ----------------------
307
+ # def is_caption_safe(caption):
308
+ # try:
309
+ # votes = moderation_model(caption)
310
+ # if isinstance(votes, list) and isinstance(votes[0], list):
311
+ # votes = votes[0]
312
+ # for item in votes:
313
+ # if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
314
+ # return False
315
+ # except Exception as e:
316
+ # print("⚠️ Moderation failed:", e)
317
+
318
+ # unsafe_keywords = [
319
+ # "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
320
+ # "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
321
+ # "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
322
+ # "grenade", "horror", "beheaded", "torture", "hostage", "rape",
323
+ # "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
324
+ # ]
325
+ # if any(word in caption.lower() for word in unsafe_keywords):
326
+ # return False
327
+ # return True
328
+
329
+
330
+ # # ----------------------
331
+ # # Caption + Translate + Speak
332
+ # # ----------------------
333
+ # def generate_caption_translate_speak(image, target_lang):
334
+ # # Step 1: Caption
335
+ # inputs = caption_processor(images=image, return_tensors="pt").to(device)
336
+ # with torch.no_grad():
337
+ # out = caption_model.generate(**inputs, max_new_tokens=50)
338
+ # english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
339
+
340
+ # # Step 1.5: Safety Check
341
+ # if not is_caption_safe(english_caption):
342
+ # # Generate beep
343
+ # beep = make_beep_sound()
344
+
345
+ # # Generate warning speech
346
+ # tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
347
+ # speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
348
+ # tts.save(speech_tmp.name)
349
+
350
+ # # Combine beep + speech
351
+ # combined_audio = combine_audio(beep, speech_tmp.name)
352
+
353
+ # # Return combined audio automatically
354
+ # return "⚠️ Warning: Unsafe or inappropriate content detected!", "", combined_audio
355
+
356
+
357
+
358
+ # # Step 2: Translate
359
+ # if target_lang in translation_models:
360
+ # translated = translation_models[target_lang](english_caption)[0]['translation_text']
361
+ # else:
362
+ # translated = "Translation not available"
363
+
364
+ # # Step 3: Generate Speech (English caption)
365
+ # tts = gTTS(english_caption, lang="en")
366
+ # tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
367
+ # tts.save(tmp_file.name)
368
+
369
+ # return english_caption, translated, tmp_file.name
370
+
371
+
372
+ # # ----------------------
373
+ # # VQA
374
+ # # ----------------------
375
+ # def vqa_answer(image, question):
376
+ # inputs = vqa_processor(image, question, return_tensors="pt").to(device)
377
+ # with torch.no_grad():
378
+ # out = vqa_model.generate(**inputs, max_new_tokens=50)
379
+ # answer = vqa_processor.decode(out[0], skip_special_tokens=True)
380
+
381
+ # if not is_caption_safe(answer):
382
+ # beep = make_beep_sound()
383
+ # tts = gTTS("Warning! Unsafe or inappropriate content detected.", lang="en")
384
+ # speech_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
385
+ # tts.save(speech_tmp.name)
386
+ # combined = combine_audio(beep, speech_tmp.name)
387
+ # return "⚠️ Warning: Unsafe or inappropriate content detected!", combined
388
+
389
+
390
+ # return answer, None
391
+
392
+
393
+ # # ----------------------
394
+ # # Gradio UI
395
+ # # ----------------------
396
+ # with gr.Blocks(title="BLIP Vision App") as demo:
397
+ # gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (Auto-Play TTS + Safety Beep)")
398
+
399
+ # # --- Caption + Translate + Speak ---
400
+ # with gr.Tab("Caption + Translate + Speak"):
401
+ # with gr.Row():
402
+ # img_in = gr.Image(type="pil", label="Upload Image")
403
+ # lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
404
+ # eng_out = gr.Textbox(label="English Caption")
405
+ # trans_out = gr.Textbox(label="Translated Caption")
406
+ # audio_out = gr.Audio(label="Speech Output", type="filepath", autoplay=True) # Auto-plays TTS or beep
407
+ # btn1 = gr.Button("Generate Caption, Translate & Speak")
408
+ # btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
409
+ # outputs=[eng_out, trans_out, audio_out])
410
+
411
+ # # --- Visual Question Answering (VQA) ---
412
+ # with gr.Tab("Visual Question Answering (VQA)"):
413
+ # with gr.Row():
414
+ # img_vqa = gr.Image(type="pil", label="Upload Image")
415
+ # q_in = gr.Textbox(label="Ask a Question about the Image")
416
+ # ans_out = gr.Textbox(label="Answer")
417
+ # beep_out = gr.Audio(label="Alert Sound", type="filepath", autoplay=True) # Auto-plays beep
418
+ # btn2 = gr.Button("Ask")
419
+ # btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
420
+
421
+ # demo.launch()
422
+
423
+
424
+
425
+
426
+
427
+
428
+
429
+
430
+
431
+
432
+
433
+
434
+
435
+
436
  import gradio as gr
437
  from transformers import (
438
  BlipProcessor,
439
  BlipForConditionalGeneration,
440
  BlipForQuestionAnswering,
441
+ pipeline,
442
+ SpeechT5Processor, # <--- NEW
443
+ SpeechT5ForTextToSpeech, # <--- NEW
444
+ set_seed # <--- NEW
445
  )
446
+ from datasets import load_dataset # <--- NEW for speaker embedding
447
  from PIL import Image
448
  import torch
449
+ # from gtts import gTTS # <--- REMOVED
450
  import tempfile
451
  import numpy as np
452
  import soundfile as sf
453
  import librosa
454
  import tempfile
455
+ import time # <--- Added for potential cleanup, but mostly for future use
456
+
457
+ # Set seed for reproducibility in TTS generation
458
+ set_seed(42)
459
 
460
  def combine_audio(beep_path, speech_path):
461
  """Combine beep + speech audio into one clip."""
462
+ # ... (Keep this function as is)
463
  beep, sr1 = sf.read(beep_path)
464
  speech, sr2 = sf.read(speech_path)
465
+
466
  # Resample beep if needed
467
  if sr1 != sr2:
468
  beep = librosa.resample(y=beep, orig_sr=sr1, target_sr=sr2)
469
  sr1 = sr2
470
+
471
  # Convert multi-channel to mono
472
  if len(beep.shape) > 1:
473
  beep = beep.mean(axis=1)
474
  if len(speech.shape) > 1:
475
+ # Check if speech is stereo (channels > 1) and has data
476
+ if speech.ndim > 1:
477
+ speech = speech.mean(axis=1)
478
+ # Ensure speech is treated as a 1D array even if it was originally mono 2D
479
+ # For single channel (mono) soundfile output, it might be 2D with shape (N, 1)
480
 
481
  # Concatenate beep + speech
482
  combined = np.concatenate((beep, speech))
483
+
484
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
485
+ # SpeechT5 generates 16000Hz WAV, so we use sr1 (which is 16000) for the output
486
  sf.write(tmp_file.name, combined, sr1)
487
  return tmp_file.name
488
 
 
511
  "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
512
  }
513
 
514
+ # Text-to-Speech (TTS) Models # <--- NEW/MODIFIED
515
+ print(" Loading SpeechT5 TTS model...")
516
+ tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
517
+ tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
518
+
519
+ # Load a speaker embedding (required for SpeechT5 to define a voice)
520
+ # Using a sample speaker from the VCTK dataset
521
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
522
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
523
+
524
  # Safety Moderation Pipeline
525
  moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
 
526
  print("✅ All models loaded!")
527
 
528
+ # ----------------------
529
+ # Utility: Generate Local Speech (TTS) # <--- NEW FUNCTION
530
+ # ----------------------
531
+ def synthesize_speech_local(text, tts_processor, tts_model, speaker_embeddings):
532
+ """Generates speech using local HuggingFace SpeechT5 model."""
533
+ inputs = tts_processor(text=text, return_tensors="pt").to(device)
534
+
535
+ # Generate speech with the loaded model and speaker embedding
536
+ speech = tts_model.generate_speech(
537
+ inputs["input_ids"],
538
+ speaker_embeddings,
539
+ do_sample=True # Use sampling for more natural tone
540
+ )
541
+
542
+ # Convert the Tensor to a NumPy array
543
+ speech_np = speech.cpu().numpy()
544
+
545
+ # Create a temporary WAV file to save the audio
546
+ # SpeechT5's default sampling rate is 16000Hz
547
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
548
+ sf.write(tmp_file.name, speech_np, samplerate=16000)
549
+
550
+ return tmp_file.name
551
 
552
  # ----------------------
553
  # Utility: Generate a Beep Sound
554
  # ----------------------
555
  def make_beep_sound(duration=0.5, freq=1000):
556
  """Generate a short beep tone and save as temporary .wav file."""
557
+ # We use 16000Hz to match SpeechT5's output for combining audio later
558
+ samplerate = 16000
559
  t = np.linspace(0, duration, int(samplerate * duration), endpoint=False)
560
  wave = 0.5 * np.sin(2 * np.pi * freq * t)
561
  tmp_beep = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
562
  sf.write(tmp_beep.name, wave, samplerate)
563
  return tmp_beep.name
564
 
 
565
  # ----------------------
566
+ # Safety Filter Function (Keep as is)
567
  # ----------------------
568
  def is_caption_safe(caption):
569
+ # ... (Keep this function as is)
570
  try:
571
  votes = moderation_model(caption)
572
  if isinstance(votes, list) and isinstance(votes[0], list):
573
  votes = votes[0]
574
  for item in votes:
575
+ # Checking for 'V' or 'V2' (Violent, etc.) labels with high confidence
576
  if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
577
  return False
578
  except Exception as e:
579
  print("⚠️ Moderation failed:", e)
580
+
581
  unsafe_keywords = [
582
  "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
583
  "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
 
589
  return False
590
  return True
591
 
 
592
  # ----------------------
593
+ # Caption + Translate + Speak # <--- MODIFIED
594
  # ----------------------
595
  def generate_caption_translate_speak(image, target_lang):
596
  # Step 1: Caption
597
  inputs = caption_processor(images=image, return_tensors="pt").to(device)
598
  with torch.no_grad():
599
  out = caption_model.generate(**inputs, max_new_tokens=50)
600
+ english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
601
+
602
+ # Step 1.5: Safety Check (MODIFIED TTS)
603
  if not is_caption_safe(english_caption):
604
+ # Generate beep (16000Hz WAV)
605
  beep = make_beep_sound()
606
+
607
+ # Generate warning speech (16000Hz WAV)
608
+ warning_text = "Warning! Unsafe or inappropriate content detected."
609
+ speech_tmp_name = synthesize_speech_local(warning_text, tts_processor, tts_model, speaker_embeddings)
610
+
611
+ # Combine beep + speech
612
+ combined_audio = combine_audio(beep, speech_tmp_name)
613
+
614
+ # Return combined audio automatically
 
615
  return "⚠️ Warning: Unsafe or inappropriate content detected!", "", combined_audio
616
 
 
 
617
  # Step 2: Translate
618
  if target_lang in translation_models:
619
  translated = translation_models[target_lang](english_caption)[0]['translation_text']
620
  else:
621
  translated = "Translation not available"
622
+
623
+ # Step 3: Generate Speech (English caption) (MODIFIED TTS)
624
+ tmp_file_name = synthesize_speech_local(english_caption, tts_processor, tts_model, speaker_embeddings)
625
+
626
+ # The output is a .wav file now, but Gradio's Audio component is flexible
627
+ return english_caption, translated, tmp_file_name
 
 
628
 
629
  # ----------------------
630
+ # VQA # <--- MODIFIED
631
  # ----------------------
632
  def vqa_answer(image, question):
633
  inputs = vqa_processor(image, question, return_tensors="pt").to(device)
634
  with torch.no_grad():
635
  out = vqa_model.generate(**inputs, max_new_tokens=50)
636
  answer = vqa_processor.decode(out[0], skip_special_tokens=True)
637
+
638
  if not is_caption_safe(answer):
639
+ # Generate beep (16000Hz WAV)
640
  beep = make_beep_sound()
641
+
642
+ # Generate warning speech (16000Hz WAV)
643
+ warning_text = "Warning! Unsafe or inappropriate content detected."
644
+ speech_tmp_name = synthesize_speech_local(warning_text, tts_processor, tts_model, speaker_embeddings)
645
+
646
+ combined = combine_audio(beep, speech_tmp_name)
647
  return "⚠️ Warning: Unsafe or inappropriate content detected!", combined
648
+
 
649
  return answer, None
650
 
 
651
  # ----------------------
652
+ # Gradio UI (Keep as is)
653
  # ----------------------
654
  with gr.Blocks(title="BLIP Vision App") as demo:
655
  gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (Auto-Play TTS + Safety Beep)")
656
+ gr.Markdown("### Note: Text-to-Speech now uses a local HuggingFace model to prevent 'Too Many Requests' (429) errors.")
657
+
658
  # --- Caption + Translate + Speak ---
659
  with gr.Tab("Caption + Translate + Speak"):
660
  with gr.Row():
 
662
  lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
663
  eng_out = gr.Textbox(label="English Caption")
664
  trans_out = gr.Textbox(label="Translated Caption")
665
+ # Note: We changed the output to WAV but Gradio handles it fine.
666
+ audio_out = gr.Audio(label="Speech Output (WAV format)", type="filepath", autoplay=True)
667
  btn1 = gr.Button("Generate Caption, Translate & Speak")
668
  btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
669
  outputs=[eng_out, trans_out, audio_out])
670
+
671
  # --- Visual Question Answering (VQA) ---
672
  with gr.Tab("Visual Question Answering (VQA)"):
673
  with gr.Row():
674
  img_vqa = gr.Image(type="pil", label="Upload Image")
675
  q_in = gr.Textbox(label="Ask a Question about the Image")
676
  ans_out = gr.Textbox(label="Answer")
677
+ beep_out = gr.Audio(label="Alert Sound (WAV format)", type="filepath", autoplay=True)
678
  btn2 = gr.Button("Ask")
679
  btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
680