gopalagra commited on
Commit
d2ac3ec
·
verified ·
1 Parent(s): b53948f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -53
app.py CHANGED
@@ -223,47 +223,22 @@ from transformers import (
223
  BlipForQuestionAnswering,
224
  pipeline
225
  )
226
- from PIL import Image
227
  import torch
228
  import tempfile
229
- import pyttsx3 # offline TTS
 
 
 
230
 
231
  # ----------------------
232
- # Device setup
233
  # ----------------------
234
  device = "cuda" if torch.cuda.is_available() else "cpu"
235
 
236
  # ----------------------
237
- # Offline TTS for safe captions
238
- # ----------------------
239
- def offline_tts(text):
240
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
241
- engine = pyttsx3.init()
242
- engine.save_to_file(text, tmp.name)
243
- engine.runAndWait()
244
- return tmp.name
245
-
246
- # ----------------------
247
- # Simple BEEP sound
248
- # ----------------------
249
- def generate_beep():
250
- import numpy as np
251
- import soundfile as sf
252
-
253
- sr = 44100
254
- duration = 0.3
255
- freq = 880
256
- t = np.linspace(0, duration, int(sr*duration), False)
257
- wave = 0.5*np.sin(2*np.pi*freq*t)
258
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
259
- sf.write(tmp.name, wave, sr)
260
- return tmp.name
261
-
262
- # ----------------------
263
- # Load models
264
  # ----------------------
265
- print("🔄 Loading models...")
266
-
267
  caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
268
  caption_model = BlipForConditionalGeneration.from_pretrained(
269
  "Salesforce/blip-image-captioning-large"
@@ -282,43 +257,69 @@ translation_models = {
282
 
283
  moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
284
 
 
 
 
285
  print("✅ All models loaded!")
286
 
287
  # ----------------------
288
  # Safety check
289
  # ----------------------
290
- def is_safe(text):
291
  try:
292
- result = moderation_model(text)
293
  if isinstance(result, list) and "label" in result[0]:
294
- if result[0]["label"].lower() == "toxic" and result[0]["score"] > 0.5:
295
  return False
296
  except:
297
  pass
298
- unsafe_keywords = ["gun", "kill", "dead", "blood", "weapon"]
299
- return not any(k in text.lower() for k in unsafe_keywords)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
  # ----------------------
302
- # Caption + Translate + Audio
 
 
 
 
 
 
 
 
303
  # ----------------------
304
  def generate_caption_translate_speak(image, target_lang):
305
  if image is None:
306
  return "", "", None
307
 
308
- # Generate caption
309
  inputs = caption_processor(images=image, return_tensors="pt").to(device)
310
  with torch.no_grad():
311
- output = caption_model.generate(**inputs, max_new_tokens=50)
312
- caption = caption_processor.decode(output[0], skip_special_tokens=True)
313
 
314
  # Safety check
315
- if not is_safe(caption):
316
- return "⚠️ Unsafe content detected!", "", generate_beep()
 
317
 
318
- # Translate
319
  translated = translation_models[target_lang](caption)[0]["translation_text"]
320
 
321
- # Generate TTS for safe caption
322
  audio_file = offline_tts(caption)
323
 
324
  return caption, translated, audio_file
@@ -329,13 +330,11 @@ def generate_caption_translate_speak(image, target_lang):
329
  def vqa_answer(image, question):
330
  if image is None or not question:
331
  return ""
332
-
333
  inputs = vqa_processor(image, question, return_tensors="pt").to(device)
334
  with torch.no_grad():
335
- out = vqa_model.generate(**inputs, max_new_tokens=30)
336
  answer = vqa_processor.decode(out[0], skip_special_tokens=True)
337
-
338
- if not is_safe(answer):
339
  return "⚠️ Unsafe content detected!"
340
  return answer
341
 
@@ -343,7 +342,7 @@ def vqa_answer(image, question):
343
  # Gradio UI
344
  # ----------------------
345
  with gr.Blocks(title="BLIP Vision App") as demo:
346
- gr.Markdown("## 🖼️ BLIP: Caption + Translation + TTS + VQA")
347
 
348
  with gr.Tab("Caption + Translate + Speak"):
349
  with gr.Row():
@@ -352,7 +351,7 @@ with gr.Blocks(title="BLIP Vision App") as demo:
352
 
353
  eng_out = gr.Textbox(label="English Caption")
354
  trans_out = gr.Textbox(label="Translated Caption")
355
- audio_out = gr.Audio(label="Audio / Beep", type="filepath", autoplay=True)
356
 
357
  btn = gr.Button("Generate Caption, Translate & Speak")
358
  btn.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
@@ -361,14 +360,12 @@ with gr.Blocks(title="BLIP Vision App") as demo:
361
  with gr.Row():
362
  img_vqa = gr.Image(type="pil")
363
  q_in = gr.Textbox(label="Ask About the Image")
364
-
365
  ans_out = gr.Textbox(label="Answer")
366
  btn2 = gr.Button("Ask")
367
  btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
368
 
369
  demo.launch()
370
 
371
-
372
 
373
 
374
 
 
223
  BlipForQuestionAnswering,
224
  pipeline
225
  )
 
226
  import torch
227
  import tempfile
228
+ import numpy as np
229
+ import soundfile as sf
230
+ from TTS.api import TTS
231
+ from PIL import Image
232
 
233
  # ----------------------
234
+ # Device
235
  # ----------------------
236
  device = "cuda" if torch.cuda.is_available() else "cpu"
237
 
238
  # ----------------------
239
+ # Load Models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  # ----------------------
241
+ print("🔄 Loading BLIP models...")
 
242
  caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
243
  caption_model = BlipForConditionalGeneration.from_pretrained(
244
  "Salesforce/blip-image-captioning-large"
 
257
 
258
  moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
259
 
260
+ # Load TTS model (Hugging Face, offline)
261
+ tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
262
+
263
  print("✅ All models loaded!")
264
 
265
  # ----------------------
266
  # Safety check
267
  # ----------------------
268
+ def is_caption_safe(caption):
269
  try:
270
+ result = moderation_model(caption)
271
  if isinstance(result, list) and "label" in result[0]:
272
+ if result[0]["label"] == "toxic" and result[0]["score"] > 0.5:
273
  return False
274
  except:
275
  pass
276
+
277
+ unsafe_words = ["gun", "kill", "dead", "weapon", "blood", "suicide", "bomb"]
278
+ return not any(w in caption.lower() for w in unsafe_words)
279
+
280
+ # ----------------------
281
+ # Beep Generator
282
+ # ----------------------
283
+ def generate_beep():
284
+ sr = 44100
285
+ duration = 0.4
286
+ frequency = 880
287
+ t = np.linspace(0, duration, int(sr * duration), False)
288
+ wave = 0.5 * np.sin(2 * np.pi * frequency * t)
289
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
290
+ sf.write(tmp.name, wave, sr)
291
+ return tmp.name
292
 
293
  # ----------------------
294
+ # TTS
295
+ # ----------------------
296
+ def offline_tts(text):
297
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
298
+ tts_model.tts_to_file(text=text, file_path=tmp.name)
299
+ return tmp.name
300
+
301
+ # ----------------------
302
+ # Caption + Translate + TTS
303
  # ----------------------
304
  def generate_caption_translate_speak(image, target_lang):
305
  if image is None:
306
  return "", "", None
307
 
308
+ # Caption
309
  inputs = caption_processor(images=image, return_tensors="pt").to(device)
310
  with torch.no_grad():
311
+ out = caption_model.generate(**inputs, max_new_tokens=50)
312
+ caption = caption_processor.decode(out[0], skip_special_tokens=True)
313
 
314
  # Safety check
315
+ if not is_caption_safe(caption):
316
+ beep_file = generate_beep()
317
+ return "⚠️ Unsafe content detected!", "", beep_file
318
 
319
+ # Translation
320
  translated = translation_models[target_lang](caption)[0]["translation_text"]
321
 
322
+ # TTS only for safe caption
323
  audio_file = offline_tts(caption)
324
 
325
  return caption, translated, audio_file
 
330
  def vqa_answer(image, question):
331
  if image is None or not question:
332
  return ""
 
333
  inputs = vqa_processor(image, question, return_tensors="pt").to(device)
334
  with torch.no_grad():
335
+ out = vqa_model.generate(**inputs, max_new_tokens=50)
336
  answer = vqa_processor.decode(out[0], skip_special_tokens=True)
337
+ if not is_caption_safe(answer):
 
338
  return "⚠️ Unsafe content detected!"
339
  return answer
340
 
 
342
  # Gradio UI
343
  # ----------------------
344
  with gr.Blocks(title="BLIP Vision App") as demo:
345
+ gr.Markdown("## 🖼️ BLIP: Caption + Translation + TTS + VQA (with Safety)")
346
 
347
  with gr.Tab("Caption + Translate + Speak"):
348
  with gr.Row():
 
351
 
352
  eng_out = gr.Textbox(label="English Caption")
353
  trans_out = gr.Textbox(label="Translated Caption")
354
+ audio_out = gr.Audio(label="Speech / Warning Beep", type="filepath", autoplay=True)
355
 
356
  btn = gr.Button("Generate Caption, Translate & Speak")
357
  btn.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
 
360
  with gr.Row():
361
  img_vqa = gr.Image(type="pil")
362
  q_in = gr.Textbox(label="Ask About the Image")
 
363
  ans_out = gr.Textbox(label="Answer")
364
  btn2 = gr.Button("Ask")
365
  btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
366
 
367
  demo.launch()
368
 
 
369
 
370
 
371