gopalagra commited on
Commit
1b286f6
·
verified ·
1 Parent(s): 5bed2ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -84
app.py CHANGED
@@ -223,146 +223,134 @@ from transformers import (
223
  BlipForQuestionAnswering,
224
  pipeline
225
  )
 
226
  import torch
 
227
  import tempfile
228
- import numpy as np
229
- import soundfile as sf
230
- from TTS.api import TTS
231
- from PIL import Image
232
 
233
  # ----------------------
234
- # Device
235
  # ----------------------
236
  device = "cuda" if torch.cuda.is_available() else "cpu"
237
 
238
  # ----------------------
239
- # Load Models
240
  # ----------------------
241
- print("🔄 Loading BLIP models...")
 
 
242
  caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
243
- caption_model = BlipForConditionalGeneration.from_pretrained(
244
- "Salesforce/blip-image-captioning-large"
245
- ).to(device)
246
 
 
247
  vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
248
- vqa_model = BlipForQuestionAnswering.from_pretrained(
249
- "Salesforce/blip-vqa-base"
250
- ).to(device)
251
 
 
252
  translation_models = {
253
  "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
254
  "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
255
  "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
256
  }
257
 
 
258
  moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
259
 
260
- # Load TTS model (Hugging Face, offline)
261
- tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
262
-
263
  print("✅ All models loaded!")
264
 
265
  # ----------------------
266
- # Safety check
267
  # ----------------------
268
  def is_caption_safe(caption):
269
  try:
270
- result = moderation_model(caption)
271
- if isinstance(result, list) and "label" in result[0]:
272
- if result[0]["label"] == "toxic" and result[0]["score"] > 0.5:
 
 
 
 
273
  return False
274
- except:
275
- pass
276
-
277
- unsafe_words = ["gun", "kill", "dead", "weapon", "blood", "suicide", "bomb"]
278
- return not any(w in caption.lower() for w in unsafe_words)
 
 
 
 
 
 
 
 
 
279
 
280
  # ----------------------
281
- # Beep Generator
282
- # ----------------------
283
- def generate_beep():
284
- sr = 44100
285
- duration = 0.4
286
- frequency = 880
287
- t = np.linspace(0, duration, int(sr * duration), False)
288
- wave = 0.5 * np.sin(2 * np.pi * frequency * t)
289
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
290
- sf.write(tmp.name, wave, sr)
291
- return tmp.name
292
-
293
- # ----------------------
294
- # TTS
295
- # ----------------------
296
- def offline_tts(text):
297
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
298
- tts_model.tts_to_file(text=text, file_path=tmp.name)
299
- return tmp.name
300
-
301
- # ----------------------
302
- # Caption + Translate + TTS
303
  # ----------------------
304
  def generate_caption_translate_speak(image, target_lang):
305
- if image is None:
306
- return "", "", None
307
-
308
- # Caption
309
  inputs = caption_processor(images=image, return_tensors="pt").to(device)
310
  with torch.no_grad():
311
  out = caption_model.generate(**inputs, max_new_tokens=50)
312
- caption = caption_processor.decode(out[0], skip_special_tokens=True)
313
-
314
- # Safety check
315
- if not is_caption_safe(caption):
316
- beep_file = generate_beep()
317
- return "⚠️ Unsafe content detected!", "", beep_file
318
-
319
- # Translation
320
- translated = translation_models[target_lang](caption)[0]["translation_text"]
321
-
322
- # TTS only for safe caption
323
- audio_file = offline_tts(caption)
324
-
325
- return caption, translated, audio_file
 
 
 
 
326
 
327
  # ----------------------
328
  # VQA
329
  # ----------------------
330
  def vqa_answer(image, question):
331
- if image is None or not question:
332
- return ""
333
  inputs = vqa_processor(image, question, return_tensors="pt").to(device)
334
  with torch.no_grad():
335
  out = vqa_model.generate(**inputs, max_new_tokens=50)
336
  answer = vqa_processor.decode(out[0], skip_special_tokens=True)
 
 
337
  if not is_caption_safe(answer):
338
- return "⚠️ Unsafe content detected!"
 
339
  return answer
340
 
341
  # ----------------------
342
  # Gradio UI
343
  # ----------------------
344
  with gr.Blocks(title="BLIP Vision App") as demo:
345
- gr.Markdown("## 🖼️ BLIP: Caption + Translation + TTS + VQA (with Safety)")
346
-
347
  with gr.Tab("Caption + Translate + Speak"):
348
  with gr.Row():
349
  img_in = gr.Image(type="pil", label="Upload Image")
350
- lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], value="Hindi", label="Translate To")
351
-
352
- eng_out = gr.Textbox(label="English Caption")
353
- trans_out = gr.Textbox(label="Translated Caption")
354
- audio_out = gr.Audio(label="Speech / Warning Beep", type="filepath", autoplay=True)
355
-
356
- btn = gr.Button("Generate Caption, Translate & Speak")
357
- btn.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
358
-
359
  with gr.Tab("Visual Question Answering (VQA)"):
360
  with gr.Row():
361
- img_vqa = gr.Image(type="pil")
362
- q_in = gr.Textbox(label="Ask About the Image")
363
- ans_out = gr.Textbox(label="Answer")
364
- btn2 = gr.Button("Ask")
365
- btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
366
 
367
  demo.launch()
368
 
 
223
  BlipForQuestionAnswering,
224
  pipeline
225
  )
226
+ from PIL import Image
227
  import torch
228
+ from gtts import gTTS
229
  import tempfile
 
 
 
 
230
 
231
  # ----------------------
232
+ # Device setup
233
  # ----------------------
234
  device = "cuda" if torch.cuda.is_available() else "cpu"
235
 
236
  # ----------------------
237
+ # Load Models Once
238
  # ----------------------
239
+ print("🔄 Loading models...")
240
+
241
+ # Captioning
242
  caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
243
+ caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
 
 
244
 
245
+ # VQA
246
  vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
247
+ vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
 
 
248
 
249
+ # Translation
250
  translation_models = {
251
  "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
252
  "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
253
  "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
254
  }
255
 
256
+ # Safety Moderation Pipeline
257
  moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
258
 
 
 
 
259
  print("✅ All models loaded!")
260
 
261
  # ----------------------
262
+ # Safety Filter Function
263
  # ----------------------
264
  def is_caption_safe(caption):
265
  try:
266
+ votes = moderation_model(caption)
267
+ # If return_all_scores=True, it's [[{label, score}, ...]]
268
+ if isinstance(votes, list) and isinstance(votes[0], list):
269
+ votes = votes[0]
270
+ # Loop through scores
271
+ for item in votes:
272
+ if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
273
  return False
274
+ except Exception as e:
275
+ print("⚠️ Moderation failed:", e)
276
+
277
+ # Fallback keyword check
278
+ unsafe_keywords = [
279
+ "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon", "fire",
280
+ "murder", "dead", "death", "suicide", "bomb", "explosion", "terrorist", "assault",
281
+ "stab", "shoot", "pistol", "rifle", "shotgun", "grenade", "horror", "beheaded",
282
+ "torture", "hostage", "rape", "war", "massacre", "chainsaw", "poison", "strangle",
283
+ "hang", "drown"
284
+ ]
285
+ if any(word in caption.lower() for word in unsafe_keywords):
286
+ return False
287
+ return True
288
 
289
  # ----------------------
290
+ # Caption + Translate + Speak
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  # ----------------------
292
  def generate_caption_translate_speak(image, target_lang):
293
+ # Step 1: Caption
 
 
 
294
  inputs = caption_processor(images=image, return_tensors="pt").to(device)
295
  with torch.no_grad():
296
  out = caption_model.generate(**inputs, max_new_tokens=50)
297
+ english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
298
+
299
+ # Step 1.5: Safety Check
300
+ if not is_caption_safe(english_caption):
301
+ return "⚠️ Warning: Unsafe or inappropriate content detected!", "", None
302
+
303
+ # Step 2: Translate
304
+ if target_lang in translation_models:
305
+ translated = translation_models[target_lang](english_caption)[0]['translation_text']
306
+ else:
307
+ translated = "Translation not available"
308
+
309
+ # Step 3: Generate Speech (English caption for now)
310
+ tts = gTTS(english_caption, lang="en")
311
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
312
+ tts.save(tmp_file.name)
313
+
314
+ return english_caption, translated, tmp_file.name
315
 
316
  # ----------------------
317
  # VQA
318
  # ----------------------
319
  def vqa_answer(image, question):
 
 
320
  inputs = vqa_processor(image, question, return_tensors="pt").to(device)
321
  with torch.no_grad():
322
  out = vqa_model.generate(**inputs, max_new_tokens=50)
323
  answer = vqa_processor.decode(out[0], skip_special_tokens=True)
324
+
325
+ # Safety filter
326
  if not is_caption_safe(answer):
327
+ return "⚠️ Warning: Unsafe or inappropriate content detected!"
328
+
329
  return answer
330
 
331
  # ----------------------
332
  # Gradio UI
333
  # ----------------------
334
  with gr.Blocks(title="BLIP Vision App") as demo:
335
+ gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter)")
336
+
337
  with gr.Tab("Caption + Translate + Speak"):
338
  with gr.Row():
339
  img_in = gr.Image(type="pil", label="Upload Image")
340
+ lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
341
+ eng_out = gr.Textbox(label="English Caption")
342
+ trans_out = gr.Textbox(label="Translated Caption")
343
+ audio_out = gr.Audio(label="Spoken Caption", type="filepath")
344
+ btn1 = gr.Button("Generate Caption, Translate & Speak")
345
+ btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
346
+
 
 
347
  with gr.Tab("Visual Question Answering (VQA)"):
348
  with gr.Row():
349
+ img_vqa = gr.Image(type="pil", label="Upload Image")
350
+ q_in = gr.Textbox(label="Ask a Question about the Image")
351
+ ans_out = gr.Textbox(label="Answer")
352
+ btn2 = gr.Button("Ask")
353
+ btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
354
 
355
  demo.launch()
356