gopalagra commited on
Commit
c8bfce1
·
verified ·
1 Parent(s): e11dd42

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -132
app.py CHANGED
@@ -218,174 +218,148 @@
218
 
219
  import gradio as gr
220
  from transformers import (
221
- BlipProcessor,
222
- BlipForConditionalGeneration,
223
- BlipForQuestionAnswering,
224
  pipeline
225
  )
226
  from PIL import Image
227
  import torch
228
- import pyttsx3
229
  import tempfile
230
- import numpy as np
231
- import soundfile as sf
232
 
233
-
234
- # ----------------------
235
- # Device setup
236
- # ----------------------
237
  device = "cuda" if torch.cuda.is_available() else "cpu"
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
- # ----------------------
241
- # Load Models Once
242
- # ----------------------
243
- print("🔄 Loading models...")
244
 
245
- # Captioning
246
  caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
247
- caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
 
 
248
 
249
- # VQA
250
  vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
251
- vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
 
 
252
 
253
- # Translation
254
  translation_models = {
255
  "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
256
  "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
257
  "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
258
  }
259
 
260
- # Safety Moderation Pipeline
261
  moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
262
 
263
- print(" All models loaded!")
264
-
265
-
266
- # ----------------------
267
- # Beep Generator
268
- # ----------------------
269
- def generate_beep():
270
- sr = 44100
271
- duration = 0.4
272
- frequency = 880
273
-
274
- t = np.linspace(0, duration, int(sr * duration), False)
275
- wave = 0.5 * np.sin(2 * np.pi * frequency * t)
276
-
277
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
278
- sf.write(tmp.name, wave, sr)
279
- return tmp.name
280
-
281
 
282
- # ----------------------
283
- # Safety Filter Function
284
- # ----------------------
285
  def is_caption_safe(caption):
286
  try:
287
- votes = moderation_model(caption)
288
- if isinstance(votes, list) and isinstance(votes[0], list):
289
- votes = votes[0]
290
-
291
- for item in votes:
292
- if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
293
- return False
294
- except Exception as e:
295
- print("⚠️ Moderation failed:", e)
296
-
297
- unsafe_keywords = [
298
- "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
299
- "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
300
- "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
301
- "grenade", "horror", "beheaded", "torture", "hostage", "rape",
302
- "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
303
- ]
304
- if any(word in caption.lower() for word in unsafe_keywords):
305
- return False
306
-
307
- return True
308
-
309
-
310
- # ----------------------
311
- # Offline Text-to-Speech using pyttsx3
312
- # ----------------------
313
- def offline_tts(text):
314
- engine = pyttsx3.init()
315
-
316
- tmp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
317
- engine.save_to_file(text, tmp_audio.name)
318
- engine.runAndWait()
319
-
320
- return tmp_audio.name
321
-
322
-
323
- # ----------------------
324
- # Caption + Translate + Speak
325
- # ----------------------
326
- def generate_caption_translate_speak(image, target_lang):
327
-
328
- # Step 1: Caption
329
  inputs = caption_processor(images=image, return_tensors="pt").to(device)
330
  with torch.no_grad():
331
- out = caption_model.generate(**inputs, max_new_tokens=50)
332
- english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
333
 
334
- # Step 1.5: Safety Check
335
- if not is_caption_safe(english_caption):
336
- beep = generate_beep()
337
- return "⚠️ Warning: Unsafe or inappropriate content detected!", "", beep
338
 
339
- # Step 2: Translate
340
- if target_lang in translation_models:
341
- translated = translation_models[target_lang](english_caption)[0]['translation_text']
342
- else:
343
- translated = "Translation not available"
344
 
345
- # Step 3: Offline Speech
346
- audio_path = offline_tts(english_caption)
347
 
348
- return english_caption, translated, audio_path
349
 
350
-
351
- # ----------------------
352
  # VQA
353
- # ----------------------
354
  def vqa_answer(image, question):
 
 
 
355
  inputs = vqa_processor(image, question, return_tensors="pt").to(device)
356
  with torch.no_grad():
357
- out = vqa_model.generate(**inputs, max_new_tokens=50)
358
- answer = vqa_processor.decode(out[0], skip_special_tokens=True)
359
-
360
- if not is_caption_safe(answer):
361
- return "⚠️ Warning: Unsafe or inappropriate content detected!"
362
-
363
- return answer
364
-
365
-
366
- # ----------------------
367
- # Gradio UI
368
- # ----------------------
369
- with gr.Blocks(title="BLIP Vision App") as demo:
370
- gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter + Warning Beep)")
371
-
372
- with gr.Tab("Caption + Translate + Speak"):
373
- with gr.Row():
374
- img_in = gr.Image(type="pil", label="Upload Image")
375
- lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
376
- eng_out = gr.Textbox(label="English Caption")
377
- trans_out = gr.Textbox(label="Translated Caption")
378
- audio_out = gr.Audio(label="Spoken Caption / Warning Beep", type="filepath", autoplay=True)
379
- btn1 = gr.Button("Generate Caption, Translate & Speak")
380
- btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
381
-
382
- with gr.Tab("Visual Question Answering (VQA)"):
383
- with gr.Row():
384
- img_vqa = gr.Image(type="pil", label="Upload Image")
385
- q_in = gr.Textbox(label="Ask a Question about the Image")
386
- ans_out = gr.Textbox(label="Answer")
387
- btn2 = gr.Button("Ask")
388
- btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
389
 
390
  demo.launch()
391
 
 
218
 
219
  import gradio as gr
220
  from transformers import (
221
+ BlipProcessor,
222
+ BlipForConditionalGeneration,
223
+ BlipForQuestionAnswering,
224
  pipeline
225
  )
226
  from PIL import Image
227
  import torch
 
228
  import tempfile
229
+ import base64
 
230
 
 
 
 
 
231
  device = "cuda" if torch.cuda.is_available() else "cpu"
232
 
233
+ # -------------------------------
234
+ # 100% VALID BEEP (base64)
235
+ # -------------------------------
236
+ BEEP_BASE64 = """
237
+ SUQzAwAAAAAAFlRFTkMAAAAPAAACcQCAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
238
+ ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
239
+ ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
240
+ ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
241
+ ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
242
+ AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
243
+ AAAAACH5BAEAAAAALAAAAAAQABAAAAj/AP8JHEiwoMGDCAcKHEixoMGDCBMqXMixoMGD
244
+ ECMOGHAgxIABAQAh+QQBAAAAACwAAAAAEAAQAAAI/wD/CRxIsKDBgwgHChxIsKDBgwgT
245
+ KlzIsaDBgxAjDhxIsKDBgwAhACH5BAEAAAAALAAAAAAQABAAAAj/AP8JHEiwoMGDCAcK
246
+ HEixoMGDCBMqXMixoMGDECMOGHAgxIABAQAh+QQBAAAAACwAAAAAEAAQAAAI/wD/CRxI
247
+ sKDBgwgHChxIsKDBgwgTKlzIsaDBgxAjDhxIsKDBgwAhACH5BAEAAAAALAAAAAAQABAA
248
+ AAj/AP8JHEiwoMGDCAcKHEixoMGDCBMqXMixoMGDECMOGHAgxIABAQAh+QQBAAAAACwA
249
+ AAAAEAAQAAAI/wD/CRxIsKDBgwgHChxIsKDBgwgTKlzIsaDBgxAjDhxIsKDBgwAhADs=
250
+ """
251
+
252
+ def load_beep():
253
+ audio_bytes = base64.b64decode(BEEP_BASE64)
254
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
255
+ tmp.write(audio_bytes)
256
+ tmp.close()
257
+ return tmp.name
258
 
259
+ # -------------------------------
260
+ # Load Models
261
+ # -------------------------------
262
+ print("Loading models...")
263
 
 
264
  caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
265
+ caption_model = BlipForConditionalGeneration.from_pretrained(
266
+ "Salesforce/blip-image-captioning-large"
267
+ ).to(device)
268
 
 
269
  vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
270
+ vqa_model = BlipForQuestionAnswering.from_pretrained(
271
+ "Salesforce/blip-vqa-base"
272
+ ).to(device)
273
 
 
274
  translation_models = {
275
  "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
276
  "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
277
  "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
278
  }
279
 
 
280
  moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
281
 
282
+ print("Models loaded.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
+ # -------------------------------
285
+ # Safety Filter
286
+ # -------------------------------
287
  def is_caption_safe(caption):
288
  try:
289
+ result = moderation_model(caption)
290
+ if result and result[0]["label"] == "toxic" and result[0]["score"] > 0.5:
291
+ return False
292
+ except:
293
+ pass
294
+
295
+ bad_words = ["kill", "gun", "blood", "weapon", "dead", "death"]
296
+ return not any(w in caption.lower() for w in bad_words)
297
+
298
+ # -------------------------------
299
+ # Auto Caption + Translate + Beep
300
+ # -------------------------------
301
+ def auto_process(image, target_lang):
302
+ if image is None:
303
+ return "", "", None
304
+
305
+ # Caption
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  inputs = caption_processor(images=image, return_tensors="pt").to(device)
307
  with torch.no_grad():
308
+ out = caption_model.generate(**inputs, max_new_tokens=40)
309
+ caption = caption_processor.decode(out[0], skip_special_tokens=True)
310
 
311
+ # Safety
312
+ if not is_caption_safe(caption):
313
+ return "⚠️ Unsafe content detected!", "", load_beep()
 
314
 
315
+ # Translate
316
+ translated = translation_models[target_lang](caption)[0]["translation_text"]
 
 
 
317
 
318
+ # Always beep
319
+ beep_path = load_beep()
320
 
321
+ return caption, translated, beep_path
322
 
323
+ # -------------------------------
 
324
  # VQA
325
+ # -------------------------------
326
  def vqa_answer(image, question):
327
+ if image is None or not question:
328
+ return ""
329
+
330
  inputs = vqa_processor(image, question, return_tensors="pt").to(device)
331
  with torch.no_grad():
332
+ out = vqa_model.generate(**inputs, max_new_tokens=30)
333
+
334
+ ans = vqa_processor.decode(out[0], skip_special_tokens=True)
335
+
336
+ if not is_caption_safe(ans):
337
+ return "⚠️ Unsafe content detected!"
338
+
339
+ return ans
340
+
341
+ # -------------------------------
342
+ # UI
343
+ # -------------------------------
344
+ with gr.Blocks(title="BLIP Auto App") as demo:
345
+ gr.Markdown("## 🖼️ Auto-Caption + Translation + Automatic Beep")
346
+
347
+ with gr.Tab("Auto Caption"):
348
+ img = gr.Image(type="pil", label="Upload Image")
349
+ lang = gr.Dropdown(["Hindi", "French", "Spanish"], value="Hindi", label="Translate To")
350
+ out_eng = gr.Textbox(label="English Caption")
351
+ out_trans = gr.Textbox(label="Translated Caption")
352
+ out_audio = gr.Audio(label="Beep", autoplay=True)
353
+
354
+ # Auto-run on image upload or language change
355
+ img.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
356
+ lang.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
357
+
358
+ with gr.Tab("VQA"):
359
+ img_vqa = gr.Image(type="pil")
360
+ q = gr.Textbox(label="Ask a question")
361
+ ans = gr.Textbox(label="Answer")
362
+ gr.Button("Ask").click(vqa_answer, inputs=[img_vqa, q], outputs=ans)
 
363
 
364
  demo.launch()
365