gopalagra commited on
Commit
b53948f
·
verified ·
1 Parent(s): 9cf0535

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -58
app.py CHANGED
@@ -226,7 +226,7 @@ from transformers import (
226
  from PIL import Image
227
  import torch
228
  import tempfile
229
- import base64
230
 
231
  # ----------------------
232
  # Device setup
@@ -234,24 +234,29 @@ import base64
234
  device = "cuda" if torch.cuda.is_available() else "cpu"
235
 
236
  # ----------------------
237
- # Simple BEEP sound (base64)
238
  # ----------------------
239
- BEEP_BASE64 = """
240
- SUQzAwAAAAAAFlRFTkMAAAAPAAADdAAAABJBTUFEAAAAGwAAAG1kYXQAAAAA/////wABAAAC
241
- AgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
242
- ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICA
243
- gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIC
244
- AgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
245
- ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICA
246
- gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgAAAA==
247
- """
248
-
249
-
250
- def load_beep():
251
- audio_bytes = base64.b64decode(BEEP_BASE64)
252
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
253
- tmp.write(audio_bytes)
254
- tmp.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  return tmp.name
256
 
257
  # ----------------------
@@ -282,42 +287,41 @@ print("✅ All models loaded!")
282
  # ----------------------
283
  # Safety check
284
  # ----------------------
285
- def is_caption_safe(caption):
286
  try:
287
- result = moderation_model(caption)
288
  if isinstance(result, list) and "label" in result[0]:
289
  if result[0]["label"].lower() == "toxic" and result[0]["score"] > 0.5:
290
  return False
291
  except:
292
  pass
293
-
294
- # extra simple keyword check
295
- unsafe_words = ["gun", "kill", "dead", "weapon", "blood"]
296
- return not any(w in caption.lower() for w in unsafe_words)
297
 
298
  # ----------------------
299
- # Auto Caption + Translate + Optional BEEP
300
  # ----------------------
301
- def auto_process(image, target_lang):
302
  if image is None:
303
  return "", "", None
304
 
305
- # Caption
306
  inputs = caption_processor(images=image, return_tensors="pt").to(device)
307
  with torch.no_grad():
308
- output = caption_model.generate(**inputs, max_new_tokens=40)
309
-
310
  caption = caption_processor.decode(output[0], skip_special_tokens=True)
311
 
312
- # Safety
313
- if not is_caption_safe(caption):
314
- return "⚠️ Unsafe content detected!", "", load_beep()
315
 
316
  # Translate
317
  translated = translation_models[target_lang](caption)[0]["translation_text"]
318
 
319
- # SAFE No beep
320
- return caption, translated, None
 
 
321
 
322
  # ----------------------
323
  # VQA
@@ -329,37 +333,38 @@ def vqa_answer(image, question):
329
  inputs = vqa_processor(image, question, return_tensors="pt").to(device)
330
  with torch.no_grad():
331
  out = vqa_model.generate(**inputs, max_new_tokens=30)
 
332
 
333
- ans = vqa_processor.decode(out[0], skip_special_tokens=True)
334
-
335
- if not is_caption_safe(ans):
336
  return "⚠️ Unsafe content detected!"
337
-
338
- return ans
339
 
340
  # ----------------------
341
  # Gradio UI
342
  # ----------------------
343
- with gr.Blocks(title="BLIP App") as demo:
344
- gr.Markdown("## 🖼️ Auto-Caption + Translation + Safety Beep")
345
-
346
- with gr.Tab("Auto Caption"):
347
- img = gr.Image(type="pil", label="Upload Image")
348
- lang = gr.Dropdown(["Hindi", "French", "Spanish"], value="Hindi", label="Translate To")
349
- out_eng = gr.Textbox(label="English Caption")
350
- out_trans = gr.Textbox(label="Translated")
351
- out_audio = gr.Audio(label="Audio", type="filepath", autoplay=True)
352
-
353
- # Auto-run on image or language change
354
- img.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
355
- lang.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
356
-
357
- with gr.Tab("VQA"):
358
- img_vqa = gr.Image(type="pil", label="Upload Image")
359
- q = gr.Textbox(label="Ask a question")
360
- ans = gr.Textbox(label="Answer")
361
- ask_btn = gr.Button("Ask")
362
- ask_btn.click(vqa_answer, inputs=[img_vqa, q], outputs=ans)
 
 
 
363
 
364
  demo.launch()
365
 
 
226
  from PIL import Image
227
  import torch
228
  import tempfile
229
+ import pyttsx3 # offline TTS
230
 
231
  # ----------------------
232
  # Device setup
 
234
  device = "cuda" if torch.cuda.is_available() else "cpu"
235
 
236
  # ----------------------
237
+ # Offline TTS for safe captions
238
  # ----------------------
239
+ def offline_tts(text):
 
 
 
 
 
 
 
 
 
 
 
 
240
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
241
+ engine = pyttsx3.init()
242
+ engine.save_to_file(text, tmp.name)
243
+ engine.runAndWait()
244
+ return tmp.name
245
+
246
+ # ----------------------
247
+ # Simple BEEP sound
248
+ # ----------------------
249
+ def generate_beep():
250
+ import numpy as np
251
+ import soundfile as sf
252
+
253
+ sr = 44100
254
+ duration = 0.3
255
+ freq = 880
256
+ t = np.linspace(0, duration, int(sr*duration), False)
257
+ wave = 0.5*np.sin(2*np.pi*freq*t)
258
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
259
+ sf.write(tmp.name, wave, sr)
260
  return tmp.name
261
 
262
  # ----------------------
 
287
  # ----------------------
288
  # Safety check
289
  # ----------------------
290
+ def is_safe(text):
291
  try:
292
+ result = moderation_model(text)
293
  if isinstance(result, list) and "label" in result[0]:
294
  if result[0]["label"].lower() == "toxic" and result[0]["score"] > 0.5:
295
  return False
296
  except:
297
  pass
298
+ unsafe_keywords = ["gun", "kill", "dead", "blood", "weapon"]
299
+ return not any(k in text.lower() for k in unsafe_keywords)
 
 
300
 
301
  # ----------------------
302
+ # Caption + Translate + Audio
303
  # ----------------------
304
+ def generate_caption_translate_speak(image, target_lang):
305
  if image is None:
306
  return "", "", None
307
 
308
+ # Generate caption
309
  inputs = caption_processor(images=image, return_tensors="pt").to(device)
310
  with torch.no_grad():
311
+ output = caption_model.generate(**inputs, max_new_tokens=50)
 
312
  caption = caption_processor.decode(output[0], skip_special_tokens=True)
313
 
314
+ # Safety check
315
+ if not is_safe(caption):
316
+ return "⚠️ Unsafe content detected!", "", generate_beep()
317
 
318
  # Translate
319
  translated = translation_models[target_lang](caption)[0]["translation_text"]
320
 
321
+ # Generate TTS for safe caption
322
+ audio_file = offline_tts(caption)
323
+
324
+ return caption, translated, audio_file
325
 
326
  # ----------------------
327
  # VQA
 
333
  inputs = vqa_processor(image, question, return_tensors="pt").to(device)
334
  with torch.no_grad():
335
  out = vqa_model.generate(**inputs, max_new_tokens=30)
336
+ answer = vqa_processor.decode(out[0], skip_special_tokens=True)
337
 
338
+ if not is_safe(answer):
 
 
339
  return "⚠️ Unsafe content detected!"
340
+ return answer
 
341
 
342
  # ----------------------
343
  # Gradio UI
344
  # ----------------------
345
+ with gr.Blocks(title="BLIP Vision App") as demo:
346
+ gr.Markdown("## 🖼️ BLIP: Caption + Translation + TTS + VQA")
347
+
348
+ with gr.Tab("Caption + Translate + Speak"):
349
+ with gr.Row():
350
+ img_in = gr.Image(type="pil", label="Upload Image")
351
+ lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], value="Hindi", label="Translate To")
352
+
353
+ eng_out = gr.Textbox(label="English Caption")
354
+ trans_out = gr.Textbox(label="Translated Caption")
355
+ audio_out = gr.Audio(label="Audio / Beep", type="filepath", autoplay=True)
356
+
357
+ btn = gr.Button("Generate Caption, Translate & Speak")
358
+ btn.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
359
+
360
+ with gr.Tab("Visual Question Answering (VQA)"):
361
+ with gr.Row():
362
+ img_vqa = gr.Image(type="pil")
363
+ q_in = gr.Textbox(label="Ask About the Image")
364
+
365
+ ans_out = gr.Textbox(label="Answer")
366
+ btn2 = gr.Button("Ask")
367
+ btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
368
 
369
  demo.launch()
370