gopalagra commited on
Commit
0a27bcd
·
verified ·
1 Parent(s): c8bfce1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -44
app.py CHANGED
@@ -228,27 +228,19 @@ import torch
228
  import tempfile
229
  import base64
230
 
 
 
 
231
  device = "cuda" if torch.cuda.is_available() else "cpu"
232
 
233
- # -------------------------------
234
- # 100% VALID BEEP (base64)
235
- # -------------------------------
236
  BEEP_BASE64 = """
237
- SUQzAwAAAAAAFlRFTkMAAAAPAAACcQCAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
238
- ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
239
- ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
240
- ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
241
- ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
242
- AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
243
- AAAAACH5BAEAAAAALAAAAAAQABAAAAj/AP8JHEiwoMGDCAcKHEixoMGDCBMqXMixoMGD
244
- ECMOGHAgxIABAQAh+QQBAAAAACwAAAAAEAAQAAAI/wD/CRxIsKDBgwgHChxIsKDBgwgT
245
- KlzIsaDBgxAjDhxIsKDBgwAhACH5BAEAAAAALAAAAAAQABAAAAj/AP8JHEiwoMGDCAcK
246
- HEixoMGDCBMqXMixoMGDECMOGHAgxIABAQAh+QQBAAAAACwAAAAAEAAQAAAI/wD/CRxI
247
- sKDBgwgHChxIsKDBgwgTKlzIsaDBgxAjDhxIsKDBgwAhACH5BAEAAAAALAAAAAAQABAA
248
- AAj/AP8JHEiwoMGDCAcKHEixoMGDCBMqXMixoMGDECMOGHAgxIABAQAh+QQBAAAAACwA
249
- AAAAEAAQAAAI/wD/CRxIsKDBgwgHChxIsKDBgwgTKlzIsaDBgxAjDhxIsKDBgwAhADs=
250
  """
251
 
 
252
  def load_beep():
253
  audio_bytes = base64.b64decode(BEEP_BASE64)
254
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
@@ -256,10 +248,11 @@ def load_beep():
256
  tmp.close()
257
  return tmp.name
258
 
259
- # -------------------------------
260
- # Load Models
261
- # -------------------------------
262
- print("Loading models...")
 
263
 
264
  caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
265
  caption_model = BlipForConditionalGeneration.from_pretrained(
@@ -279,25 +272,28 @@ translation_models = {
279
 
280
  moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
281
 
282
- print("Models loaded.")
 
283
 
284
- # -------------------------------
285
- # Safety Filter
286
- # -------------------------------
287
  def is_caption_safe(caption):
288
  try:
289
  result = moderation_model(caption)
290
- if result and result[0]["label"] == "toxic" and result[0]["score"] > 0.5:
291
- return False
 
292
  except:
293
  pass
294
 
295
- bad_words = ["kill", "gun", "blood", "weapon", "dead", "death"]
296
- return not any(w in caption.lower() for w in bad_words)
 
297
 
298
- # -------------------------------
299
- # Auto Caption + Translate + Beep
300
- # -------------------------------
301
  def auto_process(image, target_lang):
302
  if image is None:
303
  return "", "", None
@@ -305,8 +301,8 @@ def auto_process(image, target_lang):
305
  # Caption
306
  inputs = caption_processor(images=image, return_tensors="pt").to(device)
307
  with torch.no_grad():
308
- out = caption_model.generate(**inputs, max_new_tokens=40)
309
- caption = caption_processor.decode(out[0], skip_special_tokens=True)
310
 
311
  # Safety
312
  if not is_caption_safe(caption):
@@ -315,14 +311,15 @@ def auto_process(image, target_lang):
315
  # Translate
316
  translated = translation_models[target_lang](caption)[0]["translation_text"]
317
 
318
- # Always beep
319
- beep_path = load_beep()
320
 
321
- return caption, translated, beep_path
322
 
323
- # -------------------------------
 
324
  # VQA
325
- # -------------------------------
326
  def vqa_answer(image, question):
327
  if image is None or not question:
328
  return ""
@@ -338,20 +335,21 @@ def vqa_answer(image, question):
338
 
339
  return ans
340
 
341
- # -------------------------------
 
342
  # UI
343
- # -------------------------------
344
- with gr.Blocks(title="BLIP Auto App") as demo:
345
  gr.Markdown("## 🖼️ Auto-Caption + Translation + Automatic Beep")
346
 
347
  with gr.Tab("Auto Caption"):
348
  img = gr.Image(type="pil", label="Upload Image")
349
  lang = gr.Dropdown(["Hindi", "French", "Spanish"], value="Hindi", label="Translate To")
350
  out_eng = gr.Textbox(label="English Caption")
351
- out_trans = gr.Textbox(label="Translated Caption")
352
  out_audio = gr.Audio(label="Beep", autoplay=True)
353
 
354
- # Auto-run on image upload or language change
355
  img.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
356
  lang.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
357
 
@@ -359,7 +357,8 @@ with gr.Blocks(title="BLIP Auto App") as demo:
359
  img_vqa = gr.Image(type="pil")
360
  q = gr.Textbox(label="Ask a question")
361
  ans = gr.Textbox(label="Answer")
362
- gr.Button("Ask").click(vqa_answer, inputs=[img_vqa, q], outputs=ans)
 
363
 
364
  demo.launch()
365
 
 
228
  import tempfile
229
  import base64
230
 
231
+ # ----------------------
232
+ # Device
233
+ # ----------------------
234
  device = "cuda" if torch.cuda.is_available() else "cpu"
235
 
236
+ # ----------------------
237
+ # Simple BEEP sound (base64)
238
+ # ----------------------
239
  BEEP_BASE64 = """
240
+ SUQzAwAAAAAAF1RTU0UAAAAPAAADTGF2ZjU4LjMyLjEwNAAAAAAAAAAAAAAA//uQxAADB...
 
 
 
 
 
 
 
 
 
 
 
 
241
  """
242
 
243
+ # Convert base64 to temp mp3 file
244
  def load_beep():
245
  audio_bytes = base64.b64decode(BEEP_BASE64)
246
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
 
248
  tmp.close()
249
  return tmp.name
250
 
251
+
252
+ # ----------------------
253
+ # Load models
254
+ # ----------------------
255
+ print("🔄 Loading models...")
256
 
257
  caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
258
  caption_model = BlipForConditionalGeneration.from_pretrained(
 
272
 
273
  moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
274
 
275
+ print(" All models loaded!")
276
+
277
 
278
+ # ----------------------
279
+ # Safety check
280
+ # ----------------------
281
  def is_caption_safe(caption):
282
  try:
283
  result = moderation_model(caption)
284
+ if isinstance(result, list) and "label" in result[0]:
285
+ if result[0]["label"] == "toxic" and result[0]["score"] > 0.5:
286
+ return False
287
  except:
288
  pass
289
 
290
+ unsafe_words = ["gun", "kill", "dead", "weapon", "blood"]
291
+ return not any(w in caption.lower() for w in unsafe_words)
292
+
293
 
294
+ # ----------------------
295
+ # Auto Caption + Translate + BEEP
296
+ # ----------------------
297
  def auto_process(image, target_lang):
298
  if image is None:
299
  return "", "", None
 
301
  # Caption
302
  inputs = caption_processor(images=image, return_tensors="pt").to(device)
303
  with torch.no_grad():
304
+ output = caption_model.generate(**inputs, max_new_tokens=40)
305
+ caption = caption_processor.decode(output[0], skip_special_tokens=True)
306
 
307
  # Safety
308
  if not is_caption_safe(caption):
 
311
  # Translate
312
  translated = translation_models[target_lang](caption)[0]["translation_text"]
313
 
314
+ # Always play BEEP once caption is ready
315
+ beep_file = load_beep()
316
 
317
+ return caption, translated, beep_file
318
 
319
+
320
+ # ----------------------
321
  # VQA
322
+ # ----------------------
323
  def vqa_answer(image, question):
324
  if image is None or not question:
325
  return ""
 
335
 
336
  return ans
337
 
338
+
339
+ # ----------------------
340
  # UI
341
+ # ----------------------
342
+ with gr.Blocks(title="BLIP App") as demo:
343
  gr.Markdown("## 🖼️ Auto-Caption + Translation + Automatic Beep")
344
 
345
  with gr.Tab("Auto Caption"):
346
  img = gr.Image(type="pil", label="Upload Image")
347
  lang = gr.Dropdown(["Hindi", "French", "Spanish"], value="Hindi", label="Translate To")
348
  out_eng = gr.Textbox(label="English Caption")
349
+ out_trans = gr.Textbox(label="Translated")
350
  out_audio = gr.Audio(label="Beep", autoplay=True)
351
 
352
+ # 🔥 Auto-run when image is uploaded
353
  img.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
354
  lang.change(auto_process, inputs=[img, lang], outputs=[out_eng, out_trans, out_audio])
355
 
 
357
  img_vqa = gr.Image(type="pil")
358
  q = gr.Textbox(label="Ask a question")
359
  ans = gr.Textbox(label="Answer")
360
+ ask_btn = gr.Button("Ask")
361
+ ask_btn.click(vqa_answer, inputs=[img_vqa, q], outputs=ans)
362
 
363
  demo.launch()
364