gopalagra commited on
Commit
1add24e
·
verified ·
1 Parent(s): 4b9cede

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -22
app.py CHANGED
@@ -67,37 +67,74 @@
67
  # # demo.launch(share=True)
68
 
69
  import gradio as gr
70
- from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
 
 
 
 
 
71
  from PIL import Image
72
  import torch
73
  from gtts import gTTS
74
  import tempfile
75
- import os
76
 
77
  # ----------------------
78
- # Load BLIP (Large) for Captioning
79
  # ----------------------
80
- caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
81
- caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
82
 
83
  # ----------------------
84
- # Translation pipelines
85
  # ----------------------
 
 
 
 
 
 
 
 
 
 
 
86
  translation_models = {
87
  "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
88
  "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
89
  "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
90
  }
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  # ----------------------
93
- # Caption + Translate + Speak Function
94
  # ----------------------
95
  def generate_caption_translate_speak(image, target_lang):
96
  # Step 1: Caption
97
- inputs = caption_processor(images=image, return_tensors="pt")
98
- out = caption_model.generate(**inputs, max_new_tokens=50)
 
99
  english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
100
 
 
 
 
 
101
  # Step 2: Translate
102
  if target_lang in translation_models:
103
  translated = translation_models[target_lang](english_caption)[0]['translation_text']
@@ -108,37 +145,37 @@ def generate_caption_translate_speak(image, target_lang):
108
  tts = gTTS(english_caption, lang="en")
109
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
110
  tts.save(tmp_file.name)
111
- audio_file = tmp_file.name
112
 
113
- return english_caption, translated, audio_file
114
 
115
  # ----------------------
116
- # VQA Function (using BLIP VQA)
117
  # ----------------------
118
- from transformers import BlipProcessor, BlipForQuestionAnswering
119
-
120
- vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
121
- vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda" if torch.cuda.is_available() else "cpu")
122
-
123
  def vqa_answer(image, question):
124
- inputs = vqa_processor(image, question, return_tensors="pt").to(vqa_model.device)
125
- out = vqa_model.generate(**inputs, max_new_tokens=50)
 
126
  answer = vqa_processor.decode(out[0], skip_special_tokens=True)
 
 
 
 
 
127
  return answer
128
 
129
  # ----------------------
130
  # Gradio UI
131
  # ----------------------
132
  with gr.Blocks(title="BLIP Vision App") as demo:
133
- gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA")
134
 
135
  with gr.Tab("Caption + Translate + Speak"):
136
  with gr.Row():
137
  img_in = gr.Image(type="pil", label="Upload Image")
138
- lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To")
139
  eng_out = gr.Textbox(label="English Caption")
140
  trans_out = gr.Textbox(label="Translated Caption")
141
- audio_out = gr.Audio(label="Spoken Caption")
142
  btn1 = gr.Button("Generate Caption, Translate & Speak")
143
  btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
144
 
 
67
  # # demo.launch(share=True)
68
 
69
  import gradio as gr
70
+ from transformers import (
71
+ BlipProcessor,
72
+ BlipForConditionalGeneration,
73
+ BlipForQuestionAnswering,
74
+ pipeline
75
+ )
76
  from PIL import Image
77
  import torch
78
  from gtts import gTTS
79
  import tempfile
 
80
 
81
  # ----------------------
82
+ # Device setup
83
  # ----------------------
84
+ device = "cuda" if torch.cuda.is_available() else "cpu"
 
85
 
86
  # ----------------------
87
+ # Load Models Once
88
  # ----------------------
89
+ print("🔄 Loading models...")
90
+
91
+ # Captioning
92
+ caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
93
+ caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
94
+
95
+ # VQA
96
+ vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
97
+ vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
98
+
99
+ # Translation
100
  translation_models = {
101
  "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
102
  "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
103
  "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
104
  }
105
 
106
+ # Safety Moderation Pipeline
107
+ moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
108
+
109
+ print("✅ All models loaded!")
110
+
111
+ # ----------------------
112
+ # Safety Filter Function
113
+ # ----------------------
114
+ def is_caption_safe(caption):
115
+ result = moderation_model(caption)[0]
116
+ label = result["label"]
117
+ score = result["score"]
118
+
119
+ # toxic-bert gives "toxic" or "non-toxic"
120
+ if label.lower() == "toxic" and score > 0.7:
121
+ return False
122
+ return True
123
+
124
  # ----------------------
125
+ # Caption + Translate + Speak
126
  # ----------------------
127
  def generate_caption_translate_speak(image, target_lang):
128
  # Step 1: Caption
129
+ inputs = caption_processor(images=image, return_tensors="pt").to(device)
130
+ with torch.no_grad():
131
+ out = caption_model.generate(**inputs, max_new_tokens=50)
132
  english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
133
 
134
+ # Step 1.5: Safety Check
135
+ if not is_caption_safe(english_caption):
136
+ return "⚠️ Warning: Unsafe or inappropriate content detected!", "", None
137
+
138
  # Step 2: Translate
139
  if target_lang in translation_models:
140
  translated = translation_models[target_lang](english_caption)[0]['translation_text']
 
145
  tts = gTTS(english_caption, lang="en")
146
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
147
  tts.save(tmp_file.name)
 
148
 
149
+ return english_caption, translated, tmp_file.name
150
 
151
  # ----------------------
152
+ # VQA
153
  # ----------------------
 
 
 
 
 
154
  def vqa_answer(image, question):
155
+ inputs = vqa_processor(image, question, return_tensors="pt").to(device)
156
+ with torch.no_grad():
157
+ out = vqa_model.generate(**inputs, max_new_tokens=50)
158
  answer = vqa_processor.decode(out[0], skip_special_tokens=True)
159
+
160
+ # Run safety filter on answers too
161
+ if not is_caption_safe(answer):
162
+ return "⚠️ Warning: Unsafe or inappropriate content detected!"
163
+
164
  return answer
165
 
166
  # ----------------------
167
  # Gradio UI
168
  # ----------------------
169
  with gr.Blocks(title="BLIP Vision App") as demo:
170
+ gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter)")
171
 
172
  with gr.Tab("Caption + Translate + Speak"):
173
  with gr.Row():
174
  img_in = gr.Image(type="pil", label="Upload Image")
175
+ lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
176
  eng_out = gr.Textbox(label="English Caption")
177
  trans_out = gr.Textbox(label="Translated Caption")
178
+ audio_out = gr.Audio(label="Spoken Caption", type="filepath")
179
  btn1 = gr.Button("Generate Caption, Translate & Speak")
180
  btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
181