gopalagra commited on
Commit
4b9cede
Β·
verified Β·
1 Parent(s): f5fa8f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -33
app.py CHANGED
@@ -70,12 +70,15 @@ import gradio as gr
70
  from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
71
  from PIL import Image
72
  import torch
73
- from transformers import pipeline
 
 
 
74
  # ----------------------
75
- # Load BLIP (Large) for Captioning + VQA
76
  # ----------------------
77
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
78
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
79
 
80
  # ----------------------
81
  # Translation pipelines
@@ -87,61 +90,57 @@ translation_models = {
87
  }
88
 
89
  # ----------------------
90
- # Caption + Translate Function
91
  # ----------------------
92
- def generate_caption_translate(image, target_lang):
93
- inputs = processor(images=image, return_tensors="pt")
94
- out = model.generate(**inputs, max_new_tokens=50)
95
- english_caption = processor.decode(out[0], skip_special_tokens=True)
 
96
 
97
- # Translate if chosen
98
  if target_lang in translation_models:
99
  translated = translation_models[target_lang](english_caption)[0]['translation_text']
100
  else:
101
  translated = "Translation not available"
102
 
103
- return english_caption, translated
 
 
 
 
 
 
104
 
105
  # ----------------------
106
- # VQA Function (using same BLIP model)
107
  # ----------------------
108
-
109
-
110
  from transformers import BlipProcessor, BlipForQuestionAnswering
111
- from PIL import Image
112
- import torch
113
 
114
- # Load BLIP VQA
115
- processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
116
- model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda" if torch.cuda.is_available() else "cpu")
117
 
118
- # Function
119
  def vqa_answer(image, question):
120
- # image is already a PIL Image (no need to open again)
121
- inputs = processor(image, question, return_tensors="pt").to(model.device)
122
- out = model.generate(**inputs, max_new_tokens=50)
123
- answer = processor.decode(out[0], skip_special_tokens=True)
124
  return answer
125
 
126
-
127
- # Example
128
- # print(vqa_answer("baby.jpg", "What is the baby eating?"))
129
-
130
-
131
  # ----------------------
132
  # Gradio UI
133
  # ----------------------
134
  with gr.Blocks(title="BLIP Vision App") as demo:
135
- gr.Markdown("## πŸ–ΌοΈ BLIP: Image Captioning + Translation + Question Answering")
136
 
137
- with gr.Tab("Caption + Translate"):
138
  with gr.Row():
139
  img_in = gr.Image(type="pil", label="Upload Image")
140
  lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To")
141
  eng_out = gr.Textbox(label="English Caption")
142
  trans_out = gr.Textbox(label="Translated Caption")
143
- btn1 = gr.Button("Generate Caption & Translate")
144
- btn1.click(generate_caption_translate, inputs=[img_in, lang_in], outputs=[eng_out, trans_out])
 
145
 
146
  with gr.Tab("Visual Question Answering (VQA)"):
147
  with gr.Row():
@@ -152,3 +151,6 @@ with gr.Blocks(title="BLIP Vision App") as demo:
152
  btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
153
 
154
  demo.launch()
 
 
 
 
70
  from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
71
  from PIL import Image
72
  import torch
73
+ from gtts import gTTS
74
+ import tempfile
75
+ import os
76
+
77
  # ----------------------
78
+ # Load BLIP (Large) for Captioning
79
  # ----------------------
80
+ caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
81
+ caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
82
 
83
  # ----------------------
84
  # Translation pipelines
 
90
  }
91
 
92
  # ----------------------
93
+ # Caption + Translate + Speak Function
94
  # ----------------------
95
+ def generate_caption_translate_speak(image, target_lang):
96
+ # Step 1: Caption
97
+ inputs = caption_processor(images=image, return_tensors="pt")
98
+ out = caption_model.generate(**inputs, max_new_tokens=50)
99
+ english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
100
 
101
+ # Step 2: Translate
102
  if target_lang in translation_models:
103
  translated = translation_models[target_lang](english_caption)[0]['translation_text']
104
  else:
105
  translated = "Translation not available"
106
 
107
+ # Step 3: Generate Speech (English caption for now)
108
+ tts = gTTS(english_caption, lang="en")
109
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
110
+ tts.save(tmp_file.name)
111
+ audio_file = tmp_file.name
112
+
113
+ return english_caption, translated, audio_file
114
 
115
  # ----------------------
116
+ # VQA Function (using BLIP VQA)
117
  # ----------------------
 
 
118
  from transformers import BlipProcessor, BlipForQuestionAnswering
 
 
119
 
120
+ vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
121
+ vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda" if torch.cuda.is_available() else "cpu")
 
122
 
 
123
  def vqa_answer(image, question):
124
+ inputs = vqa_processor(image, question, return_tensors="pt").to(vqa_model.device)
125
+ out = vqa_model.generate(**inputs, max_new_tokens=50)
126
+ answer = vqa_processor.decode(out[0], skip_special_tokens=True)
 
127
  return answer
128
 
 
 
 
 
 
129
  # ----------------------
130
  # Gradio UI
131
  # ----------------------
132
  with gr.Blocks(title="BLIP Vision App") as demo:
133
+ gr.Markdown("## πŸ–ΌοΈ BLIP: Image Captioning + Translation + Speech + VQA")
134
 
135
+ with gr.Tab("Caption + Translate + Speak"):
136
  with gr.Row():
137
  img_in = gr.Image(type="pil", label="Upload Image")
138
  lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To")
139
  eng_out = gr.Textbox(label="English Caption")
140
  trans_out = gr.Textbox(label="Translated Caption")
141
+ audio_out = gr.Audio(label="Spoken Caption")
142
+ btn1 = gr.Button("Generate Caption, Translate & Speak")
143
+ btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
144
 
145
  with gr.Tab("Visual Question Answering (VQA)"):
146
  with gr.Row():
 
151
  btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
152
 
153
  demo.launch()
154
+
155
+
156
+