gopalagra commited on
Commit
d18aa71
·
verified ·
1 Parent(s): cd4d77a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -36
app.py CHANGED
@@ -69,59 +69,80 @@
69
  import gradio as gr
70
  from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
71
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- # Load BLIP2 for captioning
74
- processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
75
- blip_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
76
-
77
  # Translation pipelines
 
78
  translation_models = {
79
  "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
80
  "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
81
  "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
82
  }
83
 
84
- # Language model for reasoning/Q&A
85
- qa_model = pipeline("text2text-generation", model="google/flan-t5-large")
86
-
87
- def caption_translate_vqa(image, target_lang, question):
88
- # Step 1: Generate English caption
89
- inputs = processor(image, return_tensors="pt")
90
- out = blip_model.generate(**inputs, max_new_tokens=50)
91
- english_caption = processor.decode(out[0], skip_special_tokens=True)
92
 
93
- # Step 2: Translate caption
94
  if target_lang in translation_models:
95
  translated = translation_models[target_lang](english_caption)[0]['translation_text']
96
  else:
97
  translated = "Translation not available"
98
 
99
- # Step 3: Image Q&A using caption + question
100
- if question and len(question.strip()) > 0:
101
- prompt = f"Image description: {english_caption}\nQuestion: {question}\nAnswer:"
102
- answer = qa_model(prompt, max_length=100)[0]['generated_text']
103
- else:
104
- answer = "No question asked."
105
 
106
- return english_caption, translated, answer
 
 
 
 
 
 
 
107
 
 
108
  # Gradio UI
109
- interface = gr.Interface(
110
- fn=caption_translate_vqa,
111
- inputs=[
112
- gr.Image(type="pil", label="Upload Image"),
113
- gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To"),
114
- gr.Textbox(label="Ask a Question about the Image")
115
- ],
116
- outputs=[
117
- gr.Textbox(label="English Caption"),
118
- gr.Textbox(label="Translated Caption"),
119
- gr.Textbox(label="VQA Answer")
120
- ],
121
- title="BLIP2 + Translation + Visual Q&A"
122
- )
123
-
124
- interface.launch()
 
 
 
 
 
 
125
 
126
 
127
 
 
69
  import gradio as gr
70
  from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
71
  from PIL import Image
72
+ import torch
73
+
74
+ # ----------------------
75
+ # Load BLIP2 for Captioning
76
+ # ----------------------
77
+ caption_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
78
+ caption_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
79
+
80
+ # ----------------------
81
+ # Load BLIP2 for VQA
82
+ # ----------------------
83
+ vqa_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
84
+ vqa_model = Blip2ForConditionalGeneration.from_pretrained(
85
+ "Salesforce/blip2-flan-t5-xl", torch_dtype=torch.float16, device_map="auto"
86
+ )
87
 
88
+ # ----------------------
 
 
 
89
  # Translation pipelines
90
+ # ----------------------
91
  translation_models = {
92
  "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
93
  "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
94
  "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
95
  }
96
 
97
+ # ----------------------
98
+ # Caption + Translate Function
99
+ # ----------------------
100
+ def generate_caption_translate(image, target_lang):
101
+ inputs = caption_processor(image, return_tensors="pt")
102
+ out = caption_model.generate(**inputs, max_new_tokens=50)
103
+ english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
 
104
 
105
+ # Translate if chosen
106
  if target_lang in translation_models:
107
  translated = translation_models[target_lang](english_caption)[0]['translation_text']
108
  else:
109
  translated = "Translation not available"
110
 
111
+ return english_caption, translated
 
 
 
 
 
112
 
113
+ # ----------------------
114
+ # VQA Function
115
+ # ----------------------
116
+ def vqa(image, question):
117
+ inputs = vqa_processor(image, question, return_tensors="pt").to(vqa_model.device)
118
+ out = vqa_model.generate(**inputs, max_new_tokens=100)
119
+ answer = vqa_processor.decode(out[0], skip_special_tokens=True)
120
+ return answer
121
 
122
+ # ----------------------
123
  # Gradio UI
124
+ # ----------------------
125
+ with gr.Blocks(title="BLIP2 Vision App") as demo:
126
+ gr.Markdown("## 🖼️ BLIP2: Image Captioning + Translation + Question Answering")
127
+
128
+ with gr.Tab("Caption + Translate"):
129
+ with gr.Row():
130
+ img_in = gr.Image(type="pil", label="Upload Image")
131
+ lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To")
132
+ eng_out = gr.Textbox(label="English Caption")
133
+ trans_out = gr.Textbox(label="Translated Caption")
134
+ btn1 = gr.Button("Generate Caption & Translate")
135
+ btn1.click(generate_caption_translate, inputs=[img_in, lang_in], outputs=[eng_out, trans_out])
136
+
137
+ with gr.Tab("Visual Question Answering (VQA)"):
138
+ with gr.Row():
139
+ img_vqa = gr.Image(type="pil", label="Upload Image")
140
+ q_in = gr.Textbox(label="Ask a Question about the Image")
141
+ ans_out = gr.Textbox(label="Answer")
142
+ btn2 = gr.Button("Ask")
143
+ btn2.click(vqa, inputs=[img_vqa, q_in], outputs=ans_out)
144
+
145
+ demo.launch()
146
 
147
 
148