gopalagra commited on
Commit
cd4d77a
·
verified ·
1 Parent(s): 3981a40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -40
app.py CHANGED
@@ -69,15 +69,10 @@
69
  import gradio as gr
70
  from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
71
  from PIL import Image
72
- import torch
73
 
74
- # Load BLIP2 model
75
  processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
76
- model = Blip2ForConditionalGeneration.from_pretrained(
77
- "Salesforce/blip2-opt-2.7b",
78
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
79
- device_map="auto" if torch.cuda.is_available() else None
80
- )
81
 
82
  # Translation pipelines
83
  translation_models = {
@@ -86,46 +81,48 @@ translation_models = {
86
  "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
87
  }
88
 
89
- # ---- Caption + Translation ----
90
- def generate_caption_translate(image, target_lang):
91
- inputs = processor(image, return_tensors="pt").to(model.device)
92
- out = model.generate(**inputs, max_new_tokens=50)
 
 
 
93
  english_caption = processor.decode(out[0], skip_special_tokens=True)
94
 
 
95
  if target_lang in translation_models:
96
  translated = translation_models[target_lang](english_caption)[0]['translation_text']
97
  else:
98
  translated = "Translation not available"
99
 
100
- return english_caption, translated
101
-
102
- # ---- Visual Question Answering ----
103
- def answer_question(image, question):
104
- inputs = processor(image, text=question, return_tensors="pt").to(model.device)
105
- out = model.generate(**inputs, max_new_tokens=50)
106
- answer = processor.decode(out[0], skip_special_tokens=True)
107
- return answer
108
-
109
- # ---- Gradio Interface ----
110
- with gr.Blocks() as demo:
111
- gr.Markdown("## 🖼️ BLIP2: Image Captioning + Translation + VQA")
112
-
113
- with gr.Tab("Caption + Translation"):
114
- img1 = gr.Image(type="pil")
115
- lang = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To")
116
- eng_cap = gr.Textbox(label="English Caption")
117
- trans_cap = gr.Textbox(label="Translated Caption")
118
- btn1 = gr.Button("Generate Caption + Translate")
119
- btn1.click(generate_caption_translate, inputs=[img1, lang], outputs=[eng_cap, trans_cap])
120
-
121
- with gr.Tab("Visual Question Answering"):
122
- img2 = gr.Image(type="pil")
123
- question = gr.Textbox(label="Ask a Question about the Image")
124
- answer = gr.Textbox(label="Answer")
125
- btn2 = gr.Button("Get Answer")
126
- btn2.click(answer_question, inputs=[img2, question], outputs=answer)
127
-
128
- demo.launch()
129
 
130
 
131
 
 
69
  import gradio as gr
70
  from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
71
  from PIL import Image
 
72
 
73
+ # Load BLIP2 for captioning
74
  processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
75
+ blip_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
 
 
 
 
76
 
77
  # Translation pipelines
78
  translation_models = {
 
81
  "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
82
  }
83
 
84
+ # Language model for reasoning/Q&A
85
+ qa_model = pipeline("text2text-generation", model="google/flan-t5-large")
86
+
87
+ def caption_translate_vqa(image, target_lang, question):
88
+ # Step 1: Generate English caption
89
+ inputs = processor(image, return_tensors="pt")
90
+ out = blip_model.generate(**inputs, max_new_tokens=50)
91
  english_caption = processor.decode(out[0], skip_special_tokens=True)
92
 
93
+ # Step 2: Translate caption
94
  if target_lang in translation_models:
95
  translated = translation_models[target_lang](english_caption)[0]['translation_text']
96
  else:
97
  translated = "Translation not available"
98
 
99
+ # Step 3: Image Q&A using caption + question
100
+ if question and len(question.strip()) > 0:
101
+ prompt = f"Image description: {english_caption}\nQuestion: {question}\nAnswer:"
102
+ answer = qa_model(prompt, max_length=100)[0]['generated_text']
103
+ else:
104
+ answer = "No question asked."
105
+
106
+ return english_caption, translated, answer
107
+
108
+ # Gradio UI
109
+ interface = gr.Interface(
110
+ fn=caption_translate_vqa,
111
+ inputs=[
112
+ gr.Image(type="pil", label="Upload Image"),
113
+ gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To"),
114
+ gr.Textbox(label="Ask a Question about the Image")
115
+ ],
116
+ outputs=[
117
+ gr.Textbox(label="English Caption"),
118
+ gr.Textbox(label="Translated Caption"),
119
+ gr.Textbox(label="VQA Answer")
120
+ ],
121
+ title="BLIP2 + Translation + Visual Q&A"
122
+ )
123
+
124
+ interface.launch()
125
+
 
 
126
 
127
 
128