gopalagra commited on
Commit
496853e
Β·
verified Β·
1 Parent(s): b723741

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -28
app.py CHANGED
@@ -108,33 +108,17 @@ def generate_caption_translate(image, target_lang):
108
 
109
 
110
  # small text LM (runs on CPU okay)
111
- qa_text_model = pipeline("text2text-generation", model="google/flan-t5-large")
112
-
113
- def vqa_with_fallback(image, question):
114
- # 1) try direct VQA
115
- prompt = f"Question: {question} Answer:"
116
- inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
117
- out = model.generate(**inputs, max_new_tokens=40, num_beams=4, early_stopping=True)
118
- direct_answer = processor.decode(out[0], skip_special_tokens=True)
119
- if direct_answer.lower().startswith(prompt.lower()):
120
- direct_answer = direct_answer[len(prompt):].strip()
121
-
122
- # simple heuristics to detect bad/echo answers
123
- q_clean = question.strip().lower().rstrip("?.")
124
- a_clean = direct_answer.strip().lower().rstrip("?.")
125
- bad = (a_clean == "" or a_clean == question.strip().lower() or len(a_clean.split()) <= 2)
126
-
127
- if not bad:
128
- return direct_answer
129
-
130
- # 2) fallback: get a caption then use LLM for reasoning
131
- cap_inputs = processor(images=image, return_tensors="pt").to(model.device)
132
- cap_out = model.generate(**cap_inputs, max_new_tokens=40, num_beams=4)
133
- caption = processor.decode(cap_out[0], skip_special_tokens=True)
134
-
135
- # Compose prompt for the text model with grounding
136
- text_prompt = f"Image description: {caption}\nQuestion: {question}\nAnswer:"
137
- answer = qa_text_model(text_prompt, max_length=80)[0]["generated_text"]
138
  return answer
139
 
140
 
@@ -159,6 +143,6 @@ with gr.Blocks(title="BLIP Vision App") as demo:
159
  q_in = gr.Textbox(label="Ask a Question about the Image")
160
  ans_out = gr.Textbox(label="Answer")
161
  btn2 = gr.Button("Ask")
162
- btn2.click(vqa_with_fallback, inputs=[img_vqa, q_in], outputs=ans_out)
163
 
164
  demo.launch()
 
108
 
109
 
110
  # small text LM (runs on CPU okay)
111
+ from transformers import BlipProcessor, BlipForQuestionAnswering
112
+ from PIL import Image
113
+ import torch
114
+
115
+ vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
116
+ vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda" if torch.cuda.is_available() else "cpu")
117
+
118
+ def vqa_proper(image, question):
119
+ inputs = vqa_processor(image, question, return_tensors="pt").to(vqa_model.device)
120
+ out = vqa_model.generate(**inputs, max_new_tokens=50, num_beams=5)
121
+ answer = vqa_processor.decode(out[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  return answer
123
 
124
 
 
143
  q_in = gr.Textbox(label="Ask a Question about the Image")
144
  ans_out = gr.Textbox(label="Answer")
145
  btn2 = gr.Button("Ask")
146
+ btn2.click(vqa_proper, inputs=[img_vqa, q_in], outputs=ans_out)
147
 
148
  demo.launch()