Spaces:

m7mdal7aj
/

KB-VQA

Sleeping

m7mdal7aj commited on Jan 3, 2024

Commit

eedbfb7

1 Parent(s): 9a3c83b

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,13 +11,13 @@ from transformers import Blip2Processor, Blip2ForConditionalGeneration, Instruct
 def load_caption_model(blip2=False, instructblip=True):
     if blip2:
-        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b", load_in_8bit=True,torch_dtype=torch.float16, device_map="auto")
-        model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", load_in_8bit=True,torch_dtype=torch.float16, device_map="auto")
         #model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16, device_map="auto")
     if instructblip:
-        model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", load_in_8bit=True,torch_dtype=torch.float16, device_map="auto")
-        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b", load_in_8bit=True,torch_dtype=torch.float16, device_map="auto")
     return model, processor
@@ -26,13 +26,13 @@ def load_caption_model(blip2=False, instructblip=True):
 def answer_question(image, question, model, processor):
-    image = Image.open(image).convert('RGB')
     inputs = processor(image, question, return_tensors="pt").to("cuda", torch.float16)
-    out = model.generate(**inputs, max_length=200, min_length=20, num_beams=3)
     answer = processor.decode(out[0], skip_special_tokens=True).strip()
     return answer

 def load_caption_model(blip2=False, instructblip=True):
     if blip2:
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b", load_in_8bit=True,torch_dtype=torch.float16, device_map="cuda")
+        model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", load_in_8bit=True,torch_dtype=torch.float16, device_map="cuda")
         #model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16, device_map="auto")
     if instructblip:
+        model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", load_in_8bit=True,torch_dtype=torch.float16, device_map="cuda")
+        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b", load_in_8bit=True,torch_dtype=torch.float16, device_map="cuda")
     return model, processor
 def answer_question(image, question, model, processor):
+    image = Image.open(image)
     inputs = processor(image, question, return_tensors="pt").to("cuda", torch.float16)
+    out = model.generate(**inputs, max_length=200, min_length=20).to("cuda", torch.float16)
     answer = processor.decode(out[0], skip_special_tokens=True).strip()
     return answer