Spaces:
Runtime error
Runtime error
Upload predict.py
Browse files- BLIP/predict.py +98 -0
BLIP/predict.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Download the weights in ./checkpoints beforehand for fast inference
|
| 3 |
+
wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth
|
| 4 |
+
wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth
|
| 5 |
+
wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from PIL import Image
|
| 11 |
+
import torch
|
| 12 |
+
from torchvision import transforms
|
| 13 |
+
from torchvision.transforms.functional import InterpolationMode
|
| 14 |
+
import cog
|
| 15 |
+
|
| 16 |
+
from models.blip import blip_decoder
|
| 17 |
+
from models.blip_vqa import blip_vqa
|
| 18 |
+
from models.blip_itm import blip_itm
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class Predictor(cog.Predictor):
|
| 22 |
+
def setup(self):
|
| 23 |
+
self.device = "cuda:0"
|
| 24 |
+
|
| 25 |
+
self.models = {
|
| 26 |
+
'image_captioning': blip_decoder(pretrained='checkpoints/model*_base_caption.pth',
|
| 27 |
+
image_size=384, vit='base'),
|
| 28 |
+
'visual_question_answering': blip_vqa(pretrained='checkpoints/model*_vqa.pth',
|
| 29 |
+
image_size=480, vit='base'),
|
| 30 |
+
'image_text_matching': blip_itm(pretrained='checkpoints/model_base_retrieval_coco.pth',
|
| 31 |
+
image_size=384, vit='base')
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
@cog.input(
|
| 35 |
+
"image",
|
| 36 |
+
type=Path,
|
| 37 |
+
help="input image",
|
| 38 |
+
)
|
| 39 |
+
@cog.input(
|
| 40 |
+
"task",
|
| 41 |
+
type=str,
|
| 42 |
+
default='image_captioning',
|
| 43 |
+
options=['image_captioning', 'visual_question_answering', 'image_text_matching'],
|
| 44 |
+
help="Choose a task.",
|
| 45 |
+
)
|
| 46 |
+
@cog.input(
|
| 47 |
+
"question",
|
| 48 |
+
type=str,
|
| 49 |
+
default=None,
|
| 50 |
+
help="Type question for the input image for visual question answering task.",
|
| 51 |
+
)
|
| 52 |
+
@cog.input(
|
| 53 |
+
"caption",
|
| 54 |
+
type=str,
|
| 55 |
+
default=None,
|
| 56 |
+
help="Type caption for the input image for image text matching task.",
|
| 57 |
+
)
|
| 58 |
+
def predict(self, image, task, question, caption):
|
| 59 |
+
if task == 'visual_question_answering':
|
| 60 |
+
assert question is not None, 'Please type a question for visual question answering task.'
|
| 61 |
+
if task == 'image_text_matching':
|
| 62 |
+
assert caption is not None, 'Please type a caption for mage text matching task.'
|
| 63 |
+
|
| 64 |
+
im = load_image(image, image_size=480 if task == 'visual_question_answering' else 384, device=self.device)
|
| 65 |
+
model = self.models[task]
|
| 66 |
+
model.eval()
|
| 67 |
+
model = model.to(self.device)
|
| 68 |
+
|
| 69 |
+
if task == 'image_captioning':
|
| 70 |
+
with torch.no_grad():
|
| 71 |
+
caption = model.generate(im, sample=False, num_beams=3, max_length=20, min_length=5)
|
| 72 |
+
return 'Caption: ' + caption[0]
|
| 73 |
+
|
| 74 |
+
if task == 'visual_question_answering':
|
| 75 |
+
with torch.no_grad():
|
| 76 |
+
answer = model(im, question, train=False, inference='generate')
|
| 77 |
+
return 'Answer: ' + answer[0]
|
| 78 |
+
|
| 79 |
+
# image_text_matching
|
| 80 |
+
itm_output = model(im, caption, match_head='itm')
|
| 81 |
+
itm_score = torch.nn.functional.softmax(itm_output, dim=1)[:, 1]
|
| 82 |
+
itc_score = model(im, caption, match_head='itc')
|
| 83 |
+
return f'The image and text is matched with a probability of {itm_score.item():.4f}.\n' \
|
| 84 |
+
f'The image feature and text feature has a cosine similarity of {itc_score.item():.4f}.'
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def load_image(image, image_size, device):
|
| 88 |
+
raw_image = Image.open(str(image)).convert('RGB')
|
| 89 |
+
|
| 90 |
+
w, h = raw_image.size
|
| 91 |
+
|
| 92 |
+
transform = transforms.Compose([
|
| 93 |
+
transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
|
| 94 |
+
transforms.ToTensor(),
|
| 95 |
+
transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
|
| 96 |
+
])
|
| 97 |
+
image = transform(raw_image).unsqueeze(0).to(device)
|
| 98 |
+
return image
|