yfyangd commited on
Commit
058b6e6
·
1 Parent(s): cc87a56

Upload predict.py

Browse files
Files changed (1) hide show
  1. BLIP/predict.py +98 -0
BLIP/predict.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Download the weights in ./checkpoints beforehand for fast inference
3
+ wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth
4
+ wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth
5
+ wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth
6
+ """
7
+
8
+ from pathlib import Path
9
+
10
+ from PIL import Image
11
+ import torch
12
+ from torchvision import transforms
13
+ from torchvision.transforms.functional import InterpolationMode
14
+ import cog
15
+
16
+ from models.blip import blip_decoder
17
+ from models.blip_vqa import blip_vqa
18
+ from models.blip_itm import blip_itm
19
+
20
+
21
+ class Predictor(cog.Predictor):
22
+ def setup(self):
23
+ self.device = "cuda:0"
24
+
25
+ self.models = {
26
+ 'image_captioning': blip_decoder(pretrained='checkpoints/model*_base_caption.pth',
27
+ image_size=384, vit='base'),
28
+ 'visual_question_answering': blip_vqa(pretrained='checkpoints/model*_vqa.pth',
29
+ image_size=480, vit='base'),
30
+ 'image_text_matching': blip_itm(pretrained='checkpoints/model_base_retrieval_coco.pth',
31
+ image_size=384, vit='base')
32
+ }
33
+
34
+ @cog.input(
35
+ "image",
36
+ type=Path,
37
+ help="input image",
38
+ )
39
+ @cog.input(
40
+ "task",
41
+ type=str,
42
+ default='image_captioning',
43
+ options=['image_captioning', 'visual_question_answering', 'image_text_matching'],
44
+ help="Choose a task.",
45
+ )
46
+ @cog.input(
47
+ "question",
48
+ type=str,
49
+ default=None,
50
+ help="Type question for the input image for visual question answering task.",
51
+ )
52
+ @cog.input(
53
+ "caption",
54
+ type=str,
55
+ default=None,
56
+ help="Type caption for the input image for image text matching task.",
57
+ )
58
+ def predict(self, image, task, question, caption):
59
+ if task == 'visual_question_answering':
60
+ assert question is not None, 'Please type a question for visual question answering task.'
61
+ if task == 'image_text_matching':
62
+ assert caption is not None, 'Please type a caption for mage text matching task.'
63
+
64
+ im = load_image(image, image_size=480 if task == 'visual_question_answering' else 384, device=self.device)
65
+ model = self.models[task]
66
+ model.eval()
67
+ model = model.to(self.device)
68
+
69
+ if task == 'image_captioning':
70
+ with torch.no_grad():
71
+ caption = model.generate(im, sample=False, num_beams=3, max_length=20, min_length=5)
72
+ return 'Caption: ' + caption[0]
73
+
74
+ if task == 'visual_question_answering':
75
+ with torch.no_grad():
76
+ answer = model(im, question, train=False, inference='generate')
77
+ return 'Answer: ' + answer[0]
78
+
79
+ # image_text_matching
80
+ itm_output = model(im, caption, match_head='itm')
81
+ itm_score = torch.nn.functional.softmax(itm_output, dim=1)[:, 1]
82
+ itc_score = model(im, caption, match_head='itc')
83
+ return f'The image and text is matched with a probability of {itm_score.item():.4f}.\n' \
84
+ f'The image feature and text feature has a cosine similarity of {itc_score.item():.4f}.'
85
+
86
+
87
+ def load_image(image, image_size, device):
88
+ raw_image = Image.open(str(image)).convert('RGB')
89
+
90
+ w, h = raw_image.size
91
+
92
+ transform = transforms.Compose([
93
+ transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
94
+ transforms.ToTensor(),
95
+ transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
96
+ ])
97
+ image = transform(raw_image).unsqueeze(0).to(device)
98
+ return image