Modified handler to load BLIP directly from transformers

Browse files

Files changed (2) hide show

handler.py +21 -26
requirements.txt +1 -5

handler.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import  Dict, List, Any
 from PIL import Image
 import requests
@@ -5,32 +6,26 @@ import torch
 import base64
 import os
 from io import BytesIO
 from models.blip_decoder import blip_decoder
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 class EndpointHandler():
     def __init__(self, path=""):
         # load the optimized model
-        self.model_path = os.path.join(path,'model_large_caption.pth')
-        self.model = blip_decoder(
-            pretrained=self.model_path,
-            image_size=384,
-            vit='large',
-            med_config=os.path.join(path, 'configs/med_config.json')
-        )
         self.model.eval()
         self.model = self.model.to(device)
-        image_size = 384
-        self.transform = transforms.Compose([
-            transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
-            transforms.ToTensor(),
-            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
-            ])
     def __call__(self, data: Any) -> Dict[str, Any]:
@@ -39,22 +34,22 @@ class EndpointHandler():
             data (:obj:):
                 includes the input data and the parameters for the inference.
         Return:
-            A :obj:`dict`:. The object returned should be a dict of one list like {"caption": ["A hugging face at the office"]} containing :
                 - "caption": A string corresponding to the generated caption.
         """
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
-        image = Image.open(BytesIO(inputs))
-        image = self.transform(image).unsqueeze(0).to(device)
         with torch.no_grad():
-            caption = self.model.generate(
-                image,
-                sample=parameters.get('sample',True),
-                top_p=parameters.get('top_p',0.9),
-                max_length=parameters.get('max_length',20),
-                min_length=parameters.get('min_length',5)
             )
         # postprocess the prediction
-        return {"caption": caption}

+# +
 from typing import  Dict, List, Any
 from PIL import Image
 import requests
 import base64
 import os
 from io import BytesIO
+from transformers import BlipForConditionalGeneration, BlipProcessor
 from models.blip_decoder import blip_decoder
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
+# -
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 class EndpointHandler():
     def __init__(self, path=""):
         # load the optimized model
+        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        self.model = BlipForConditionalGeneration.from_pretrained(
+            "Salesforce/blip-image-captioning-base"
+        ).to(device)
         self.model.eval()
         self.model = self.model.to(device)
     def __call__(self, data: Any) -> Dict[str, Any]:
             data (:obj:):
                 includes the input data and the parameters for the inference.
         Return:
+            A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
                 - "caption": A string corresponding to the generated caption.
         """
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
+        raw_images = [Image.open(BytesIO(_img)) for _img in inputs]
+        processed_image = self.processor(images=raw_images, return_tensors="pt")
+        processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
+        processed_image = {**processed_image, **parameters}
         with torch.no_grad():
+            out = self.model.generate(
+                **processed_image
             )
+        captions = self.processor.batch_decode(out, skip_special_tokens=True)
         # postprocess the prediction
+        return {"captions": captions}

requirements.txt CHANGED Viewed

@@ -1,5 +1 @@
-timm==0.4.12
-transformers==4.15.0
-fairscale==0.4.4
-requests
-Pillow


1	+ git+https://github.com/huggingface/transformers.git@main