| | |
| | from typing import Dict, List, Any |
| | from PIL import Image |
| | import base64 |
| | import torch |
| | import os |
| | from io import BytesIO |
| | from transformers import BlipForConditionalGeneration, BlipProcessor |
| | |
| |
|
| | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| |
|
| | class EndpointHandler(): |
| | def __init__(self, path=""): |
| | |
| | |
| | self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") |
| | self.model = BlipForConditionalGeneration.from_pretrained( |
| | "Salesforce/blip-image-captioning-base" |
| | ).to(device) |
| | self.model.eval() |
| | self.model = self.model.to(device) |
| | |
| |
|
| |
|
| | def __call__(self, data: Any) -> Dict[str, Any]: |
| | """ |
| | Args: |
| | data (:obj:): |
| | includes the input data and the parameters for the inference. |
| | Return: |
| | A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing : |
| | - "caption": A string corresponding to the generated caption. |
| | """ |
| | print(data) |
| | inputs = data.pop("inputs", data) |
| | parameters = data.pop("parameters", {}) |
| | |
| | raw_images = [Image.open(BytesIO(base64.b64decode(_img))) for _img in inputs] |
| | |
| | processed_images = self.processor(images=raw_images, return_tensors="pt") |
| | processed_images["pixel_values"] = processed_images["pixel_values"].to(device) |
| | processed_images = {**processed_images, **parameters} |
| | |
| | with torch.no_grad(): |
| | out = self.model.generate( |
| | **processed_images |
| | ) |
| | captions = self.processor.batch_decode(out, skip_special_tokens=True) |
| |
|
| | return {"captions": captions} |
| |
|