| from typing import Any, Dict | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| from PIL import Image | |
| from io import BytesIO | |
| import torch | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| class EndpointHandler(): | |
| def __init__(self, path=""): | |
| self.model = BlipForConditionalGeneration.from_pretrained("quadranttechnologies/qhub-blip-image-captioning-finetuned").to(device) | |
| self.processor = BlipProcessor.from_pretrained("quadranttechnologies/qhub-blip-image-captioning-finetuned") | |
| self.model.eval() | |
| self.model = self.model.to(device).to(device) | |
| def __call__(self, data: Any) -> Dict[str, Any]: | |
| """ | |
| Args: | |
| data (:obj:): | |
| includes the input data and the parameters for the inference. | |
| Return: | |
| A :obj:`dict`:. The object returned should be a dict of one list like {"descriptions": ["Description of the image"]} containing : | |
| - "description": A string corresponding to the generated description. | |
| """ | |
| inputs = data.pop("inputs", data) | |
| parameters = data.pop("parameters", {}) | |
| raw_images = [Image.open(BytesIO(_img)) for _img in inputs] | |
| processed_image = self.processor(images=raw_images, return_tensors="pt") | |
| processed_image["pixel_values"] = processed_image["pixel_values"].to(device) | |
| processed_image = {**processed_image, **parameters} | |
| with torch.no_grad(): | |
| out = self.model.generate( | |
| **processed_image | |
| ) | |
| description = self.processor.batch_decode(out, skip_special_tokens=True) | |
| return {"description": description} |