| | from typing import Dict, List, Any |
| | from PIL import Image |
| | import base64 |
| | import torch |
| | import os |
| | from io import BytesIO |
| | from transformers import BlipForConditionalGeneration, BlipProcessor |
| | import requests |
| | from PIL import Image |
| | from transformers import Blip2Processor, Blip2ForConditionalGeneration |
| | |
| |
|
| | class EndpointHandler(): |
| | def __init__(self, path=""): |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | print(" $$$$ Model Loading $$$$") |
| | self.processor = Blip2Processor.from_pretrained("blip2/sharded") |
| | self.model = Blip2ForConditionalGeneration.from_pretrained("blip2/sharded", device_map = "auto", load_in_8bit = True) |
| | print(" $$$$ model loaded $$$$") |
| | |
| | |
| |
|
| |
|
| | def __call__(self, data: Any) -> Dict[str, Any]: |
| | """ |
| | Args: |
| | data (:obj:): |
| | includes the input data and the parameters for the inference. |
| | Return: |
| | A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing : |
| | - "caption": A string corresponding to the generated caption. |
| | """ |
| | print("********* Helllo ***********") |
| | print(data) |
| | img_data = data.pop("inputs", data) |
| | prompt = data.pop("prompt", "") |
| | print("#########") |
| | |
| |
|
| | if isinstance(img_data, Image.Image): |
| | raw_image = img_data |
| | else: |
| | inputs = isinstance(img_data, str) and [img_data] or img_data |
| | |
| | raw_image = Image.open(BytesIO(base64.b64decode(img_data))) |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | generated_ids = self.processor(raw_image, return_tensors="pt").to("cuda", torch.float16) |
| | generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() |
| | print("@@@@@@ generated_text @@@@@@@") |
| | print(generated_text) |
| | |
| | |
| | |
| | |
| | |
| | return {"captions": generated_text} |
| |
|