| | from transformers import AutoModelForCausalLM, AutoProcessor |
| | from PIL import Image |
| | import requests |
| | import torch |
| |
|
| | class EndpointHandler: |
| | def __init__(self, model_dir): |
| | |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | |
| | |
| | self.model = AutoModelForCausalLM.from_pretrained( |
| | model_dir, |
| | trust_remote_code=True |
| | ).eval().to(device) |
| | |
| | self.processor = AutoProcessor.from_pretrained( |
| | model_dir, |
| | trust_remote_code=True |
| | ) |
| | self.device = device |
| |
|
| | def __call__(self, data): |
| | |
| | task_prompt = data.get("task_prompt", "<MORE_DETAILED_CAPTION>") |
| | image_url = data.get("image_url") |
| |
|
| | |
| | image = self.load_image(image_url) |
| |
|
| | |
| | inputs = self.processor( |
| | text=task_prompt, |
| | images=image, |
| | return_tensors="pt" |
| | ).to(self.device) |
| |
|
| | |
| | generated_ids = self.model.generate( |
| | input_ids=inputs["input_ids"], |
| | pixel_values=inputs["pixel_values"], |
| | max_new_tokens=1024, |
| | num_beams=3, |
| | ) |
| |
|
| | |
| | generated_text = self.processor.batch_decode( |
| | generated_ids, |
| | skip_special_tokens=True |
| | )[0] |
| |
|
| | return {"caption": generated_text} |
| |
|
| | def load_image(self, image_url): |
| | |
| | image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB") |
| | return image |