import io, base64, torch from PIL import Image from transformers import CLIPProcessor, CLIPModel class EndpointHandler: def __init__(self, path=""): device = "cuda" if torch.cuda.is_available() else "cpu" self.model = CLIPModel.from_pretrained(path).to(device) self.processor = CLIPProcessor.from_pretrained(path) self.device = device def __call__(self, data): # Expect JSON {"image": "", "candidate_labels": ["cat","dog"]} img_b64 = data["image"] labels = data.get("candidate_labels", []) image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB") inputs = self.processor(text=labels, images=image, return_tensors="pt", padding=True).to(self.device) probs = self.model(**inputs).logits_per_image.softmax(dim=-1)[0].tolist() return [{"label": l, "score": float(p)} for l, p in zip(labels, probs)]