from transformers import AutoProcessor, AutoModel from PIL import Image import torch import requests import io import json class EndpointHandler: def __init__(self, path=""): model_name_or_path = "oliverlabs/siglip256" self.processor = AutoProcessor.from_pretrained(model_name_or_path) self.model = AutoModel.from_pretrained(model_name_or_path) self.model.eval() def _load_image(self, image_input): """Load image from URL or bytes""" if isinstance(image_input, str): # URL response = requests.get(image_input) image = Image.open(io.BytesIO(response.content)).convert("RGB") elif isinstance(image_input, bytes): # raw bytes image = Image.open(io.BytesIO(image_input)).convert("RGB") else: raise ValueError("Unsupported image input format") return image def __call__(self, data): """ Hugging Face Inference Endpoint calls this method with JSON input """ image_input = data.get("image") texts = data.get("texts", []) if not image_input or not texts: return {"error": "Missing image or texts in payload."} image = self._load_image(image_input) inputs = self.processor(text=texts, images=image, return_tensors="pt", padding=True) with torch.no_grad(): outputs = self.model(**inputs) image_emb = outputs.image_embeds[0].tolist() text_embs = [emb.tolist() for emb in outputs.text_embeds] return { "image_embedding": image_emb, "text_embeddings": text_embs, "num_texts": len(texts) } if __name__ == "__main__": handler = EndpointHandler() test_payload = { "image": "http://images.cocodataset.org/val2017/000000039769.jpg", "texts": ["a photo of 2 cats", "a photo of 2 dogs"] } result = handler(test_payload) print(json.dumps(result, indent=2)[:1000] + "\n... (truncated)")