| from transformers import AutoProcessor, AutoModel |
| from PIL import Image |
| import torch |
| import requests |
| import io |
| import json |
|
|
| class EndpointHandler: |
| def __init__(self, path=""): |
| model_name_or_path = "oliverlabs/siglip256" |
| self.processor = AutoProcessor.from_pretrained(model_name_or_path) |
| self.model = AutoModel.from_pretrained(model_name_or_path) |
| self.model.eval() |
|
|
| def _load_image(self, image_input): |
| """Load image from URL or bytes""" |
| if isinstance(image_input, str): |
| response = requests.get(image_input) |
| image = Image.open(io.BytesIO(response.content)).convert("RGB") |
| elif isinstance(image_input, bytes): |
| image = Image.open(io.BytesIO(image_input)).convert("RGB") |
| else: |
| raise ValueError("Unsupported image input format") |
| return image |
|
|
| def __call__(self, data): |
| """ |
| Hugging Face Inference Endpoint calls this method with JSON input |
| """ |
| image_input = data.get("image") |
| texts = data.get("texts", []) |
|
|
| if not image_input or not texts: |
| return {"error": "Missing image or texts in payload."} |
|
|
| image = self._load_image(image_input) |
| inputs = self.processor(text=texts, images=image, return_tensors="pt", padding=True) |
|
|
| with torch.no_grad(): |
| outputs = self.model(**inputs) |
|
|
| image_emb = outputs.image_embeds[0].tolist() |
| text_embs = [emb.tolist() for emb in outputs.text_embeds] |
|
|
| return { |
| "image_embedding": image_emb, |
| "text_embeddings": text_embs, |
| "num_texts": len(texts) |
| } |
|
|
|
|
| if __name__ == "__main__": |
| handler = EndpointHandler() |
| test_payload = { |
| "image": "http://images.cocodataset.org/val2017/000000039769.jpg", |
| "texts": ["a photo of 2 cats", "a photo of 2 dogs"] |
| } |
|
|
| result = handler(test_payload) |
| print(json.dumps(result, indent=2)[:1000] + "\n... (truncated)") |
|
|