Mohammad Amin Kateb Saber commited on
Commit
a465973
·
1 Parent(s): c145c39

feat(handler): implement handler and add requirements.txt

Browse files
Files changed (2) hide show
  1. handler.py +128 -0
  2. requirements.txt +4 -0
handler.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SigLIP2 embedding handler for Hugging Face Inference Endpoints.
3
+ Supports image and text embeddings via get_image_features and get_text_features.
4
+ """
5
+
6
+ import base64
7
+ from io import BytesIO
8
+ from typing import Any, Dict, List, Optional, Union
9
+
10
+ import torch
11
+ from PIL import Image
12
+ from transformers import AutoModel, AutoProcessor
13
+ from transformers.image_utils import load_image
14
+
15
+
16
+ def _load_image_from_input(image_input: Union[str, bytes]) -> Image.Image:
17
+ """Load a PIL Image from a URL, file path, or base64 string."""
18
+ if isinstance(image_input, bytes):
19
+ return Image.open(BytesIO(image_input)).convert("RGB")
20
+
21
+ if not isinstance(image_input, str):
22
+ raise ValueError(f"Image input must be str or bytes, got {type(image_input)}")
23
+
24
+ # Base64 string (with or without data URL prefix)
25
+ if image_input.startswith("data:"):
26
+ # Format: data:image/jpeg;base64,<b64data>
27
+ b64_data = image_input.split(",", 1)[1] if "," in image_input else image_input
28
+ return Image.open(BytesIO(base64.b64decode(b64_data))).convert("RGB")
29
+ if image_input.startswith("/9j/") or len(image_input) > 500:
30
+ # Likely raw base64 without prefix
31
+ try:
32
+ return Image.open(BytesIO(base64.b64decode(image_input))).convert("RGB")
33
+ except Exception:
34
+ pass
35
+
36
+ # URL or file path
37
+ return load_image(image_input)
38
+
39
+
40
+ class EndpointHandler:
41
+ """Hugging Face Inference Endpoints handler for SigLIP2 image and text embeddings."""
42
+
43
+ def __init__(self, path: str = ""):
44
+ """Load model and processor from the given path (repo root when deployed)."""
45
+ self.model = (
46
+ AutoModel.from_pretrained(
47
+ path,
48
+ device_map="auto",
49
+ torch_dtype=torch.float16,
50
+ )
51
+ .eval()
52
+ )
53
+ self.processor = AutoProcessor.from_pretrained(path)
54
+
55
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
56
+ """
57
+ Process a request containing images and/or texts and return embeddings.
58
+
59
+ Args:
60
+ data: Request payload with "inputs" key. Expected shape:
61
+ {
62
+ "inputs": {
63
+ "images": ["url1", "url2"] | ["data:image/jpeg;base64,...", ...],
64
+ "texts": ["text1", "text2"]
65
+ },
66
+ "normalize": true # optional, default True
67
+ }
68
+ At least one of "images" or "texts" must be provided.
69
+
70
+ Returns:
71
+ {
72
+ "image_embeddings": [[...], [...]] | null,
73
+ "text_embeddings": [[...], [...]] | null
74
+ }
75
+ """
76
+ payload = data.get("inputs", data)
77
+ normalize = data.get("normalize", True)
78
+
79
+ if not isinstance(payload, dict):
80
+ raise ValueError(
81
+ "inputs must be a dict with 'images' and/or 'texts' keys. "
82
+ f"Got {type(payload)}."
83
+ )
84
+
85
+ images = payload.get("images")
86
+ texts = payload.get("texts")
87
+
88
+ if not images and not texts:
89
+ raise ValueError("At least one of 'images' or 'texts' must be provided.")
90
+
91
+ if images is not None and not isinstance(images, list):
92
+ raise ValueError("'images' must be a list.")
93
+ if texts is not None and not isinstance(texts, list):
94
+ raise ValueError("'texts' must be a list.")
95
+
96
+ result: Dict[str, Optional[List[List[float]]]] = {
97
+ "image_embeddings": None,
98
+ "text_embeddings": None,
99
+ }
100
+
101
+ with torch.no_grad():
102
+ if images:
103
+ pil_images = [_load_image_from_input(img) for img in images]
104
+ inputs = self.processor(
105
+ images=pil_images,
106
+ return_tensors="pt",
107
+ max_num_patches=256,
108
+ ).to(self.model.device)
109
+ image_embeddings = self.model.get_image_features(**inputs)
110
+ if normalize:
111
+ image_embeddings = image_embeddings / image_embeddings.norm(
112
+ p=2, dim=-1, keepdim=True
113
+ )
114
+ result["image_embeddings"] = image_embeddings.cpu().tolist()
115
+
116
+ if texts:
117
+ inputs = self.processor(
118
+ text=texts,
119
+ return_tensors="pt",
120
+ ).to(self.model.device)
121
+ text_embeddings = self.model.get_text_features(**inputs)
122
+ if normalize:
123
+ text_embeddings = text_embeddings / text_embeddings.norm(
124
+ p=2, dim=-1, keepdim=True
125
+ )
126
+ result["text_embeddings"] = text_embeddings.cpu().tolist()
127
+
128
+ return result
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers>=4.49.0
2
+ torch
3
+ Pillow
4
+ accelerate