basiliskan
/

slig

@@ -1,3 +1,8 @@
 from typing import Any, Dict, List, Union
 import torch
 from PIL import Image
@@ -9,243 +14,108 @@ from transformers import AutoProcessor, AutoModel
 class EndpointHandler:
     def __init__(self, path: str = ""):
-        """
-        Initialize the handler by loading the SigLIP2 model and processor.
-        Args:
-            path: Path to the model directory (provided by HF Inference Endpoints)
-        """
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model = AutoModel.from_pretrained(path, trust_remote_code=True).to(self.device)
         self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
         self.model.eval()
     def _load_image(self, image_data: Any) -> Image.Image:
-        """
-        Load an image from various input formats.
-        Args:
-            image_data: Can be a URL string, base64 string, or raw bytes
-        Returns:
-            PIL Image object
-        """
         if isinstance(image_data, str):
-            # Check if it's a URL
             if image_data.startswith(("http://", "https://")):
                 response = requests.get(image_data, timeout=10)
                 response.raise_for_status()
                 return Image.open(BytesIO(response.content)).convert("RGB")
-            # Otherwise assume base64
             else:
-                # Handle data URI format
                 if "," in image_data:
                     image_data = image_data.split(",")[1]
                 image_bytes = base64.b64decode(image_data)
                 return Image.open(BytesIO(image_bytes)).convert("RGB")
         elif isinstance(image_data, bytes):
             return Image.open(BytesIO(image_data)).convert("RGB")
-        else:
-            raise ValueError(f"Unsupported image format: {type(image_data)}")
-    def _text_embedding(self, inputs: Union[str, List[str]]) -> List[Dict[str, Any]]:
-        """
-        Extract text embeddings.
-        Args:
-            inputs: Single text string or list of text strings
-        Returns:
-            List of dictionaries with normalized embeddings
-        """
-        texts = [inputs] if isinstance(inputs, str) else inputs
-        processed = self.processor(
-            text=texts,
-            padding="max_length",
-            return_tensors="pt"
-        ).to(self.device)
-        with torch.no_grad():
-            text_features = self.model.get_text_features(**processed)
-        # Normalize embeddings
-        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
-        return [{"embedding": emb.cpu().tolist()} for emb in text_features]
-    def _image_embedding(self, inputs: Any) -> List[Dict[str, Any]]:
-        """
-        Extract image embeddings.
-        Args:
-            inputs: Single image or list of images (URL, base64, or bytes)
-        Returns:
-            List of dictionaries with normalized embeddings
-        """
-        # Handle single image or list of images
-        if isinstance(inputs, list):
-            images = [self._load_image(img) for img in inputs]
-        else:
-            images = [self._load_image(inputs)]
-        processed = self.processor(
-            images=images,
-            return_tensors="pt"
-        ).to(self.device)
-        with torch.no_grad():
-            image_features = self.model.get_image_features(**processed)
-        # Normalize embeddings
-        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
-        return [{"embedding": emb.cpu().tolist()} for emb in image_features]
-    def _zero_shot(self, inputs: Any, candidate_labels: List[str]) -> List[Dict[str, Any]]:
-        """
-        Perform zero-shot image classification.
-        Args:
-            inputs: Image data (URL, base64, or bytes)
-            candidate_labels: List of text labels to classify against
-        Returns:
-            List of dictionaries with label and score, sorted by score descending
-        """
-        image = self._load_image(inputs)
-        processed = self.processor(
-            text=candidate_labels,
-            images=image,
-            padding="max_length",
-            return_tensors="pt"
-        ).to(self.device)
         with torch.no_grad():
-            outputs = self.model(**processed)
-            # Get image and text embeddings
-            image_embeds = outputs.image_embeds
-            text_embeds = outputs.text_embeds
-            # Normalize embeddings
-            image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
-            text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
-            # Compute similarity scores
-            logits_per_image = torch.matmul(image_embeds, text_embeds.t())
-            # Apply softmax to get probabilities
-            probs = torch.softmax(logits_per_image, dim=-1)
-        # Format results
-        scores = probs[0].cpu().tolist()
-        results = [
-            {"label": label, "score": score}
-            for label, score in zip(candidate_labels, scores)
-        ]
-        # Sort by score descending
-        results.sort(key=lambda x: x["score"], reverse=True)
-        return results
-    def _similarity(self, image_input: Any, text_input: Union[str, List[str]]) -> Dict[str, Any]:
-        """
-        Compute similarity between image(s) and text(s).
-        Args:
-            image_input: Image data
-            text_input: Text string or list of strings
-        Returns:
-            Dictionary with similarity scores
-        """
-        image = self._load_image(image_input)
-        texts = [text_input] if isinstance(text_input, str) else text_input
-        processed = self.processor(
-            text=texts,
-            images=image,
-            padding="max_length",
-            return_tensors="pt"
-        ).to(self.device)
         with torch.no_grad():
-            outputs = self.model(**processed)
-            image_embeds = outputs.image_embeds
-            text_embeds = outputs.text_embeds
-            # Normalize
-            image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
-            text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
-            # Compute cosine similarities
-            similarities = torch.matmul(image_embeds, text_embeds.t())
-        scores = similarities[0].cpu().tolist()
-        return {
-            "similarities": [
-                {"text": text, "score": score}
-                for text, score in zip(texts, scores)
-            ]
-        }
-    def __call__(self, data: Dict[str, Any]) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
-        """
-        Process inference requests with auto-detection of mode.
-        Args:
-            data: Dictionary containing:
-                - "inputs": Image data, text, or list thereof
-                - "parameters": Optional dict with:
-                    - "mode": One of "auto", "text_embedding", "image_embedding",
-                              "zero_shot", "similarity"
-                    - "candidate_labels": List of labels (for zero_shot mode)
-                    - "text": Text input (for similarity mode)
-        Returns:
-            Results based on the mode selected
-        """
         inputs = data.get("inputs", data)
         parameters = data.get("parameters", {})
         mode = parameters.get("mode", "auto")
-        # Auto-detect mode based on inputs and parameters
         if mode == "auto":
-            if "candidate_labels" in parameters:
-                mode = "zero_shot"
-            elif "text" in parameters and inputs:
                 mode = "similarity"
-            elif isinstance(inputs, str) and len(inputs) < 500 and not inputs.startswith(("http://", "https://", "data:")):
                 mode = "text_embedding"
             else:
                 mode = "image_embedding"
-        # Route to appropriate handler
-        if mode == "text_embedding":
-            return self._text_embedding(inputs)
         elif mode == "image_embedding":
             return self._image_embedding(inputs)
-        elif mode == "zero_shot":
-            candidate_labels = parameters.get("candidate_labels", [])
-            if isinstance(candidate_labels, str):
-                candidate_labels = [label.strip() for label in candidate_labels.split(",")]
-            if not candidate_labels:
-                raise ValueError("candidate_labels required for zero_shot mode")
-            return self._zero_shot(inputs, candidate_labels)
         elif mode == "similarity":
-            text = parameters.get("text")
-            if not text:
-                raise ValueError("text parameter required for similarity mode")
-            return self._similarity(inputs, text)
         else:
-            raise ValueError(f"Unknown mode: {mode}. Supported: auto, text_embedding, image_embedding, zero_shot, similarity")

+"""
+Custom Inference Handler for SigLIP2-base-patch16-512
+Supports: zero_shot, image_embedding, text_embedding, similarity
+Returns 768D embeddings.
+"""
 from typing import Any, Dict, List, Union
 import torch
 from PIL import Image
 class EndpointHandler:
     def __init__(self, path: str = ""):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model = AutoModel.from_pretrained(path, trust_remote_code=True).to(self.device)
         self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
         self.model.eval()
     def _load_image(self, image_data: Any) -> Image.Image:
         if isinstance(image_data, str):
             if image_data.startswith(("http://", "https://")):
                 response = requests.get(image_data, timeout=10)
                 response.raise_for_status()
                 return Image.open(BytesIO(response.content)).convert("RGB")
             else:
                 if "," in image_data:
                     image_data = image_data.split(",")[1]
                 image_bytes = base64.b64decode(image_data)
                 return Image.open(BytesIO(image_bytes)).convert("RGB")
         elif isinstance(image_data, bytes):
             return Image.open(BytesIO(image_data)).convert("RGB")
+        raise ValueError(f"Unsupported image format: {type(image_data)}")
+    def _get_image_embeddings(self, images: List[Image.Image]) -> torch.Tensor:
+        inputs = self.processor(images=images, return_tensors="pt").to(self.device)
         with torch.no_grad():
+            features = self.model.get_image_features(**inputs)
+        return features / features.norm(dim=-1, keepdim=True)
+    def _get_text_embeddings(self, texts: List[str]) -> torch.Tensor:
+        inputs = self.processor(text=texts, padding="max_length", return_tensors="pt").to(self.device)
         with torch.no_grad():
+            features = self.model.get_text_features(**inputs)
+        return features / features.norm(dim=-1, keepdim=True)
+    def __call__(self, data: Dict[str, Any]) -> Any:
         inputs = data.get("inputs", data)
         parameters = data.get("parameters", {})
         mode = parameters.get("mode", "auto")
+        # Auto-detect mode
         if mode == "auto":
+            if isinstance(inputs, dict) and ("image" in inputs or "images" in inputs):
                 mode = "similarity"
+            elif "candidate_labels" in parameters:
+                mode = "zero_shot"
+            elif isinstance(inputs, str) and not inputs.startswith(("http", "data:")) and len(inputs) < 500:
+                mode = "text_embedding"
+            elif isinstance(inputs, list) and all(
+                isinstance(i, str) and not i.startswith(("http", "data:")) and len(i) < 500 for i in inputs
+            ):
                 mode = "text_embedding"
             else:
                 mode = "image_embedding"
+        if mode == "zero_shot":
+            return self._zero_shot(inputs, parameters)
         elif mode == "image_embedding":
             return self._image_embedding(inputs)
+        elif mode == "text_embedding":
+            return self._text_embedding(inputs)
         elif mode == "similarity":
+            return self._similarity(inputs)
         else:
+            raise ValueError(f"Unknown mode: {mode}")
+    def _zero_shot(self, inputs, parameters):
+        candidate_labels = parameters.get("candidate_labels", ["photo", "illustration", "diagram"])
+        if isinstance(candidate_labels, str):
+            candidate_labels = [l.strip() for l in candidate_labels.split(",")]
+        images = [self._load_image(inputs)] if not isinstance(inputs, list) else [self._load_image(i) for i in inputs]
+        image_embeds = self._get_image_embeddings(images)
+        text_embeds = self._get_text_embeddings(candidate_labels)
+        logits = image_embeds @ text_embeds.T
+        probs = torch.softmax(logits, dim=-1)
+        results = []
+        for i, prob in enumerate(probs):
+            scores = prob.cpu().tolist()
+            result = [{"label": l, "score": s} for l, s in sorted(zip(candidate_labels, scores), key=lambda x: -x[1])]
+            results.append(result)
+        return results[0] if len(results) == 1 else results
+    def _image_embedding(self, inputs):
+        images = [self._load_image(inputs)] if not isinstance(inputs, list) else [self._load_image(i) for i in inputs]
+        embeddings = self._get_image_embeddings(images)
+        return [{"embedding": emb.cpu().tolist()} for emb in embeddings]
+    def _text_embedding(self, inputs):
+        texts = [inputs] if isinstance(inputs, str) else inputs
+        embeddings = self._get_text_embeddings(texts)
+        return [{"embedding": emb.cpu().tolist()} for emb in embeddings]
+    def _similarity(self, inputs):
+        image_input = inputs.get("image") or inputs.get("images")
+        text_input = inputs.get("text") or inputs.get("texts")
+        images = [self._load_image(image_input)] if not isinstance(image_input, list) else [self._load_image(i) for i in image_input]
+        texts = [text_input] if isinstance(text_input, str) else text_input
+        image_embeds = self._get_image_embeddings(images)
+        text_embeds = self._get_text_embeddings(texts)
+        similarity = (image_embeds @ text_embeds.T).cpu().tolist()
+        return {"similarity_scores": similarity, "image_count": len(images), "text_count": len(texts)}