basiliskan
/

slig

@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
 import torch
 from PIL import Image
 import requests
@@ -48,48 +48,83 @@ class EndpointHandler:
         else:
             raise ValueError(f"Unsupported image format: {type(image_data)}")
-    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
-        Process inference requests for zero-shot image classification.
         Args:
-            data: Dictionary containing:
-                - "inputs": Image data (URL, base64, or bytes)
-                - "parameters": Optional dict with:
-                    - "candidate_labels": List of text labels to classify against
         Returns:
-            List of dictionaries with "label" and "score" for each candidate
         """
-        # Extract inputs
-        inputs = data.get("inputs")
-        parameters = data.get("parameters", {})
-        # Get candidate labels (required for zero-shot classification)
-        candidate_labels = parameters.get("candidate_labels", [])
-        if not candidate_labels:
-            # Default labels if none provided
-            candidate_labels = ["a photo", "an illustration", "a diagram"]
-        # Ensure candidate_labels is a list
-        if isinstance(candidate_labels, str):
-            candidate_labels = [label.strip() for label in candidate_labels.split(",")]
-        # Load the image
         image = self._load_image(inputs)
-        # Process inputs
-        processed_inputs = self.processor(
             text=candidate_labels,
             images=image,
             padding="max_length",
             return_tensors="pt"
         ).to(self.device)
-        # Run inference
         with torch.no_grad():
-            outputs = self.model(**processed_inputs)
             # Get image and text embeddings
             image_embeds = outputs.image_embeds
@@ -115,4 +150,102 @@ class EndpointHandler:
         # Sort by score descending
         results.sort(key=lambda x: x["score"], reverse=True)
-        return results

+from typing import Any, Dict, List, Union
 import torch
 from PIL import Image
 import requests
         else:
             raise ValueError(f"Unsupported image format: {type(image_data)}")
+    def _text_embedding(self, inputs: Union[str, List[str]]) -> List[Dict[str, Any]]:
         """
+        Extract text embeddings.
         Args:
+            inputs: Single text string or list of text strings
         Returns:
+            List of dictionaries with normalized embeddings
         """
+        texts = [inputs] if isinstance(inputs, str) else inputs
+        processed = self.processor(
+            text=texts,
+            padding="max_length",
+            return_tensors="pt"
+        ).to(self.device)
+        with torch.no_grad():
+            text_features = self.model.get_text_features(**processed)
+        # Normalize embeddings
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        return [{"embedding": emb.cpu().tolist()} for emb in text_features]
+    def _image_embedding(self, inputs: Any) -> List[Dict[str, Any]]:
+        """
+        Extract image embeddings.
+        Args:
+            inputs: Single image or list of images (URL, base64, or bytes)
+        Returns:
+            List of dictionaries with normalized embeddings
+        """
+        # Handle single image or list of images
+        if isinstance(inputs, list):
+            images = [self._load_image(img) for img in inputs]
+        else:
+            images = [self._load_image(inputs)]
+        processed = self.processor(
+            images=images,
+            return_tensors="pt"
+        ).to(self.device)
+        with torch.no_grad():
+            image_features = self.model.get_image_features(**processed)
+        # Normalize embeddings
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        return [{"embedding": emb.cpu().tolist()} for emb in image_features]
+    def _zero_shot(self, inputs: Any, candidate_labels: List[str]) -> List[Dict[str, Any]]:
+        """
+        Perform zero-shot image classification.
+        Args:
+            inputs: Image data (URL, base64, or bytes)
+            candidate_labels: List of text labels to classify against
+        Returns:
+            List of dictionaries with label and score, sorted by score descending
+        """
         image = self._load_image(inputs)
+        processed = self.processor(
             text=candidate_labels,
             images=image,
             padding="max_length",
             return_tensors="pt"
         ).to(self.device)
         with torch.no_grad():
+            outputs = self.model(**processed)
             # Get image and text embeddings
             image_embeds = outputs.image_embeds
         # Sort by score descending
         results.sort(key=lambda x: x["score"], reverse=True)
+        return results
+    def _similarity(self, image_input: Any, text_input: Union[str, List[str]]) -> Dict[str, Any]:
+        """
+        Compute similarity between image(s) and text(s).
+        Args:
+            image_input: Image data
+            text_input: Text string or list of strings
+        Returns:
+            Dictionary with similarity scores
+        """
+        image = self._load_image(image_input)
+        texts = [text_input] if isinstance(text_input, str) else text_input
+        processed = self.processor(
+            text=texts,
+            images=image,
+            padding="max_length",
+            return_tensors="pt"
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**processed)
+            image_embeds = outputs.image_embeds
+            text_embeds = outputs.text_embeds
+            # Normalize
+            image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+            text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+            # Compute cosine similarities
+            similarities = torch.matmul(image_embeds, text_embeds.t())
+        scores = similarities[0].cpu().tolist()
+        return {
+            "similarities": [
+                {"text": text, "score": score}
+                for text, score in zip(texts, scores)
+            ]
+        }
+    def __call__(self, data: Dict[str, Any]) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
+        """
+        Process inference requests with auto-detection of mode.
+        Args:
+            data: Dictionary containing:
+                - "inputs": Image data, text, or list thereof
+                - "parameters": Optional dict with:
+                    - "mode": One of "auto", "text_embedding", "image_embedding",
+                              "zero_shot", "similarity"
+                    - "candidate_labels": List of labels (for zero_shot mode)
+                    - "text": Text input (for similarity mode)
+        Returns:
+            Results based on the mode selected
+        """
+        inputs = data.get("inputs", data)
+        parameters = data.get("parameters", {})
+        mode = parameters.get("mode", "auto")
+        # Auto-detect mode based on inputs and parameters
+        if mode == "auto":
+            if "candidate_labels" in parameters:
+                mode = "zero_shot"
+            elif "text" in parameters and inputs:
+                mode = "similarity"
+            elif isinstance(inputs, str) and len(inputs) < 500 and not inputs.startswith(("http://", "https://", "data:")):
+                mode = "text_embedding"
+            else:
+                mode = "image_embedding"
+        # Route to appropriate handler
+        if mode == "text_embedding":
+            return self._text_embedding(inputs)
+        elif mode == "image_embedding":
+            return self._image_embedding(inputs)
+        elif mode == "zero_shot":
+            candidate_labels = parameters.get("candidate_labels", [])
+            if isinstance(candidate_labels, str):
+                candidate_labels = [label.strip() for label in candidate_labels.split(",")]
+            if not candidate_labels:
+                raise ValueError("candidate_labels required for zero_shot mode")
+            return self._zero_shot(inputs, candidate_labels)
+        elif mode == "similarity":
+            text = parameters.get("text")
+            if not text:
+                raise ValueError("text parameter required for similarity mode")
+            return self._similarity(inputs, text)
+        else:
+            raise ValueError(f"Unknown mode: {mode}. Supported: auto, text_embedding, image_embedding, zero_shot, similarity")