manevamarija
/

clip-vit-base-custom-handler

Model card Files Files and versions

manevamarija commited on May 18, 2025

Commit

eef716f

·

verified ·

1 Parent(s): 28cb177

Update handler.py

Files changed (1) hide show

handler.py +36 -14

handler.py CHANGED Viewed

@@ -1,27 +1,49 @@
 from typing import Dict, List, Any
 from PIL import Image
 from io import BytesIO
-from transformers import pipeline
 import base64
 class EndpointHandler():
     def __init__(self, path=""):
-        self.pipeline = pipeline("zero-shot-image-classification", model="openai/clip-vit-large-patch14-336")
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
-        data args:
-            images (:obj:`string`)
-            candiates (:obj:`list`)
-        Return:
-            A :obj:`list`:. The list contains items that are dicts like {"label": "XXX", "score": 0.82}
         """
-        inputs = data.pop("inputs", data)
-        # decode base64 image to PIL
-        image = Image.open(BytesIO(base64.b64decode(inputs['image'])))
-        # run prediction with provided candiates
-        prediction = self.pipeline(images=[image], candidate_labels=inputs["candiates"])
-        return prediction[0]

 from typing import Dict, List, Any
 from PIL import Image
 from io import BytesIO
 import base64
+import torch
+from transformers import CLIPProcessor, CLIPModel
 class EndpointHandler():
     def __init__(self, path=""):
+        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        self.model.eval()
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
+        Args:
+            data: {
+                "inputs": {
+                    "image": base64 string,
+                    "candiates": list of strings
+                }
+            }
+        Returns:
+            List of dicts with raw cosine similarity scores (not softmax probabilities).
         """
+        inputs = data.get("inputs", data)
+        # Decode and process image
+        image = Image.open(BytesIO(base64.b64decode(inputs["image"]))).convert("RGB")
+        categories = inputs["candiates"]
+        # Get image and text features
+        processed = self.processor(text=categories, images=image, return_tensors="pt", padding=True)
+        with torch.no_grad():
+            image_features = self.model.get_image_features(processed["pixel_values"])
+            text_features = self.model.get_text_features(processed["input_ids"], attention_mask=processed["attention_mask"])
+        # Normalize (L2) to get cosine similarity
+        image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
+        similarity = (image_features @ text_features.T).squeeze(0)  # shape: (num_labels,)
+        # Format output with raw cosine scores
+        result = [{"label": label, "score": score.item()} for label, score in zip(categories, similarity)]
+        result = sorted(result, key=lambda x: x["score"], reverse=True)
+        return result