Added handler.py to support running model in Inference Endpoints.

The call to the handler takes a single text and image as input.
It returns the embeddings of the text and image as well as their cosine similarity.

Files changed (1) hide show

handler.py ADDED Viewed

+from typing import Dict
+import requests
+from transformers import CLIPProcessor, CLIPModel
+from PIL import Image
+from sklearn.metrics.pairwise import cosine_similarity
+class EndpointHandler:
+    def __init__(self, path=""):
+        self.processor = CLIPProcessor.from_pretrained(path)
+        self.model = CLIPModel.from_pretrained(path)
+    def __call__(self, data: Dict) -> Dict:
+        text = data.pop("text")
+        if "image_url" in data:
+            image_url = data.pop("image_url")
+            image = Image.open(requests.get(image_url, stream=True).raw)
+        else:
+            image = data.pop("image")
+        inputs = self.processor(text=text, images=image,
+                                return_tensors="pt", padding=True, truncation=True)
+        outputs = self.model(**inputs)
+        embedding_similarity = cosine_similarity(outputs.text_embeds.detach().numpy(),
+                                                 outputs.image_embeds.detach().numpy())[0][0].item()
+        return {"text_embedding": outputs.text_embeds[0].tolist(),
+                "image_embedding": outputs.image_embeds[0].tolist(),
+                "embedding_similarity": embedding_similarity}