File size: 7,055 Bytes

566678a
9188b68
 
 
35037e4
d46ebda
2fb4fd2
 
d46ebda
2fb4fd2
 
 
 
 
35037e4
 
08ce2dc
d46ebda
566678a
35037e4
566678a
35037e4
2fb4fd2
35037e4
566678a
825b375
35037e4
9188b68
2fb4fd2
aa10251
d46ebda
9188b68
825b375
e1369ab
 
08ce2dc
 
9188b68
2fb4fd2
88b442b
 
35037e4
d46ebda
5ca6bfd
 
 
aa10251
566678a
aa10251
5ca6bfd
88b442b
 
08ce2dc
2fb4fd2
35037e4
88b442b
 
 
35037e4
 
08ce2dc
 
35037e4
aa10251
d46ebda
566678a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa10251

import io, base64, torch, open_clip
from PIL import Image

class EndpointHandler:
    """
    MobileCLIP‑B ('datacompdr') zero‑shot classifier with a per‑process
    text‑embedding cache.

    Client JSON must look like:
      {
        "inputs": {
          "image": "<base64 PNG/JPEG>",
          "candidate_labels": ["a photo of a cat", ...]
        }
      }
    """

    # ---------- init (runs once per container) ----------
    def __init__(self, path=""):
        self.model, _, self.preprocess = open_clip.create_model_and_transforms(
            "mobileclip_b", pretrained="datacompdr"
        )
        self.model.eval()

        self.tokenizer = open_clip.get_tokenizer("mobileclip_b")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

        self.cache: dict[str, torch.Tensor] = {}   # prompt → embedding

    # ----------------- inference ------------------------
    def __call__(self, data):
        payload = data.get("inputs", data)
        img_b64 = payload["image"]
        labels  = payload.get("candidate_labels", [])
        if not labels:
            return {"error": "candidate_labels list is empty"}

        # Image → tensor
        img = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
        img_t = self.preprocess(img).unsqueeze(0).to(self.device)

        # Text embeddings (cached)
        new = [l for l in labels if l not in self.cache]
        if new:
            tok = self.tokenizer(new).to(self.device)
            with torch.no_grad():
                emb = self.model.encode_text(tok)
                emb = emb / emb.norm(dim=-1, keepdim=True)
            for l, e in zip(new, emb):
                self.cache[l] = e
        txt_t = torch.stack([self.cache[l] for l in labels])

        # Forward
        with torch.no_grad(), torch.cuda.amp.autocast():
            img_f = self.model.encode_image(img_t)
            img_f = img_f / img_f.norm(dim=-1, keepdim=True)
            probs = (100 * img_f @ txt_t.T).softmax(dim=-1)[0].tolist()

        return [
            {"label": l, "score": float(p)}
            for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
        ]


# import io, base64, torch
# from PIL import Image
# import open_clip


# class EndpointHandler:
#     """
#     Zero‑shot classifier for MobileCLIP‑B (OpenCLIP) with a text‑embedding cache.

#     Client JSON:
#     {
#       "inputs": {
#         "image": "<base64 PNG/JPEG>",
#         "candidate_labels": ["cat", "dog", ...]
#       }
#     }
#     """

#     # ------------------------------------------------- #
#     #                 INITIALISATION                    #
#     # ------------------------------------------------- #
#     def __init__(self, path: str = ""):
#         weights = f"{path}/mobileclip_b.pt"

#         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
#             "MobileCLIP-B", pretrained=weights
#         )
#         self.model.eval()

#         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
#         self.device = "cuda" if torch.cuda.is_available() else "cpu"
#         self.model.to(self.device)

#         # cache: {prompt -> 1×512 tensor on device}
#         self.label_cache: dict[str, torch.Tensor] = {}

#     # ------------------------------------------------- #
#     #                    INFERENCE                      #
#     # ------------------------------------------------- #
#     def __call__(self, data):
#         payload = data.get("inputs", data)

#         img_b64 = payload["image"]
#         labels  = payload.get("candidate_labels", [])
#         if not labels:
#             return {"error": "candidate_labels list is empty"}

#         # --- image ----
#         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
#         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)

#         # --- text (with cache) ----
#         missing = [l for l in labels if l not in self.label_cache]
#         if missing:
#             tokens = self.tokenizer(missing).to(self.device)
#             with torch.no_grad():
#                 emb = self.model.encode_text(tokens)
#                 emb = emb / emb.norm(dim=-1, keepdim=True)
#             for l, e in zip(missing, emb):
#                 self.label_cache[l] = e
#         txt_feat = torch.stack([self.label_cache[l] for l in labels])

#         # --- forward & softmax ----
#         with torch.no_grad(), torch.cuda.amp.autocast():
#             img_feat = self.model.encode_image(img_tensor)
#             img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
#             probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()

#         # --- sorted output ----
#         return [
#             {"label": l, "score": float(p)}
#             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
#         ]

# # handler.py  (repo root)
# import io, base64, torch
# from PIL import Image
# import open_clip

# class EndpointHandler:
#     """
#     Zero‑shot classifier for MobileCLIP‑B (OpenCLIP).

#     Expected client JSON *to the endpoint*:
#     {
#       "inputs": {
#         "image": "<base64 PNG/JPEG>",
#         "candidate_labels": ["cat", "dog", ...]
#       }
#     }
#     """

#     def __init__(self, path: str = ""):
#         weights = f"{path}/mobileclip_b.pt"
#         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
#             "MobileCLIP-B", pretrained=weights
#         )
#         self.model.eval()

#         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
#         self.device = "cuda" if torch.cuda.is_available() else "cpu"
#         self.model.to(self.device)

#     def __call__(self, data):
#         # ── unwrap Hugging Face's `inputs` envelope ───────────
#         payload = data.get("inputs", data)

#         img_b64 = payload["image"]
#         labels  = payload.get("candidate_labels", [])
#         if not labels:
#             return {"error": "candidate_labels list is empty"}

#         # Decode & preprocess image
#         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
#         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)

#         # Tokenise labels
#         text_tokens = self.tokenizer(labels).to(self.device)

#         # Forward pass
#         with torch.no_grad(), torch.cuda.amp.autocast():
#             img_feat = self.model.encode_image(img_tensor)
#             txt_feat = self.model.encode_text(text_tokens)
#             img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
#             txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)
#             probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()

#         # Sorted output
#         return [
#             {"label": l, "score": float(p)}
#             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
#         ]