File size: 7,273 Bytes

233acb0
aa10251
566678a
9188b68
5ca6bfd
9188b68
 
35037e4
5ca6bfd
 
 
 
 
35037e4
5ca6bfd
35037e4
08ce2dc
566678a
5ca6bfd
35037e4
566678a
35037e4
5ca6bfd
35037e4
566678a
825b375
35037e4
9188b68
5ca6bfd
aa10251
9188b68
825b375
e1369ab
 
08ce2dc
 
9188b68
5ca6bfd
88b442b
 
35037e4
5ca6bfd
 
 
 
aa10251
566678a
aa10251
5ca6bfd
88b442b
 
08ce2dc
5ca6bfd
35037e4
88b442b
 
 
35037e4
 
08ce2dc
 
35037e4
aa10251
566678a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa10251

# handler.py  (repo root)

import io, base64, torch, open_clip
from PIL import Image
from mobileclip.modules.common.mobileone import reparameterize_model             # optional

class EndpointHandler:
    """
    MobileCLIP‑B ('datacompdr') · text‑embedding cache.
    Expects: {
      "inputs": {
        "image": "<base64>",
        "candidate_labels": ["a photo of a cat", ...]
      }
    }
    """

    def __init__(self, path=""):
        # -- Load MobileCLIP‑B checkpoint identical to local run -------------
        self.model, _, self.preprocess = open_clip.create_model_and_transforms(
            "mobileclip_b", pretrained="datacompdr"
        )
        self.model = reparameterize_model(self.model).eval()   # matches local pipeline

        self.tokenizer = open_clip.get_tokenizer("mobileclip_b")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

        self.cache: dict[str, torch.Tensor] = {}               # label → embedding

    def __call__(self, data):
        payload = data.get("inputs", data)
        img_b64 = payload["image"]
        labels  = payload.get("candidate_labels", [])
        if not labels:
            return {"error": "candidate_labels list is empty"}

        # -------- image preprocessing --------------------------------------
        img = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
        img_t = self.preprocess(img).unsqueeze(0).to(self.device)

        # -------- text embeddings with cache -------------------------------
        new = [l for l in labels if l not in self.cache]
        if new:
            tok = self.tokenizer(new).to(self.device)
            with torch.no_grad():
                emb = self.model.encode_text(tok)
                emb = emb / emb.norm(dim=-1, keepdim=True)
            for l, e in zip(new, emb):
                self.cache[l] = e
        txt_t = torch.stack([self.cache[l] for l in labels])

        # -------- forward & softmax ----------------------------------------
        with torch.no_grad(), torch.cuda.amp.autocast():
            img_f = self.model.encode_image(img_t)
            img_f = img_f / img_f.norm(dim=-1, keepdim=True)
            probs = (100 * img_f @ txt_t.T).softmax(dim=-1)[0].tolist()

        return [
            {"label": l, "score": float(p)}
            for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
        ]

# import io, base64, torch
# from PIL import Image
# import open_clip


# class EndpointHandler:
#     """
#     Zero‑shot classifier for MobileCLIP‑B (OpenCLIP) with a text‑embedding cache.

#     Client JSON:
#     {
#       "inputs": {
#         "image": "<base64 PNG/JPEG>",
#         "candidate_labels": ["cat", "dog", ...]
#       }
#     }
#     """

#     # ------------------------------------------------- #
#     #                 INITIALISATION                    #
#     # ------------------------------------------------- #
#     def __init__(self, path: str = ""):
#         weights = f"{path}/mobileclip_b.pt"

#         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
#             "MobileCLIP-B", pretrained=weights
#         )
#         self.model.eval()

#         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
#         self.device = "cuda" if torch.cuda.is_available() else "cpu"
#         self.model.to(self.device)

#         # cache: {prompt -> 1×512 tensor on device}
#         self.label_cache: dict[str, torch.Tensor] = {}

#     # ------------------------------------------------- #
#     #                    INFERENCE                      #
#     # ------------------------------------------------- #
#     def __call__(self, data):
#         payload = data.get("inputs", data)

#         img_b64 = payload["image"]
#         labels  = payload.get("candidate_labels", [])
#         if not labels:
#             return {"error": "candidate_labels list is empty"}

#         # --- image ----
#         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
#         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)

#         # --- text (with cache) ----
#         missing = [l for l in labels if l not in self.label_cache]
#         if missing:
#             tokens = self.tokenizer(missing).to(self.device)
#             with torch.no_grad():
#                 emb = self.model.encode_text(tokens)
#                 emb = emb / emb.norm(dim=-1, keepdim=True)
#             for l, e in zip(missing, emb):
#                 self.label_cache[l] = e
#         txt_feat = torch.stack([self.label_cache[l] for l in labels])

#         # --- forward & softmax ----
#         with torch.no_grad(), torch.cuda.amp.autocast():
#             img_feat = self.model.encode_image(img_tensor)
#             img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
#             probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()

#         # --- sorted output ----
#         return [
#             {"label": l, "score": float(p)}
#             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
#         ]

# # handler.py  (repo root)
# import io, base64, torch
# from PIL import Image
# import open_clip

# class EndpointHandler:
#     """
#     Zero‑shot classifier for MobileCLIP‑B (OpenCLIP).

#     Expected client JSON *to the endpoint*:
#     {
#       "inputs": {
#         "image": "<base64 PNG/JPEG>",
#         "candidate_labels": ["cat", "dog", ...]
#       }
#     }
#     """

#     def __init__(self, path: str = ""):
#         weights = f"{path}/mobileclip_b.pt"
#         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
#             "MobileCLIP-B", pretrained=weights
#         )
#         self.model.eval()

#         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
#         self.device = "cuda" if torch.cuda.is_available() else "cpu"
#         self.model.to(self.device)

#     def __call__(self, data):
#         # ── unwrap Hugging Face's `inputs` envelope ───────────
#         payload = data.get("inputs", data)

#         img_b64 = payload["image"]
#         labels  = payload.get("candidate_labels", [])
#         if not labels:
#             return {"error": "candidate_labels list is empty"}

#         # Decode & preprocess image
#         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
#         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)

#         # Tokenise labels
#         text_tokens = self.tokenizer(labels).to(self.device)

#         # Forward pass
#         with torch.no_grad(), torch.cuda.amp.autocast():
#             img_feat = self.model.encode_image(img_tensor)
#             txt_feat = self.model.encode_text(text_tokens)
#             img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
#             txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)
#             probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()

#         # Sorted output
#         return [
#             {"label": l, "score": float(p)}
#             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
#         ]