finhdev
/

testmobileclip

mobileclip

Model card Files Files and versions

xet

Community

finhdev commited on Jul 29, 2025

Commit

08ce2dc

verified ·

1 Parent(s): 52af22a

Update handler.py

Browse files

Files changed (1) hide show

handler.py +38 -22

handler.py CHANGED Viewed

@@ -1,56 +1,72 @@
-# handler.py  – place in repo root
-import io, base64, torch
 from PIL import Image
 import open_clip
-from mobileclip.modules.common.mobileone import reparameterize_model
 class EndpointHandler:
     """
-    Zero‑shot image classifier for MobileCLIP‑B using OpenCLIP.
-    Expects JSON:
       {
         "image": "<base64‑encoded PNG/JPEG>",
         "candidate_labels": ["cat", "dog", ...]
       }
     """
     def __init__(self, path: str = ""):
-        # Hugging Face Endpoints clones the repo into `path`.
-        # The weights file is mobileclip_b.pt (already in the repo).
         weights = f"{path}/mobileclip_b.pt"
         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
             "MobileCLIP-B", pretrained=weights
         )
-        # Re‑parameterize once for faster inference (as per MobileCLIP docs)
-        self.model = reparameterize_model(self.model)
-        self.model.eval()
-        # OpenCLIP tokenizer (same as CLIP)
         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
     def __call__(self, data):
-        # Decode input
-        img_b64 = data["image"]
-        labels  = data.get("candidate_labels", [])
-        image   = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
-        # Preprocess
         image_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
-        text_tokens  = self.tokenizer(labels).to(self.device)
         with torch.no_grad(), torch.cuda.amp.autocast():
             img_feat = self.model.encode_image(image_tensor)
             txt_feat = self.model.encode_text(text_tokens)
-            img_feat /= img_feat.norm(dim=-1, keepdim=True)
-            txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
             probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()
         return [
-            {"label": l, "score": float(p)} for l, p in sorted(
-                zip(labels, probs), key=lambda x: x[1], reverse=True
-            )
         ]

+# handler.py
+import io
+import base64
+import torch
 from PIL import Image
 import open_clip
+from open_clip import fuse_conv_bn_sequential
 class EndpointHandler:
     """
+    Zero‑shot image classifier for MobileCLIP‑B (OpenCLIP).
+    Expects JSON payload:
       {
         "image": "<base64‑encoded PNG/JPEG>",
         "candidate_labels": ["cat", "dog", ...]
       }
+    Returns:
+      [
+        {"label": "cat", "score": 0.91},
+        {"label": "dog", "score": 0.05},
+        ...
+      ]
     """
     def __init__(self, path: str = ""):
+        # Path points to the repo root inside the container
         weights = f"{path}/mobileclip_b.pt"
+        # Load model + transforms from OpenCLIP
         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
             "MobileCLIP-B", pretrained=weights
         )
+        # Fuse conv + BN for faster inference (same idea as MobileCLIP re‑param)
+        self.model = fuse_conv_bn_sequential(self.model).eval()
+        # Tokenizer for label prompts
         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
+        # Device selection
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
     def __call__(self, data):
+        # 1. Parse request
+        img_b64   = data["image"]
+        labels    = data.get("candidate_labels", [])
+        if not labels:
+            return {"error": "candidate_labels list is empty"}
+        # 2. Decode & preprocess image
+        image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
         image_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
+        # 3. Tokenize labels
+        text_tokens = self.tokenizer(labels).to(self.device)
+        # 4. Forward pass
         with torch.no_grad(), torch.cuda.amp.autocast():
             img_feat = self.model.encode_image(image_tensor)
             txt_feat = self.model.encode_text(text_tokens)
+            img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
+            txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)
             probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()
+        # 5. Return sorted list
         return [
+            {"label": l, "score": float(p)}
+            for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
         ]