File size: 5,457 Bytes

cd0f6ce
270502a
88c8e02
 
 
 
cd0f6ce
270502a
 
 
88c8e02
270502a
88c8e02
270502a
88c8e02
270502a
 
 
 
 
 
88c8e02
270502a
 
88c8e02
 
cd0f6ce
 
88c8e02
cd0f6ce
 
 
88c8e02
 
270502a
88c8e02
270502a
 
88c8e02
 
270502a
 
 
 
88c8e02
 
cd0f6ce
270502a
 
 
 
 
88c8e02
270502a
88c8e02
270502a
 
 
 
 
01f2a47
270502a
88c8e02
cd0f6ce
 
88c8e02
270502a

# handler.py
import io, base64, time, torch
from PIL import Image
from transformers import CLIPModel, CLIPProcessor

class EndpointHandler:
    def __init__(self, path=""):
        self.model      = CLIPModel.from_pretrained(path)
        self.processor  = CLIPProcessor.from_pretrained(path)
        self.device     = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device).eval()
        self.cache: dict[str, torch.Tensor] = {}

    # -------------------------------------------------------
    def __call__(self, data):
        T = {}                             # timing dict
        t0 = time.perf_counter()

        payload  = data.get("inputs", data)
        img_b64  = payload["image"]
        prompts  = payload["candidate_labels"]

        # —— text embeddings (cache) ————————————————
        t = time.perf_counter()
        missing = [p for p in prompts if p not in self.cache]
        if missing:
            tok = self.processor(text=missing, return_tensors="pt",
                                 padding=True).to(self.device)
            with torch.no_grad():
                emb = self.model.get_text_features(**tok)
                emb = emb / emb.norm(dim=-1, keepdim=True)
            for p, e in zip(missing, emb):
                self.cache[p] = e
        txt_feat = torch.stack([self.cache[p] for p in prompts])
        T["encode_text"] = (time.perf_counter() - t) * 1000  # ms

        # —— image preprocessing ————————————————
        t = time.perf_counter()
        img = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
        img_in = self.processor(images=img, return_tensors="pt").to(self.device)
        T["decode_resize"] = (time.perf_counter() - t) * 1000

        # —— image embedding ————————————————
        t = time.perf_counter()
        with torch.no_grad(), torch.cuda.amp.autocast():
            img_feat = self.model.get_image_features(**img_in)
        img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
        img_feat = img_feat.float();  txt_feat = txt_feat.float()
        T["encode_image"] = (time.perf_counter() - t) * 1000

        # —— similarity & softmax ————————————————
        t = time.perf_counter()
        probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()
        T["similarity_softmax"] = (time.perf_counter() - t) * 1000

        # —— log timings ————————————————
        total = (time.perf_counter() - t0) * 1000
        print(f"[CLIP timings] total={total:.1f} ms | " +
              " | ".join(f"{k}={v:.1f}" for k, v in T.items()),
              flush=True)

        # —— build response ————————————————
        return [
            {"label": p, "score": float(s)}
            for p, s in sorted(zip(prompts, probs), key=lambda x: x[1], reverse=True)
        ]

# import io, base64, torch
# from PIL import Image
# from transformers import CLIPModel, CLIPProcessor

# class EndpointHandler:
#     """
#     CLIP ViT‑L/14 zero‑shot classifier.
#     Expects JSON: {
#       "inputs": {
#         "image": "<base64>",
#         "candidate_labels": ["prompt‑1", "prompt‑2", ...]
#       }
#     }
#     """

#     def __init__(self, path=""):
#         self.model = CLIPModel.from_pretrained(path)
#         self.processor = CLIPProcessor.from_pretrained(path)
#         self.device = "cuda" if torch.cuda.is_available() else "cpu"
#         self.model.to(self.device).eval()
#         self.cache: dict[str, torch.Tensor] = {}          # prompt -> emb

#     def __call__(self, data):
#         payload = data.get("inputs", data)
#         img_b64 = payload["image"]
#         prompts = payload.get("candidate_labels", [])
#         if not prompts:
#             return {"error": "candidate_labels list is empty"}

#         # --- text embeddings with per‑process cache ----------
#         missing = [p for p in prompts if p not in self.cache]
#         if missing:
#             tok = self.processor(text=missing, return_tensors="pt",
#                                  padding=True).to(self.device)
#             with torch.no_grad():
#                 emb = self.model.get_text_features(**tok)
#                 emb = emb / emb.norm(dim=-1, keepdim=True)
#             for p, e in zip(missing, emb):
#                 self.cache[p] = e
#         txt_feat = torch.stack([self.cache[p] for p in prompts])

#         # --- image embedding ---------------------------------
#         img = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
#         img_in = self.processor(images=img, return_tensors="pt").to(self.device)
       
#         with torch.no_grad(), torch.cuda.amp.autocast():
#             img_feat = self.model.get_image_features(**img_in)
        
#         img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
#         # txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)
        
#         img_feat = img_feat.float()   #  ← add these two lines
#         txt_feat = txt_feat.float()   #  ←
        
#         probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()

       

#         return [
#             {"label": p, "score": float(s)}
#             for p, s in sorted(zip(prompts, probs), key=lambda x: x[1], reverse=True)
#         ]