import gradio as gr import open_clip import torch from PIL import Image MODEL_NAME = "ViT-B-32" PRETRAINED = "laion2b_s34b_b79k" print(f"Loading {MODEL_NAME} / {PRETRAINED}...") model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED) model.eval() tokenizer = open_clip.get_tokenizer(MODEL_NAME) print("Model loaded.") def encode(text: str): """Encode a text query into a 512-d L2-normalized CLIP embedding.""" if not text or not text.strip(): return {"error": "empty text", "embedding": None} with torch.no_grad(): tokens = tokenizer([text]) feat = model.encode_text(tokens) feat = feat / feat.norm(dim=-1, keepdim=True) return { "model": MODEL_NAME, "pretrained": PRETRAINED, "dim": feat.shape[-1], "embedding": feat[0].tolist(), } def encode_image(image): """Encode an image into a 512-d L2-normalized CLIP embedding. Same vector space as encode(text), so text-image cosine similarity works. Same vector space as the dataset features baked into the static site (ViT-B-32 / laion2b_s34b_b79k), so user-uploaded image cues can be compared directly to those features for compositional retrieval. """ if image is None: return {"error": "no image", "embedding": None} if not isinstance(image, Image.Image): try: image = Image.fromarray(image) except Exception as e: return {"error": f"unsupported image input: {type(image).__name__}: {e}", "embedding": None} image = image.convert("RGB") with torch.no_grad(): x = preprocess(image).unsqueeze(0) feat = model.encode_image(x) feat = feat / feat.norm(dim=-1, keepdim=True) return { "model": MODEL_NAME, "pretrained": PRETRAINED, "dim": feat.shape[-1], "embedding": feat[0].tolist(), } text_iface = gr.Interface( fn=encode, inputs=gr.Textbox(label="Query", placeholder="a hotel bathroom with a walk-in shower"), outputs=gr.JSON(label="CLIP text embedding"), title="CLIP text encoder", description="Returns a 512-d L2-normalized text embedding (ViT-B-32 / LAION-2B). API endpoint: /encode", ) image_iface = gr.Interface( fn=encode_image, inputs=gr.Image(type="pil", label="Image"), outputs=gr.JSON(label="CLIP image embedding"), title="CLIP image encoder", description="Returns a 512-d L2-normalized image embedding (ViT-B-32 / LAION-2B). API endpoint: /encode_image", ) demo = gr.TabbedInterface( [text_iface, image_iface], ["Text", "Image"], title="CLIP encoder (ViT-B-32 / LAION-2B)", ) demo.launch()