Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import open_clip | |
| import torch | |
| from PIL import Image | |
| MODEL_NAME = "ViT-B-32" | |
| PRETRAINED = "laion2b_s34b_b79k" | |
| print(f"Loading {MODEL_NAME} / {PRETRAINED}...") | |
| model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED) | |
| model.eval() | |
| tokenizer = open_clip.get_tokenizer(MODEL_NAME) | |
| print("Model loaded.") | |
| def encode(text: str): | |
| """Encode a text query into a 512-d L2-normalized CLIP embedding.""" | |
| if not text or not text.strip(): | |
| return {"error": "empty text", "embedding": None} | |
| with torch.no_grad(): | |
| tokens = tokenizer([text]) | |
| feat = model.encode_text(tokens) | |
| feat = feat / feat.norm(dim=-1, keepdim=True) | |
| return { | |
| "model": MODEL_NAME, | |
| "pretrained": PRETRAINED, | |
| "dim": feat.shape[-1], | |
| "embedding": feat[0].tolist(), | |
| } | |
| def encode_image(image): | |
| """Encode an image into a 512-d L2-normalized CLIP embedding. | |
| Same vector space as encode(text), so text-image cosine similarity works. | |
| Same vector space as the dataset features baked into the static site | |
| (ViT-B-32 / laion2b_s34b_b79k), so user-uploaded image cues can be | |
| compared directly to those features for compositional retrieval. | |
| """ | |
| if image is None: | |
| return {"error": "no image", "embedding": None} | |
| if not isinstance(image, Image.Image): | |
| try: | |
| image = Image.fromarray(image) | |
| except Exception as e: | |
| return {"error": f"unsupported image input: {type(image).__name__}: {e}", "embedding": None} | |
| image = image.convert("RGB") | |
| with torch.no_grad(): | |
| x = preprocess(image).unsqueeze(0) | |
| feat = model.encode_image(x) | |
| feat = feat / feat.norm(dim=-1, keepdim=True) | |
| return { | |
| "model": MODEL_NAME, | |
| "pretrained": PRETRAINED, | |
| "dim": feat.shape[-1], | |
| "embedding": feat[0].tolist(), | |
| } | |
| text_iface = gr.Interface( | |
| fn=encode, | |
| inputs=gr.Textbox(label="Query", placeholder="a hotel bathroom with a walk-in shower"), | |
| outputs=gr.JSON(label="CLIP text embedding"), | |
| title="CLIP text encoder", | |
| description="Returns a 512-d L2-normalized text embedding (ViT-B-32 / LAION-2B). API endpoint: /encode", | |
| ) | |
| image_iface = gr.Interface( | |
| fn=encode_image, | |
| inputs=gr.Image(type="pil", label="Image"), | |
| outputs=gr.JSON(label="CLIP image embedding"), | |
| title="CLIP image encoder", | |
| description="Returns a 512-d L2-normalized image embedding (ViT-B-32 / LAION-2B). API endpoint: /encode_image", | |
| ) | |
| demo = gr.TabbedInterface( | |
| [text_iface, image_iface], | |
| ["Text", "Image"], | |
| title="CLIP encoder (ViT-B-32 / LAION-2B)", | |
| ) | |
| demo.launch() | |