robertpless commited on
Commit
a4d5a67
·
1 Parent(s): fca4620

Add /encode_image endpoint for compositional image cues

Browse files

Same model and preprocessing as the dataset features (ViT-B-32 /
laion2b_s34b_b79k), so user-uploaded images can be compared directly
to the static site's image features in the same vector space.

Switched to gr.TabbedInterface so both /encode and /encode_image
endpoints are exposed.

Files changed (1) hide show
  1. app.py +48 -5
app.py CHANGED
@@ -1,12 +1,13 @@
1
  import gradio as gr
2
  import open_clip
3
  import torch
 
4
 
5
  MODEL_NAME = "ViT-B-32"
6
  PRETRAINED = "laion2b_s34b_b79k"
7
 
8
  print(f"Loading {MODEL_NAME} / {PRETRAINED}...")
9
- model, _, _ = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED)
10
  model.eval()
11
  tokenizer = open_clip.get_tokenizer(MODEL_NAME)
12
  print("Model loaded.")
@@ -28,12 +29,54 @@ def encode(text: str):
28
  }
29
 
30
 
31
- demo = gr.Interface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  fn=encode,
33
  inputs=gr.Textbox(label="Query", placeholder="a hotel bathroom with a walk-in shower"),
34
  outputs=gr.JSON(label="CLIP text embedding"),
35
- title="CLIP text encoder (ViT-B-32 / LAION-2B)",
36
- description="Returns a 512-d L2-normalized text embedding for CLIP-guided image search.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  )
38
 
39
- demo.launch()
 
1
  import gradio as gr
2
  import open_clip
3
  import torch
4
+ from PIL import Image
5
 
6
  MODEL_NAME = "ViT-B-32"
7
  PRETRAINED = "laion2b_s34b_b79k"
8
 
9
  print(f"Loading {MODEL_NAME} / {PRETRAINED}...")
10
+ model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED)
11
  model.eval()
12
  tokenizer = open_clip.get_tokenizer(MODEL_NAME)
13
  print("Model loaded.")
 
29
  }
30
 
31
 
32
+ def encode_image(image):
33
+ """Encode an image into a 512-d L2-normalized CLIP embedding.
34
+
35
+ Same vector space as encode(text), so text-image cosine similarity works.
36
+ Same vector space as the dataset features baked into the static site
37
+ (ViT-B-32 / laion2b_s34b_b79k), so user-uploaded image cues can be
38
+ compared directly to those features for compositional retrieval.
39
+ """
40
+ if image is None:
41
+ return {"error": "no image", "embedding": None}
42
+ if not isinstance(image, Image.Image):
43
+ try:
44
+ image = Image.fromarray(image)
45
+ except Exception as e:
46
+ return {"error": f"unsupported image input: {type(image).__name__}: {e}", "embedding": None}
47
+ image = image.convert("RGB")
48
+ with torch.no_grad():
49
+ x = preprocess(image).unsqueeze(0)
50
+ feat = model.encode_image(x)
51
+ feat = feat / feat.norm(dim=-1, keepdim=True)
52
+ return {
53
+ "model": MODEL_NAME,
54
+ "pretrained": PRETRAINED,
55
+ "dim": feat.shape[-1],
56
+ "embedding": feat[0].tolist(),
57
+ }
58
+
59
+
60
+ text_iface = gr.Interface(
61
  fn=encode,
62
  inputs=gr.Textbox(label="Query", placeholder="a hotel bathroom with a walk-in shower"),
63
  outputs=gr.JSON(label="CLIP text embedding"),
64
+ title="CLIP text encoder",
65
+ description="Returns a 512-d L2-normalized text embedding (ViT-B-32 / LAION-2B). API endpoint: /encode",
66
+ )
67
+
68
+ image_iface = gr.Interface(
69
+ fn=encode_image,
70
+ inputs=gr.Image(type="pil", label="Image"),
71
+ outputs=gr.JSON(label="CLIP image embedding"),
72
+ title="CLIP image encoder",
73
+ description="Returns a 512-d L2-normalized image embedding (ViT-B-32 / LAION-2B). API endpoint: /encode_image",
74
+ )
75
+
76
+ demo = gr.TabbedInterface(
77
+ [text_iface, image_iface],
78
+ ["Text", "Image"],
79
+ title="CLIP encoder (ViT-B-32 / LAION-2B)",
80
  )
81
 
82
+ demo.launch()