Barvero commited on
Commit
5adb389
·
verified ·
1 Parent(s): 720ee1b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -36
app.py CHANGED
@@ -11,28 +11,21 @@ from datasets import load_dataset
11
  from transformers import CLIPModel, CLIPProcessor
12
 
13
 
14
- # Select device (GPU if available)
 
 
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
 
17
-
18
- # Load pretrained CLIP model
19
  MODEL_NAME = "openai/clip-vit-base-patch32"
20
- model = CLIPModel.from_pretrained(MODEL_NAME)
21
  processor = CLIPProcessor.from_pretrained(MODEL_NAME)
22
-
23
- # Move model to device and set evaluation mode
24
- model = model.to(device)
25
  model.eval()
26
 
27
-
28
- # Load precomputed embeddings from file
29
  emb_df = pd.read_parquet("clip_embeddings_3000.parquet")
30
-
31
- # Extract normalized embeddings matrix
32
  embeddings = emb_df.drop(columns=["image_id"]).values.astype(np.float32)
33
 
34
-
35
- # Load sampled indices (required to fetch the same 3000 images)
36
  sampled_indices = np.load("sampled_indices_3000.npy").astype(int).tolist()
37
 
38
  # Load dataset and select the sampled subset
@@ -40,41 +33,100 @@ ds = load_dataset("JamieSJS/stanford-online-products", "corpus", split="corpus")
40
  sampled_dataset = ds.select(sampled_indices)
41
 
42
 
43
- # Convert a user image into a normalized CLIP embedding
44
- def embed_image(image):
45
- # Preprocess image for CLIP
 
 
 
 
 
 
46
  inputs = processor(images=[image], return_tensors="pt")
47
  inputs = {k: v.to(device) for k, v in inputs.items()}
48
 
49
- # Extract image features without gradients
50
  with torch.no_grad():
51
- features = model.get_image_features(**inputs)
 
 
 
 
52
 
53
- # Convert embedding to numpy and normalize
54
- vec = features.cpu().numpy().reshape(-1).astype(np.float32)
55
- vec = vec / (np.linalg.norm(vec) + 1e-12)
 
 
 
 
 
56
 
57
- return vec
 
58
 
59
 
60
- # Recommend top-3 visually similar images
61
- def recommend(image):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  try:
63
- # Embed user input image
64
- user_vec = embed_image(image)
 
 
 
 
 
 
 
65
 
66
- # Compute cosine similarity scores
 
 
 
 
 
 
 
 
 
67
  scores = embeddings @ user_vec
68
 
69
- # Get Top-3 indices
70
  top_idx = np.argsort(scores)[::-1][:3]
71
  top_scores = scores[top_idx]
72
 
73
- # Fetch images directly from the sampled dataset
74
  results = [sampled_dataset[int(i)]["image"] for i in top_idx]
75
 
76
- # Return a short message for visibility
 
 
 
 
 
 
 
77
  msg = (
 
 
78
  f"Top-3 cosine similarity scores: "
79
  f"{top_scores[0]:.3f}, {top_scores[1]:.3f}, {top_scores[2]:.3f}"
80
  )
@@ -85,17 +137,22 @@ def recommend(image):
85
  return [], f"Error: {str(e)}"
86
 
87
 
88
- # Define Gradio interface
 
 
89
  demo = gr.Interface(
90
  fn=recommend,
91
- inputs=gr.Image(type="pil", label="Upload an image"),
 
 
 
 
92
  outputs=[
93
  gr.Gallery(label="Top-3 Recommended Images"),
94
  gr.Textbox(label="Details"),
95
  ],
96
- title="CLIP Image Recommendation System",
97
- description="Upload an image and receive visually similar product recommendations."
98
  )
99
 
100
- # Launch the application
101
- demo.launch(show_error=True, ssr_mode=False)
 
11
  from transformers import CLIPModel, CLIPProcessor
12
 
13
 
14
+ # -----------------------------
15
+ # Setup
16
+ # -----------------------------
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
 
 
 
19
  MODEL_NAME = "openai/clip-vit-base-patch32"
20
+ model = CLIPModel.from_pretrained(MODEL_NAME).to(device)
21
  processor = CLIPProcessor.from_pretrained(MODEL_NAME)
 
 
 
22
  model.eval()
23
 
24
+ # Load precomputed embeddings (image embeddings for the sampled subset)
 
25
  emb_df = pd.read_parquet("clip_embeddings_3000.parquet")
 
 
26
  embeddings = emb_df.drop(columns=["image_id"]).values.astype(np.float32)
27
 
28
+ # Load sampled indices (to fetch the same 3000 images)
 
29
  sampled_indices = np.load("sampled_indices_3000.npy").astype(int).tolist()
30
 
31
  # Load dataset and select the sampled subset
 
33
  sampled_dataset = ds.select(sampled_indices)
34
 
35
 
36
+ # -----------------------------
37
+ # Embedding helpers
38
+ # -----------------------------
39
+ def l2_normalize(vec: np.ndarray) -> np.ndarray:
40
+ return vec / (np.linalg.norm(vec) + 1e-12)
41
+
42
+
43
+ def embed_image(image) -> np.ndarray:
44
+ # Prepare image for CLIP
45
  inputs = processor(images=[image], return_tensors="pt")
46
  inputs = {k: v.to(device) for k, v in inputs.items()}
47
 
48
+ # Extract image features
49
  with torch.no_grad():
50
+ feats = model.get_image_features(**inputs)
51
+
52
+ vec = feats.cpu().numpy().reshape(-1).astype(np.float32)
53
+ return l2_normalize(vec)
54
+
55
 
56
+ def embed_text(text: str) -> np.ndarray:
57
+ # Prepare text for CLIP
58
+ inputs = processor(text=[text], return_tensors="pt", padding=True, truncation=True)
59
+ inputs = {k: v.to(device) for k, v in inputs.items()}
60
+
61
+ # Extract text features
62
+ with torch.no_grad():
63
+ feats = model.get_text_features(**inputs)
64
 
65
+ vec = feats.cpu().numpy().reshape(-1).astype(np.float32)
66
+ return l2_normalize(vec)
67
 
68
 
69
+ def combine_embeddings(image_vec, text_vec, alpha: float) -> np.ndarray:
70
+ """
71
+ alpha = weight for image
72
+ (1-alpha) = weight for text
73
+ """
74
+ if image_vec is None and text_vec is None:
75
+ return None
76
+
77
+ if image_vec is None:
78
+ return text_vec
79
+ if text_vec is None:
80
+ return image_vec
81
+
82
+ combo = alpha * image_vec + (1.0 - alpha) * text_vec
83
+ return l2_normalize(combo.astype(np.float32))
84
+
85
+
86
+ # -----------------------------
87
+ # Recommendation function
88
+ # -----------------------------
89
+ def recommend(image, text, alpha):
90
  try:
91
+ # Handle empty inputs
92
+ if image is None and (text is None or str(text).strip() == ""):
93
+ return [], "Please upload an image and/or enter a text description."
94
+
95
+ image_vec = None
96
+ text_vec = None
97
+
98
+ if image is not None:
99
+ image_vec = embed_image(image)
100
 
101
+ if text is not None and str(text).strip() != "":
102
+ text_vec = embed_text(str(text).strip())
103
+
104
+ # Combine
105
+ user_vec = combine_embeddings(image_vec, text_vec, float(alpha))
106
+
107
+ if user_vec is None:
108
+ return [], "Could not compute an embedding from the given inputs."
109
+
110
+ # Cosine similarity (because vectors are normalized)
111
  scores = embeddings @ user_vec
112
 
113
+ # Top-3
114
  top_idx = np.argsort(scores)[::-1][:3]
115
  top_scores = scores[top_idx]
116
 
 
117
  results = [sampled_dataset[int(i)]["image"] for i in top_idx]
118
 
119
+ # Details message
120
+ mode = []
121
+ if image is not None:
122
+ mode.append("Image")
123
+ if text is not None and str(text).strip() != "":
124
+ mode.append("Text")
125
+ mode_str = " + ".join(mode)
126
+
127
  msg = (
128
+ f"Mode: {mode_str}\n"
129
+ f"Alpha (image weight): {float(alpha):.2f}\n"
130
  f"Top-3 cosine similarity scores: "
131
  f"{top_scores[0]:.3f}, {top_scores[1]:.3f}, {top_scores[2]:.3f}"
132
  )
 
137
  return [], f"Error: {str(e)}"
138
 
139
 
140
+ # -----------------------------
141
+ # Gradio UI
142
+ # -----------------------------
143
  demo = gr.Interface(
144
  fn=recommend,
145
+ inputs=[
146
+ gr.Image(type="pil", label="Upload an image (optional)"),
147
+ gr.Textbox(label="Text description (optional)", placeholder="e.g., 'small handheld vacuum'"),
148
+ gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.05, label="Alpha (image vs text weight)"),
149
+ ],
150
  outputs=[
151
  gr.Gallery(label="Top-3 Recommended Images"),
152
  gr.Textbox(label="Details"),
153
  ],
154
+ title="Hybrid CLIP Recommender (Image + Text)",
155
+ description="Upload an image, type a description, or combine both. Recommendations are based on CLIP embeddings + cosine similarity."
156
  )
157
 
158
+ demo.launch(show_error=True, ssr_mode=False)