Spaces:

Barvero
/

visual-image-recommender

Sleeping

App Files Files Community

Barvero commited on Dec 14, 2025

Commit

5adb389

verified ·

1 Parent(s): 720ee1b

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -36

app.py CHANGED Viewed

@@ -11,28 +11,21 @@ from datasets import load_dataset
 from transformers import CLIPModel, CLIPProcessor
-# Select device (GPU if available)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load pretrained CLIP model
 MODEL_NAME = "openai/clip-vit-base-patch32"
-model = CLIPModel.from_pretrained(MODEL_NAME)
 processor = CLIPProcessor.from_pretrained(MODEL_NAME)
-# Move model to device and set evaluation mode
-model = model.to(device)
 model.eval()
-# Load precomputed embeddings from file
 emb_df = pd.read_parquet("clip_embeddings_3000.parquet")
-# Extract normalized embeddings matrix
 embeddings = emb_df.drop(columns=["image_id"]).values.astype(np.float32)
-# Load sampled indices (required to fetch the same 3000 images)
 sampled_indices = np.load("sampled_indices_3000.npy").astype(int).tolist()
 # Load dataset and select the sampled subset
@@ -40,41 +33,100 @@ ds = load_dataset("JamieSJS/stanford-online-products", "corpus", split="corpus")
 sampled_dataset = ds.select(sampled_indices)
-# Convert a user image into a normalized CLIP embedding
-def embed_image(image):
-    # Preprocess image for CLIP
     inputs = processor(images=[image], return_tensors="pt")
     inputs = {k: v.to(device) for k, v in inputs.items()}
-    # Extract image features without gradients
     with torch.no_grad():
-        features = model.get_image_features(**inputs)
-    # Convert embedding to numpy and normalize
-    vec = features.cpu().numpy().reshape(-1).astype(np.float32)
-    vec = vec / (np.linalg.norm(vec) + 1e-12)
-    return vec
-# Recommend top-3 visually similar images
-def recommend(image):
     try:
-        # Embed user input image
-        user_vec = embed_image(image)
-        # Compute cosine similarity scores
         scores = embeddings @ user_vec
-        # Get Top-3 indices
         top_idx = np.argsort(scores)[::-1][:3]
         top_scores = scores[top_idx]
-        # Fetch images directly from the sampled dataset
         results = [sampled_dataset[int(i)]["image"] for i in top_idx]
-        # Return a short message for visibility
         msg = (
             f"Top-3 cosine similarity scores: "
             f"{top_scores[0]:.3f}, {top_scores[1]:.3f}, {top_scores[2]:.3f}"
         )
@@ -85,17 +137,22 @@ def recommend(image):
         return [], f"Error: {str(e)}"
-# Define Gradio interface
 demo = gr.Interface(
     fn=recommend,
-    inputs=gr.Image(type="pil", label="Upload an image"),
     outputs=[
         gr.Gallery(label="Top-3 Recommended Images"),
         gr.Textbox(label="Details"),
     ],
-    title="CLIP Image Recommendation System",
-    description="Upload an image and receive visually similar product recommendations."
 )
-# Launch the application
-demo.launch(show_error=True, ssr_mode=False)

 from transformers import CLIPModel, CLIPProcessor
+# -----------------------------
+# Setup
+# -----------------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_NAME = "openai/clip-vit-base-patch32"
+model = CLIPModel.from_pretrained(MODEL_NAME).to(device)
 processor = CLIPProcessor.from_pretrained(MODEL_NAME)
 model.eval()
+# Load precomputed embeddings (image embeddings for the sampled subset)
 emb_df = pd.read_parquet("clip_embeddings_3000.parquet")
 embeddings = emb_df.drop(columns=["image_id"]).values.astype(np.float32)
+# Load sampled indices (to fetch the same 3000 images)
 sampled_indices = np.load("sampled_indices_3000.npy").astype(int).tolist()
 # Load dataset and select the sampled subset
 sampled_dataset = ds.select(sampled_indices)
+# -----------------------------
+# Embedding helpers
+# -----------------------------
+def l2_normalize(vec: np.ndarray) -> np.ndarray:
+    return vec / (np.linalg.norm(vec) + 1e-12)
+def embed_image(image) -> np.ndarray:
+    # Prepare image for CLIP
     inputs = processor(images=[image], return_tensors="pt")
     inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Extract image features
     with torch.no_grad():
+        feats = model.get_image_features(**inputs)
+    vec = feats.cpu().numpy().reshape(-1).astype(np.float32)
+    return l2_normalize(vec)
+def embed_text(text: str) -> np.ndarray:
+    # Prepare text for CLIP
+    inputs = processor(text=[text], return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Extract text features
+    with torch.no_grad():
+        feats = model.get_text_features(**inputs)
+    vec = feats.cpu().numpy().reshape(-1).astype(np.float32)
+    return l2_normalize(vec)
+def combine_embeddings(image_vec, text_vec, alpha: float) -> np.ndarray:
+    """
+    alpha = weight for image
+    (1-alpha) = weight for text
+    """
+    if image_vec is None and text_vec is None:
+        return None
+    if image_vec is None:
+        return text_vec
+    if text_vec is None:
+        return image_vec
+    combo = alpha * image_vec + (1.0 - alpha) * text_vec
+    return l2_normalize(combo.astype(np.float32))
+# -----------------------------
+# Recommendation function
+# -----------------------------
+def recommend(image, text, alpha):
     try:
+        # Handle empty inputs
+        if image is None and (text is None or str(text).strip() == ""):
+            return [], "Please upload an image and/or enter a text description."
+        image_vec = None
+        text_vec = None
+        if image is not None:
+            image_vec = embed_image(image)
+        if text is not None and str(text).strip() != "":
+            text_vec = embed_text(str(text).strip())
+        # Combine
+        user_vec = combine_embeddings(image_vec, text_vec, float(alpha))
+        if user_vec is None:
+            return [], "Could not compute an embedding from the given inputs."
+        # Cosine similarity (because vectors are normalized)
         scores = embeddings @ user_vec
+        # Top-3
         top_idx = np.argsort(scores)[::-1][:3]
         top_scores = scores[top_idx]
         results = [sampled_dataset[int(i)]["image"] for i in top_idx]
+        # Details message
+        mode = []
+        if image is not None:
+            mode.append("Image")
+        if text is not None and str(text).strip() != "":
+            mode.append("Text")
+        mode_str = " + ".join(mode)
         msg = (
+            f"Mode: {mode_str}\n"
+            f"Alpha (image weight): {float(alpha):.2f}\n"
             f"Top-3 cosine similarity scores: "
             f"{top_scores[0]:.3f}, {top_scores[1]:.3f}, {top_scores[2]:.3f}"
         )
         return [], f"Error: {str(e)}"
+# -----------------------------
+# Gradio UI
+# -----------------------------
 demo = gr.Interface(
     fn=recommend,
+    inputs=[
+        gr.Image(type="pil", label="Upload an image (optional)"),
+        gr.Textbox(label="Text description (optional)", placeholder="e.g., 'small handheld vacuum'"),
+        gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.05, label="Alpha (image vs text weight)"),
+    ],
     outputs=[
         gr.Gallery(label="Top-3 Recommended Images"),
         gr.Textbox(label="Details"),
     ],
+    title="Hybrid CLIP Recommender (Image + Text)",
+    description="Upload an image, type a description, or combine both. Recommendations are based on CLIP embeddings + cosine similarity."
 )
+demo.launch(show_error=True, ssr_mode=False)