Spaces:
Sleeping
Sleeping
File size: 4,813 Bytes
2904b62 89e8538 2904b62 5adb389 2904b62 89e8538 5adb389 89e8538 2904b62 5adb389 2904b62 89e8538 2904b62 5adb389 89e8538 2e83694 89e8538 2904b62 5adb389 89e8538 2904b62 5adb389 2904b62 5adb389 2904b62 5adb389 2904b62 5adb389 2904b62 5adb389 89e8538 5adb389 89e8538 5adb389 89e8538 2904b62 5adb389 89e8538 2904b62 89e8538 2904b62 5adb389 89e8538 5adb389 89e8538 2904b62 89e8538 2904b62 89e8538 2904b62 5adb389 2904b62 5adb389 89e8538 5adb389 2904b62 5adb389 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# Import core libraries
import numpy as np
import pandas as pd
import torch
import gradio as gr
# Import dataset loader
from datasets import load_dataset
# Import CLIP model and processor
from transformers import CLIPModel, CLIPProcessor
# -----------------------------
# Setup
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(MODEL_NAME).to(device)
processor = CLIPProcessor.from_pretrained(MODEL_NAME)
model.eval()
# Load precomputed embeddings (image embeddings for the sampled subset)
emb_df = pd.read_parquet("clip_embeddings_3000.parquet")
embeddings = emb_df.drop(columns=["image_id"]).values.astype(np.float32)
# Load sampled indices (to fetch the same 3000 images)
sampled_indices = np.load("sampled_indices_3000.npy").astype(int).tolist()
# Load dataset and select the sampled subset
ds = load_dataset("JamieSJS/stanford-online-products", "corpus", split="corpus")
sampled_dataset = ds.select(sampled_indices)
# -----------------------------
# Embedding helpers
# -----------------------------
def l2_normalize(vec: np.ndarray) -> np.ndarray:
return vec / (np.linalg.norm(vec) + 1e-12)
def embed_image(image) -> np.ndarray:
# Prepare image for CLIP
inputs = processor(images=[image], return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
# Extract image features
with torch.no_grad():
feats = model.get_image_features(**inputs)
vec = feats.cpu().numpy().reshape(-1).astype(np.float32)
return l2_normalize(vec)
def embed_text(text: str) -> np.ndarray:
# Prepare text for CLIP
inputs = processor(text=[text], return_tensors="pt", padding=True, truncation=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Extract text features
with torch.no_grad():
feats = model.get_text_features(**inputs)
vec = feats.cpu().numpy().reshape(-1).astype(np.float32)
return l2_normalize(vec)
def combine_embeddings(image_vec, text_vec, alpha: float) -> np.ndarray:
"""
alpha = weight for image
(1-alpha) = weight for text
"""
if image_vec is None and text_vec is None:
return None
if image_vec is None:
return text_vec
if text_vec is None:
return image_vec
combo = alpha * image_vec + (1.0 - alpha) * text_vec
return l2_normalize(combo.astype(np.float32))
# -----------------------------
# Recommendation function
# -----------------------------
def recommend(image, text, alpha):
try:
# Handle empty inputs
if image is None and (text is None or str(text).strip() == ""):
return [], "Please upload an image and/or enter a text description."
image_vec = None
text_vec = None
if image is not None:
image_vec = embed_image(image)
if text is not None and str(text).strip() != "":
text_vec = embed_text(str(text).strip())
# Combine
user_vec = combine_embeddings(image_vec, text_vec, float(alpha))
if user_vec is None:
return [], "Could not compute an embedding from the given inputs."
# Cosine similarity (because vectors are normalized)
scores = embeddings @ user_vec
# Top-3
top_idx = np.argsort(scores)[::-1][:3]
top_scores = scores[top_idx]
results = [sampled_dataset[int(i)]["image"] for i in top_idx]
# Details message
mode = []
if image is not None:
mode.append("Image")
if text is not None and str(text).strip() != "":
mode.append("Text")
mode_str = " + ".join(mode)
msg = (
f"Mode: {mode_str}\n"
f"Alpha (image weight): {float(alpha):.2f}\n"
f"Top-3 cosine similarity scores: "
f"{top_scores[0]:.3f}, {top_scores[1]:.3f}, {top_scores[2]:.3f}"
)
return results, msg
except Exception as e:
return [], f"Error: {str(e)}"
# -----------------------------
# Gradio UI
# -----------------------------
demo = gr.Interface(
fn=recommend,
inputs=[
gr.Image(type="pil", label="Upload an image (optional)"),
gr.Textbox(label="Text description (optional)", placeholder="e.g., 'small handheld vacuum'"),
gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.05, label="Alpha (image vs text weight)"),
],
outputs=[
gr.Gallery(label="Top-3 Recommended Images"),
gr.Textbox(label="Details"),
],
title="Hybrid CLIP Recommender (Image + Text)",
description="Upload an image, type a description, or combine both. Recommendations are based on CLIP embeddings + cosine similarity."
)
demo.launch(show_error=True, ssr_mode=False)
|