File size: 4,813 Bytes
2904b62
 
 
 
 
 
89e8538
 
 
2904b62
 
 
 
5adb389
 
 
2904b62
 
89e8538
5adb389
89e8538
2904b62
 
5adb389
2904b62
89e8538
2904b62
5adb389
89e8538
 
 
2e83694
89e8538
2904b62
 
5adb389
 
 
 
 
 
 
 
 
89e8538
2904b62
 
5adb389
2904b62
5adb389
 
 
 
 
2904b62
5adb389
 
 
 
 
 
 
 
2904b62
5adb389
 
2904b62
 
5adb389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89e8538
5adb389
 
 
 
 
 
 
 
 
89e8538
5adb389
 
 
 
 
 
 
 
 
 
89e8538
2904b62
5adb389
89e8538
 
2904b62
89e8538
2904b62
5adb389
 
 
 
 
 
 
 
89e8538
5adb389
 
89e8538
 
 
2904b62
89e8538
2904b62
89e8538
 
2904b62
 
5adb389
 
 
2904b62
 
5adb389
 
 
 
 
89e8538
 
 
 
5adb389
 
2904b62
 
5adb389
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# Import core libraries
import numpy as np
import pandas as pd
import torch
import gradio as gr

# Import dataset loader
from datasets import load_dataset

# Import CLIP model and processor
from transformers import CLIPModel, CLIPProcessor


# -----------------------------
# Setup
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_NAME = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(MODEL_NAME).to(device)
processor = CLIPProcessor.from_pretrained(MODEL_NAME)
model.eval()

# Load precomputed embeddings (image embeddings for the sampled subset)
emb_df = pd.read_parquet("clip_embeddings_3000.parquet")
embeddings = emb_df.drop(columns=["image_id"]).values.astype(np.float32)

# Load sampled indices (to fetch the same 3000 images)
sampled_indices = np.load("sampled_indices_3000.npy").astype(int).tolist()

# Load dataset and select the sampled subset
ds = load_dataset("JamieSJS/stanford-online-products", "corpus", split="corpus")
sampled_dataset = ds.select(sampled_indices)


# -----------------------------
# Embedding helpers
# -----------------------------
def l2_normalize(vec: np.ndarray) -> np.ndarray:
    return vec / (np.linalg.norm(vec) + 1e-12)


def embed_image(image) -> np.ndarray:
    # Prepare image for CLIP
    inputs = processor(images=[image], return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Extract image features
    with torch.no_grad():
        feats = model.get_image_features(**inputs)

    vec = feats.cpu().numpy().reshape(-1).astype(np.float32)
    return l2_normalize(vec)


def embed_text(text: str) -> np.ndarray:
    # Prepare text for CLIP
    inputs = processor(text=[text], return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Extract text features
    with torch.no_grad():
        feats = model.get_text_features(**inputs)

    vec = feats.cpu().numpy().reshape(-1).astype(np.float32)
    return l2_normalize(vec)


def combine_embeddings(image_vec, text_vec, alpha: float) -> np.ndarray:
    """
    alpha = weight for image
    (1-alpha) = weight for text
    """
    if image_vec is None and text_vec is None:
        return None

    if image_vec is None:
        return text_vec
    if text_vec is None:
        return image_vec

    combo = alpha * image_vec + (1.0 - alpha) * text_vec
    return l2_normalize(combo.astype(np.float32))


# -----------------------------
# Recommendation function
# -----------------------------
def recommend(image, text, alpha):
    try:
        # Handle empty inputs
        if image is None and (text is None or str(text).strip() == ""):
            return [], "Please upload an image and/or enter a text description."

        image_vec = None
        text_vec = None

        if image is not None:
            image_vec = embed_image(image)

        if text is not None and str(text).strip() != "":
            text_vec = embed_text(str(text).strip())

        # Combine
        user_vec = combine_embeddings(image_vec, text_vec, float(alpha))

        if user_vec is None:
            return [], "Could not compute an embedding from the given inputs."

        # Cosine similarity (because vectors are normalized)
        scores = embeddings @ user_vec

        # Top-3
        top_idx = np.argsort(scores)[::-1][:3]
        top_scores = scores[top_idx]

        results = [sampled_dataset[int(i)]["image"] for i in top_idx]

        # Details message
        mode = []
        if image is not None:
            mode.append("Image")
        if text is not None and str(text).strip() != "":
            mode.append("Text")
        mode_str = " + ".join(mode)

        msg = (
            f"Mode: {mode_str}\n"
            f"Alpha (image weight): {float(alpha):.2f}\n"
            f"Top-3 cosine similarity scores: "
            f"{top_scores[0]:.3f}, {top_scores[1]:.3f}, {top_scores[2]:.3f}"
        )

        return results, msg

    except Exception as e:
        return [], f"Error: {str(e)}"


# -----------------------------
# Gradio UI
# -----------------------------
demo = gr.Interface(
    fn=recommend,
    inputs=[
        gr.Image(type="pil", label="Upload an image (optional)"),
        gr.Textbox(label="Text description (optional)", placeholder="e.g., 'small handheld vacuum'"),
        gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.05, label="Alpha (image vs text weight)"),
    ],
    outputs=[
        gr.Gallery(label="Top-3 Recommended Images"),
        gr.Textbox(label="Details"),
    ],
    title="Hybrid CLIP Recommender (Image + Text)",
    description="Upload an image, type a description, or combine both. Recommendations are based on CLIP embeddings + cosine similarity."
)

demo.launch(show_error=True, ssr_mode=False)