File size: 2,971 Bytes
3eaabcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import json
import requests
import io
import faiss
import numpy as np
from PIL import Image
from sentence_transformers import SentenceTransformer
from tqdm import tqdm  # progress bar

# ---------------------------------------------------
# Locate products.json in the same folder as this script
# ---------------------------------------------------
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
PRODUCTS_FILE = os.path.join(BASE_DIR, "products.json")
INDEX_FILE = os.path.join(BASE_DIR, "products.index")

# ---------------------------------------------------
# Load product metadata
# ---------------------------------------------------
if not os.path.exists(PRODUCTS_FILE):
    raise FileNotFoundError(f"❌ Could not find {PRODUCTS_FILE}")

with open(PRODUCTS_FILE, "r", encoding="utf-8") as f:
    products = json.load(f)

print(f"πŸ“¦ Loaded {len(products)} products from {PRODUCTS_FILE}")

# ---------------------------------------------------
# Load CLIP model
# ---------------------------------------------------
print("🧠 Loading CLIP model (this may take a few seconds)...")
model = SentenceTransformer("clip-ViT-B-32")

# ---------------------------------------------------
# Collect unique image URLs (avoid redundant downloads)
# ---------------------------------------------------
unique_urls = list({p["image_url"] for p in products})
print(f"πŸ”— Found {len(unique_urls)} unique image URLs")

# ---------------------------------------------------
# Compute embeddings for unique URLs
# ---------------------------------------------------
url_to_emb = {}

for url in tqdm(unique_urls, desc="Embedding unique images"):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        img = Image.open(io.BytesIO(response.content)).convert("RGB")
        emb = model.encode(img, convert_to_numpy=True,
                           normalize_embeddings=True)
        url_to_emb[url] = emb
    except Exception as e:
        print(f"⚠️ Error processing {url}: {e}")
        url_to_emb[url] = np.zeros(512, dtype=np.float32)  # fallback embedding

# ---------------------------------------------------
# Build embeddings array for all products
# ---------------------------------------------------
embeddings = []
for p in products:
    embeddings.append(url_to_emb[p["image_url"]])

embeddings = np.array(embeddings).astype("float32")

print(f"βœ… Built embeddings array: {embeddings.shape}")

# ---------------------------------------------------
# Create FAISS index (cosine similarity via inner product)
# ---------------------------------------------------
dim = embeddings.shape[1]  # 512 for CLIP
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

# ---------------------------------------------------
# Save FAISS index
# ---------------------------------------------------
faiss.write_index(index, INDEX_FILE)
print(f"πŸŽ‰ Saved FAISS index with {index.ntotal} vectors β†’ {INDEX_FILE}")