|
|
import os |
|
|
import json |
|
|
import requests |
|
|
import io |
|
|
import faiss |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
|
PRODUCTS_FILE = os.path.join(BASE_DIR, "products.json") |
|
|
INDEX_FILE = os.path.join(BASE_DIR, "products.index") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(PRODUCTS_FILE): |
|
|
raise FileNotFoundError(f"β Could not find {PRODUCTS_FILE}") |
|
|
|
|
|
with open(PRODUCTS_FILE, "r", encoding="utf-8") as f: |
|
|
products = json.load(f) |
|
|
|
|
|
print(f"π¦ Loaded {len(products)} products from {PRODUCTS_FILE}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("π§ Loading CLIP model (this may take a few seconds)...") |
|
|
model = SentenceTransformer("clip-ViT-B-32") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
unique_urls = list({p["image_url"] for p in products}) |
|
|
print(f"π Found {len(unique_urls)} unique image URLs") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
url_to_emb = {} |
|
|
|
|
|
for url in tqdm(unique_urls, desc="Embedding unique images"): |
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
img = Image.open(io.BytesIO(response.content)).convert("RGB") |
|
|
emb = model.encode(img, convert_to_numpy=True, |
|
|
normalize_embeddings=True) |
|
|
url_to_emb[url] = emb |
|
|
except Exception as e: |
|
|
print(f"β οΈ Error processing {url}: {e}") |
|
|
url_to_emb[url] = np.zeros(512, dtype=np.float32) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embeddings = [] |
|
|
for p in products: |
|
|
embeddings.append(url_to_emb[p["image_url"]]) |
|
|
|
|
|
embeddings = np.array(embeddings).astype("float32") |
|
|
|
|
|
print(f"β
Built embeddings array: {embeddings.shape}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dim = embeddings.shape[1] |
|
|
index = faiss.IndexFlatIP(dim) |
|
|
index.add(embeddings) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
faiss.write_index(index, INDEX_FILE) |
|
|
print(f"π Saved FAISS index with {index.ntotal} vectors β {INDEX_FILE}") |
|
|
|