github-actions[bot] commited on
Commit
041edd5
·
1 Parent(s): 19363d6

chore: sync from GitHub 2026-03-05 15:48:45 UTC

Browse files
models.py CHANGED
@@ -29,7 +29,7 @@ def get_image_pipeline():
29
  print("Loading image model (quantized)...")
30
  _image_pipeline = pipeline(
31
  task="zero-shot-image-classification",
32
- model="openai/clip-vit-base-patch32",
33
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
34
  )
35
  # Apply dynamic quantization on CPU to reduce memory ~2-3x
 
29
  print("Loading image model (quantized)...")
30
  _image_pipeline = pipeline(
31
  task="zero-shot-image-classification",
32
+ model="google/siglip-base-patch16-224",
33
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
34
  )
35
  # Apply dynamic quantization on CPU to reduce memory ~2-3x
recommenders/content_based.py CHANGED
@@ -2,10 +2,9 @@
2
  Content-Based Recommender (Embedding-Based)
3
  ============================================
4
 
5
- Uses the 'products_recommend' ChromaDB collection (title + description + tags)
6
  and sentence-transformer embeddings to build user profiles and recommend
7
- similar products. Descriptions are kept here (unlike search) so that
8
- related accessories/peripherals surface as cross-sell recommendations.
9
 
10
  How it works:
11
  1. Gather all user interactions grouped by user_id from 4 tables:
 
2
  Content-Based Recommender (Embedding-Based)
3
  ============================================
4
 
5
+ Uses the ChromaDB 'products' collection (title + description + tags)
6
  and sentence-transformer embeddings to build user profiles and recommend
7
+ similar products.
 
8
 
9
  How it works:
10
  1. Gather all user interactions grouped by user_id from 4 tables:
smart_search/Documentation.md ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Smart Search — Technical Documentation
2
+
3
+ Multi-modal product search system supporting **text**, **image**, and **audio** queries. Uses a two-stage pipeline: tag-based filtering followed by semantic similarity search.
4
+
5
+ ---
6
+
7
+ ## Architecture
8
+
9
+ ```
10
+ routes.py → FastAPI endpoints (text / image / audio / product details)
11
+ smart_search.py → Core search logic (tag filter, semantic search, data helpers)
12
+ categories.txt → Category labels for zero-shot image classification
13
+ whisper_finetuned_ct2/ → Fine-tuned Faster-Whisper model for Arabic/English audio
14
+ utils.py → Shared helpers (Supabase clients, ChromaDB, vector DB management)
15
+ models.py → Lazy-loaded ML model singletons (embedder, CLIP, Whisper)
16
+ ```
17
+
18
+ ---
19
+
20
+ ## Search Pipeline
21
+
22
+ ```
23
+ User Query (text / image / audio)
24
+
25
+ ├── [Image] Zero-shot CLIP classification → predicted category label
26
+ ├── [Audio] Faster-Whisper transcription → text caption
27
+ └── [Text] Used directly
28
+
29
+
30
+ ┌─────────────────────────────────────────────────┐
31
+ │ Stage 1 — Tag Filter (Supabase) │
32
+ │ Query: WHERE tags && ['token1', 'token2', ...] │
33
+ │ Returns: list of product IDs │
34
+ │ │
35
+ │ • Tokenizes query into individual words │
36
+ │ • Uses Supabase .overlaps() on the tags column │
37
+ │ • Hard-filters to only categorically relevant │
38
+ │ products, eliminating cross-category bleed │
39
+ └─────────────────────┬───────────────────────────┘
40
+
41
+ Has matches?
42
+ ╱ ╲
43
+ Yes No (fallback)
44
+ │ │
45
+ ▼ ▼
46
+ ┌───────────────────┐ ┌───────────────────────┐
47
+ │ Scoped Semantic │ │ Global Semantic Search │
48
+ │ Search (ChromaDB) │ │ (ChromaDB, full k) │
49
+ │ filter={id: $in} │ │ No filter applied │
50
+ │ k=min(top_k, n) │ │ │
51
+ └─────────┬─────────┘ └──────────┬──────────────┘
52
+ │ │
53
+ └───────────┬───────────┘
54
+
55
+
56
+ Ranked Results
57
+ (product_ids, titles, distances)
58
+ ```
59
+
60
+ ### Why Two Stages?
61
+
62
+ Pure semantic search on `title + description + tags` embeddings causes **cross-category bleed** — a "smartphone" query returns phone cases and chargers because their descriptions mention "smartphone". The tag filter eliminates this:
63
+
64
+ | Query | Without Tag Filter | With Tag Filter |
65
+ |-------|--------------------|-----------------|
66
+ | "smartphone" | Smartphones, phone cases, chargers, screen protectors | Only actual smartphones |
67
+ | "laptop bag" | Laptop bags, laptops, backpacks | Only products tagged "laptop bag" |
68
+
69
+ The **fallback** ensures specific brand queries like "Samsung Galaxy A15 ceramic white" still work — if no tags match, global semantic search handles it.
70
+
71
+ ---
72
+
73
+ ## Search Modalities
74
+
75
+ ### 1. Text Search (`POST /search/text`)
76
+
77
+ Direct text-to-product search.
78
+
79
+ | Parameter | Type | Default | Description |
80
+ |-----------|------|---------|-------------|
81
+ | `query` | string | required | Search query text |
82
+ | `top_k` | int | 100 | Max results to return |
83
+
84
+ ### 2. Image Search (`POST /search/image`)
85
+
86
+ Zero-shot image classification → text search pipeline.
87
+
88
+ 1. User uploads an image
89
+ 2. CLIP model (`openai/clip-vit-base-patch16`) classifies it against category labels from `categories.txt`
90
+ 3. The predicted category becomes the text query for the search pipeline
91
+
92
+ | Parameter | Type | Default | Description |
93
+ |-----------|------|---------|-------------|
94
+ | `image` | file | required | Product image (JPEG/PNG) |
95
+ | `top_k` | int | 100 | Max results to return |
96
+
97
+ **Response** includes `predicted_category` and `confidence_score` alongside results.
98
+
99
+ ### 3. Audio Search (`POST /search/audio`)
100
+
101
+ Speech-to-text → text search pipeline.
102
+
103
+ 1. User uploads an audio clip
104
+ 2. Fine-tuned Faster-Whisper model transcribes it (supports Arabic and English)
105
+ 3. The transcription becomes the text query for the search pipeline
106
+
107
+ | Parameter | Type | Default | Description |
108
+ |-----------|------|---------|-------------|
109
+ | `audio` | file | required | Audio file (WAV/MP3/etc.) |
110
+ | `language` | string | "en" | Language code ("en" or "ar") |
111
+ | `top_k` | int | 100 | Max results to return |
112
+
113
+ **Response** includes `caption` (transcription) alongside results.
114
+
115
+ ---
116
+
117
+ ## Vector Database
118
+
119
+ ### Embedding Model
120
+
121
+ - **Model**: `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` (384-dim)
122
+ - **Storage**: ChromaDB (persisted at `src/chroma_db/`)
123
+ - **Collection**: `products` — each document is `title + description + tags`
124
+
125
+ ### Document Content
126
+
127
+ Each product is embedded as:
128
+ ```
129
+ "{title} {description} {tag1 tag2 tag3 ...}"
130
+ ```
131
+
132
+ Metadata stored per document:
133
+ | Field | Description |
134
+ |-------|-------------|
135
+ | `id` | Product UUID |
136
+ | `title` | Product title |
137
+ | `tags` | Space-separated tags string |
138
+
139
+ ### Adding Products
140
+
141
+ Products are indexed in two ways:
142
+
143
+ 1. **Bulk at startup** — `update_vectordb()` in `app.py` syncs all Supabase products to ChromaDB on server start. Only new products (not already indexed) are added.
144
+
145
+ 2. **Single product via API** — `POST /vectordb/add?product_id=<uuid>` adds one product's embedding without restarting the server. Useful when a new product is created.
146
+
147
+ ---
148
+
149
+ ## Other Endpoints
150
+
151
+ ### Random Products (`GET /products/random`)
152
+
153
+ Returns products for initial display before the user searches.
154
+
155
+ | Parameter | Type | Default | Description |
156
+ |-----------|------|---------|-------------|
157
+ | `limit` | int | 20 | Number of products |
158
+
159
+ ### Product Details (`GET /product/{product_id}`)
160
+
161
+ Returns full product info: title, description, price, old_price, sku, stock, seller name, and all images.
162
+
163
+ ---
164
+
165
+ ## Database Tables Used
166
+
167
+ | Table | Fields Used | Purpose |
168
+ |-------|-------------|---------|
169
+ | `products` | id, title, description, tags, price, old_price, sku, stock, store_id, status | Product catalog + tag filter |
170
+ | `product_images` | product_id, url | Product images |
171
+ | `stores` | id, name | Store/seller name |
172
+
173
+ ---
174
+
175
+ ## Models Used
176
+
177
+ | Component | Model | Size | Purpose |
178
+ |-----------|-------|------|---------|
179
+ | Text Embeddings | `paraphrase-multilingual-MiniLM-L12-v2` | ~120 MB | Semantic similarity (384-dim vectors) |
180
+ | Image Classification | `openai/clip-vit-base-patch16` | ~600 MB | Zero-shot image → category |
181
+ | Speech-to-Text | Fine-tuned Faster-Whisper (CTranslate2) | ~150 MB | Arabic/English audio transcription |
182
+
183
+ All models run on CPU with no GPU requirement.
smart_search/routes.py CHANGED
@@ -3,7 +3,8 @@ import uvicorn
3
  from PIL import Image
4
  from fastapi import FastAPI, UploadFile, File
5
  from models import IMAGE_PIPELINE, AUDIO_MODEL
6
- from utils import similarity_search, load_categories, load_audio_bytes_ffmpeg, get_product_images, get_product_prices, get_product_details, get_random_products
 
7
 
8
  def register_search_routes(app: FastAPI):
9
 
@@ -106,3 +107,10 @@ def register_search_routes(app: FastAPI):
106
  if product:
107
  return product
108
  return {"error": "Product not found"}
 
 
 
 
 
 
 
 
3
  from PIL import Image
4
  from fastapi import FastAPI, UploadFile, File
5
  from models import IMAGE_PIPELINE, AUDIO_MODEL
6
+ from utils import get_product_images, add_product_to_vectordb, get_product_prices, get_product_details, get_random_products, load_categories, load_audio_bytes_ffmpeg
7
+ from smart_search.smart_search import similarity_search
8
 
9
  def register_search_routes(app: FastAPI):
10
 
 
107
  if product:
108
  return product
109
  return {"error": "Product not found"}
110
+
111
+ ## Add Product Embedding (called when a new product is created)
112
+ @app.post('/vectordb/add')
113
+ def add_product_embedding(product_id: str):
114
+ """Add a single product's embedding to ChromaDB without restarting the server."""
115
+ result = add_product_to_vectordb(product_id)
116
+ return result
smart_search/smart_search.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Smart Search — Core Search Functions
3
+ ======================================
4
+
5
+ Two-stage search pipeline:
6
+ 1. Tag filter (Supabase) — restrict to products whose tags overlap with query tokens.
7
+ 2. Semantic search (ChromaDB) — vector similarity within the filtered set.
8
+ Fallback: if no tag matches, run unrestricted semantic search.
9
+ """
10
+
11
+ from utils import supabase, get_vector_db
12
+
13
+
14
+ # ═══════════════════════ Search Pipeline ════════════════════════
15
+
16
+ def _tag_search(query_tokens: list) -> list:
17
+ """
18
+ Stage 1 — Tag filter.
19
+ Query Supabase for products whose tags array overlaps with any query token.
20
+ Returns a list of matching product IDs, or [] if none / on error.
21
+ """
22
+ if supabase is None or not query_tokens:
23
+ return []
24
+ try:
25
+ response = (
26
+ supabase.table("products")
27
+ .select("id")
28
+ .overlaps("tags", query_tokens)
29
+ .execute()
30
+ )
31
+ return [row["id"] for row in response.data]
32
+ except Exception as e:
33
+ print(f"⚠️ Tag search failed, falling back to pure semantic: {e}")
34
+ return []
35
+
36
+
37
+ def similarity_search(query, top_k):
38
+ """
39
+ Two-stage search pipeline:
40
+ 1. Tag filter — restrict to products whose tags overlap with query tokens
41
+ 2. Semantic search — vector similarity within the filtered set
42
+ 3. Gap fill — if tag-filtered results < top_k, pad with global semantic results
43
+ Fallback: if no tag matches at all, run unrestricted semantic search.
44
+ """
45
+ query_tokens = [t.lower() for t in query.split()]
46
+ tag_filtered_ids = _tag_search(query_tokens)
47
+
48
+ if tag_filtered_ids:
49
+ # Semantic search scoped to tag-matched products only
50
+ k = min(top_k, len(tag_filtered_ids))
51
+ where_filter = {"id": {"$in": tag_filtered_ids}}
52
+ primary_results = get_vector_db().similarity_search_with_score(
53
+ query, k=k, filter=where_filter
54
+ )
55
+
56
+ # Gap fill: if we got fewer than top_k, pad with global semantic results
57
+ if len(primary_results) < top_k:
58
+ gap = top_k - len(primary_results)
59
+ seen_ids = {doc.metadata['id'] for doc, _ in primary_results}
60
+ fallback_results = get_vector_db().similarity_search_with_score(query, k=top_k)
61
+ extras = [
62
+ (doc, dist) for doc, dist in fallback_results
63
+ if doc.metadata['id'] not in seen_ids
64
+ ][:gap]
65
+ relevant_products = primary_results + extras
66
+ else:
67
+ relevant_products = primary_results
68
+
69
+ else:
70
+ # Fallback: no tag matches (e.g. brand-only query) → global semantic search
71
+ relevant_products = get_vector_db().similarity_search_with_score(query, k=top_k)
72
+
73
+ product_ids = [doc.metadata['id'] for doc, _ in relevant_products]
74
+ titles = [doc.metadata['title'] for doc, _ in relevant_products]
75
+ distances = [dist for _, dist in relevant_products]
76
+ return product_ids, titles, distances
utils.py CHANGED
@@ -31,38 +31,22 @@ if not SUPABASE_URL or not SUPABASE_SERVICE_KEY:
31
  else:
32
  supabase_service: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
33
 
34
- ## Loading the Vector Databases (lazy — created on first use)
35
  CHROMA_DB_PATH = str(BASE_DIR / "chroma_db")
36
- _search_db = None # title + tags only (precise search)
37
- _recommend_db = None # title + description + tags (cross-sell recommendations)
38
 
39
- def get_search_db():
40
- """ChromaDB collection for search — title + tags only."""
41
- global _search_db
42
- if _search_db is None:
43
- from models import get_embedder
44
- _search_db = Chroma(
45
- collection_name='products_search',
46
- embedding_function=get_embedder(),
47
- persist_directory=CHROMA_DB_PATH
48
- )
49
- return _search_db
50
-
51
- def get_recommend_db():
52
- """ChromaDB collection for recommendations — title + description + tags."""
53
- global _recommend_db
54
- if _recommend_db is None:
55
  from models import get_embedder
56
- _recommend_db = Chroma(
57
- collection_name='products_recommend',
58
  embedding_function=get_embedder(),
59
  persist_directory=CHROMA_DB_PATH
60
  )
61
- return _recommend_db
62
 
63
- # Backward-compat alias used by content_based recommender
64
- def get_vector_db():
65
- return get_recommend_db()
66
 
67
  def update_vectordb():
68
 
@@ -73,95 +57,61 @@ def update_vectordb():
73
  print("Fetching products from Supabase...")
74
  products = supabase.table("products").select("id, title, description, tags").execute().data
75
 
76
- # --- Determine which products are already indexed in each collection ---
77
- search_existing = {m["id"] for m in get_search_db().get(include=["metadatas"])["metadatas"]}
78
- recommend_existing = {m["id"] for m in get_recommend_db().get(include=["metadatas"])["metadatas"]}
79
 
80
- search_contents, search_metas = [], []
81
- recommend_contents, recommend_metas = [], []
82
 
83
  for product in products:
84
  pid = product['id']
85
- tags = product.get('tags') or []
86
- tags_str = ' '.join(tags)
87
- title = product.get('title') or ''
88
- description = product.get('description') or ''
89
-
90
- meta = {"id": pid, "title": title, "tags": tags_str}
91
-
92
- # Search collection: title + tags only (precise)
93
- if pid not in search_existing:
94
- search_contents.append(f"{title} {tags_str}")
95
- search_metas.append(meta)
96
-
97
- # Recommend collection: title + description + tags (cross-sell)
98
- if pid not in recommend_existing:
99
- recommend_contents.append(f"{title} {description} {tags_str}")
100
- recommend_metas.append(meta)
101
-
102
- # --- Persist search collection ---
103
- if search_contents:
104
- get_search_db().add_texts(texts=search_contents, metadatas=search_metas)
105
- get_search_db().persist()
106
- print(f"✅ Added {len(search_contents)} products to search collection")
107
  else:
108
- print("✅ Search collection is up to date")
109
-
110
- # --- Persist recommend collection ---
111
- if recommend_contents:
112
- get_recommend_db().add_texts(texts=recommend_contents, metadatas=recommend_metas)
113
- get_recommend_db().persist()
114
- print(f"✅ Added {len(recommend_contents)} products to recommend collection")
115
- else:
116
- print("✅ Recommend collection is up to date")
117
 
118
 
119
- def _tag_search(query_tokens: list) -> list:
120
  """
121
- Stage 1 Tag filter.
122
- Query Supabase for products whose tags array overlaps with any query token.
123
- Returns a list of matching product IDs, or [] if none / on error.
124
  """
125
- if supabase is None or not query_tokens:
126
- return []
127
- try:
128
- response = (
129
- supabase.table("products")
130
- .select("id")
131
- .overlaps("tags", query_tokens)
132
- .execute()
133
- )
134
- return [row["id"] for row in response.data]
135
- except Exception as e:
136
- print(f"⚠️ Tag search failed, falling back to pure semantic: {e}")
137
- return []
138
 
 
 
 
 
139
 
140
- def similarity_search(query, top_k):
141
- """
142
- Two-stage search pipeline:
143
- 1. Tag filter — restrict to products whose tags overlap with query tokens
144
- 2. Semantic search — vector similarity within the filtered set
145
- Fallback: if no tag matches, run unrestricted semantic search.
146
- """
147
- query_tokens = [t.lower() for t in query.split()]
148
- tag_filtered_ids = _tag_search(query_tokens)
149
-
150
- if tag_filtered_ids:
151
- # Semantic search scoped to tag-matched products only
152
- k = min(top_k, len(tag_filtered_ids))
153
- where_filter = {"id": {"$in": tag_filtered_ids}}
154
- relevant_products = get_search_db().similarity_search_with_score(
155
- query, k=k, filter=where_filter
156
- )
157
- else:
158
- # Fallback: no tag matches (e.g. brand-only query) → global semantic search
159
- relevant_products = get_search_db().similarity_search_with_score(query, k=top_k)
160
 
161
- product_ids = [doc.metadata['id'] for doc, _ in relevant_products]
162
- titles = [doc.metadata['title'] for doc, _ in relevant_products]
163
- distances = [dist for _, dist in relevant_products]
164
- return product_ids, titles, distances
165
 
166
 
167
  def get_product_images(product_ids: list) -> dict:
@@ -198,19 +148,19 @@ def get_product_prices(product_ids: list) -> dict:
198
  """
199
  if not product_ids:
200
  return {}
201
-
202
  try:
203
  response = supabase.table("products").select("id, price").in_("id", list(product_ids)).execute()
204
-
205
  prices_map = {}
206
  for row in response.data:
207
  pid = row.get("id")
208
  price = row.get("price")
209
  if pid:
210
  prices_map[pid] = price
211
-
212
  return prices_map
213
-
214
  except Exception as e:
215
  print(f"Error fetching product prices: {e}")
216
  return {}
@@ -222,18 +172,17 @@ def get_product_details(product_id: str) -> dict:
222
  Returns product info including title, description, price, old_price, sku, stock, store name, etc.
223
  """
224
  try:
225
- # Get product details
226
  response = supabase.table("products").select("*").eq("id", product_id).execute()
227
-
228
  if not response.data:
229
  return None
230
-
231
  product = response.data[0]
232
-
233
  # Get product images
234
  images_response = supabase.table("product_images").select("url").eq("product_id", product_id).execute()
235
  images = [img.get("url") for img in images_response.data if img.get("url")]
236
-
237
  # Get store name
238
  store_name = None
239
  store_id = product.get("store_id")
@@ -241,7 +190,7 @@ def get_product_details(product_id: str) -> dict:
241
  store_response = supabase.table("stores").select("name").eq("id", store_id).execute()
242
  if store_response.data:
243
  store_name = store_response.data[0].get("name")
244
-
245
  return {
246
  "id": product.get("id"),
247
  "title": product.get("title"),
@@ -253,7 +202,7 @@ def get_product_details(product_id: str) -> dict:
253
  "sold_by": store_name,
254
  "images": images,
255
  }
256
-
257
  except Exception as e:
258
  print(f"Error fetching product details: {e}")
259
  return None
@@ -265,18 +214,15 @@ def get_random_products(limit: int = 10) -> list:
265
  Returns a list of products with id, title, price, and image_url.
266
  """
267
  try:
268
- # Get first N products
269
  response = supabase.table("products").select("id, title, price").limit(limit).execute()
270
-
271
  if not response.data:
272
  return []
273
-
274
  products = response.data
275
  product_ids = [p.get("id") for p in products]
276
-
277
- # Get images for these products
278
  images_map = get_product_images(product_ids)
279
-
280
  return [
281
  {
282
  "id": p.get("id"),
@@ -286,26 +232,25 @@ def get_random_products(limit: int = 10) -> list:
286
  }
287
  for p in products
288
  ]
289
-
290
  except Exception as e:
291
  print(f"Error fetching random products: {e}")
292
  return []
293
 
294
 
295
- def load_categories(file_name = None):
 
296
  if file_name is None:
297
- file_name = str(BASE_DIR / "smart_search" / "categories.txt")
298
  try:
299
  with open(file_name, 'r') as file:
300
  return [line.strip() for line in file.readlines() if line.strip()]
301
-
302
  except FileNotFoundError:
303
  print("Categories.txt file is not found")
304
- return ["Product", "Electronics", "Fashion", "Home"]
305
 
306
 
307
  def load_audio_bytes_ffmpeg(audio_bytes):
308
-
309
  process = subprocess.Popen(
310
  [
311
  "ffmpeg", "-i", "pipe:0",
@@ -314,11 +259,9 @@ def load_audio_bytes_ffmpeg(audio_bytes):
314
  "-ar", "16000",
315
  "pipe:1"
316
  ],
317
- stdin = subprocess.PIPE,
318
- stdout = subprocess.PIPE,
319
- stderr = subprocess.PIPE
320
  )
321
-
322
  out, _ = process.communicate(input=audio_bytes)
323
- audio_np = np.frombuffer(out, dtype=np.float32)
324
- return audio_np
 
31
  else:
32
  supabase_service: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
33
 
34
+ ## Loading the Vector Database (lazy — created on first use)
35
  CHROMA_DB_PATH = str(BASE_DIR / "chroma_db")
36
+ _vector_db = None
 
37
 
38
+ def get_vector_db():
39
+ """Single ChromaDB collection — title + description + tags."""
40
+ global _vector_db
41
+ if _vector_db is None:
 
 
 
 
 
 
 
 
 
 
 
 
42
  from models import get_embedder
43
+ _vector_db = Chroma(
44
+ collection_name='products',
45
  embedding_function=get_embedder(),
46
  persist_directory=CHROMA_DB_PATH
47
  )
48
+ return _vector_db
49
 
 
 
 
50
 
51
  def update_vectordb():
52
 
 
57
  print("Fetching products from Supabase...")
58
  products = supabase.table("products").select("id, title, description, tags").execute().data
59
 
60
+ existing_ids = {m["id"] for m in get_vector_db().get(include=["metadatas"])["metadatas"]}
 
 
61
 
62
+ contents = []
63
+ metadatas = []
64
 
65
  for product in products:
66
  pid = product['id']
67
+ if pid not in existing_ids:
68
+ tags = product.get('tags') or []
69
+ tags_str = ' '.join(tags)
70
+ title = product.get('title') or ''
71
+ description = product.get('description') or ''
72
+
73
+ contents.append(f"{title} {description} {tags_str}")
74
+ metadatas.append({"id": pid, "title": title, "tags": tags_str})
75
+
76
+ if contents:
77
+ get_vector_db().add_texts(texts=contents, metadatas=metadatas)
78
+ get_vector_db().persist()
79
+ print(f"✅ Added {len(contents)} new products to ChromaDB")
 
 
 
 
 
 
 
 
 
80
  else:
81
+ print("✅ No new products to add, ChromaDB is up to date")
 
 
 
 
 
 
 
 
82
 
83
 
84
+ def add_product_to_vectordb(product_id: str):
85
  """
86
+ Add a single product's embedding to ChromaDB.
87
+ Called via API when a new product is created no need to restart the server.
 
88
  """
89
+ if supabase is None:
90
+ return {"error": "Supabase not configured"}
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ # Check if already indexed
93
+ existing_ids = {m["id"] for m in get_vector_db().get(include=["metadatas"])["metadatas"]}
94
+ if product_id in existing_ids:
95
+ return {"status": "already_indexed", "product_id": product_id}
96
 
97
+ # Fetch product from Supabase
98
+ response = supabase.table("products").select("id, title, description, tags").eq("id", product_id).execute()
99
+ if not response.data:
100
+ return {"error": f"Product {product_id} not found in Supabase"}
101
+
102
+ product = response.data[0]
103
+ tags = product.get('tags') or []
104
+ tags_str = ' '.join(tags)
105
+ title = product.get('title') or ''
106
+ description = product.get('description') or ''
107
+
108
+ content = f"{title} {description} {tags_str}"
109
+ meta = {"id": product_id, "title": title, "tags": tags_str}
 
 
 
 
 
 
 
110
 
111
+ get_vector_db().add_texts(texts=[content], metadatas=[meta])
112
+ get_vector_db().persist()
113
+
114
+ return {"status": "added", "product_id": product_id, "title": title}
115
 
116
 
117
  def get_product_images(product_ids: list) -> dict:
 
148
  """
149
  if not product_ids:
150
  return {}
151
+
152
  try:
153
  response = supabase.table("products").select("id, price").in_("id", list(product_ids)).execute()
154
+
155
  prices_map = {}
156
  for row in response.data:
157
  pid = row.get("id")
158
  price = row.get("price")
159
  if pid:
160
  prices_map[pid] = price
161
+
162
  return prices_map
163
+
164
  except Exception as e:
165
  print(f"Error fetching product prices: {e}")
166
  return {}
 
172
  Returns product info including title, description, price, old_price, sku, stock, store name, etc.
173
  """
174
  try:
 
175
  response = supabase.table("products").select("*").eq("id", product_id).execute()
176
+
177
  if not response.data:
178
  return None
179
+
180
  product = response.data[0]
181
+
182
  # Get product images
183
  images_response = supabase.table("product_images").select("url").eq("product_id", product_id).execute()
184
  images = [img.get("url") for img in images_response.data if img.get("url")]
185
+
186
  # Get store name
187
  store_name = None
188
  store_id = product.get("store_id")
 
190
  store_response = supabase.table("stores").select("name").eq("id", store_id).execute()
191
  if store_response.data:
192
  store_name = store_response.data[0].get("name")
193
+
194
  return {
195
  "id": product.get("id"),
196
  "title": product.get("title"),
 
202
  "sold_by": store_name,
203
  "images": images,
204
  }
205
+
206
  except Exception as e:
207
  print(f"Error fetching product details: {e}")
208
  return None
 
214
  Returns a list of products with id, title, price, and image_url.
215
  """
216
  try:
 
217
  response = supabase.table("products").select("id, title, price").limit(limit).execute()
218
+
219
  if not response.data:
220
  return []
221
+
222
  products = response.data
223
  product_ids = [p.get("id") for p in products]
 
 
224
  images_map = get_product_images(product_ids)
225
+
226
  return [
227
  {
228
  "id": p.get("id"),
 
232
  }
233
  for p in products
234
  ]
235
+
236
  except Exception as e:
237
  print(f"Error fetching random products: {e}")
238
  return []
239
 
240
 
241
+ def load_categories(file_name=None):
242
+ categories_path = BASE_DIR / "smart_search" / "categories.txt"
243
  if file_name is None:
244
+ file_name = str(categories_path)
245
  try:
246
  with open(file_name, 'r') as file:
247
  return [line.strip() for line in file.readlines() if line.strip()]
 
248
  except FileNotFoundError:
249
  print("Categories.txt file is not found")
250
+ return ["Product", "Electronics", "Fashion", "Home"]
251
 
252
 
253
  def load_audio_bytes_ffmpeg(audio_bytes):
 
254
  process = subprocess.Popen(
255
  [
256
  "ffmpeg", "-i", "pipe:0",
 
259
  "-ar", "16000",
260
  "pipe:1"
261
  ],
262
+ stdin=subprocess.PIPE,
263
+ stdout=subprocess.PIPE,
264
+ stderr=subprocess.PIPE
265
  )
 
266
  out, _ = process.communicate(input=audio_bytes)
267
+ return np.frombuffer(out, dtype=np.float32)