Sarisha Das commited on
Commit
2bf862f
·
1 Parent(s): eb58aa0
requirements.txt CHANGED
@@ -9,4 +9,6 @@ faiss-cpu
9
  torch
10
  torchvision
11
  torchaudio
12
- datasets==3.6.0
 
 
 
9
  torch
10
  torchvision
11
  torchaudio
12
+ datasets==3.6.0
13
+ nltk
14
+ rank_bm25
src/streamlit_app.py CHANGED
@@ -7,11 +7,13 @@ import streamlit as st
7
  # ─── Repo root is the working directory on HF Spaces ─────────────────────────
8
  ROOT = Path(__file__).resolve().parent.parent # app.py lives at repo root
9
  sys.path.append(str(ROOT))
 
10
 
11
  os.environ["HF_HOME"] = str(ROOT / ".hf_cache")
12
  os.environ["TRANSFORMERS_CACHE"] = str(ROOT / ".hf_cache" / "transformers")
13
 
14
- from utils.retrieval_helpers import enrich_search_results
 
15
  from utils.semantic import load_vector_store
16
 
17
  import warnings
@@ -50,7 +52,6 @@ VECTOR_STORE_DIR = ROOT / "embeddings" / "semantic_vector_store"
50
 
51
  @st.cache_resource
52
  def load_vector_store_cached():
53
- # Authenticate explicitly — raises a clear error if token is missing
54
  hf_token = os.environ.get("HF_TOKEN")
55
  if not hf_token:
56
  st.error("HF_TOKEN secret is not set. Go to Space Settings → Secrets.")
@@ -60,14 +61,20 @@ def load_vector_store_cached():
60
 
61
  VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
62
 
63
- snapshot_download(
64
  repo_id="rishadaz/amazon_retriever-storage",
65
  repo_type="dataset",
66
  local_dir=str(VECTOR_STORE_DIR),
67
  token=hf_token,
68
  )
69
 
70
- return load_vector_store(VECTOR_STORE_DIR)
 
 
 
 
 
 
71
 
72
  # ─── Custom CSS ───────────────────────────────────────────────────────────────
73
  st.markdown(
@@ -147,8 +154,7 @@ st.markdown(
147
  # Expected return format — list of dicts with keys:
148
  # asin (str), title (str), text (str), rating (float), score (float)
149
 
150
- DUMMY_RESULTS = {}
151
-
152
 
153
  def bm25_search(query: str, top_k: int = 3) -> list[dict]:
154
  """
@@ -157,7 +163,8 @@ def bm25_search(query: str, top_k: int = 3) -> list[dict]:
157
  return retriever.search(query, top_k=top_k)
158
  Returns top_k review-level results (may include multiple reviews per ASIN).
159
  """
160
- return [r.copy() for r in DUMMY_RESULTS[:top_k]]
 
161
 
162
 
163
  def semantic_search(query: str, top_k: int = 3) -> list[dict]:
@@ -167,7 +174,6 @@ def semantic_search(query: str, top_k: int = 3) -> list[dict]:
167
  return retriever.search(query, top_k=top_k)
168
  Returns top_k review-level results (scores are cosine similarities, 0–1).
169
  """
170
- vector_store = load_vector_store_cached()
171
  results = enrich_search_results(vector_store, query, top_k, HF_DATASET["full"])
172
  return results
173
 
@@ -284,11 +290,6 @@ st.markdown(
284
  unsafe_allow_html=True,
285
  )
286
 
287
- st.markdown(
288
- '<div class="placeholder-badge">⚠️ Placeholder mode — real BM25 / Semantic indices not yet loaded</div>',
289
- unsafe_allow_html=True,
290
- )
291
-
292
  # ─── Search bar ───────────────────────────────────────────────────────────────
293
  query = st.text_input(
294
  "Search for a product or describe what you're looking for",
 
7
  # ─── Repo root is the working directory on HF Spaces ─────────────────────────
8
  ROOT = Path(__file__).resolve().parent.parent # app.py lives at repo root
9
  sys.path.append(str(ROOT))
10
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
11
 
12
  os.environ["HF_HOME"] = str(ROOT / ".hf_cache")
13
  os.environ["TRANSFORMERS_CACHE"] = str(ROOT / ".hf_cache" / "transformers")
14
 
15
+ from utils.retrieval_helpers import enrich_search_results, enrich_bm25_search_results
16
+ from utils.bm25 import load
17
  from utils.semantic import load_vector_store
18
 
19
  import warnings
 
52
 
53
  @st.cache_resource
54
  def load_vector_store_cached():
 
55
  hf_token = os.environ.get("HF_TOKEN")
56
  if not hf_token:
57
  st.error("HF_TOKEN secret is not set. Go to Space Settings → Secrets.")
 
61
 
62
  VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
63
 
64
+ snapshot_path = snapshot_download(
65
  repo_id="rishadaz/amazon_retriever-storage",
66
  repo_type="dataset",
67
  local_dir=str(VECTOR_STORE_DIR),
68
  token=hf_token,
69
  )
70
 
71
+ mini_index_path = Path(snapshot_path) / "tokenisation" / "bm25_index_mini.pkl"
72
+ embeddings_dir = Path(snapshot_path) / "embeddings"
73
+
74
+ vector_store = load_vector_store(embeddings_dir)
75
+ bm25_retriever = load(mini_index_path)
76
+
77
+ return vector_store, bm25_retriever
78
 
79
  # ─── Custom CSS ───────────────────────────────────────────────────────────────
80
  st.markdown(
 
154
  # Expected return format — list of dicts with keys:
155
  # asin (str), title (str), text (str), rating (float), score (float)
156
 
157
+ vector_store, bm25_retriever = load_vector_store_cached()
 
158
 
159
  def bm25_search(query: str, top_k: int = 3) -> list[dict]:
160
  """
 
163
  return retriever.search(query, top_k=top_k)
164
  Returns top_k review-level results (may include multiple reviews per ASIN).
165
  """
166
+ results = enrich_bm25_search_results(bm25_retriever, query, top_k, HF_DATASET['full'])
167
+ return results
168
 
169
 
170
  def semantic_search(query: str, top_k: int = 3) -> list[dict]:
 
174
  return retriever.search(query, top_k=top_k)
175
  Returns top_k review-level results (scores are cosine similarities, 0–1).
176
  """
 
177
  results = enrich_search_results(vector_store, query, top_k, HF_DATASET["full"])
178
  return results
179
 
 
290
  unsafe_allow_html=True,
291
  )
292
 
 
 
 
 
 
293
  # ─── Search bar ───────────────────────────────────────────────────────────────
294
  query = st.text_input(
295
  "Search for a product or describe what you're looking for",
utils/bm25.py ADDED
@@ -0,0 +1,554 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ src/bm25.py — BM25 keyword retrieval
3
+ Uses LangChain's BM25Retriever with the custom tokenizer from utils.py.
4
+
5
+ Document schema (one LangChain Document per product):
6
+ page_content : text BM25 scores against =
7
+ title + features + description + categories +
8
+ details (flattened) + store + top-k review titles & texts
9
+ metadata : structured fields for display in app.py
10
+ (parent_asin, title, main_category, price, store,
11
+ categories, features, description, details, top_reviews)
12
+
13
+ Data source expected: HuggingFace Dataset objects as loaded in
14
+ milestone1_exploration.ipynb via load_dataset("McAuley-Lab/Amazon-Reviews-2023", ...)
15
+ OR the saved .jsonl subsets in data/raw/.
16
+ """
17
+
18
+ import json
19
+ import pickle
20
+ from pathlib import Path
21
+ from typing import Any
22
+ import sys
23
+ from datasets import Dataset
24
+ from langchain_community.retrievers import BM25Retriever
25
+ from langchain_core.documents import Document
26
+ ROOT_FOLDER = Path(__file__).resolve().parent.parent
27
+
28
+ sys.path.append(str(ROOT_FOLDER))
29
+ from utils.utils import simple_tokenize
30
+ from utils.eda_helpers import get_best_reviews
31
+
32
+
33
+ # ── field helpers ─────────────────────────────────────────────────────────────
34
+
35
+ def _coerce_str(value: Any) -> str:
36
+ """Safely flatten any metadata field to a plain string."""
37
+ if value is None:
38
+ return ""
39
+ if isinstance(value, list):
40
+ return " ".join(_coerce_str(v) for v in value)
41
+ if isinstance(value, dict):
42
+ return " ".join(f"{k} {_coerce_str(v)}" for k, v in value.items())
43
+ s = str(value)
44
+ # treat the literal string "None" as empty
45
+ return "" if s.strip().lower() == "none" else s
46
+
47
+
48
+ def _parse_details(details: Any) -> dict:
49
+ """
50
+ 'details' in this dataset is stored as a JSON string, e.g.:
51
+ '{"Brand": "Luzianne", "Item Form": "Ground", ...}'
52
+ Parse it safely; return an empty dict on failure.
53
+ """
54
+ if not details:
55
+ return {}
56
+ if isinstance(details, dict):
57
+ return details
58
+ try:
59
+ return json.loads(str(details))
60
+ except (json.JSONDecodeError, TypeError):
61
+ return {}
62
+
63
+
64
+ def _parse_price(price: Any) -> float | None:
65
+ """price can be a float, an int, or the string 'None'."""
66
+ if price is None:
67
+ return None
68
+ try:
69
+ v = float(price)
70
+ return None if v != v else v # NaN guard
71
+ except (ValueError, TypeError):
72
+ return None
73
+
74
+
75
+ # ── review selection ──────────────────────────────────────────────────────────
76
+
77
+ def get_top_reviews(
78
+ reviews_dataset_dict,
79
+ parent_asin: str,
80
+ k: int = 5,
81
+ ) -> list[dict]:
82
+ """
83
+ Select the top-k reviews for a product using get_best_reviews() from
84
+ eda_helpers.py (weighted score: helpful_vote 50%, verified_purchase 30%,
85
+ rating extremity 20%).
86
+
87
+ Parameters
88
+ ----------
89
+ reviews_dataset_dict : the full reviews DatasetDict (raw_reviews) —
90
+ NOT the pre-selected 'full' split, because
91
+ get_best_reviews() selects 'full' internally.
92
+ parent_asin : product identifier
93
+ k : number of reviews to return
94
+
95
+ Returns
96
+ -------
97
+ List of dicts with keys: title, text, rating, helpful_vote
98
+ """
99
+ result = get_best_reviews(reviews_dataset_dict, parent_asin, top_k=k)
100
+
101
+ # get_best_reviews returns (total_count, Dataset) when top_k is set,
102
+ # or a bare Dataset with 0 rows when no reviews are found.
103
+ if isinstance(result, tuple):
104
+ _, matched = result
105
+ else:
106
+ matched = result
107
+
108
+ if len(matched) == 0:
109
+ return []
110
+
111
+ return [
112
+ {
113
+ "title": row.get("title", "") or "",
114
+ "text": row.get("text", "") or "",
115
+ "rating": row.get("rating"),
116
+ "helpful_vote": row.get("helpful_vote", 0),
117
+ }
118
+ for row in matched
119
+ ]
120
+
121
+
122
+ # ── document construction ─────────────────────────────────────────────────────
123
+
124
+ def format_review(review: dict) -> str:
125
+ """Format a single review the same way as in the notebook."""
126
+ return (
127
+ f"Review (Rating: {review['rating']}): "
128
+ f"{review['title']}. "
129
+ f"{review['text']}\n "
130
+ )
131
+
132
+
133
+ def build_page_content(product: dict, top_reviews: list[dict]) -> str:
134
+ """
135
+ Build the page_content string that BM25 will index.
136
+ Mirrors the create_document() structure in milestone1_exploration.ipynb.
137
+ """
138
+ title = _coerce_str(product.get("title"))
139
+ description = " ".join(product.get("description") or [])
140
+ features = "\n".join(product.get("features") or [])
141
+ categories = " > ".join(product.get("categories") or [])
142
+ store = _coerce_str(product.get("store"))
143
+ details = _parse_details(product.get("details"))
144
+ details_str = " ".join(f"{k}: {v}" for k, v in details.items())
145
+
146
+ review_lines = "".join(format_review(r) for r in top_reviews)
147
+ n_reviews = len(top_reviews)
148
+
149
+ return f"""Product: {title}
150
+ Category: {categories}
151
+ Store: {store}
152
+
153
+ Features:
154
+ {features}
155
+
156
+ Description:
157
+ {description}
158
+
159
+ Details:
160
+ {details_str}
161
+
162
+ Top Reviews (showing {n_reviews}):
163
+ {review_lines}"""
164
+
165
+
166
+ def _extract_image_url(images: Any) -> str:
167
+ """
168
+ Extract the best available image URL from the images field.
169
+ The field is a dict with keys: thumb, large, hi_res, variant — each a list.
170
+ Prefers 'large', falls back to 'thumb', then 'hi_res'. Returns "" if none found.
171
+ """
172
+ if not images or not isinstance(images, dict):
173
+ return ""
174
+ for key in ("large", "thumb", "hi_res"):
175
+ urls = images.get(key)
176
+ if isinstance(urls, list) and urls and urls[0]:
177
+ return urls[0]
178
+ return ""
179
+
180
+
181
+ def build_document(product: dict, top_reviews: list[dict]) -> Document | None:
182
+ """
183
+ Build one LangChain Document for a single product row from the metadata Dataset.
184
+ Returns None if there is no indexable text.
185
+ """
186
+ page_content = build_page_content(product, top_reviews)
187
+ if not page_content.strip():
188
+ return None
189
+
190
+ details_dict = _parse_details(product.get("details"))
191
+
192
+ metadata = {
193
+ "parent_asin": product.get("parent_asin", ""),
194
+ "title": _coerce_str(product.get("title")),
195
+ "main_category": _coerce_str(product.get("main_category")),
196
+ "price": _parse_price(product.get("price")),
197
+ "store": _coerce_str(product.get("store")),
198
+ "categories": _coerce_str(product.get("categories")),
199
+ "features": _coerce_str(product.get("features")),
200
+ "description": _coerce_str(product.get("description")),
201
+ "details": details_dict,
202
+ "average_rating": product.get("average_rating"),
203
+ "rating_number": product.get("rating_number"),
204
+ "image_url": _extract_image_url(product.get("images")),
205
+ "top_reviews": top_reviews,
206
+ }
207
+
208
+ return Document(page_content=page_content, metadata=metadata)
209
+
210
+
211
+ def pregroup_reviews(
212
+ reviews_dataset_dict,
213
+ max_reviews_per_product: int = 5,
214
+ ) -> dict:
215
+ """
216
+ Pre-group top-k reviews per product using DuckDB for efficient scoring
217
+ and ranking — never loads all 14M reviews into Python memory at once.
218
+
219
+ Uses a single SQL query with ROW_NUMBER() to rank reviews per product
220
+ by the same weighted score as eda_helpers.get_best_reviews():
221
+ helpful_vote 50% (log-scaled) + verified_purchase 30% + rating extremity 20%
222
+ """
223
+ import duckdb
224
+
225
+ print("Pre-grouping reviews via DuckDB (memory-efficient) ...")
226
+ arrow_table = reviews_dataset_dict["full"].data.table
227
+
228
+ k = max_reviews_per_product
229
+ query = f"""
230
+ WITH scored AS (
231
+ SELECT
232
+ parent_asin,
233
+ title,
234
+ text,
235
+ rating,
236
+ helpful_vote,
237
+ verified_purchase,
238
+ (
239
+ 0.5 * (LN(1 + GREATEST(COALESCE(helpful_vote, 0), 0)))
240
+ + 0.3 * (CASE WHEN verified_purchase THEN 1.0 ELSE 0.0 END)
241
+ + 0.2 * (ABS(COALESCE(rating, 3.0) - 3.0) / 2.0)
242
+ ) AS score
243
+ FROM arrow_table
244
+ WHERE parent_asin IS NOT NULL AND parent_asin != ''
245
+ ),
246
+ ranked AS (
247
+ SELECT *,
248
+ ROW_NUMBER() OVER (
249
+ PARTITION BY parent_asin
250
+ ORDER BY score DESC
251
+ ) AS rn
252
+ FROM scored
253
+ )
254
+ SELECT parent_asin, title, text, rating, helpful_vote
255
+ FROM ranked
256
+ WHERE rn <= {k}
257
+ ORDER BY parent_asin, rn
258
+ """
259
+
260
+ rows = duckdb.query(query).fetchall()
261
+ cols = ["parent_asin", "title", "text", "rating", "helpful_vote"]
262
+
263
+ result = {}
264
+ for row in rows:
265
+ r = dict(zip(cols, row))
266
+ asin = r.pop("parent_asin")
267
+ result.setdefault(asin, []).append(r)
268
+
269
+ print(f" {len(result):,} unique parent_asins grouped")
270
+ print(" pre-grouping done")
271
+ return result
272
+
273
+
274
+ def build_documents(
275
+ metadata_dataset: Dataset,
276
+ reviews_dataset_dict,
277
+ max_products: int | None = None,
278
+ max_reviews_per_product: int = 5,
279
+ reviews_lookup: dict | None = None,
280
+ ) -> list[Document]:
281
+ """
282
+ Build one LangChain Document per product.
283
+
284
+ Pass reviews_lookup (from pregroup_reviews) to skip per-product DuckDB
285
+ queries entirely — much faster for large datasets.
286
+ """
287
+ total = len(metadata_dataset)
288
+ n = min(total, max_products) if max_products else total
289
+ print(f"Building documents for {n} products ...")
290
+
291
+ docs = []
292
+ for i in range(n):
293
+ product = metadata_dataset[i]
294
+ parent_asin = product.get("parent_asin", "")
295
+
296
+ if reviews_lookup is not None:
297
+ top_reviews = reviews_lookup.get(parent_asin, [])[:max_reviews_per_product]
298
+ else:
299
+ top_reviews = get_top_reviews(
300
+ reviews_dataset_dict, parent_asin, k=max_reviews_per_product
301
+ )
302
+
303
+ doc = build_document(product, top_reviews)
304
+ if doc:
305
+ docs.append(doc)
306
+
307
+ if (i + 1) % 500 == 0:
308
+ print(f" ... {i + 1}/{n} products processed")
309
+
310
+ print(f" -> {len(docs)} documents built (skipped {n - len(docs)} empty)")
311
+ return docs
312
+
313
+
314
+ # ── index build & persist ─────────────────────────────────────────────────────
315
+
316
+ def build_and_save(
317
+ documents: list[Document],
318
+ index_path: str | Path = "data/processed/bm25_index.pkl",
319
+ corpus_path: str | Path = "data/processed/bm25_corpus.pkl",
320
+ ) -> BM25Retriever:
321
+ """
322
+ Build a BM25Retriever from documents, then pickle both the
323
+ tokenized corpus and the retriever to disk.
324
+
325
+ Parameters
326
+ ----------
327
+ documents : output of build_documents()
328
+ index_path : e.g. 'data/processed/bm25_index.pkl'
329
+ corpus_path : e.g. 'data/processed/bm25_corpus.pkl'
330
+
331
+ Returns
332
+ -------
333
+ The fitted BM25Retriever instance.
334
+ """
335
+ index_path = Path(index_path)
336
+ corpus_path = Path(corpus_path)
337
+ index_path.parent.mkdir(parents=True, exist_ok=True)
338
+
339
+ print(f"Fitting BM25 index over {len(documents)} documents …")
340
+ retriever = BM25Retriever.from_documents(
341
+ documents,
342
+ preprocess_func=simple_tokenize,
343
+ )
344
+
345
+ # Save tokenized corpus separately — useful for inspection in the notebook
346
+ tokenized_corpus = [simple_tokenize(doc.page_content) for doc in documents]
347
+ with open(corpus_path, "wb") as f:
348
+ pickle.dump(tokenized_corpus, f)
349
+ print(f"Tokenized corpus saved → {corpus_path}")
350
+
351
+ with open(index_path, "wb") as f:
352
+ pickle.dump(retriever, f)
353
+ print(f"BM25 index saved → {index_path}")
354
+
355
+ return retriever
356
+
357
+
358
+ # ── load ──────────────────────────────────────────────────────────────────────
359
+
360
+ def load(index_path: str | Path = "data/processed/bm25_index.pkl") -> BM25Retriever:
361
+ """
362
+ Load a previously saved BM25Retriever from disk.
363
+ Call this in app.py instead of rebuilding every time.
364
+ """
365
+ index_path = Path(index_path)
366
+ if not index_path.exists():
367
+ raise FileNotFoundError(
368
+ f"BM25 index not found at '{index_path}'.\n"
369
+ "Run build_and_save() from your notebook first."
370
+ )
371
+ with open(index_path, "rb") as f:
372
+ retriever = pickle.load(f)
373
+ print(f"BM25 index loaded ← {index_path}")
374
+ return retriever
375
+
376
+
377
+ # ── search ────────────────────────────────────────────────────────────────────
378
+
379
+ def search(
380
+ retriever: BM25Retriever,
381
+ query: str,
382
+ top_k: int = 3,
383
+ ) -> list[dict]:
384
+ """
385
+ Run a BM25 query and return results in the format expected by app.py's
386
+ render_results().
387
+
388
+ Parameters
389
+ ----------
390
+ retriever : loaded or freshly-built BM25Retriever
391
+ query : raw user query string (tokenized internally via simple_tokenize)
392
+ top_k : number of results to return
393
+
394
+ Returns
395
+ -------
396
+ List of dicts with keys:
397
+ asin, title, text, rating, score, top_reviews
398
+ """
399
+ retriever.k = top_k
400
+ docs = retriever.invoke(query)
401
+
402
+ results = []
403
+ for doc in docs:
404
+ m = doc.metadata
405
+ top_reviews = m.get("top_reviews", [])
406
+
407
+ # Average rating across retrieved top reviews
408
+ rated = [r["rating"] for r in top_reviews if r.get("rating") is not None]
409
+ avg_rating = round(sum(rated) / len(rated), 1) if rated else 0.0
410
+
411
+ # Snippet = first review text, falling back to description
412
+ if top_reviews and top_reviews[0].get("text"):
413
+ snippet = top_reviews[0]["text"][:300]
414
+ else:
415
+ snippet = m.get("description", "")[:300]
416
+
417
+ results.append({
418
+ "asin": m.get("parent_asin", ""),
419
+ "title": m.get("title", ""),
420
+ "text": snippet,
421
+ "rating": avg_rating,
422
+ "score": 0.0, # LangChain BM25Retriever does not expose raw scores
423
+ "top_reviews": top_reviews,
424
+ })
425
+
426
+ return results
427
+
428
+
429
+ # ── notebook entry point ──────────────────────────────────────────────────────
430
+
431
+ def build_from_hf_datasets(
432
+ metadata_dataset: Dataset,
433
+ reviews_dataset_dict,
434
+ index_path: str | Path = "data/processed/bm25_index.pkl",
435
+ corpus_path: str | Path = "data/processed/bm25_corpus.pkl",
436
+ max_products: int | None = None,
437
+ max_reviews_per_product: int = 5,
438
+ ) -> BM25Retriever:
439
+ """
440
+ End-to-end helper to call from milestone1_exploration.ipynb.
441
+
442
+ Example usage in the notebook:
443
+ --------------------------------
444
+ from src.bm25 import build_from_hf_datasets, load, search
445
+
446
+ retriever = build_from_hf_datasets(
447
+ metadata_dataset=raw_metadata['full'],
448
+ reviews_dataset_dict=raw_reviews,
449
+ max_products=500,
450
+ )
451
+
452
+ # Later in app.py — just load the saved index:
453
+ # retriever = load("data/processed/bm25_index.pkl")
454
+ # results = search(retriever, "something sweet for a cheese board")
455
+ """
456
+ reviews_lookup = pregroup_reviews(reviews_dataset_dict, max_reviews_per_product)
457
+ docs = build_documents(
458
+ metadata_dataset,
459
+ reviews_dataset_dict,
460
+ max_products=max_products,
461
+ max_reviews_per_product=max_reviews_per_product,
462
+ reviews_lookup=reviews_lookup,
463
+ )
464
+ return build_and_save(docs, index_path=index_path, corpus_path=corpus_path)
465
+
466
+
467
+ def build_from_hf_datasets_batched(
468
+ metadata_dataset: Dataset,
469
+ reviews_dataset_dict,
470
+ index_path: str | Path = "data/processed/bm25_index.pkl",
471
+ corpus_path: str | Path = "data/processed/bm25_corpus.pkl",
472
+ batch_size: int = 2000,
473
+ max_reviews_per_product: int = 5,
474
+ max_products: int | None = None,
475
+ ) -> BM25Retriever:
476
+ """
477
+ Memory-safe version of build_from_hf_datasets — builds documents in
478
+ batches to avoid OOM kernel crashes on large datasets.
479
+
480
+ Checkpoints completed batches to data/processed/checkpoints/ after each
481
+ batch, so if the kernel dies mid-run you can resume from the last
482
+ completed batch instead of starting over.
483
+
484
+ Example usage in the notebook:
485
+ --------------------------------
486
+ retriever = build_from_hf_datasets_batched(
487
+ metadata_dataset=raw_metadata['full'],
488
+ reviews_dataset_dict=raw_reviews,
489
+ batch_size=5000,
490
+ max_reviews_per_product=3,
491
+ max_products=60000, # None = use all
492
+ )
493
+ """
494
+ index_path = Path(index_path)
495
+ corpus_path = Path(corpus_path)
496
+
497
+ # checkpoint folder lives next to the index
498
+ checkpoint_dir = index_path.parent / "checkpoints"
499
+ checkpoint_dir.mkdir(parents=True, exist_ok=True)
500
+
501
+ total = min(len(metadata_dataset), max_products) if max_products else len(metadata_dataset)
502
+
503
+ # find resume point — checkpoints named docs_0.pkl, docs_2000.pkl, ...
504
+ existing = sorted(checkpoint_dir.glob("docs_*.pkl"))
505
+ if existing:
506
+ last_ckpt = existing[-1]
507
+ resume_start = int(last_ckpt.stem.split("_")[1]) + batch_size
508
+ print(f"Resuming from product {resume_start} "
509
+ f"({len(existing)} checkpoint(s) found)")
510
+ all_docs = []
511
+ for ckpt in existing:
512
+ with open(ckpt, "rb") as f:
513
+ all_docs.extend(pickle.load(f))
514
+ print(f" loaded {len(all_docs)} docs from checkpoints")
515
+ else:
516
+ resume_start = 0
517
+ all_docs = []
518
+ print(f"Starting fresh — {total} products to process")
519
+
520
+ # pre-group all reviews once
521
+ reviews_lookup = pregroup_reviews(reviews_dataset_dict, max_reviews_per_product)
522
+
523
+ # batch loop
524
+ for start in range(resume_start, total, batch_size):
525
+ end = min(start + batch_size, total)
526
+ print(f"\nBatch {start}-{end} of {total} ...")
527
+
528
+ batch = metadata_dataset.select(range(start, end))
529
+ batch_docs = build_documents(
530
+ batch,
531
+ reviews_dataset_dict,
532
+ max_products=None,
533
+ max_reviews_per_product=max_reviews_per_product,
534
+ reviews_lookup=reviews_lookup,
535
+ )
536
+ all_docs.extend(batch_docs)
537
+
538
+ # save checkpoint for this batch
539
+ ckpt_path = checkpoint_dir / f"docs_{start}.pkl"
540
+ with open(ckpt_path, "wb") as f:
541
+ pickle.dump(batch_docs, f)
542
+ print(f" checkpoint saved -> {ckpt_path.name}")
543
+ print(f" cumulative docs : {len(all_docs)}")
544
+
545
+ # build final index
546
+ print(f"\nAll batches done - {len(all_docs)} total documents.")
547
+ retriever = build_and_save(all_docs, index_path=index_path, corpus_path=corpus_path)
548
+
549
+ # clean up checkpoints now that final index is safely written
550
+ for ckpt in checkpoint_dir.glob("docs_*.pkl"):
551
+ ckpt.unlink()
552
+ print("Checkpoints cleaned up.")
553
+
554
+ return retriever
utils/retrieval_helpers.py CHANGED
@@ -27,6 +27,28 @@ def decode_ratings(page_content):
27
  return(parsed)
28
  else:
29
  return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
32
  """
@@ -79,5 +101,63 @@ def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
79
 
80
  con.close()
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  # 4. Return JSON metadata objects
83
  return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
 
27
  return(parsed)
28
  else:
29
  return {}
30
+
31
+ def decode_bm25_ratings(page_content):
32
+ block_pattern = r'Review \(Rating:\s*\d+\.\d+\):.*'
33
+ matches = re.findall(block_pattern, page_content)
34
+
35
+ if matches:
36
+ pattern = r'Review \(Rating:\s*(\d+\.\d+)\):\s*([^\.]+)\.\s*(.*)'
37
+ parsed = []
38
+
39
+ for r in matches[:3]:
40
+ match = re.match(pattern, r)
41
+ if match:
42
+ rating, title, text = match.groups()
43
+ parsed.append({
44
+ 'rating': float(rating),
45
+ 'title': title.strip(),
46
+ 'text': text.strip()
47
+ })
48
+
49
+ return parsed
50
+ else:
51
+ return {}
52
 
53
  def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
54
  """
 
101
 
102
  con.close()
103
 
104
+ # 4. Return JSON metadata objects
105
+ return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
106
+
107
+ def enrich_bm25_search_results(retriever, query: str, k: int, hf_dataset):
108
+ """
109
+ Perform BM25 search and enrich results with HuggingFace dataset metadata.
110
+
111
+ Args:
112
+ retriever: LangChain BM25Retriever instance
113
+ query: Search query string
114
+ k: Number of results to return
115
+ hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
116
+
117
+ Returns:
118
+ List of enriched metadata objects as dicts
119
+ """
120
+ # Get BM25 scores via underlying rank_bm25 library
121
+ query_tokens = query.split()
122
+ scores = retriever.vectorizer.get_scores(query_tokens) # numpy array
123
+
124
+ top_k_indices = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:k]
125
+ results = [(retriever.docs[i], score) for i, score in top_k_indices]
126
+
127
+ # 1. Extract parent_asins from metadata
128
+ parent_asins = [doc.metadata.get("parent_asin") for doc, score in results]
129
+
130
+ # 2. Query HuggingFace dataset via DuckDB
131
+ con = duckdb.connect()
132
+ arrow_table = hf_dataset.data.table
133
+ con.register("hf_table", arrow_table)
134
+
135
+ asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
136
+ query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
137
+ hf_rows = con.execute(query_sql).fetchdf()
138
+
139
+ # Build lookup: parent_asin -> metadata dict
140
+ asin_to_metadata = {
141
+ row["parent_asin"]: row.to_dict()
142
+ for _, row in hf_rows.iterrows()
143
+ }
144
+
145
+ enriched_results = []
146
+
147
+ for doc, score in results:
148
+ parent_asin = doc.metadata.get("parent_asin")
149
+ total_reviews = doc.metadata.get("total_reviews")
150
+ metadata_object = asin_to_metadata.get(parent_asin, {}).copy()
151
+ metadata_object['score'] = score
152
+ metadata_object['total_reviews'] = total_reviews
153
+
154
+ # 3. Extract reviews from page_content
155
+ page_content = doc.page_content
156
+ metadata_object["reviews"] = decode_ratings(page_content)
157
+
158
+ enriched_results.append(metadata_object)
159
+
160
+ con.close()
161
+
162
  # 4. Return JSON metadata objects
163
  return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
utils/utils.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import nltk
3
+ from nltk.corpus import stopwords
4
+
5
+ # Download stopwords if not already downloaded
6
+ nltk.download('stopwords', quiet=True)
7
+
8
+ # Define a set of English stopwords for filtering out common words
9
+ STOPWORDS = set(stopwords.words('english'))
10
+
11
+ # Tokenizer
12
+ def simple_tokenize(text):
13
+ if not text:
14
+ return []
15
+ text = text.lower()
16
+ text = re.sub(r"-", " ", text)
17
+ text = re.sub(r"[^a-z0-9\s]", "", text)
18
+ tokens = text.split()
19
+ tokens = [t for t in tokens if t not in STOPWORDS]
20
+ return tokens