Spaces:

rishadaz
/

amazon_retriever

Sleeping

App Files Files Community

github-actions[bot] commited on Apr 22

Commit

a8a94d1

1 Parent(s): e51a05a

chore: sync app/ and src/ from GitHub

Browse files

Files changed (10) hide show

app/app/app.py +358 -0
app/app/styles.css +94 -0
src/src/__init__.py +0 -0
src/src/bm25.py +546 -0
src/src/eda_helpers.py +112 -0
src/src/hybrid.py +240 -0
src/src/rag_pipeline.py +304 -0
src/src/retrieval_helpers.py +194 -0
src/src/semantic.py +295 -0
src/src/utils.py +20 -0

app/app/app.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import csv, sys
+from datetime import datetime
+from pathlib import Path
+import streamlit as st
+import markdown
+ROOT_FOLDER = Path(__file__).resolve().parent.parent
+sys.path.append(str(ROOT_FOLDER))
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
+from src.retrieval_helpers import enrich_search_results,enrich_bm25_search_results
+from src.semantic import load_vector_store
+from src.rag_pipeline import run_rag
+from src.bm25 import load
+from src.hybrid import HybridRetriever
+from dotenv import load_dotenv
+load_dotenv()
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+# ─── Page config (must be first Streamlit call) ───────────────────────────────
+st.set_page_config(
+    page_title="Groceries & Gourmet Food Search",
+    page_icon="🥕",
+    layout="wide",
+    initial_sidebar_state="collapsed",
+)
+# ─── Paths ────────────────────────────────────────────────────────────────────
+ROOT = Path(__file__).resolve().parent.parent
+FEEDBACK_CSV = ROOT / "results" / "feedback.csv"
+FEEDBACK_CSV.parent.mkdir(parents=True, exist_ok=True)
+TOP_K = 5
+HF_TOKEN = os.getenv('HF_TOKEN')
+from datasets import load_dataset
+from huggingface_hub import snapshot_download, login
+# ─── Custom CSS ───────────────────────────────────────────────────────────────
+with open('./app/styles.css', "r") as f:
+    css = f.read()
+st.markdown(f"<style>{css}</style>", unsafe_allow_html=True)
+@st.cache_resource
+def load_hf_dataset():
+    return load_dataset(
+        "McAuley-Lab/Amazon-Reviews-2023",
+        "raw_meta_Grocery_and_Gourmet_Food",
+        trust_remote_code=True,
+        token=HF_TOKEN
+    )
+VECTOR_STORE_DIR = ROOT / "data" / "processed"
+@st.cache_resource
+def load_vector_store_cached():
+    login(token=HF_TOKEN, add_to_git_credential=False)
+    VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
+    snapshot_path = snapshot_download(
+        repo_id="rishadaz/amazon_retriever-storage",
+        repo_type="dataset",
+        local_dir=str(VECTOR_STORE_DIR),
+        token=HF_TOKEN,
+    )
+    mini_index_path = Path(snapshot_path) / "tokenisation" / "bm25_index_mini.pkl"
+    embeddings_dir  = Path(snapshot_path) / "embeddings"
+    vector_store    = load_vector_store(embeddings_dir)
+    bm25_retriever  = load(mini_index_path)
+    return vector_store, bm25_retriever
+# ─── Get Data ──────────────────────────────────────────────────────────────
+# local tag will read from your local directory as a default it will
+# read the mini versions of the files we have provided in the repo
+data_source = "remote" #"remote" or "local"
+# note: remote has the full generated corpus and
+# embeddings which can take a long time to download and
+# the app might become heavy too and slow down
+# processing. For development pls use the smaller "local" corpus
+HF_DATASET = load_hf_dataset()
+if data_source == 'local':
+    MINI_INDEX_PATH  = ROOT / "data" / "processed" / "tokenisation" / "bm25_index_mini.pkl"
+    vector_store = load_vector_store(ROOT_FOLDER / 'data' / 'processed' / 'embeddings')
+    retriever = load(MINI_INDEX_PATH)
+else:
+    vector_store, retriever = load_vector_store_cached()
+def bm25_search(query: str, top_k: int = 3) -> list[dict]:
+    """
+    PLACEHOLDER — swap with real BM25Retriever call, e.g.:
+        retriever = BM25Retriever.load('data/processed/bm25_index.pkl')
+        return retriever.search(query, top_k=top_k)
+    Returns top_k review-level results (may include multiple reviews per ASIN).
+    """
+    results = enrich_bm25_search_results(retriever, query, top_k, HF_DATASET['full'])
+    return results
+def semantic_search(query: str, top_k: int = 3) -> list[dict]:
+    """
+    PLACEHOLDER — swap with real SemanticRetriever call, e.g.:
+        retriever = SemanticRetriever.load('data/processed/faiss_index')
+        return retriever.search(query, top_k=top_k)
+    Returns top_k review-level results (scores are cosine similarities, 0–1).
+    """
+    results = enrich_search_results(vector_store, query, top_k, HF_DATASET['full'])
+    return results
+hybrid_retriever = HybridRetriever(
+        bm25_retriever=retriever,
+        semantic_store=vector_store,
+        k=TOP_K,
+        bm25_weight=0.5,
+        semantic_weight=0.5,
+    )
+def llm_retriever(query: str, top_k: int = 5):
+    retriever = hybrid_retriever
+    answer, docs = run_rag(retriever, query=query, hf_dataset=HF_DATASET['full'])
+    return answer, docs
+# ─── Helpers ──��───────────────────────────────────────────────────────────────
+def stars(rating: float) -> str:
+    full  = int(rating)
+    half  = 1 if (rating - full) >= 0.5 else 0
+    empty = 5 - full - half
+    return "★" * full + "½" * half + "☆" * empty
+def log_feedback(query: str, mode: str, asin: str, title: str, vote: str) -> None:
+    file_exists = FEEDBACK_CSV.exists()
+    with open(FEEDBACK_CSV, "a", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(
+            f, fieldnames=["timestamp", "query", "mode", "asin", "title", "vote"]
+        )
+        if not file_exists:
+            writer.writeheader()
+        writer.writerow({
+            "timestamp": datetime.now().isoformat(),
+            "query":     query,
+            "mode":      mode,
+            "asin":      asin,
+            "title":     title,
+            "vote":      vote,
+        })
+def render_product(ind, item):
+    reviews     = item.get("reviews",{})
+    title       = item["title"]
+    avg_rating  = item["average_rating"]
+    n_reviews   = len(reviews)
+    # total_reviews = item.get('total_reviews', n_reviews)
+    rating_number = item.get('rating_number', 0)
+    asin        = item['parent_asin']
+    review_word = "review" if n_reviews == 1 else "reviews"
+    large_images = item.get('images', {}).get('large', [])
+    image_html = f'<img src="{large_images[0]}" style="width:100%;max-width:200px;border-radius:8px;margin-bottom:8px;" />' if large_images else ''
+    raw_price = item.get('price')
+    try:
+        price_val = float(str(raw_price).replace('$', '').replace(',', '').strip())
+        price_html = f'<span style="color:#2ecc71;font-weight:600">${price_val:.2f}</span>'
+    except (TypeError, ValueError):
+        price_html = ''
+    # ── Product card header ───────────────────────────────────────────
+    score_badge = f'<span class="score-badge">similarity score: {float(item["score"]):.2f}</span>' if 'score' in item else "<span/>"
+    st.markdown(
+        f"""
+        <div class="product-card" id="{asin}">
+            {image_html}
+            <h4>#{ind + 1} &nbsp; {title}</h4>
+            <span class="stars">{stars(avg_rating)}</span>
+            &nbsp;<small style="color:#888">{avg_rating:.1f}/5 avg ({rating_number:,} ratings)</small>
+            &nbsp;&nbsp;
+            {score_badge}
+            {"&nbsp;&nbsp;" + price_html if price_html else ""}
+        </div>
+        """,
+        unsafe_allow_html=True,
+    )
+    # ── Reviews in collapsible expander ───────────────────────────────
+    expander_label = f"📖 Viewing top {n_reviews} {review_word} "
+    with st.expander(expander_label, expanded=(n_reviews == 1)):
+        for j, rev in enumerate(reviews):
+            st.markdown(
+                f"""
+                <div class="review-snippet">
+                    <strong>{rev['title']}</strong>
+                    &nbsp;·&nbsp;
+                    <span class="stars">{stars(rev['rating'])}</span>
+                    <span style="color:#888; font-size:0.8rem"> {rev['rating']}/5</span>
+                    &nbsp;·&nbsp;
+                    <br><br>
+                    {rev['text'][:300]}{'…' if len(rev['text']) > 300 else ''}
+                </div>
+                """,
+                unsafe_allow_html=True,
+            )
+    # ── Feedback buttons (per product) ────────────────────────────────
+    col_up, col_dn, _ = st.columns([1, 1, 10])
+    with col_up:
+        if st.button("👍", key=f"up_{mode}_{asin}_{ind}"):
+            log_feedback(query, mode, asin, title, "up")
+            st.toast("Thanks! 👍")
+    with col_dn:
+        if st.button("👎", key=f"dn_{mode}_{asin}_{ind}"):
+            log_feedback(query, mode, asin, title, "down")
+            st.toast("Noted! 👎")
+    st.markdown("<hr style='border:none;border-top:1px solid #e8e0d0;margin:0.5rem 0 1rem'>", unsafe_allow_html=True)
+def render_results(results: list[dict], mode: str, query: str) -> None:
+    if not results:
+        st.info("No results returned.")
+        return
+    for ind, item in enumerate(results):
+        render_product(ind,item)
+# ─── App layout ───────────────────────────────────────────────────────────────
+st.markdown(
+    """
+    <div class="banner">
+        <h1>🥕🧀 Groceries & Gourmet Food Search</h1>
+        <p>Amazon Products & Reviews · Groceries & Gourmet Food </p>
+    </div>
+    """,
+    unsafe_allow_html=True,
+)
+# ─── Search bar ───────────────────────────────────────────────────────────────
+query = st.text_input(
+    "Search for a product or describe what you're looking for",
+    placeholder="e.g. something sweet for a cheese board...",
+)
+# ─── Run searches only when query changes ─────────────────────────────────────
+if query.strip() and query != st.session_state.get("last_query"):
+    st.session_state.last_query = query
+    with st.spinner("Searching..."):
+        st.session_state.bm25_results = bm25_search(query, top_k=TOP_K)
+        st.session_state.semantic_results = semantic_search(query, top_k=TOP_K)
+    with st.spinner("Asking AI..."):
+        try:
+            answer, docs = llm_retriever(query, top_k=TOP_K)
+            st.session_state.llm_result = answer
+            st.session_state.llm_docs = docs
+        except Exception as e:
+            st.session_state.llm_result = f"**Error:** {e}"
+            st.session_state.llm_docs = []
+elif not query.strip():
+    # Clear results when input is emptied
+    for key in ("last_query", "bm25_results", "semantic_results", "llm_result"):
+        st.session_state.pop(key, None)
+# ─── Tabs ─────────────────────────────────────────────────────────────────────
+tab_search, tab_llm = st.tabs(["🔍 Search", "🤖 AI Assistant"])
+# ─── Search Tab ───────────────────────────────────────────────────────────────
+with tab_search:
+    mode = st.radio(
+        "Search mode",
+        options=["BM25", "Semantic"],
+        index=0,
+        horizontal=True,
+        help="BM25 = keyword matching · Semantic = embedding similarity (all-MiniLM-L6-v2 + FAISS)",
+    )
+    if "last_query" not in st.session_state:
+        st.markdown(
+            "<p style='color:#aaa; margin-top:1rem;'>Enter a query above to see results.</p>",
+            unsafe_allow_html=True,
+        )
+    else:
+        st.markdown(f"#### Top {TOP_K} results — {mode}")
+        results = (
+            st.session_state.bm25_results
+            if mode == "BM25"
+            else st.session_state.semantic_results
+        )
+        render_results(results, mode=mode.lower(), query=st.session_state.last_query)
+# ─── LLM Tab ──────────────────────────────────────────────────────────────────
+with tab_llm:
+    if "llm_result" not in st.session_state:
+        st.markdown(
+            "<p style='color:#aaa; margin-top:1rem;'>Enter a query above to get AI-powered recommendations.</p>",
+            unsafe_allow_html=True,
+        )
+    else:
+        st.markdown(f"#### 🤖 AI Answer — *\"{st.session_state.last_query}\"*")
+        st.caption("⚠️ AI responses may contain errors - please verify before relying on them.")
+        html_response = markdown.markdown(
+            st.session_state.llm_result,
+            extensions=["tables", "fenced_code", "nl2br"],
+        )
+        st.markdown(
+            f"<div class='llm-response'>{html_response}</div>",
+            unsafe_allow_html=True,
+        )
+        st.markdown("#### 📦 Retrieved Products")
+        docs = st.session_state.get("llm_docs", [])
+        if docs:
+            # Build scrollable card list in one HTML block
+            cards_html = "<div class='doc-sidebar'>"
+            for i, doc in enumerate(docs, 1):
+                render_product(i,doc)
+            cards_html += "</div>"
+            st.markdown(cards_html, unsafe_allow_html=True)
+        else:
+            st.markdown("<p style='color:#aaa;'>No documents retrieved.</p>", unsafe_allow_html=True)
+# ─── Sidebar: feedback log ────────────────────────────────────────────────────
+with st.sidebar:
+    st.header("📋 Feedback Log")
+    if FEEDBACK_CSV.exists():
+        import pandas as pd
+        df = pd.read_csv(FEEDBACK_CSV)
+        st.dataframe(df.tail(20), use_container_width=True)
+        st.download_button(
+            "⬇️ Download feedback.csv",
+            data=df.to_csv(index=False),
+            file_name="feedback.csv",
+            mime="text/csv",
+        )
+    else:
+        st.info("No feedback yet — use 👍/👎 on results.")

app/app/styles.css ADDED Viewed

	@@ -0,0 +1,94 @@

+@import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@600&family=Source+Sans+3:wght@400;600&display=swap');
+html, body, [class*="css"] {
+    font-family: 'Source Sans 3', sans-serif;
+}
+h1, h2, h3 { font-family: 'Playfair Display', serif; }
+.banner {
+    background: linear-gradient(135deg, #2d4a22 0%, #4a7c3f 60%, #7aab5c 100%);
+    border-radius: 12px;
+    padding: 2rem 2.5rem;
+    margin-bottom: 1.5rem;
+    color: #f5f0e8;
+}
+.banner h1 { margin: 0; font-size: 2.4rem; color: #f5f0e8; }
+.banner p  { margin: 0.3rem 0 0; font-size: 1.05rem; opacity: 0.85; }
+/* Product card (outer) */
+.product-card {
+    background: #fffdf7;
+    border: 1px solid #e2d9c8;
+    border-left: 4px solid #4a7c3f;
+    border-radius: 8px;
+    padding: 1rem 1.2rem 0.6rem;
+    margin-bottom: 0.4rem;
+    box-shadow: 0 1px 4px rgba(0,0,0,0.06);
+}
+.product-card h4 { margin: 0 0 0.2rem; color: #1e3318; font-size: 1.05rem; }
+/* Review snippet inside expander */
+.review-snippet {
+    background: #f7f4ee;
+    border-radius: 6px;
+    padding: 0.6rem 0.9rem;
+    margin-bottom: 0.5rem;
+    font-size: 0.87rem;
+    color: #444;
+    line-height: 1.55;
+}
+.score-badge {
+    display: inline-block;
+    background: #eaf3e6;
+    color: #2d5a20;
+    border-radius: 20px;
+    padding: 2px 10px;
+    font-size: 0.78rem;
+    font-weight: 600;
+    margin-right: 6px;
+}
+.stars { color: #e6a817; }
+.placeholder-badge {
+    background: #fff3cd;
+    border: 1px solid #ffc107;
+    border-radius: 6px;
+    padding: 0.4rem 0.8rem;
+    font-size: 0.82rem;
+    color: #7a5800;
+    display: inline-block;
+    margin-bottom: 1rem;
+}
+.doc-sidebar {
+    max-height: 600px;
+    overflow-y: auto;
+    padding-right: 4px;
+}
+.doc-card {
+    background: #1e1e2e;
+    border: 1px solid #333;
+    border-radius: 8px;
+    padding: 0.75rem;
+    margin-bottom: 0.65rem;
+}
+.doc-title {
+    font-weight: 600;
+    font-size: 0.85rem;
+    margin-bottom: 0.3rem;
+    color: #f0f0f0;
+    line-height: 1.3;
+}
+.doc-meta {
+    font-size: 0.78rem;
+    margin-bottom: 0.3rem;
+    display: flex;
+    gap: 0.5rem;
+}
+.doc-rating { color: #f5c518; }
+.doc-price  { color: #5cb85c; }
+.doc-snippet {
+    font-size: 0.75rem;
+    color: #999;
+    line-height: 1.4;
+}

src/src/__init__.py ADDED Viewed

File without changes

src/src/bm25.py ADDED Viewed

	@@ -0,0 +1,546 @@

+"""
+src/bm25.py  —  BM25 keyword retrieval
+Uses LangChain's BM25Retriever with the custom tokenizer from utils.py.
+Document schema (one LangChain Document per product):
+  page_content : text BM25 scores against =
+                   title + features + description + categories +
+                   details (flattened) + store + top-k review titles & texts
+  metadata     : structured fields for display in app.py
+                   (parent_asin, title, main_category, price, store,
+                    categories, features, description, details, top_reviews)
+Data source expected: HuggingFace Dataset objects as loaded in
+  milestone1_exploration.ipynb via load_dataset("McAuley-Lab/Amazon-Reviews-2023", ...)
+  OR the saved .jsonl subsets in data/raw/.
+"""
+import json
+import pickle
+from pathlib import Path
+from typing import Any
+import sys
+from datasets import Dataset
+from langchain_community.retrievers import BM25Retriever
+from langchain_core.documents import Document
+ROOT_FOLDER = Path(__file__).resolve().parent.parent
+sys.path.append(str(ROOT_FOLDER))
+from src.utils import simple_tokenize
+from src.eda_helpers import get_best_reviews
+# ── field helpers ─────────────────────────────────────────────────────────────
+def _coerce_str(value: Any) -> str:
+    """Safely flatten any metadata field to a plain string."""
+    if value is None:
+        return ""
+    if isinstance(value, list):
+        return " ".join(_coerce_str(v) for v in value)
+    if isinstance(value, dict):
+        return " ".join(f"{k} {_coerce_str(v)}" for k, v in value.items())
+    s = str(value)
+    # treat the literal string "None" as empty
+    return "" if s.strip().lower() == "none" else s
+def _parse_details(details: Any) -> dict:
+    """
+    'details' in this dataset is stored as a JSON string, e.g.:
+        '{"Brand": "Luzianne", "Item Form": "Ground", ...}'
+    Parse it safely; return an empty dict on failure.
+    """
+    if not details:
+        return {}
+    if isinstance(details, dict):
+        return details
+    try:
+        return json.loads(str(details))
+    except (json.JSONDecodeError, TypeError):
+        return {}
+def _parse_price(price: Any) -> float | None:
+    """price can be a float, an int, or the string 'None'."""
+    if price is None:
+        return None
+    try:
+        v = float(price)
+        return None if v != v else v   # NaN guard
+    except (ValueError, TypeError):
+        return None
+# ── review selection ──────────────────────────────────────────────────────────
+def get_top_reviews(
+    reviews_dataset_dict,
+    parent_asin: str,
+    k: int = 5,
+) -> list[dict]:
+    """
+    Select the top-k reviews for a product using get_best_reviews() from
+    eda_helpers.py (weighted score: helpful_vote 50%, verified_purchase 30%,
+    rating extremity 20%).
+    Parameters
+    ----------
+    reviews_dataset_dict : the full reviews DatasetDict (raw_reviews) —
+                           NOT the pre-selected 'full' split, because
+                           get_best_reviews() selects 'full' internally.
+    parent_asin          : product identifier
+    k                    : number of reviews to return
+    Returns
+    -------
+    List of dicts with keys: title, text, rating, helpful_vote
+    """
+    result = get_best_reviews(reviews_dataset_dict, parent_asin, top_k=k)
+    # get_best_reviews returns (total_count, Dataset) when top_k is set,
+    # or a bare Dataset with 0 rows when no reviews are found.
+    if isinstance(result, tuple):
+        _, matched = result
+    else:
+        matched = result
+    if len(matched) == 0:
+        return []
+    return [
+        {
+            "title":        row.get("title", "") or "",
+            "text":         row.get("text", "") or "",
+            "rating":       row.get("rating"),
+            "helpful_vote": row.get("helpful_vote", 0),
+        }
+        for row in matched
+    ]
+# ── document construction ─────────────────────────────────────────────────────
+def format_review(review: dict) -> str:
+    """Format a single review the same way as in the notebook."""
+    return (
+        f"Review (Rating: {review['rating']}): "
+        f"{review['title']}. "
+        f"{review['text']}\n    "
+    )
+def build_page_content(product: dict, top_reviews: list[dict]) -> str:
+    """
+    Build the page_content string that BM25 will index.
+    Mirrors the create_document() structure in milestone1_exploration.ipynb.
+    """
+    title       = _coerce_str(product.get("title"))
+    description = " ".join(product.get("description") or [])
+    features    = "\n".join(product.get("features") or [])
+    categories  = " > ".join(product.get("categories") or [])
+    store       = _coerce_str(product.get("store"))
+    details     = _parse_details(product.get("details"))
+    details_str = " ".join(f"{k}: {v}" for k, v in details.items())
+    review_lines = "".join(format_review(r) for r in top_reviews)
+    n_reviews    = len(top_reviews)
+    return f"""Product: {title}
+Category: {categories}
+Store: {store}
+Features:
+    {features}
+Description:
+    {description}
+Details:
+    {details_str}
+Top Reviews (showing {n_reviews}):
+    {review_lines}"""
+def _extract_image_url(images: Any) -> str:
+    """
+    Extract the best available image URL from the images field.
+    The field is a dict with keys: thumb, large, hi_res, variant — each a list.
+    Prefers 'large', falls back to 'thumb', then 'hi_res'. Returns "" if none found.
+    """
+    if not images or not isinstance(images, dict):
+        return ""
+    for key in ("large", "thumb", "hi_res"):
+        urls = images.get(key)
+        if isinstance(urls, list) and urls and urls[0]:
+            return urls[0]
+    return ""
+def build_document(product: dict, top_reviews: list[dict]) -> Document | None:
+    """
+    Build one LangChain Document for a single product row from the metadata Dataset.
+    Returns None if there is no indexable text.
+    """
+    page_content = build_page_content(product, top_reviews)
+    if not page_content.strip():
+        return None
+    details_dict = _parse_details(product.get("details"))
+    metadata = {
+        "parent_asin":    product.get("parent_asin", ""),
+        "title":          _coerce_str(product.get("title")),
+        "main_category":  _coerce_str(product.get("main_category")),
+        "price":          _parse_price(product.get("price")),
+        "store":          _coerce_str(product.get("store")),
+        "categories":     _coerce_str(product.get("categories")),
+        "features":       _coerce_str(product.get("features")),
+        "description":    _coerce_str(product.get("description")),
+        "details":        details_dict,
+        "average_rating": product.get("average_rating"),
+        "rating_number":  product.get("rating_number"),
+        "image_url":      _extract_image_url(product.get("images")),
+        "top_reviews":    top_reviews,
+    }
+    return Document(page_content=page_content, metadata=metadata)
+def pregroup_reviews(
+    reviews_dataset_dict,
+    max_reviews_per_product: int = 5,
+) -> dict:
+    """
+    Pre-group top-k reviews per product using DuckDB for efficient scoring
+    and ranking — never loads all 14M reviews into Python memory at once.
+    Uses a single SQL query with ROW_NUMBER() to rank reviews per product
+    by the same weighted score as eda_helpers.get_best_reviews():
+        helpful_vote 50% (log-scaled) + verified_purchase 30% + rating extremity 20%
+    """
+    import duckdb
+    print("Pre-grouping reviews via DuckDB (memory-efficient) ...")
+    arrow_table = reviews_dataset_dict["full"].data.table
+    k = max_reviews_per_product
+    query = f"""
+        WITH scored AS (
+            SELECT
+                parent_asin,
+                title,
+                text,
+                rating,
+                helpful_vote,
+                verified_purchase,
+                (
+                    0.5 * (LN(1 + GREATEST(COALESCE(helpful_vote, 0), 0)))
+                    + 0.3 * (CASE WHEN verified_purchase THEN 1.0 ELSE 0.0 END)
+                    + 0.2 * (ABS(COALESCE(rating, 3.0) - 3.0) / 2.0)
+                ) AS score
+            FROM arrow_table
+            WHERE parent_asin IS NOT NULL AND parent_asin != ''
+        ),
+        ranked AS (
+            SELECT *,
+                ROW_NUMBER() OVER (
+                    PARTITION BY parent_asin
+                    ORDER BY score DESC
+                ) AS rn
+            FROM scored
+        )
+        SELECT parent_asin, title, text, rating, helpful_vote
+        FROM ranked
+        WHERE rn <= {k}
+        ORDER BY parent_asin, rn
+    """
+    rows = duckdb.query(query).fetchall()
+    cols = ["parent_asin", "title", "text", "rating", "helpful_vote"]
+    result = {}
+    for row in rows:
+        r           = dict(zip(cols, row))
+        asin        = r.pop("parent_asin")
+        result.setdefault(asin, []).append(r)
+    print(f"  {len(result):,} unique parent_asins grouped")
+    print("  pre-grouping done")
+    return result
+def build_documents(
+    metadata_dataset: Dataset,
+    reviews_dataset_dict,
+    max_products: int | None = None,
+    max_reviews_per_product: int = 5,
+    reviews_lookup: dict | None = None,
+) -> list[Document]:
+    """
+    Build one LangChain Document per product.
+    Pass reviews_lookup (from pregroup_reviews) to skip per-product DuckDB
+    queries entirely — much faster for large datasets.
+    """
+    total = len(metadata_dataset)
+    n     = min(total, max_products) if max_products else total
+    print(f"Building documents for {n} products ...")
+    docs = []
+    for i in range(n):
+        product     = metadata_dataset[i]
+        parent_asin = product.get("parent_asin", "")
+        if reviews_lookup is not None:
+            top_reviews = reviews_lookup.get(parent_asin, [])[:max_reviews_per_product]
+        else:
+            top_reviews = get_top_reviews(
+                reviews_dataset_dict, parent_asin, k=max_reviews_per_product
+            )
+        doc = build_document(product, top_reviews)
+        if doc:
+            docs.append(doc)
+        if (i + 1) % 500 == 0:
+            print(f"  ... {i + 1}/{n} products processed")
+    print(f"  -> {len(docs)} documents built (skipped {n - len(docs)} empty)")
+    return docs
+# ── index build & persist ─────────────────────────────────────────────────────
+def build_and_save(
+    documents: list[Document],
+    index_path: str | Path = "data/processed/bm25_index.pkl",
+    corpus_path: str | Path = "data/processed/bm25_corpus.pkl",
+) -> BM25Retriever:
+    """
+    Build a BM25Retriever from documents, then pickle both the
+    tokenized corpus and the retriever to disk.
+    Parameters
+    ----------
+    documents   : output of build_documents()
+    index_path  : e.g. 'data/processed/bm25_index.pkl'
+    corpus_path : e.g. 'data/processed/bm25_corpus.pkl'
+    Returns
+    -------
+    The fitted BM25Retriever instance.
+    """
+    index_path  = Path(index_path)
+    corpus_path = Path(corpus_path)
+    index_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f"Fitting BM25 index over {len(documents)} documents …")
+    retriever = BM25Retriever.from_documents(
+        documents,
+        preprocess_func=simple_tokenize,
+    )
+    # Save tokenized corpus separately — useful for inspection in the notebook
+    tokenized_corpus = [simple_tokenize(doc.page_content) for doc in documents]
+    with open(corpus_path, "wb") as f:
+        pickle.dump(tokenized_corpus, f)
+    print(f"Tokenized corpus saved → {corpus_path}")
+    with open(index_path, "wb") as f:
+        pickle.dump(retriever, f)
+    print(f"BM25 index saved       → {index_path}")
+    return retriever
+# ── load ──────────────────────────────────────────────────────────────────────
+def load(index_path: str | Path = "data/processed/bm25_index.pkl") -> BM25Retriever:
+    """
+    Load a previously saved BM25Retriever from disk.
+    Call this in app.py instead of rebuilding every time.
+    """
+    index_path = Path(index_path)
+    if not index_path.exists():
+        raise FileNotFoundError(
+            f"BM25 index not found at '{index_path}'.\n"
+            "Run build_and_save() from your notebook first."
+        )
+    with open(index_path, "rb") as f:
+        retriever = pickle.load(f)
+    print(f"BM25 index loaded ← {index_path}")
+    return retriever
+# ── search ────────────────────────────────────────────────────────────────────
+def search(
+    retriever: BM25Retriever,
+    query: str,
+    top_k: int = 3,
+) -> list[dict]:
+    retriever.k = top_k
+    # Tokenize query the same way the index was built
+    tokenized_query = simple_tokenize(query)
+    # Get raw BM25 scores for ALL documents
+    scores = retriever.vectorizer.get_scores(tokenized_query)  # np.ndarray, len = n_docs
+    # Get top-k doc indices by score
+    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
+    results = []
+    for idx in top_indices:
+        doc = retriever.docs[idx]          # retriever.docs holds the original Document list
+        m   = doc.metadata
+        top_reviews = m.get("top_reviews", [])
+        rated = [r["rating"] for r in top_reviews if r.get("rating") is not None]
+        avg_rating = round(sum(rated) / len(rated), 1) if rated else 0.0
+        if top_reviews and top_reviews[0].get("text"):
+            snippet = top_reviews[0]["text"][:300]
+        else:
+            snippet = m.get("description", "")[:300]
+        results.append({
+            "asin":        m.get("parent_asin", ""),
+            "title":       m.get("title", ""),
+            "text":        snippet,
+            "rating":      avg_rating,
+            "score":       float(scores[idx]),
+            "top_reviews": top_reviews,
+        })
+    return results
+# ── notebook entry point ──────────────────────────────────────────────────────
+def build_from_hf_datasets(
+    metadata_dataset: Dataset,
+    reviews_dataset_dict,
+    index_path: str | Path = "data/processed/tokenisation/bm25_index.pkl",
+    corpus_path: str | Path = "data/processed/tokenisation/bm25_corpus.pkl",
+    max_products: int | None = None,
+    max_reviews_per_product: int = 5,
+) -> BM25Retriever:
+    """
+    End-to-end helper to call from milestone1_exploration.ipynb.
+    Example usage in the notebook:
+    --------------------------------
+    from src.bm25 import build_from_hf_datasets, load, search
+    retriever = build_from_hf_datasets(
+        metadata_dataset=raw_metadata['full'],
+        reviews_dataset_dict=raw_reviews,
+        max_products=500,
+    )
+    # Later in app.py — just load the saved index:
+    # retriever = load("data/processed/bm25_index.pkl")
+    # results   = search(retriever, "something sweet for a cheese board")
+    """
+    reviews_lookup = pregroup_reviews(reviews_dataset_dict, max_reviews_per_product)
+    docs = build_documents(
+        metadata_dataset,
+        reviews_dataset_dict,
+        max_products=max_products,
+        max_reviews_per_product=max_reviews_per_product,
+        reviews_lookup=reviews_lookup,
+    )
+    return build_and_save(docs, index_path=index_path, corpus_path=corpus_path)
+def build_from_hf_datasets_batched(
+    metadata_dataset: Dataset,
+    reviews_dataset_dict,
+    index_path: str | Path = "data/processed/tokenisation/bm25_index.pkl",
+    corpus_path: str | Path = "data/processed/tokenisation/bm25_corpus.pkl",
+    batch_size: int = 2000,
+    max_reviews_per_product: int = 5,
+    max_products: int | None = None,
+) -> BM25Retriever:
+    """
+    Memory-safe version of build_from_hf_datasets — builds documents in
+    batches to avoid OOM kernel crashes on large datasets.
+    Checkpoints completed batches to data/processed/checkpoints/ after each
+    batch, so if the kernel dies mid-run you can resume from the last
+    completed batch instead of starting over.
+    Example usage in the notebook:
+    --------------------------------
+    retriever = build_from_hf_datasets_batched(
+        metadata_dataset=raw_metadata['full'],
+        reviews_dataset_dict=raw_reviews,
+        batch_size=5000,
+        max_reviews_per_product=3,
+        max_products=60000,   # None = use all
+    )
+    """
+    index_path  = Path(index_path)
+    corpus_path = Path(corpus_path)
+    # checkpoint folder lives next to the index
+    checkpoint_dir = index_path.parent / "checkpoints"
+    checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    total = min(len(metadata_dataset), max_products) if max_products else len(metadata_dataset)
+    # find resume point — checkpoints named docs_0.pkl, docs_2000.pkl, ...
+    existing = sorted(checkpoint_dir.glob("docs_*.pkl"))
+    if existing:
+        last_ckpt    = existing[-1]
+        resume_start = int(last_ckpt.stem.split("_")[1]) + batch_size
+        print(f"Resuming from product {resume_start} "
+              f"({len(existing)} checkpoint(s) found)")
+        all_docs = []
+        for ckpt in existing:
+            with open(ckpt, "rb") as f:
+                all_docs.extend(pickle.load(f))
+        print(f"  loaded {len(all_docs)} docs from checkpoints")
+    else:
+        resume_start = 0
+        all_docs     = []
+        print(f"Starting fresh — {total} products to process")
+    # pre-group all reviews once
+    reviews_lookup = pregroup_reviews(reviews_dataset_dict, max_reviews_per_product)
+    # batch loop
+    for start in range(resume_start, total, batch_size):
+        end   = min(start + batch_size, total)
+        print(f"\nBatch {start}-{end} of {total} ...")
+        batch      = metadata_dataset.select(range(start, end))
+        batch_docs = build_documents(
+            batch,
+            reviews_dataset_dict,
+            max_products=None,
+            max_reviews_per_product=max_reviews_per_product,
+            reviews_lookup=reviews_lookup,
+        )
+        all_docs.extend(batch_docs)
+        # save checkpoint for this batch
+        ckpt_path = checkpoint_dir / f"docs_{start}.pkl"
+        with open(ckpt_path, "wb") as f:
+            pickle.dump(batch_docs, f)
+        print(f"  checkpoint saved -> {ckpt_path.name}")
+        print(f"  cumulative docs  : {len(all_docs)}")
+    # build final index
+    print(f"\nAll batches done - {len(all_docs)} total documents.")
+    retriever = build_and_save(all_docs, index_path=index_path, corpus_path=corpus_path)
+    # clean up checkpoints now that final index is safely written
+    for ckpt in checkpoint_dir.glob("docs_*.pkl"):
+        ckpt.unlink()
+    print("Checkpoints cleaned up.")
+    return retriever

src/src/eda_helpers.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from datasets import Dataset
+import duckdb
+def dataset_overview(dataset_dict) -> None:
+    """Print a concise overview of a DatasetDict: splits, features, row counts."""
+    print(f"\n{'='*60}")
+    print(f"  Overview")
+    print(f"{'='*60}")
+    for split, ds in dataset_dict.items():
+        print(f"\n  Split : {split!r}  ({ds.num_rows:,} rows)")
+        print(f"  {'Field':<30} {'dtype'}")
+        print(f"  {'-'*45}")
+        for feat, ftype in ds.features.items():
+            print(f"  {feat:<30} {ftype}")
+    print()
+def get_reviews_by_asin(
+    reviews_dataset,
+    parent_asin: str,
+):
+    """
+    Retrieve all reviews matching a given parent_asin.
+    Parameters
+    ----------
+    reviews_dataset : DatasetDict (the full reviews DatasetDict)
+    parent_asin     : the product ASIN to filter by
+    split           : which split to search in (default: "full")
+    Returns
+    -------
+    HuggingFace Dataset containing only rows matching the given parent_asin
+    """
+    if not parent_asin or not isinstance(parent_asin,str):
+        raise TypeError("Invalid parent_asin passed")
+    ds = reviews_dataset["full"]
+    arrow_table = ds.data.table
+    matched_arrow = duckdb.query(
+        f"SELECT * FROM arrow_table WHERE parent_asin = '{parent_asin}'"
+    ).fetch_arrow_table()
+    return Dataset(matched_arrow)
+def get_best_reviews(
+    reviews_dataset,
+    parent_asin: str,
+    top_k: int = None,
+):
+    """
+    Retrieve reviews matching a given parent_asin, optionally returning
+    only the top-k highest quality reviews.
+    Ranking score (all components normalized to [0, 1]):
+        - helpful_vote      : 50% weight  (log-scaled to reduce outlier dominance)
+        - verified_purchase : 30% weight  (bool → 1.0 or 0.0)
+        - rating            : 20% weight  (how extreme the rating is — 1 or 5
+                                           are more informative than a neutral 3)
+    Parameters
+    ----------
+    reviews_dataset : DatasetDict
+    parent_asin     : product ASIN to filter by
+    top_k           : number of top reviews to return (None = return all, sorted)
+    split           : which split to use
+    Returns
+    -------
+    HuggingFace Dataset
+    """
+    import math
+    matched = get_reviews_by_asin(reviews_dataset,parent_asin)
+    tot=matched.num_rows
+    if tot == 0:
+        return 0, matched
+    if top_k is None:
+        return 0, matched
+    # Step 2: compute scores
+    helpful_votes = matched["helpful_vote"]
+    verified      = matched["verified_purchase"]
+    ratings       = matched["rating"]
+    # Log-scale helpful votes: log(1 + x), then normalize to [0, 1]
+    log_votes = [math.log1p(v if v is not None else 0) for v in helpful_votes]
+    max_log   = max(log_votes) if max(log_votes) > 0 else 1.0
+    norm_votes = [v / max_log for v in log_votes]
+    # Verified purchase: 1.0 if True, 0.0 otherwise
+    norm_verified = [1.0 if v else 0.0 for v in verified]
+    # Rating extremity: reviews at 1 or 5 are more informative than 3
+    # score = 1 - |rating - 3| / 2  →  inverted so extreme ratings score higher
+    norm_rating = [abs((r if r is not None else 3.0) - 3.0) / 2.0 for r in ratings]
+    # Weighted sum
+    scores = [
+        0.50 * nv + 0.30 * ver + 0.20 * nr
+        for nv, ver, nr in zip(norm_votes, norm_verified, norm_rating)
+    ]
+    # Step 3: select top-k indices by score
+    k = min(top_k, matched.num_rows)
+    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
+    top_indices_sorted = sorted(top_indices)  # preserve original row order
+    return tot, matched.select(top_indices_sorted)

src/src/hybrid.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+src/hybrid.py
+-------------
+Hybrid retriever combining BM25 keyword search and FAISS semantic search,
+fused with Reciprocal Rank Fusion (RRF).
+Designed to plug into the existing run_rag() pipeline in rag_pipeline.py
+as a drop-in replacement for the semantic retriever:
+    hybrid_retriever = load_hybrid_retriever(
+        bm25_index_path="data/processed/tokenisation/bm25_index_mini.pkl",
+        faiss_store_path="data/processed/embeddings",
+        k=5,
+    )
+    answer = run_rag(hybrid_retriever, "Best coffee beans for espresso")
+The HybridRetriever class extends LangChain's BaseRetriever so it is fully
+compatible with the | (pipe) operator used in rag_pipeline.py:
+    rag_chain = (
+        {
+            "context": hybrid_retriever | RunnableLambda(build_context),
+            "question": RunnablePassthrough(),
+        }
+        | prompt_template
+        | llm
+        | StrOutputParser()
+    )
+"""
+from __future__ import annotations
+import logging
+from typing import Any
+from langchain_community.retrievers import BM25Retriever
+from langchain_community.vectorstores import FAISS
+from langchain_core.callbacks import CallbackManagerForRetrieverRun
+from langchain_core.documents import Document
+from langchain_core.retrievers import BaseRetriever
+from pydantic import Field
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# HybridRetriever
+# ---------------------------------------------------------------------------
+class HybridRetriever(BaseRetriever):
+    """
+    Combines BM25 keyword retrieval and FAISS semantic retrieval using
+    Reciprocal Rank Fusion (RRF) to produce a unified ranked document list.
+    RRF score for document d across retriever r:
+        score(d) = weight_r * (1 / (rrf_c + rank(d, r)))
+    Documents appearing in both retrievers accumulate scores from both,
+    naturally promoting results that are relevant by both keyword and meaning.
+    Parameters
+    ----------
+    bm25_retriever   : Fitted LangChain BM25Retriever (from bm25.load())
+    semantic_store   : Loaded FAISS vectorstore (from semantic.load_vector_store())
+    k                : Number of final documents to return
+    rrf_c            : RRF constant — dampens the impact of rank differences.
+                       Standard value is 60; lower = top ranks matter more.
+    bm25_weight      : RRF weight for BM25 results (keyword signal)
+    semantic_weight  : RRF weight for semantic results (meaning signal)
+    fetch_multiplier : Fetch this multiple of k from each retriever before fusing.
+                       More candidates = better fusion quality. Default: 3.
+    """
+    bm25_retriever: Any = Field(...)
+    semantic_store: Any = Field(...)
+    k: int = Field(default=5)
+    rrf_c: int = Field(default=60)
+    bm25_weight: float = Field(default=0.5)
+    semantic_weight: float = Field(default=0.5)
+    fetch_multiplier: int = Field(default=3)
+    def _get_relevant_documents(
+        self,
+        query: str,
+        *,
+        run_manager: CallbackManagerForRetrieverRun,
+    ) -> list[Document]:
+        """
+        Core retrieval logic called by LangChain when the retriever is invoked.
+        Steps
+        -----
+        1. Fetch candidates from BM25 and FAISS independently
+        2. Assign RRF scores weighted by retriever confidence
+        3. Deduplicate by parent_asin, accumulating scores for shared hits
+        4. Sort by fused RRF score and return top-k Documents
+        """
+        fetch_k = self.k * self.fetch_multiplier
+        # ── 1. BM25 retrieval ────────────────────────────────────────────────
+        self.bm25_retriever.k = fetch_k
+        try:
+            bm25_docs: list[Document] = self.bm25_retriever.invoke(query)
+            logger.debug("BM25 returned %d docs for query: %r", len(bm25_docs), query)
+        except Exception as exc:
+            logger.warning("BM25 retrieval failed: %s — using empty list.", exc)
+            bm25_docs = []
+        # ── 2. Semantic retrieval ────────────────────────────────────────────
+        # similarity_search returns list[Document] (no scores needed — rank is enough for RRF)
+        try:
+            semantic_docs: list[Document] = self.semantic_store.similarity_search(
+                query, k=fetch_k
+            )
+            logger.debug(
+                "Semantic returned %d docs for query: %r", len(semantic_docs), query
+            )
+        except Exception as exc:
+            logger.warning("Semantic retrieval failed: %s — using empty list.", exc)
+            semantic_docs = []
+        # ── 3. RRF fusion ────────────────────────────────────────────────────
+        rrf_scores: dict[str, float] = {}
+        doc_map: dict[str, Document] = {}
+        def _asin_key(doc: Document, fallback: str) -> str:
+            """Use parent_asin as the dedup key; fall back to a content prefix."""
+            return doc.metadata.get("parent_asin") or fallback
+        for rank, doc in enumerate(bm25_docs):
+            key = _asin_key(doc, f"bm25_{rank}")
+            score = self.bm25_weight / (self.rrf_c + rank + 1)
+            rrf_scores[key] = rrf_scores.get(key, 0.0) + score
+            doc_map[key] = doc                  # BM25 docs have richer metadata (top_reviews etc.)
+        for rank, doc in enumerate(semantic_docs):
+            key = _asin_key(doc, f"sem_{rank}")
+            score = self.semantic_weight / (self.rrf_c + rank + 1)
+            rrf_scores[key] = rrf_scores.get(key, 0.0) + score
+            # Only add to doc_map if BM25 didn't already supply this product
+            # (BM25 metadata is richer — has top_reviews, image_url, etc.)
+            if key not in doc_map:
+                doc_map[key] = doc
+        # ── 4. Sort and truncate ─────────────────────────────────────────────
+        ranked_keys = sorted(rrf_scores, key=lambda k: rrf_scores[k], reverse=True)
+        top_docs = [doc_map[key] for key in ranked_keys[: self.k]]
+        # Attach fused score to metadata — useful for app display
+        for key, doc in zip(ranked_keys, top_docs):
+            doc.metadata["hybrid_score"] = round(rrf_scores[key], 6)
+            # Record which retriever(s) contributed to this result
+            in_bm25 = any(
+                _asin_key(d, f"bm25_{i}") == key for i, d in enumerate(bm25_docs)
+            )
+            in_sem = any(
+                _asin_key(d, f"sem_{i}") == key for i, d in enumerate(semantic_docs)
+            )
+            if in_bm25 and in_sem:
+                doc.metadata["retrieval_source"] = "hybrid"
+            elif in_bm25:
+                doc.metadata["retrieval_source"] = "bm25"
+            else:
+                doc.metadata["retrieval_source"] = "semantic"
+        logger.info(
+            "HybridRetriever: BM25=%d, Semantic=%d → fused=%d (returning top %d)",
+            len(bm25_docs), len(semantic_docs), len(rrf_scores), len(top_docs),
+        )
+        return top_docs
+# ---------------------------------------------------------------------------
+# Convenience loader
+# ---------------------------------------------------------------------------
+def load_hybrid_retriever(
+    bm25_index_path: str = "data/processed/tokenisation/bm25_index_mini.pkl",
+    faiss_store_path: str = "data/processed/embeddings",
+    k: int = 5,
+    bm25_weight: float = 0.5,
+    semantic_weight: float = 0.5,
+    rrf_c: int = 60,
+    fetch_multiplier: int = 3,
+) -> HybridRetriever:
+    """
+    Load both indexes from disk and return a ready-to-use HybridRetriever.
+    Call this once in your notebook or app.py, then pass the result to run_rag().
+    Parameters
+    ----------
+    bm25_index_path  : Path to the pickled BM25Retriever (from bm25.build_and_save())
+    faiss_store_path : Directory containing index.faiss + index.pkl
+                       (from semantic.build_and_save_vector_store())
+    k                : Number of documents to return per query
+    bm25_weight      : RRF weight for BM25 (keyword signal). Default 0.5.
+    semantic_weight  : RRF weight for semantic (meaning signal). Default 0.5.
+                       Weights don't need to sum to 1 but relative scale matters.
+    rrf_c            : RRF rank-dampening constant. Default 60 (standard).
+    fetch_multiplier : Candidates to fetch per retriever = k * fetch_multiplier.
+    Returns
+    -------
+    HybridRetriever
+        A LangChain-compatible retriever pipeable with |.
+    Example
+    -------
+    >>> from src.hybrid import load_hybrid_retriever
+    >>> from src.rag_pipeline import run_rag
+    >>>
+    >>> hybrid = load_hybrid_retriever(k=5)
+    >>> answer = run_rag(hybrid, "Best coffee beans for a French press")
+    >>> print(answer)
+    """
+    # Import here to avoid circular imports when used from rag_pipeline.py
+    from src.bm25 import load as load_bm25
+    from src.semantic import load_vector_store
+    print(f"Loading BM25 index from: {bm25_index_path}")
+    bm25_ret: BM25Retriever = load_bm25(bm25_index_path)
+    print(f"Loading FAISS store from: {faiss_store_path}")
+    faiss_store: FAISS = load_vector_store(faiss_store_path)
+    retriever = HybridRetriever(
+        bm25_retriever=bm25_ret,
+        semantic_store=faiss_store,
+        k=k,
+        bm25_weight=bm25_weight,
+        semantic_weight=semantic_weight,
+        rrf_c=rrf_c,
+        fetch_multiplier=fetch_multiplier,
+    )
+    print(
+        f"HybridRetriever ready — k={k}, "
+        f"BM25 weight={bm25_weight}, Semantic weight={semantic_weight}, RRF c={rrf_c}"
+    )
+    return retriever

src/src/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""
+rag_chain.py
+------------
+Amazon product RAG (Retrieval-Augmented Generation) pipeline using
+LangChain + HuggingFace Inference Endpoints.
+Typical usage
+-------------
+>>> from rag_chain import run_rag
+>>> answer = run_rag(retriever, "Moisturizing shampoo for thick curly hair")
+>>> print(answer)
+"""
+from __future__ import annotations
+import logging
+from typing import Any
+from langchain_core.documents import Document
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnableLambda, RunnablePassthrough
+from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
+from src.retrieval_helpers import _format_docs
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+DEFAULT_REPO_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+DEFAULT_MAX_NEW_TOKENS = 512
+DEFAULT_TOP_K = 5
+DEFAULT_SYSTEM_PROMPT = (
+    "You are a helpful Amazon grocery shopping assistant.\n\n"
+    "You will receive a grocery query and a list of related Amazon products (including reviews and metadata).\n\n"
+    "Your response must follow this exact structure:\n\n"
+    "---\n\n"
+    "## 🛒 Recommended Products\n"
+    "For each product, write a numbered list entry, mentioning products by title "
+    "followed by 1–2 sentences describing the product and why it suits the query.\n\n"
+    "## 💡 Tips & Recipe Ideas\n"
+    "A bullet-point list of practical tips, storage advice, and brief recipe ideas related to the products above "
+    "(do NOT write out full recipes — keep each idea to 1–2 sentences)."
+    "Add food emojis if relevant.\n\n"
+    "---\n\n"
+    "Rules:\n"
+    "- Do not invent products. Only recommend products from the provided list.\n"
+    "- Keep descriptions factual and grounded in the provided reviews and metadata.\n"
+    "- Recipe ideas should be suggestions or ideas only, not step-by-step instructions.\n"
+    "- Format the entire response in Markdown.\n"
+    "- IMPORTANT: Whenever citing the product title: add the parent_asin in the following format [title](#parent_asin)"
+)
+# ---------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------
+import logging
+from langchain_core.runnables import RunnableLambda
+logger = logging.getLogger(__name__)
+def _make_verbose_tap(label: str, verbose: bool):
+    """
+    Returns a passthrough RunnableLambda that logs *value* when verbose=True.
+    Works for any chain step — docs, prompt messages, or raw strings.
+    """
+    def _tap(value):
+        if verbose:
+            if hasattr(value, "messages"):          # ChatPromptValue
+                rendered = "\n".join(
+                    f"[{m.type.upper()}]: {m.content}"
+                    for m in value.messages
+                )
+            elif isinstance(value, list):            # list of Documents
+                rendered = "\n".join(str(d) for d in value)
+            else:
+                rendered = str(value)
+            print(f"\n{'='*60}\n{label}\n{'='*60}\n{rendered}\n")
+            logger.debug("%s\n%s", label, rendered)
+        return value
+    return RunnableLambda(_tap)
+def build_context(docs: list[Document]) -> str:
+    """
+    Concatenate a list of retrieved LangChain Documents into a single
+    context string that the LLM can reason over.
+    Each entry includes the product's ``parent_asin`` (falling back to its
+    position index), its page content, and its full metadata dict.
+    Parameters
+    ----------
+    docs:
+        List of ``langchain_core.documents.Document`` objects returned by
+        the retriever.
+    Returns
+    -------
+    str
+        A newline-separated block of product descriptions ready for prompt
+        injection. Returns an empty string when *docs* is empty.
+    Raises
+    ------
+    TypeError
+        If *docs* is not a list, or any element is not a ``Document``.
+    """
+    if not isinstance(docs, list):
+        raise TypeError(
+            f"'docs' must be a list of Document objects, got {type(docs).__name__}."
+        )
+    for i, doc in enumerate(docs):
+        if not isinstance(doc, Document):
+            raise TypeError(
+                f"Element at index {i} is not a Document; got {type(doc).__name__}."
+            )
+    if not docs:
+        logger.warning("build_context received an empty document list.")
+        return ""
+    return "\n\n".join(
+        f"ASIN {doc.metadata.get('parent_asin', n)} Description: {doc.page_content}\n"
+        f"Metadata: {doc.metadata}"
+        for n, doc in enumerate(docs)
+    )
+def _build_llm(
+    repo_id: str,
+    max_new_tokens: int,
+    provider: str,
+) -> ChatHuggingFace:
+    """
+    Instantiate and return a ``ChatHuggingFace`` model backed by a
+    HuggingFace Inference Endpoint.
+    Parameters
+    ----------
+    repo_id:
+        HuggingFace Hub model identifier (e.g.
+        ``"meta-llama/Meta-Llama-3-8B-Instruct"``).
+    max_new_tokens:
+        Maximum number of tokens the model may generate per call.
+    provider:
+        Inference provider passed to ``HuggingFaceEndpoint``
+        (``"auto"``, ``"novita"``, etc.).
+    Returns
+    -------
+    ChatHuggingFace
+        A chat-compatible wrapper around the endpoint.
+    """
+    endpoint = HuggingFaceEndpoint(
+        repo_id=repo_id,
+        task="text-generation",
+        max_new_tokens=max_new_tokens,
+        provider=provider,
+    )
+    return ChatHuggingFace(llm=endpoint)
+def _build_prompt_template(system_prompt: str) -> ChatPromptTemplate:
+    """
+    Create a ``ChatPromptTemplate`` with a system message and a human
+    turn that injects ``{context}`` and ``{question}`` placeholders.
+    Parameters
+    ----------
+    system_prompt:
+        The system-level instruction string.
+    Returns
+    -------
+    ChatPromptTemplate
+    """
+    return ChatPromptTemplate.from_messages([
+        ("system", system_prompt),
+        (
+            "human",
+            "context:\n{context}\n\nquestion:\n{question}\n\n"
+            "Answer based on the Amazon datasets:",
+        ),
+    ])
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def run_rag(
+    retriever: Any,
+    query: str,
+    system_prompt: str = DEFAULT_SYSTEM_PROMPT,
+    repo_id: str = DEFAULT_REPO_ID,
+    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
+    provider: str = "auto",
+    verbose: bool = False,
+    hf_dataset = None
+) -> str:
+    """
+    Execute a full RAG pipeline and return the model's answer.
+    The pipeline follows the steps below:
+    1. **Retrieve** - *retriever* fetches the *k* most relevant documents
+       for *query*.
+    2. **Format context** - :func:`build_context` serialises the documents
+       into a single string.
+    3. **Prompt** - the context and query are injected into the chat prompt
+       template.
+    4. **Generate** - the LLM produces an answer grounded in the context.
+    5. **Parse** - the raw chat message is unwrapped to a plain string.
+    Parameters
+    ----------
+    retriever:
+        A LangChain-compatible retriever (must expose ``.invoke()`` and be
+        pipeable with ``|``).  Typically created via
+        ``vectorstore.as_retriever(...)``.
+    query:
+        Natural-language question to answer (non-empty string).
+    system_prompt:
+        System-level instruction for the assistant.  Defaults to
+        :data:`DEFAULT_SYSTEM_PROMPT`.
+    repo_id:
+        HuggingFace Hub model identifier.  Defaults to
+        ``"meta-llama/Meta-Llama-3-8B-Instruct"``.
+    max_new_tokens:
+        Upper bound on generated tokens.  Must be a positive integer.
+        Defaults to ``100``.
+    provider:
+        HuggingFace inference provider (e.g. ``"auto"``, ``"novita"``).
+        Defaults to ``"auto"``.
+    Returns
+    -------
+    str
+        The model's answer as a plain string.
+    Raises
+    ------
+    TypeError
+        If *retriever* is ``None``, *query* is not a string, or
+        *system_prompt* is not a string.
+    ValueError
+        If *query* is blank, *max_new_tokens* is not a positive integer,
+        or *repo_id* / *provider* are blank strings.
+    Examples
+    --------
+    >>> answer = run_rag(retriever, "Best waterproof mascara under $20")
+    >>> print(answer)
+    """
+    # ------------------------------------------------------------------
+    # Build chain components
+    # ------------------------------------------------------------------
+    logger.info("Initialising LLM endpoint: %s", repo_id)
+    llm = _build_llm(repo_id, max_new_tokens, provider)
+    prompt_template = _build_prompt_template(system_prompt)
+    retrieved_docs: list[Document] = []   # ← capture target
+    def _retrieve_and_capture(query: str) -> list[Document]:
+        """Invoke the retriever and snapshot the results for the caller."""
+        docs = retriever.invoke(query)
+        retrieved_docs.extend(docs)       # ← populate closure variable
+        return docs                       # ← pass through to build_context
+    rag_chain = (
+        {
+            "context": RunnableLambda(_retrieve_and_capture)
+                       | RunnableLambda(build_context)
+                       | _make_verbose_tap("RETRIEVED CONTEXT", verbose),
+            "question": RunnablePassthrough(),
+        }
+        | _make_verbose_tap("PROMPT INPUTS (context + question)", verbose)
+        | prompt_template
+        | _make_verbose_tap("RENDERED PROMPT SENT TO LLM", verbose)   # ← shows exact prompt
+        | llm
+        | StrOutputParser()
+    )
+    # ------------------------------------------------------------------
+    # Run
+    # ------------------------------------------------------------------
+    logger.info("Invoking RAG chain for query: %r", query)
+    answer: str = rag_chain.invoke(query)
+    logger.debug("RAG answer: %s", answer)
+    if hf_dataset:
+        docs = _format_docs(retrieved_docs, hf_dataset)
+    else:
+        docs = retrieved_docs
+    return answer, docs

src/src/retrieval_helpers.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import duckdb
+import json, sys
+import re
+from pathlib import Path
+ROOT_FOLDER = Path(__file__).resolve().parent.parent
+sys.path.append(str(ROOT_FOLDER))
+from src.semantic import semantic_search
+def decode_ratings(page_content):
+    block_pattern = r'\[\d\.0★\].*'
+    matches = re.findall(block_pattern, page_content)
+    if matches:
+        pattern = r'\[(\d\.0)★\]\s*(.*?)\s*—\s*(.*)'
+        parsed = []
+        for r in matches[:3]:
+            match = re.match(pattern, r)
+            if match:
+                rating, title, text = match.groups()
+                parsed.append({
+                    'rating': float(rating),
+                    'title': title.strip(),
+                    'text': text.strip()
+                })
+        return(parsed)
+    else:
+        return {}
+def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
+    """
+    Perform similarity search and enrich results with HuggingFace dataset metadata.
+    Args:
+        vector_store: LangChain vector store instance
+        query: Search query string
+        k: Number of results to return
+        filter: Filter dict for similarity search
+        hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
+    Returns:
+        List of enriched metadata objects as dicts
+    """
+    results = semantic_search(query, vector_store, k=k)
+    # 1. Extract parent_asins from metadata
+    parent_asins = [doc.metadata.get("parent_asin") for doc, score in results]
+    # 2. Query HuggingFace dataset via DuckDB
+    con = duckdb.connect()
+    arrow_table = hf_dataset.data.table  # Get underlying PyArrow table
+    con.register("hf_table", arrow_table)
+    asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
+    query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
+    hf_rows = con.execute(query_sql).fetchdf()
+    # Build lookup: parent_asin -> metadata dict
+    asin_to_metadata = {
+        row["parent_asin"]: row.to_dict()
+        for _, row in hf_rows.iterrows()
+    }
+    enriched_results = []
+    for doc, score in results:
+        parent_asin = doc.metadata.get("parent_asin")
+        total_reviews = doc.metadata.get("total_reviews")
+        metadata_object = asin_to_metadata.get(parent_asin, {}).copy()
+        metadata_object['score'] = score
+        metadata_object['total_reviews'] = total_reviews
+        # 3. Extract 3 lines after "Top Reviews\n" from page_content
+        page_content = doc.page_content
+        metadata_object["reviews"] = decode_ratings(page_content)
+        enriched_results.append(metadata_object)
+    con.close()
+    # 4. Return JSON metadata objects
+    return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
+def enrich_bm25_search_results(retriever, query: str, k: int, hf_dataset):
+    """
+    Perform BM25 search and enrich results with HuggingFace dataset metadata.
+    Args:
+        retriever: LangChain BM25Retriever instance
+        query: Search query string
+        k: Number of results to return
+        hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
+    Returns:
+        List of enriched metadata objects as dicts
+    """
+    # Get BM25 scores via underlying rank_bm25 library
+    query_tokens = query.split()
+    scores = retriever.vectorizer.get_scores(query_tokens)  # numpy array
+    top_k_indices = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:k]
+    results = [(retriever.docs[i], score) for i, score in top_k_indices]
+    # 1. Extract parent_asins from metadata
+    parent_asins = [doc.metadata.get("parent_asin") for doc, score in results]
+    # 2. Query HuggingFace dataset via DuckDB
+    con = duckdb.connect()
+    arrow_table = hf_dataset.data.table
+    con.register("hf_table", arrow_table)
+    asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
+    query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
+    hf_rows = con.execute(query_sql).fetchdf()
+    # Build lookup: parent_asin -> metadata dict
+    asin_to_metadata = {
+        row["parent_asin"]: row.to_dict()
+        for _, row in hf_rows.iterrows()
+    }
+    enriched_results = []
+    for doc, score in results:
+        parent_asin = doc.metadata.get("parent_asin")
+        metadata_object = {
+            **doc.metadata,
+            **asin_to_metadata.get(parent_asin, {}),
+            "score": score,
+        }
+        metadata_object['reviews'] = metadata_object.pop('top_reviews', {}) or {}
+        enriched_results.append(metadata_object)
+    con.close()
+    # 4. Return JSON metadata objects
+    return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
+def _format_docs(results, hf_dataset):
+    """
+    Perform similarity search and enrich results with HuggingFace dataset metadata.
+    Args:
+        vector_store: LangChain vector store instance
+        query: Search query string
+        k: Number of results to return
+        filter: Filter dict for similarity search
+        hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
+    Returns:
+        List of enriched metadata objects as dicts
+    """
+    # 1. Extract parent_asins from metadata
+    parent_asins = [doc.metadata.get("parent_asin") for doc in results]
+    # 2. Query HuggingFace dataset via DuckDB
+    con = duckdb.connect()
+    arrow_table = hf_dataset.data.table  # Get underlying PyArrow table
+    con.register("hf_table", arrow_table)
+    asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
+    query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
+    hf_rows = con.execute(query_sql).fetchdf()
+    # Build lookup: parent_asin -> metadata dict
+    asin_to_metadata = {
+        row["parent_asin"]: row.to_dict()
+        for _, row in hf_rows.iterrows()
+    }
+    enriched_results = []
+    for doc in results:
+        parent_asin = doc.metadata.get("parent_asin")
+        total_reviews = doc.metadata.get("total_reviews")
+        metadata_object = asin_to_metadata.get(parent_asin, {}).copy()
+        metadata_object['total_reviews'] = total_reviews
+        # 3. Extract 3 lines after "Top Reviews\n" from page_content
+        page_content = doc.page_content
+        metadata_object["reviews"] = decode_ratings(page_content)
+        enriched_results.append(metadata_object)
+    con.close()
+    # 4. Return JSON metadata objects
+    return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]

src/src/semantic.py ADDED Viewed

	@@ -0,0 +1,295 @@

+"""
+semantic_search.py
+------------------
+Semantic search over an Amazon product catalogue using FAISS + HuggingFace embeddings.
+Expected inputs
+---------------
+- metadata_dataset : datasets.Dataset  — one row per product (raw_metadata["full"])
+- reviews_dataset  : datasets.Dataset  — passed to get_best_reviews(reviews, asin, k)
+Typical usage
+-------------
+    docs  = build_documents(raw_metadata["full"], raw_reviews, n=100)
+    store = build_vector_store(docs)
+    results = semantic_search("noise cancelling headphones", store, k=5)
+"""
+import logging
+from typing import Any
+import torch
+import json, os, sys
+from pathlib import Path
+import faiss
+from datasets import Dataset
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+ROOT_FOLDER = Path(__file__).resolve().parent.parent
+sys.path.append(str(ROOT_FOLDER))
+from src.eda_helpers import get_best_reviews
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+DEFAULT_TOP_REVIEWS = 5
+DEFAULT_TOP_K = 5
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+EMBEDDINGS = HuggingFaceEmbeddings(
+    model_name=DEFAULT_EMBEDDING_MODEL,
+    model_kwargs={
+        "device": DEVICE,
+        "model_kwargs": {"torch_dtype": torch.float16},
+    },
+    encode_kwargs={
+        "batch_size": 128 if DEVICE == 'cpu' else 512,
+        "normalize_embeddings": True,
+    },
+)
+# ---------------------------------------------------------------------------
+# Document construction
+# ---------------------------------------------------------------------------
+def _format_review(review) -> str:
+    """Return a concise single-line string for one review."""
+    rating = review.get("rating", "?")
+    title  = (review.get("title") or "").strip()
+    text   = (review.get("text")  or "").strip()
+    return f"[{rating}★] {title} — {text}"
+def _build_reviews_block(
+    reviews: Dataset,
+    parent_asin: str,
+    k: int = DEFAULT_TOP_REVIEWS,
+) -> str:
+    """
+    Fetch top-k reviews for *parent_asin* and return a formatted text block.
+    Returns an empty string when no reviews are found.
+    """
+    total, product_reviews = get_best_reviews(reviews, parent_asin, k)
+    if not product_reviews:
+        return 0, ""
+    lines = "\n    ".join(_format_review(r) for r in product_reviews)
+    return total, f"{lines}"
+def _build_page_content(product, review_block: str) -> str:
+    """Assemble the text that will be embedded. Empty sections are omitted."""
+    title         = (product.get("title") or "").strip()
+    main_category = (product.get("main_category") or "").strip()
+    categories    = main_category +" >> " + " > ".join(product.get("categories") or [])
+    features      = "\n    ".join(product.get("features") or [])
+    description   = " ".join(product.get("description") or [])
+    details = (product.get("details") or "").strip()
+    parts = [f"Product: {title}"]
+    if categories:
+        parts.append(f"Category Path: {categories}")
+    if features:
+        parts.append(f"Features:\n    {features}")
+    if description:
+        parts.append(f"Description:\n    {description}")
+    if review_block:
+        parts.append(f"Top Reviews:\n    {review_block}")
+    if details:
+        parts.append(f"Details:\n    {details}")
+    return "\n".join(parts)
+def create_document(product, reviews: Dataset) -> Document | None:
+    """
+    Build a :class:`~langchain_core.documents.Document` from one product row.
+    Args:
+        product: A single row from a HuggingFace metadata Dataset (dict-like).
+        reviews: The full reviews Dataset, forwarded to ``get_best_reviews``.
+    Returns:
+        A Document, or ``None`` if the row has no ``parent_asin``.
+    Notes:
+        *page_content* contains only the text that influences embeddings.
+        *metadata* stores structured scalars used for filtering and display
+        after retrieval — values are kept flat and JSON-serialisable so FAISS
+        filter expressions work correctly.
+    """
+    parent_asin = product.get("parent_asin")
+    if not parent_asin:
+        logger.warning("Skipping product with missing parent_asin: %s", product.get("title"))
+        return None
+    tot, review_block = _build_reviews_block(reviews, parent_asin)
+    page_content = _build_page_content(product, review_block)
+    metadata = {
+        # --- identifiers ---
+        "parent_asin":    parent_asin,
+        # --- numeric (filterable / rankable) ---
+        "price":          product.get("price"),
+        "average_rating": product.get("average_rating"),
+        "rating_number":  product.get("rating_number"),
+        # --- categorical (filterable) ---
+        "main_category":  product.get("main_category", ""),
+        "categories":     product.get("categories") or [],
+        # --- free-form (display only; coerce to str for FAISS compatibility) ---
+        "details":        str(product.get("details") or ""),
+        "total_reviews":  tot
+    }
+    return Document(page_content=page_content, metadata=metadata)
+# ---------------------------------------------------------------------------
+# Vector store
+# ---------------------------------------------------------------------------
+# Case when we want to create embeddings at once
+def build_vector_store(
+    docs: list[Document],
+    existing_store: FAISS | None = None,
+) -> FAISS:
+    """
+    Embed *docs* and return (or update) a FAISS vector store.
+    If ``existing_store`` is provided, documents are added to it.
+    Otherwise, a new FAISS store is created.
+    Document IDs are set to ``parent_asin``.
+    """
+    if not docs:
+        raise ValueError("Cannot build a vector store from an empty document list.")
+    logger.info("Embedding on %s", DEVICE)
+    # --- Create new store if needed ---
+    if existing_store is None:
+        dim = len(EMBEDDINGS.embed_query("probe"))
+        index = faiss.IndexFlatL2(dim)
+        vector_store = FAISS(
+            embedding_function=EMBEDDINGS,
+            index=index,
+            docstore=InMemoryDocstore(),
+            index_to_docstore_id={},
+        )
+    else:
+        vector_store = existing_store
+    # --- Add documents ---
+    uuids = [doc.metadata["parent_asin"] for doc in docs]
+    vector_store.add_documents(documents=docs, ids=uuids)
+    logger.info("Indexed %d documents into FAISS.", len(docs))
+    return vector_store
+# Running the above function in batches and saving
+def build_and_save_vector_store(
+    metadata_dataset: Dataset,
+    reviews: Dataset,
+    save_path: str,
+    batch_size: int = 500,
+) -> FAISS:
+    # --- Resume / initialize ---
+    if os.path.exists(os.path.join(save_path, "index.faiss")):
+        vector_store = FAISS.load_local(
+            save_path, EMBEDDINGS, allow_dangerous_deserialization=True
+        )
+        already_indexed = set(vector_store.index_to_docstore_id.values())
+        print(f"Resuming — {len(already_indexed)} docs already indexed.")
+    else:
+        os.makedirs(save_path, exist_ok=True)
+        vector_store = None  # let helper create it
+        already_indexed = set()
+    progress_file = os.path.join(save_path, "progress.json")
+    # --- Resume progress ---
+    if os.path.exists(progress_file):
+        with open(progress_file) as f:
+            resume_start = json.load(f).get("next_start", 0)
+        print(f"Resuming from row {resume_start}.")
+    else:
+        resume_start = 0
+    total = len(metadata_dataset)
+    for start in range(resume_start, total, batch_size):
+        batch = metadata_dataset.select(range(start, min(start + batch_size, total)))
+        docs = []
+        for row in batch:
+            doc = create_document(row, reviews)
+            if doc is not None and doc.metadata["parent_asin"] not in already_indexed:
+                docs.append(doc)
+        if docs:
+            vector_store = build_vector_store(
+                docs=docs,
+                existing_store=vector_store,
+            )
+            already_indexed.update(doc.metadata["parent_asin"] for doc in docs)
+        # --- Save after each batch ---
+        vector_store.save_local(save_path)
+        with open(progress_file, "w") as f:
+            json.dump({"next_start": min(start + batch_size, total)}, f)
+        print(f"Indexed {min(start + batch_size, total)} / {total} rows")
+    if os.path.exists(progress_file):
+        os.remove(progress_file)
+    return vector_store
+# ---------------------------------------------------------------------------
+# Search
+# ---------------------------------------------------------------------------
+def semantic_search(
+    query: str,
+    vector_store: FAISS,
+    k: int = DEFAULT_TOP_K,
+    filter = None,
+) -> list[Document]:
+    """
+    Run a semantic similarity search against a pre-built *vector_store*.
+    Args:
+        query:        Natural-language search query.
+        vector_store: A FAISS store built with :func:`build_vector_store`.
+        k:            Number of results to return.
+        filter:       Optional metadata filter dict, e.g.
+                      ``{"main_category": "Electronics"}``.
+    Returns:
+        Ordered list of the *k* most relevant Documents.
+    """
+    results = vector_store.similarity_search_with_score(query, k=k, filter=filter)
+    logger.info("'%s' -> %d results", query, len(results))
+    return results
+# ---------------------------------------------------------------------------
+# Read existing vector store
+# ---------------------------------------------------------------------------
+def load_vector_store(
+    load_path: str,
+) -> FAISS:
+    return FAISS.load_local(
+        load_path,
+        embeddings=EMBEDDINGS,
+        allow_dangerous_deserialization=True,
+    )

src/src/utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import re
+import nltk
+from nltk.corpus import stopwords
+# Download stopwords if not already downloaded
+nltk.download('stopwords', quiet=True)
+# Define a set of English stopwords for filtering out common words
+STOPWORDS = set(stopwords.words('english'))
+# Tokenizer
+def simple_tokenize(text):
+    if not text:
+        return []
+    text = text.lower()
+    text = re.sub(r"-", " ", text)
+    text = re.sub(r"[^a-z0-9\s]", "", text)
+    tokens = text.split()
+    tokens = [t for t in tokens if t not in STOPWORDS]
+    return tokens