Spaces:

rishadaz
/

amazon_retriever

Sleeping

App Files Files Community

Sarisha Das commited on Apr 18

Commit

681ec3c

1 Parent(s): 0bcbce0

update for rag

Browse files

Files changed (6) hide show

src/streamlit_app.py +176 -159
src/styles.css +94 -0
utils/bm25.py +18 -24
utils/hybrid.py +240 -0
utils/rag_pipeline.py +304 -0
utils/retrieval_helpers.py +53 -0

src/streamlit_app.py CHANGED Viewed

@@ -15,6 +15,11 @@ os.environ["TRANSFORMERS_CACHE"] = str(ROOT / ".hf_cache" / "transformers")
 from utils.retrieval_helpers import enrich_search_results, enrich_bm25_search_results
 from utils.bm25 import load
 from utils.semantic import load_vector_store
 from dotenv import load_dotenv
 load_dotenv()
@@ -31,83 +36,22 @@ st.set_page_config(
 )
 # ─── Paths ────────────────────────────────────────────────────────────────────
 FEEDBACK_CSV = ROOT / "results" / "feedback.csv"
 FEEDBACK_CSV.parent.mkdir(parents=True, exist_ok=True)
 HF_TOKEN = os.getenv('HF_TOKEN')
 from datasets import load_dataset
 from huggingface_hub import snapshot_download, login
 # ─── Custom CSS ───────────────────────────────────────────────────────────────
-st.markdown(
-    """
-    <style>
-    @import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@600&family=Source+Sans+3:wght@400;600&display=swap');
-    html, body, [class*="css"] {
-        font-family: 'Source Sans 3', sans-serif;
-    }
-    h1, h2, h3 { font-family: 'Playfair Display', serif; }
-    .banner {
-        background: linear-gradient(135deg, #2d4a22 0%, #4a7c3f 60%, #7aab5c 100%);
-        border-radius: 12px;
-        padding: 2rem 2.5rem;
-        margin-bottom: 1.5rem;
-        color: #f5f0e8;
-    }
-    .banner h1 { margin: 0; font-size: 2.4rem; color: #f5f0e8; }
-    .banner p  { margin: 0.3rem 0 0; font-size: 1.05rem; opacity: 0.85; }
-    /* Product card (outer) */
-    .product-card {
-        background: #fffdf7;
-        border: 1px solid #e2d9c8;
-        border-left: 4px solid #4a7c3f;
-        border-radius: 8px;
-        padding: 1rem 1.2rem 0.6rem;
-        margin-bottom: 0.4rem;
-        box-shadow: 0 1px 4px rgba(0,0,0,0.06);
-    }
-    .product-card h4 { margin: 0 0 0.2rem; color: #1e3318; font-size: 1.05rem; }
-    /* Review snippet inside expander */
-    .review-snippet {
-        background: #f7f4ee;
-        border-radius: 6px;
-        padding: 0.6rem 0.9rem;
-        margin-bottom: 0.5rem;
-        font-size: 0.87rem;
-        color: #444;
-        line-height: 1.55;
-    }
-    .score-badge {
-        display: inline-block;
-        background: #eaf3e6;
-        color: #2d5a20;
-        border-radius: 20px;
-        padding: 2px 10px;
-        font-size: 0.78rem;
-        font-weight: 600;
-        margin-right: 6px;
-    }
-    .stars { color: #e6a817; }
-    .placeholder-badge {
-        background: #fff3cd;
-        border: 1px solid #ffc107;
-        border-radius: 6px;
-        padding: 0.4rem 0.8rem;
-        font-size: 0.82rem;
-        color: #7a5800;
-        display: inline-block;
-        margin-bottom: 1rem;
-    }
-    </style>
-    """,
-    unsafe_allow_html=True,
-)
 @st.cache_resource
 def load_hf_dataset():
@@ -183,9 +127,23 @@ def semantic_search(query: str, top_k: int = 3) -> list[dict]:
         return retriever.search(query, top_k=top_k)
     Returns top_k review-level results (scores are cosine similarities, 0–1).
     """
-    results = enrich_search_results(vector_store, query, top_k, HF_DATASET["full"])
     return results
 # ─── Helpers ──────────────────────────────────────────────────────────────────
 def stars(rating: float) -> str:
@@ -212,82 +170,85 @@ def log_feedback(query: str, mode: str, asin: str, title: str, vote: str) -> Non
             "vote":      vote,
         })
 def render_results(results: list[dict], mode: str, query: str) -> None:
     if not results:
         st.info("No results returned.")
         return
-    grouped = results
-    for ind, item in enumerate(grouped):
-        reviews     = item["reviews"]
-        title       = item["title"]
-        avg_rating  = item["average_rating"]
-        n_reviews   = len(reviews)
-        # total_reviews = item.get('total_reviews', n_reviews)
-        rating_number = item.get('rating_number', 0)
-        asin        = item['parent_asin']
-        review_word = "review" if n_reviews == 1 else "reviews"
-        large_images = item.get('images', {}).get('large', [])
-        image_html = f'<img src="{large_images[0]}" style="width:100%;max-width:200px;border-radius:8px;margin-bottom:8px;" />' if large_images else ''
-        raw_price = item.get('price')
-        try:
-            price_val = float(str(raw_price).replace('$', '').replace(',', '').strip())
-            price_html = f'<span style="color:#2ecc71;font-weight:600">${price_val:.2f}</span>'
-        except (TypeError, ValueError):
-            price_html = ''
-        # ── Product card header ───────────────────────────────────────────
-        st.markdown(
-            f"""
-            <div class="product-card">
-                {image_html}
-                <h4>#{ind + 1} &nbsp; {title}</h4>
-                <span class="stars">{stars(avg_rating)}</span>
-                &nbsp;<small style="color:#888">{avg_rating:.1f}/5 avg ({rating_number:,} ratings)</small>
-                &nbsp;&nbsp;
-                <span class="score-badge">similarity score: {item['score']}</span>
-                {"&nbsp;&nbsp;" + price_html if price_html else ""}
-            </div>
-            """,
-            unsafe_allow_html=True,
-        )
-        # ── Reviews in collapsible expander ───────────────────────────────
-        expander_label = f"📖 Viewing top {n_reviews} {review_word} "
-        with st.expander(expander_label, expanded=(n_reviews == 1)):
-            for j, rev in enumerate(reviews):
-                st.markdown(
-                    f"""
-                    <div class="review-snippet">
-                        <strong>{rev['title']}</strong>
-                        &nbsp;·&nbsp;
-                        <span class="stars">{stars(rev['rating'])}</span>
-                        <span style="color:#888; font-size:0.8rem"> {rev['rating']}/5</span>
-                        &nbsp;·&nbsp;
-                        <br><br>
-                        {rev['text'][:300]}{'…' if len(rev['text']) > 300 else ''}
-                    </div>
-                    """,
-                    unsafe_allow_html=True,
-                )
-        # ── Feedback buttons (per product) ────────────────────────────────
-        col_up, col_dn, _ = st.columns([1, 1, 10])
-        with col_up:
-            if st.button("👍", key=f"up_{mode}_{asin}_{ind}"):
-                log_feedback(query, mode, asin, title, "up")
-                st.toast("Thanks! 👍")
-        with col_dn:
-            if st.button("👎", key=f"dn_{mode}_{asin}_{ind}"):
-                log_feedback(query, mode, asin, title, "down")
-                st.toast("Noted! 👎")
-        st.markdown("<hr style='border:none;border-top:1px solid #e8e0d0;margin:0.5rem 0 1rem'>", unsafe_allow_html=True)
 # ─── App layout ───────────────────────────────────────────────────────────────
 st.markdown(
     """
@@ -304,29 +265,85 @@ query = st.text_input(
     "Search for a product or describe what you're looking for",
     placeholder="e.g. something sweet for a cheese board...",
 )
-# ─── Mode radio ───────────────────────────────────────────────────────────────
-mode = st.radio(
-    "Search mode",
-    options=["BM25", "Semantic"],
-    index=0,            # BM25 shown by default
-    horizontal=True,
-    help="BM25 = keyword matching · Semantic = embedding similarity (all-MiniLM-L6-v2 + FAISS)",
-)
-# ─── Run & render ─────────────────────────────────────────────────────────────
-TOP_K = 5  # fixed per milestone requirement
-if query.strip():
-    st.markdown(f"#### Top {TOP_K} results — {mode}")
-    results = bm25_search(query, top_k=TOP_K) if mode == "BM25" else semantic_search(query, top_k=TOP_K)
-    render_results(results, mode=mode.lower(), query=query)
-else:
-    st.markdown(
-        "<p style='color:#aaa; margin-top:1rem;'>Enter a query above to see results.</p>",
-        unsafe_allow_html=True,
-    )
 # ─── Sidebar: feedback log ────────────────────────────────────────────────────
 with st.sidebar:

 from utils.retrieval_helpers import enrich_search_results, enrich_bm25_search_results
 from utils.bm25 import load
 from utils.semantic import load_vector_store
+from utils.rag_pipeline import run_rag
+from utils.bm25 import load
+from utils.hybrid import HybridRetriever
+import markdown
 from dotenv import load_dotenv
 load_dotenv()
 )
 # ─── Paths ────────────────────────────────────────────────────────────────────
+ROOT = Path(__file__).resolve().parent.parent
 FEEDBACK_CSV = ROOT / "results" / "feedback.csv"
 FEEDBACK_CSV.parent.mkdir(parents=True, exist_ok=True)
+TOP_K = 5
 HF_TOKEN = os.getenv('HF_TOKEN')
 from datasets import load_dataset
 from huggingface_hub import snapshot_download, login
 # ─── Custom CSS ───────────────────────────────────────────────────────────────
+with open('./src/styles.css', "r") as f:
+    css = f.read()
+st.markdown(f"<style>{css}</style>", unsafe_allow_html=True)
 @st.cache_resource
 def load_hf_dataset():
         return retriever.search(query, top_k=top_k)
     Returns top_k review-level results (scores are cosine similarities, 0–1).
     """
+    results = enrich_search_results(vector_store, query, top_k, HF_DATASET['full'])
     return results
+hybrid_retriever = HybridRetriever(
+        bm25_retriever=retriever,
+        semantic_store=vector_store,
+        k=TOP_K,
+        bm25_weight=0.5,
+        semantic_weight=0.5,
+    )
+def llm_retriever(query: str, top_k: int = 5):
+    retriever = hybrid_retriever
+    answer, docs = run_rag(retriever, query=query, hf_dataset=HF_DATASET['full'])
+    return answer, docs
 # ─── Helpers ──────────────────────────────────────────────────────────────────
 def stars(rating: float) -> str:
             "vote":      vote,
         })
+def render_product(ind, item):
+    reviews     = item.get("reviews",{})
+    title       = item["title"]
+    avg_rating  = item["average_rating"]
+    n_reviews   = len(reviews)
+    # total_reviews = item.get('total_reviews', n_reviews)
+    rating_number = item.get('rating_number', 0)
+    asin        = item['parent_asin']
+    review_word = "review" if n_reviews == 1 else "reviews"
+    large_images = item.get('images', {}).get('large', [])
+    image_html = f'<img src="{large_images[0]}" style="width:100%;max-width:200px;border-radius:8px;margin-bottom:8px;" />' if large_images else ''
+    raw_price = item.get('price')
+    try:
+        price_val = float(str(raw_price).replace('$', '').replace(',', '').strip())
+        price_html = f'<span style="color:#2ecc71;font-weight:600">${price_val:.2f}</span>'
+    except (TypeError, ValueError):
+        price_html = ''
+    # ── Product card header ───────────────────────────────────────────
+    score_badge = f'<span class="score-badge">similarity score: {float(item["score"]):.2f}</span>' if 'score' in item else "<span/>"
+    st.markdown(
+        f"""
+        <div class="product-card" id="{asin}">
+            {image_html}
+            <h4>#{ind + 1} &nbsp; {title}</h4>
+            <span class="stars">{stars(avg_rating)}</span>
+            &nbsp;<small style="color:#888">{avg_rating:.1f}/5 avg ({rating_number:,} ratings)</small>
+            &nbsp;&nbsp;
+            {score_badge}
+            {"&nbsp;&nbsp;" + price_html if price_html else ""}
+        </div>
+        """,
+        unsafe_allow_html=True,
+    )
+    # ── Reviews in collapsible expander ───────────────────────────────
+    expander_label = f"📖 Viewing top {n_reviews} {review_word} "
+    with st.expander(expander_label, expanded=(n_reviews == 1)):
+        for j, rev in enumerate(reviews):
+            st.markdown(
+                f"""
+                <div class="review-snippet">
+                    <strong>{rev['title']}</strong>
+                    &nbsp;·&nbsp;
+                    <span class="stars">{stars(rev['rating'])}</span>
+                    <span style="color:#888; font-size:0.8rem"> {rev['rating']}/5</span>
+                    &nbsp;·&nbsp;
+                    <br><br>
+                    {rev['text'][:300]}{'…' if len(rev['text']) > 300 else ''}
+                </div>
+                """,
+                unsafe_allow_html=True,
+            )
+    # ── Feedback buttons (per product) ────────────────────────────────
+    col_up, col_dn, _ = st.columns([1, 1, 10])
+    with col_up:
+        if st.button("👍", key=f"up_{mode}_{asin}_{ind}"):
+            log_feedback(query, mode, asin, title, "up")
+            st.toast("Thanks! 👍")
+    with col_dn:
+        if st.button("👎", key=f"dn_{mode}_{asin}_{ind}"):
+            log_feedback(query, mode, asin, title, "down")
+            st.toast("Noted! 👎")
+    st.markdown("<hr style='border:none;border-top:1px solid #e8e0d0;margin:0.5rem 0 1rem'>", unsafe_allow_html=True)
 def render_results(results: list[dict], mode: str, query: str) -> None:
     if not results:
         st.info("No results returned.")
         return
+    for ind, item in enumerate(results):
+        render_product(ind,item)
 # ─── App layout ───────────────────────────────────────────────────────────────
 st.markdown(
     """
     "Search for a product or describe what you're looking for",
     placeholder="e.g. something sweet for a cheese board...",
 )
+# ─── Run searches only when query changes ─────────────────────────────────────
+if query.strip() and query != st.session_state.get("last_query"):
+    st.session_state.last_query = query
+    with st.spinner("Searching..."):
+        st.session_state.bm25_results = bm25_search(query, top_k=TOP_K)
+        st.session_state.semantic_results = semantic_search(query, top_k=TOP_K)
+    with st.spinner("Asking AI..."):
+        try:
+            answer, docs = llm_retriever(query, top_k=TOP_K)
+            st.session_state.llm_result = answer
+            st.session_state.llm_docs = docs
+        except Exception as e:
+            st.session_state.llm_result = f"**Error:** {e}"
+            st.session_state.llm_docs = []
+elif not query.strip():
+    # Clear results when input is emptied
+    for key in ("last_query", "bm25_results", "semantic_results", "llm_result"):
+        st.session_state.pop(key, None)
+# ─── Tabs ─────────────────────────────────────────────────────────────────────
+tab_search, tab_llm = st.tabs(["🔍 Search", "🤖 AI Assistant"])
+# ─── Search Tab ───────────────────────────────────────────────────────────────
+with tab_search:
+    mode = st.radio(
+        "Search mode",
+        options=["BM25", "Semantic"],
+        index=0,
+        horizontal=True,
+        help="BM25 = keyword matching · Semantic = embedding similarity (all-MiniLM-L6-v2 + FAISS)",
+    )
+    if "last_query" not in st.session_state:
+        st.markdown(
+            "<p style='color:#aaa; margin-top:1rem;'>Enter a query above to see results.</p>",
+            unsafe_allow_html=True,
+        )
+    else:
+        st.markdown(f"#### Top {TOP_K} results — {mode}")
+        results = (
+            st.session_state.bm25_results
+            if mode == "BM25"
+            else st.session_state.semantic_results
+        )
+        render_results(results, mode=mode.lower(), query=st.session_state.last_query)
+# ─── LLM Tab ──────────────────────────────────────────────────────────────────
+with tab_llm:
+    if "llm_result" not in st.session_state:
+        st.markdown(
+            "<p style='color:#aaa; margin-top:1rem;'>Enter a query above to get AI-powered recommendations.</p>",
+            unsafe_allow_html=True,
+        )
+    else:
+        st.markdown(f"#### 🤖 AI Answer — *\"{st.session_state.last_query}\"*")
+        st.caption("⚠️ AI responses may contain errors - please verify before relying on them.")
+        html_response = markdown.markdown(
+            st.session_state.llm_result,
+            extensions=["tables", "fenced_code", "nl2br"],
+        )
+        st.markdown(
+            f"<div class='llm-response'>{html_response}</div>",
+            unsafe_allow_html=True,
+        )
+        st.markdown("#### 📦 Retrieved Products")
+        docs = st.session_state.get("llm_docs", [])
+        if docs:
+            # Build scrollable card list in one HTML block
+            cards_html = "<div class='doc-sidebar'>"
+            for i, doc in enumerate(docs, 1):
+                render_product(i,doc)
+            cards_html += "</div>"
+            st.markdown(cards_html, unsafe_allow_html=True)
+        else:
+            st.markdown("<p style='color:#aaa;'>No documents retrieved.</p>", unsafe_allow_html=True)
 # ─── Sidebar: feedback log ────────────────────────────────────────────────────
 with st.sidebar:

src/styles.css ADDED Viewed

	@@ -0,0 +1,94 @@

+@import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@600&family=Source+Sans+3:wght@400;600&display=swap');
+html, body, [class*="css"] {
+    font-family: 'Source Sans 3', sans-serif;
+}
+h1, h2, h3 { font-family: 'Playfair Display', serif; }
+.banner {
+    background: linear-gradient(135deg, #2d4a22 0%, #4a7c3f 60%, #7aab5c 100%);
+    border-radius: 12px;
+    padding: 2rem 2.5rem;
+    margin-bottom: 1.5rem;
+    color: #f5f0e8;
+}
+.banner h1 { margin: 0; font-size: 2.4rem; color: #f5f0e8; }
+.banner p  { margin: 0.3rem 0 0; font-size: 1.05rem; opacity: 0.85; }
+/* Product card (outer) */
+.product-card {
+    background: #fffdf7;
+    border: 1px solid #e2d9c8;
+    border-left: 4px solid #4a7c3f;
+    border-radius: 8px;
+    padding: 1rem 1.2rem 0.6rem;
+    margin-bottom: 0.4rem;
+    box-shadow: 0 1px 4px rgba(0,0,0,0.06);
+}
+.product-card h4 { margin: 0 0 0.2rem; color: #1e3318; font-size: 1.05rem; }
+/* Review snippet inside expander */
+.review-snippet {
+    background: #f7f4ee;
+    border-radius: 6px;
+    padding: 0.6rem 0.9rem;
+    margin-bottom: 0.5rem;
+    font-size: 0.87rem;
+    color: #444;
+    line-height: 1.55;
+}
+.score-badge {
+    display: inline-block;
+    background: #eaf3e6;
+    color: #2d5a20;
+    border-radius: 20px;
+    padding: 2px 10px;
+    font-size: 0.78rem;
+    font-weight: 600;
+    margin-right: 6px;
+}
+.stars { color: #e6a817; }
+.placeholder-badge {
+    background: #fff3cd;
+    border: 1px solid #ffc107;
+    border-radius: 6px;
+    padding: 0.4rem 0.8rem;
+    font-size: 0.82rem;
+    color: #7a5800;
+    display: inline-block;
+    margin-bottom: 1rem;
+}
+.doc-sidebar {
+    max-height: 600px;
+    overflow-y: auto;
+    padding-right: 4px;
+}
+.doc-card {
+    background: #1e1e2e;
+    border: 1px solid #333;
+    border-radius: 8px;
+    padding: 0.75rem;
+    margin-bottom: 0.65rem;
+}
+.doc-title {
+    font-weight: 600;
+    font-size: 0.85rem;
+    margin-bottom: 0.3rem;
+    color: #f0f0f0;
+    line-height: 1.3;
+}
+.doc-meta {
+    font-size: 0.78rem;
+    margin-bottom: 0.3rem;
+    display: flex;
+    gap: 0.5rem;
+}
+.doc-rating { color: #f5c518; }
+.doc-price  { color: #5cb85c; }
+.doc-snippet {
+    font-size: 0.75rem;
+    color: #999;
+    line-height: 1.4;
+}

utils/bm25.py CHANGED Viewed

@@ -368,21 +368,8 @@ def load(index_path: str | Path = "data/processed/bm25_index.pkl") -> BM25Retrie
             f"BM25 index not found at '{index_path}'.\n"
             "Run build_and_save() from your notebook first."
         )
-    # Patch: pickle saved simple_tokenize under 'utils' top-level namespace,
-    # but it now lives in utils.bm25 — register it where pickle expects it
-    import sys
-    import types
-    from utils import bm25 as bm25_module
-    if "utils" not in sys.modules or not hasattr(sys.modules["utils"], "simple_tokenize"):
-        fake_utils = types.ModuleType("utils")
-        fake_utils.simple_tokenize = bm25_module.simple_tokenize
-        sys.modules["utils"] = fake_utils
     with open(index_path, "rb") as f:
         retriever = pickle.load(f)
     print(f"BM25 index loaded ← {index_path}")
     return retriever
@@ -410,18 +397,25 @@ def search(
         asin, title, text, rating, score, top_reviews
     """
     retriever.k = top_k
-    docs = retriever.invoke(query)
     results = []
-    for doc in docs:
-        m           = doc.metadata
         top_reviews = m.get("top_reviews", [])
-        # Average rating across retrieved top reviews
         rated = [r["rating"] for r in top_reviews if r.get("rating") is not None]
         avg_rating = round(sum(rated) / len(rated), 1) if rated else 0.0
-        # Snippet = first review text, falling back to description
         if top_reviews and top_reviews[0].get("text"):
             snippet = top_reviews[0]["text"][:300]
         else:
@@ -432,7 +426,7 @@ def search(
             "title":       m.get("title", ""),
             "text":        snippet,
             "rating":      avg_rating,
-            "score":       0.0,   # LangChain BM25Retriever does not expose raw scores
             "top_reviews": top_reviews,
         })
@@ -444,8 +438,8 @@ def search(
 def build_from_hf_datasets(
     metadata_dataset: Dataset,
     reviews_dataset_dict,
-    index_path: str | Path = "data/processed/bm25_index.pkl",
-    corpus_path: str | Path = "data/processed/bm25_corpus.pkl",
     max_products: int | None = None,
     max_reviews_per_product: int = 5,
 ) -> BM25Retriever:
@@ -480,8 +474,8 @@ def build_from_hf_datasets(
 def build_from_hf_datasets_batched(
     metadata_dataset: Dataset,
     reviews_dataset_dict,
-    index_path: str | Path = "data/processed/bm25_index.pkl",
-    corpus_path: str | Path = "data/processed/bm25_corpus.pkl",
     batch_size: int = 2000,
     max_reviews_per_product: int = 5,
     max_products: int | None = None,

             f"BM25 index not found at '{index_path}'.\n"
             "Run build_and_save() from your notebook first."
         )
     with open(index_path, "rb") as f:
         retriever = pickle.load(f)
     print(f"BM25 index loaded ← {index_path}")
     return retriever
         asin, title, text, rating, score, top_reviews
     """
     retriever.k = top_k
+    # Tokenize query the same way the index was built
+    tokenized_query = simple_tokenize(query)
+    # Get raw BM25 scores for ALL documents
+    scores = retriever.vectorizer.get_scores(tokenized_query)  # np.ndarray, len = n_docs
+    # Get top-k doc indices by score
+    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
     results = []
+    for idx in top_indices:
+        doc = retriever.docs[idx]          # retriever.docs holds the original Document list
+        m   = doc.metadata
         top_reviews = m.get("top_reviews", [])
         rated = [r["rating"] for r in top_reviews if r.get("rating") is not None]
         avg_rating = round(sum(rated) / len(rated), 1) if rated else 0.0
         if top_reviews and top_reviews[0].get("text"):
             snippet = top_reviews[0]["text"][:300]
         else:
             "title":       m.get("title", ""),
             "text":        snippet,
             "rating":      avg_rating,
+            "score":       float(scores[idx]),
             "top_reviews": top_reviews,
         })
 def build_from_hf_datasets(
     metadata_dataset: Dataset,
     reviews_dataset_dict,
+    index_path: str | Path = "data/processed/tokenisation/bm25_index.pkl",
+    corpus_path: str | Path = "data/processed/tokenisation/bm25_corpus.pkl",
     max_products: int | None = None,
     max_reviews_per_product: int = 5,
 ) -> BM25Retriever:
 def build_from_hf_datasets_batched(
     metadata_dataset: Dataset,
     reviews_dataset_dict,
+    index_path: str | Path = "data/processed/tokenisation/bm25_index.pkl",
+    corpus_path: str | Path = "data/processed/tokenisation/bm25_corpus.pkl",
     batch_size: int = 2000,
     max_reviews_per_product: int = 5,
     max_products: int | None = None,

utils/hybrid.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+src/hybrid.py
+-------------
+Hybrid retriever combining BM25 keyword search and FAISS semantic search,
+fused with Reciprocal Rank Fusion (RRF).
+Designed to plug into the existing run_rag() pipeline in rag_pipeline.py
+as a drop-in replacement for the semantic retriever:
+    hybrid_retriever = load_hybrid_retriever(
+        bm25_index_path="data/processed/tokenisation/bm25_index_mini.pkl",
+        faiss_store_path="data/processed/embeddings",
+        k=5,
+    )
+    answer = run_rag(hybrid_retriever, "Best coffee beans for espresso")
+The HybridRetriever class extends LangChain's BaseRetriever so it is fully
+compatible with the | (pipe) operator used in rag_pipeline.py:
+    rag_chain = (
+        {
+            "context": hybrid_retriever | RunnableLambda(build_context),
+            "question": RunnablePassthrough(),
+        }
+        | prompt_template
+        | llm
+        | StrOutputParser()
+    )
+"""
+from __future__ import annotations
+import logging
+from typing import Any
+from langchain_community.retrievers import BM25Retriever
+from langchain_community.vectorstores import FAISS
+from langchain_core.callbacks import CallbackManagerForRetrieverRun
+from langchain_core.documents import Document
+from langchain_core.retrievers import BaseRetriever
+from pydantic import Field
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# HybridRetriever
+# ---------------------------------------------------------------------------
+class HybridRetriever(BaseRetriever):
+    """
+    Combines BM25 keyword retrieval and FAISS semantic retrieval using
+    Reciprocal Rank Fusion (RRF) to produce a unified ranked document list.
+    RRF score for document d across retriever r:
+        score(d) = weight_r * (1 / (rrf_c + rank(d, r)))
+    Documents appearing in both retrievers accumulate scores from both,
+    naturally promoting results that are relevant by both keyword and meaning.
+    Parameters
+    ----------
+    bm25_retriever   : Fitted LangChain BM25Retriever (from bm25.load())
+    semantic_store   : Loaded FAISS vectorstore (from semantic.load_vector_store())
+    k                : Number of final documents to return
+    rrf_c            : RRF constant — dampens the impact of rank differences.
+                       Standard value is 60; lower = top ranks matter more.
+    bm25_weight      : RRF weight for BM25 results (keyword signal)
+    semantic_weight  : RRF weight for semantic results (meaning signal)
+    fetch_multiplier : Fetch this multiple of k from each retriever before fusing.
+                       More candidates = better fusion quality. Default: 3.
+    """
+    bm25_retriever: Any = Field(...)
+    semantic_store: Any = Field(...)
+    k: int = Field(default=5)
+    rrf_c: int = Field(default=60)
+    bm25_weight: float = Field(default=0.5)
+    semantic_weight: float = Field(default=0.5)
+    fetch_multiplier: int = Field(default=3)
+    def _get_relevant_documents(
+        self,
+        query: str,
+        *,
+        run_manager: CallbackManagerForRetrieverRun,
+    ) -> list[Document]:
+        """
+        Core retrieval logic called by LangChain when the retriever is invoked.
+        Steps
+        -----
+        1. Fetch candidates from BM25 and FAISS independently
+        2. Assign RRF scores weighted by retriever confidence
+        3. Deduplicate by parent_asin, accumulating scores for shared hits
+        4. Sort by fused RRF score and return top-k Documents
+        """
+        fetch_k = self.k * self.fetch_multiplier
+        # ── 1. BM25 retrieval ────────────────────────────────────────────────
+        self.bm25_retriever.k = fetch_k
+        try:
+            bm25_docs: list[Document] = self.bm25_retriever.invoke(query)
+            logger.debug("BM25 returned %d docs for query: %r", len(bm25_docs), query)
+        except Exception as exc:
+            logger.warning("BM25 retrieval failed: %s — using empty list.", exc)
+            bm25_docs = []
+        # ── 2. Semantic retrieval ────────────────────────────────────────────
+        # similarity_search returns list[Document] (no scores needed — rank is enough for RRF)
+        try:
+            semantic_docs: list[Document] = self.semantic_store.similarity_search(
+                query, k=fetch_k
+            )
+            logger.debug(
+                "Semantic returned %d docs for query: %r", len(semantic_docs), query
+            )
+        except Exception as exc:
+            logger.warning("Semantic retrieval failed: %s — using empty list.", exc)
+            semantic_docs = []
+        # ── 3. RRF fusion ────────────────────────────────────────────────────
+        rrf_scores: dict[str, float] = {}
+        doc_map: dict[str, Document] = {}
+        def _asin_key(doc: Document, fallback: str) -> str:
+            """Use parent_asin as the dedup key; fall back to a content prefix."""
+            return doc.metadata.get("parent_asin") or fallback
+        for rank, doc in enumerate(bm25_docs):
+            key = _asin_key(doc, f"bm25_{rank}")
+            score = self.bm25_weight / (self.rrf_c + rank + 1)
+            rrf_scores[key] = rrf_scores.get(key, 0.0) + score
+            doc_map[key] = doc                  # BM25 docs have richer metadata (top_reviews etc.)
+        for rank, doc in enumerate(semantic_docs):
+            key = _asin_key(doc, f"sem_{rank}")
+            score = self.semantic_weight / (self.rrf_c + rank + 1)
+            rrf_scores[key] = rrf_scores.get(key, 0.0) + score
+            # Only add to doc_map if BM25 didn't already supply this product
+            # (BM25 metadata is richer — has top_reviews, image_url, etc.)
+            if key not in doc_map:
+                doc_map[key] = doc
+        # ── 4. Sort and truncate ─────────────────────────────────────────────
+        ranked_keys = sorted(rrf_scores, key=lambda k: rrf_scores[k], reverse=True)
+        top_docs = [doc_map[key] for key in ranked_keys[: self.k]]
+        # Attach fused score to metadata — useful for app display
+        for key, doc in zip(ranked_keys, top_docs):
+            doc.metadata["hybrid_score"] = round(rrf_scores[key], 6)
+            # Record which retriever(s) contributed to this result
+            in_bm25 = any(
+                _asin_key(d, f"bm25_{i}") == key for i, d in enumerate(bm25_docs)
+            )
+            in_sem = any(
+                _asin_key(d, f"sem_{i}") == key for i, d in enumerate(semantic_docs)
+            )
+            if in_bm25 and in_sem:
+                doc.metadata["retrieval_source"] = "hybrid"
+            elif in_bm25:
+                doc.metadata["retrieval_source"] = "bm25"
+            else:
+                doc.metadata["retrieval_source"] = "semantic"
+        logger.info(
+            "HybridRetriever: BM25=%d, Semantic=%d → fused=%d (returning top %d)",
+            len(bm25_docs), len(semantic_docs), len(rrf_scores), len(top_docs),
+        )
+        return top_docs
+# ---------------------------------------------------------------------------
+# Convenience loader
+# ---------------------------------------------------------------------------
+def load_hybrid_retriever(
+    bm25_index_path: str = "data/processed/tokenisation/bm25_index_mini.pkl",
+    faiss_store_path: str = "data/processed/embeddings",
+    k: int = 5,
+    bm25_weight: float = 0.5,
+    semantic_weight: float = 0.5,
+    rrf_c: int = 60,
+    fetch_multiplier: int = 3,
+) -> HybridRetriever:
+    """
+    Load both indexes from disk and return a ready-to-use HybridRetriever.
+    Call this once in your notebook or app.py, then pass the result to run_rag().
+    Parameters
+    ----------
+    bm25_index_path  : Path to the pickled BM25Retriever (from bm25.build_and_save())
+    faiss_store_path : Directory containing index.faiss + index.pkl
+                       (from semantic.build_and_save_vector_store())
+    k                : Number of documents to return per query
+    bm25_weight      : RRF weight for BM25 (keyword signal). Default 0.5.
+    semantic_weight  : RRF weight for semantic (meaning signal). Default 0.5.
+                       Weights don't need to sum to 1 but relative scale matters.
+    rrf_c            : RRF rank-dampening constant. Default 60 (standard).
+    fetch_multiplier : Candidates to fetch per retriever = k * fetch_multiplier.
+    Returns
+    -------
+    HybridRetriever
+        A LangChain-compatible retriever pipeable with |.
+    Example
+    -------
+    >>> from utils.hybrid import load_hybrid_retriever
+    >>> from utils.rag_pipeline import run_rag
+    >>>
+    >>> hybrid = load_hybrid_retriever(k=5)
+    >>> answer = run_rag(hybrid, "Best coffee beans for a French press")
+    >>> print(answer)
+    """
+    # Import here to avoid circular imports when used from rag_pipeline.py
+    from utils.bm25 import load as load_bm25
+    from utils.semantic import load_vector_store
+    print(f"Loading BM25 index from: {bm25_index_path}")
+    bm25_ret: BM25Retriever = load_bm25(bm25_index_path)
+    print(f"Loading FAISS store from: {faiss_store_path}")
+    faiss_store: FAISS = load_vector_store(faiss_store_path)
+    retriever = HybridRetriever(
+        bm25_retriever=bm25_ret,
+        semantic_store=faiss_store,
+        k=k,
+        bm25_weight=bm25_weight,
+        semantic_weight=semantic_weight,
+        rrf_c=rrf_c,
+        fetch_multiplier=fetch_multiplier,
+    )
+    print(
+        f"HybridRetriever ready — k={k}, "
+        f"BM25 weight={bm25_weight}, Semantic weight={semantic_weight}, RRF c={rrf_c}"
+    )
+    return retriever

utils/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""
+rag_chain.py
+------------
+Amazon product RAG (Retrieval-Augmented Generation) pipeline using
+LangChain + HuggingFace Inference Endpoints.
+Typical usage
+-------------
+>>> from rag_chain import run_rag
+>>> answer = run_rag(retriever, "Moisturizing shampoo for thick curly hair")
+>>> print(answer)
+"""
+from __future__ import annotations
+import logging
+from typing import Any
+from langchain_core.documents import Document
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnableLambda, RunnablePassthrough
+from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
+from utils.retrieval_helpers import _format_docs
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+DEFAULT_REPO_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+DEFAULT_MAX_NEW_TOKENS = 512
+DEFAULT_TOP_K = 5
+DEFAULT_SYSTEM_PROMPT = (
+    "You are a helpful Amazon grocery shopping assistant.\n\n"
+    "You will receive a grocery query and a list of related Amazon products (including reviews and metadata).\n\n"
+    "Your response must follow this exact structure:\n\n"
+    "---\n\n"
+    "## 🛒 Recommended Products\n"
+    "For each product, write a numbered list entry, mentioning products by title "
+    "followed by 1–2 sentences describing the product and why it suits the query.\n\n"
+    "## 💡 Tips & Recipe Ideas\n"
+    "A bullet-point list of practical tips, storage advice, and brief recipe ideas related to the products above "
+    "(do NOT write out full recipes — keep each idea to 1–2 sentences)."
+    "Add food emojis if relevant.\n\n"
+    "---\n\n"
+    "Rules:\n"
+    "- Do not invent products. Only recommend products from the provided list.\n"
+    "- Keep descriptions factual and grounded in the provided reviews and metadata.\n"
+    "- Recipe ideas should be suggestions or ideas only, not step-by-step instructions.\n"
+    "- Format the entire response in Markdown.\n"
+    "- IMPORTANT: Whenever citing the product title: add the parent_asin in the following format [title](#parent_asin)"
+)
+# ---------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------
+import logging
+from langchain_core.runnables import RunnableLambda
+logger = logging.getLogger(__name__)
+def _make_verbose_tap(label: str, verbose: bool):
+    """
+    Returns a passthrough RunnableLambda that logs *value* when verbose=True.
+    Works for any chain step — docs, prompt messages, or raw strings.
+    """
+    def _tap(value):
+        if verbose:
+            if hasattr(value, "messages"):          # ChatPromptValue
+                rendered = "\n".join(
+                    f"[{m.type.upper()}]: {m.content}"
+                    for m in value.messages
+                )
+            elif isinstance(value, list):            # list of Documents
+                rendered = "\n".join(str(d) for d in value)
+            else:
+                rendered = str(value)
+            print(f"\n{'='*60}\n{label}\n{'='*60}\n{rendered}\n")
+            logger.debug("%s\n%s", label, rendered)
+        return value
+    return RunnableLambda(_tap)
+def build_context(docs: list[Document]) -> str:
+    """
+    Concatenate a list of retrieved LangChain Documents into a single
+    context string that the LLM can reason over.
+    Each entry includes the product's ``parent_asin`` (falling back to its
+    position index), its page content, and its full metadata dict.
+    Parameters
+    ----------
+    docs:
+        List of ``langchain_core.documents.Document`` objects returned by
+        the retriever.
+    Returns
+    -------
+    str
+        A newline-separated block of product descriptions ready for prompt
+        injection. Returns an empty string when *docs* is empty.
+    Raises
+    ------
+    TypeError
+        If *docs* is not a list, or any element is not a ``Document``.
+    """
+    if not isinstance(docs, list):
+        raise TypeError(
+            f"'docs' must be a list of Document objects, got {type(docs).__name__}."
+        )
+    for i, doc in enumerate(docs):
+        if not isinstance(doc, Document):
+            raise TypeError(
+                f"Element at index {i} is not a Document; got {type(doc).__name__}."
+            )
+    if not docs:
+        logger.warning("build_context received an empty document list.")
+        return ""
+    return "\n\n".join(
+        f"ASIN {doc.metadata.get('parent_asin', n)} Description: {doc.page_content}\n"
+        f"Metadata: {doc.metadata}"
+        for n, doc in enumerate(docs)
+    )
+def _build_llm(
+    repo_id: str,
+    max_new_tokens: int,
+    provider: str,
+) -> ChatHuggingFace:
+    """
+    Instantiate and return a ``ChatHuggingFace`` model backed by a
+    HuggingFace Inference Endpoint.
+    Parameters
+    ----------
+    repo_id:
+        HuggingFace Hub model identifier (e.g.
+        ``"meta-llama/Meta-Llama-3-8B-Instruct"``).
+    max_new_tokens:
+        Maximum number of tokens the model may generate per call.
+    provider:
+        Inference provider passed to ``HuggingFaceEndpoint``
+        (``"auto"``, ``"novita"``, etc.).
+    Returns
+    -------
+    ChatHuggingFace
+        A chat-compatible wrapper around the endpoint.
+    """
+    endpoint = HuggingFaceEndpoint(
+        repo_id=repo_id,
+        task="text-generation",
+        max_new_tokens=max_new_tokens,
+        provider=provider,
+    )
+    return ChatHuggingFace(llm=endpoint)
+def _build_prompt_template(system_prompt: str) -> ChatPromptTemplate:
+    """
+    Create a ``ChatPromptTemplate`` with a system message and a human
+    turn that injects ``{context}`` and ``{question}`` placeholders.
+    Parameters
+    ----------
+    system_prompt:
+        The system-level instruction string.
+    Returns
+    -------
+    ChatPromptTemplate
+    """
+    return ChatPromptTemplate.from_messages([
+        ("system", system_prompt),
+        (
+            "human",
+            "context:\n{context}\n\nquestion:\n{question}\n\n"
+            "Answer based on the Amazon datasets:",
+        ),
+    ])
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def run_rag(
+    retriever: Any,
+    query: str,
+    system_prompt: str = DEFAULT_SYSTEM_PROMPT,
+    repo_id: str = DEFAULT_REPO_ID,
+    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
+    provider: str = "auto",
+    verbose: bool = False,
+    hf_dataset = None
+) -> str:
+    """
+    Execute a full RAG pipeline and return the model's answer.
+    The pipeline follows the steps below:
+    1. **Retrieve** - *retriever* fetches the *k* most relevant documents
+       for *query*.
+    2. **Format context** - :func:`build_context` serialises the documents
+       into a single string.
+    3. **Prompt** - the context and query are injected into the chat prompt
+       template.
+    4. **Generate** - the LLM produces an answer grounded in the context.
+    5. **Parse** - the raw chat message is unwrapped to a plain string.
+    Parameters
+    ----------
+    retriever:
+        A LangChain-compatible retriever (must expose ``.invoke()`` and be
+        pipeable with ``|``).  Typically created via
+        ``vectorstore.as_retriever(...)``.
+    query:
+        Natural-language question to answer (non-empty string).
+    system_prompt:
+        System-level instruction for the assistant.  Defaults to
+        :data:`DEFAULT_SYSTEM_PROMPT`.
+    repo_id:
+        HuggingFace Hub model identifier.  Defaults to
+        ``"meta-llama/Meta-Llama-3-8B-Instruct"``.
+    max_new_tokens:
+        Upper bound on generated tokens.  Must be a positive integer.
+        Defaults to ``100``.
+    provider:
+        HuggingFace inference provider (e.g. ``"auto"``, ``"novita"``).
+        Defaults to ``"auto"``.
+    Returns
+    -------
+    str
+        The model's answer as a plain string.
+    Raises
+    ------
+    TypeError
+        If *retriever* is ``None``, *query* is not a string, or
+        *system_prompt* is not a string.
+    ValueError
+        If *query* is blank, *max_new_tokens* is not a positive integer,
+        or *repo_id* / *provider* are blank strings.
+    Examples
+    --------
+    >>> answer = run_rag(retriever, "Best waterproof mascara under $20")
+    >>> print(answer)
+    """
+    # ------------------------------------------------------------------
+    # Build chain components
+    # ------------------------------------------------------------------
+    logger.info("Initialising LLM endpoint: %s", repo_id)
+    llm = _build_llm(repo_id, max_new_tokens, provider)
+    prompt_template = _build_prompt_template(system_prompt)
+    retrieved_docs: list[Document] = []   # ← capture target
+    def _retrieve_and_capture(query: str) -> list[Document]:
+        """Invoke the retriever and snapshot the results for the caller."""
+        docs = retriever.invoke(query)
+        retrieved_docs.extend(docs)       # ← populate closure variable
+        return docs                       # ← pass through to build_context
+    rag_chain = (
+        {
+            "context": RunnableLambda(_retrieve_and_capture)
+                       | RunnableLambda(build_context)
+                       | _make_verbose_tap("RETRIEVED CONTEXT", verbose),
+            "question": RunnablePassthrough(),
+        }
+        | _make_verbose_tap("PROMPT INPUTS (context + question)", verbose)
+        | prompt_template
+        | _make_verbose_tap("RENDERED PROMPT SENT TO LLM", verbose)   # ← shows exact prompt
+        | llm
+        | StrOutputParser()
+    )
+    # ------------------------------------------------------------------
+    # Run
+    # ------------------------------------------------------------------
+    logger.info("Invoking RAG chain for query: %r", query)
+    answer: str = rag_chain.invoke(query)
+    logger.debug("RAG answer: %s", answer)
+    if hf_dataset:
+        docs = _format_docs(retrieved_docs, hf_dataset)
+    else:
+        docs = retrieved_docs
+    return answer, docs

utils/retrieval_helpers.py CHANGED Viewed

@@ -82,6 +82,7 @@ def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
     # 4. Return JSON metadata objects
     return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
 def enrich_bm25_search_results(retriever, query: str, k: int, hf_dataset):
     """
     Perform BM25 search and enrich results with HuggingFace dataset metadata.
@@ -137,5 +138,57 @@ def enrich_bm25_search_results(retriever, query: str, k: int, hf_dataset):
     con.close()
     # 4. Return JSON metadata objects
     return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]

     # 4. Return JSON metadata objects
     return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
 def enrich_bm25_search_results(retriever, query: str, k: int, hf_dataset):
     """
     Perform BM25 search and enrich results with HuggingFace dataset metadata.
     con.close()
+    # 4. Return JSON metadata objects
+    return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
+def _format_docs(results, hf_dataset):
+    """
+    Perform similarity search and enrich results with HuggingFace dataset metadata.
+    Args:
+        vector_store: LangChain vector store instance
+        query: Search query string
+        k: Number of results to return
+        filter: Filter dict for similarity search
+        hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
+    Returns:
+        List of enriched metadata objects as dicts
+    """
+    # 1. Extract parent_asins from metadata
+    parent_asins = [doc.metadata.get("parent_asin") for doc in results]
+    # 2. Query HuggingFace dataset via DuckDB
+    con = duckdb.connect()
+    arrow_table = hf_dataset.data.table  # Get underlying PyArrow table
+    con.register("hf_table", arrow_table)
+    asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
+    query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
+    hf_rows = con.execute(query_sql).fetchdf()
+    # Build lookup: parent_asin -> metadata dict
+    asin_to_metadata = {
+        row["parent_asin"]: row.to_dict()
+        for _, row in hf_rows.iterrows()
+    }
+    enriched_results = []
+    for doc in results:
+        parent_asin = doc.metadata.get("parent_asin")
+        total_reviews = doc.metadata.get("total_reviews")
+        metadata_object = asin_to_metadata.get(parent_asin, {}).copy()
+        metadata_object['total_reviews'] = total_reviews
+        # 3. Extract 3 lines after "Top Reviews\n" from page_content
+        page_content = doc.page_content
+        metadata_object["reviews"] = decode_ratings(page_content)
+        enriched_results.append(metadata_object)
+    con.close()
     # 4. Return JSON metadata objects
     return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]