Spaces:

rishadaz
/

amazon_retriever

Sleeping

App Files Files Community

rishadaz commited on Apr 12

Commit

a80240d

verified ·

1 Parent(s): 08b4e10

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +314 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,316 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import csv, sys, os
+from datetime import datetime
+from pathlib import Path
 import streamlit as st
+# ─── Repo root is the working directory on HF Spaces ─────────────────────────
+ROOT = Path(__file__).resolve().parent          # app.py lives at repo root
+sys.path.append(str(ROOT))
+from src.retrieval_helpers import enrich_search_results
+from src.semantic import load_vector_store
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+# ─── Page config (must be first Streamlit call) ───────────────────────────────
+st.set_page_config(
+    page_title="Groceries & Gourmet Food Search",
+    page_icon="🥕",
+    layout="wide",
+    initial_sidebar_state="collapsed",
+)
+# ─── Paths ────────────────────────────────────────────────────────────────────
+FEEDBACK_CSV = ROOT / "results" / "feedback.csv"
+FEEDBACK_CSV.parent.mkdir(parents=True, exist_ok=True)
+# ─── Load HF dataset (cached so it only runs once) ───────────────────────────
+from datasets import load_dataset
+@st.cache_resource
+def load_hf_dataset():
+    return load_dataset(
+        "McAuley-Lab/Amazon-Reviews-2023",
+        "raw_meta_Grocery_and_Gourmet_Food",
+        trust_remote_code=True,
+    )
+HF_DATASET = load_hf_dataset()
+# ─── Download vector store from your HF dataset repo ─────────────────────────
+from huggingface_hub import hf_hub_download, snapshot_download
+VECTOR_STORE_DIR = ROOT / "embeddings" / "semantic_vector_store"
+VECTOR_STORE_DIR = Path("/data/embeddings/semantic_vector_store")
+@st.cache_resource
+def load_vector_store_cached():
+    return load_vector_store(VECTOR_STORE_DIR)
+# ─── Custom CSS ───────────────────────────────────────────────────────────────
+st.markdown(
+    """
+    <style>
+    @import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@600&family=Source+Sans+3:wght@400;600&display=swap');
+    html, body, [class*="css"] {
+        font-family: 'Source Sans 3', sans-serif;
+    }
+    h1, h2, h3 { font-family: 'Playfair Display', serif; }
+    .banner {
+        background: linear-gradient(135deg, #2d4a22 0%, #4a7c3f 60%, #7aab5c 100%);
+        border-radius: 12px;
+        padding: 2rem 2.5rem;
+        margin-bottom: 1.5rem;
+        color: #f5f0e8;
+    }
+    .banner h1 { margin: 0; font-size: 2.4rem; color: #f5f0e8; }
+    .banner p  { margin: 0.3rem 0 0; font-size: 1.05rem; opacity: 0.85; }
+    /* Product card (outer) */
+    .product-card {
+        background: #fffdf7;
+        border: 1px solid #e2d9c8;
+        border-left: 4px solid #4a7c3f;
+        border-radius: 8px;
+        padding: 1rem 1.2rem 0.6rem;
+        margin-bottom: 0.4rem;
+        box-shadow: 0 1px 4px rgba(0,0,0,0.06);
+    }
+    .product-card h4 { margin: 0 0 0.2rem; color: #1e3318; font-size: 1.05rem; }
+    /* Review snippet inside expander */
+    .review-snippet {
+        background: #f7f4ee;
+        border-radius: 6px;
+        padding: 0.6rem 0.9rem;
+        margin-bottom: 0.5rem;
+        font-size: 0.87rem;
+        color: #444;
+        line-height: 1.55;
+    }
+    .score-badge {
+        display: inline-block;
+        background: #eaf3e6;
+        color: #2d5a20;
+        border-radius: 20px;
+        padding: 2px 10px;
+        font-size: 0.78rem;
+        font-weight: 600;
+        margin-right: 6px;
+    }
+    .stars { color: #e6a817; }
+    .placeholder-badge {
+        background: #fff3cd;
+        border: 1px solid #ffc107;
+        border-radius: 6px;
+        padding: 0.4rem 0.8rem;
+        font-size: 0.82rem;
+        color: #7a5800;
+        display: inline-block;
+        margin-bottom: 1rem;
+    }
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+# ─── Placeholder retrieval functions ──────────────────────────────────────────
+# TODO: Replace with real imports once src/bm25.py and src/semantic.py are ready:
+#   from src.bm25 import BM25Retriever
+#   from src.semantic import SemanticRetriever
+#
+# Expected return format — list of dicts with keys:
+#   asin (str), title (str), text (str), rating (float), score (float)
+DUMMY_RESULTS = {}
+def bm25_search(query: str, top_k: int = 3) -> list[dict]:
+    """
+    PLACEHOLDER — swap with real BM25Retriever call, e.g.:
+        retriever = BM25Retriever.load('data/processed/bm25_index.pkl')
+        return retriever.search(query, top_k=top_k)
+    Returns top_k review-level results (may include multiple reviews per ASIN).
+    """
+    return [r.copy() for r in DUMMY_RESULTS[:top_k]]
+def semantic_search(query: str, top_k: int = 3) -> list[dict]:
+    """
+    PLACEHOLDER — swap with real SemanticRetriever call, e.g.:
+        retriever = SemanticRetriever.load('data/processed/faiss_index')
+        return retriever.search(query, top_k=top_k)
+    Returns top_k review-level results (scores are cosine similarities, 0–1).
+    """
+    vector_store = load_vector_store_cached()
+    results = enrich_search_results(vector_store, query, top_k, HF_DATASET["full"])
+    return results
+# ─── Helpers ──────────────────────────────────────────────────────────────────
+def stars(rating: float) -> str:
+    full  = int(rating)
+    half  = 1 if (rating - full) >= 0.5 else 0
+    empty = 5 - full - half
+    return "★" * full + "½" * half + "☆" * empty
+def log_feedback(query: str, mode: str, asin: str, title: str, vote: str) -> None:
+    file_exists = FEEDBACK_CSV.exists()
+    with open(FEEDBACK_CSV, "a", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(
+            f, fieldnames=["timestamp", "query", "mode", "asin", "title", "vote"]
+        )
+        if not file_exists:
+            writer.writeheader()
+        writer.writerow({
+            "timestamp": datetime.now().isoformat(),
+            "query":     query,
+            "mode":      mode,
+            "asin":      asin,
+            "title":     title,
+            "vote":      vote,
+        })
+def render_results(results: list[dict], mode: str, query: str) -> None:
+    if not results:
+        st.info("No results returned.")
+        return
+    grouped = results
+    for ind, item in enumerate(grouped):
+        reviews     = item["reviews"]
+        title       = item["title"]
+        avg_rating  = item["average_rating"]
+        n_reviews   = len(reviews)
+        total_reviews = item.get('total_reviews', n_reviews)
+        rating_number = item.get('rating_number', 0)
+        asin        = item['parent_asin']
+        review_word = "review" if n_reviews == 1 else "reviews"
+        large_images = item.get('images', {}).get('large', [])
+        image_html = f'<img src="{large_images[0]}" style="width:100%;max-width:200px;border-radius:8px;margin-bottom:8px;" />' if large_images else ''
+        raw_price = item.get('price')
+        try:
+            price_val = float(str(raw_price).replace('$', '').replace(',', '').strip())
+            price_html = f'<span style="color:#2ecc71;font-weight:600">${price_val:.2f}</span>'
+        except (TypeError, ValueError):
+            price_html = ''
+        # ── Product card header ───────────────────────────────────────────
+        st.markdown(
+            f"""
+            <div class="product-card">
+                {image_html}
+                <h4>#{ind + 1} &nbsp; {title}</h4>
+                <span class="stars">{stars(avg_rating)}</span>
+                &nbsp;<small style="color:#888">{avg_rating:.1f}/5 avg ({rating_number:,} ratings)</small>
+                &nbsp;&nbsp;
+                <span class="score-badge">similarity score: {item['score']}</span>
+                {"&nbsp;&nbsp;" + price_html if price_html else ""}
+            </div>
+            """,
+            unsafe_allow_html=True,
+        )
+        # ── Reviews in collapsible expander ───────────────────────────────
+        expander_label = f"📖 View {n_reviews} of total {total_reviews} {review_word} "
+        with st.expander(expander_label, expanded=(n_reviews == 1)):
+            for j, rev in enumerate(reviews):
+                st.markdown(
+                    f"""
+                    <div class="review-snippet">
+                        <strong>{rev['title']}</strong>
+                        &nbsp;·&nbsp;
+                        <span class="stars">{stars(rev['rating'])}</span>
+                        <span style="color:#888; font-size:0.8rem"> {rev['rating']}/5</span>
+                        &nbsp;·&nbsp;
+                        <br><br>
+                        {rev['text'][:300]}{'…' if len(rev['text']) > 300 else ''}
+                    </div>
+                    """,
+                    unsafe_allow_html=True,
+                )
+        # ── Feedback buttons (per product) ────────────────────────────────
+        col_up, col_dn, _ = st.columns([1, 1, 10])
+        with col_up:
+            if st.button("👍", key=f"up_{mode}_{asin}_{ind}"):
+                log_feedback(query, mode, asin, title, "up")
+                st.toast("Thanks! 👍")
+        with col_dn:
+            if st.button("👎", key=f"dn_{mode}_{asin}_{ind}"):
+                log_feedback(query, mode, asin, title, "down")
+                st.toast("Noted! 👎")
+        st.markdown("<hr style='border:none;border-top:1px solid #e8e0d0;margin:0.5rem 0 1rem'>", unsafe_allow_html=True)
+# ─── App layout ───────────────────────────────────────────────────────────────
+st.markdown(
+    """
+    <div class="banner">
+        <h1>🥕🧀 Groceries & Gourmet Food Search</h1>
+        <p>Amazon Products & Reviews · Groceries & Gourmet Food </p>
+    </div>
+    """,
+    unsafe_allow_html=True,
+)
+st.markdown(
+    '<div class="placeholder-badge">⚠️ Placeholder mode — real BM25 / Semantic indices not yet loaded</div>',
+    unsafe_allow_html=True,
+)
+# ─── Search bar ───────────────────────────────────────────────────────────────
+query = st.text_input(
+    "Search for a product or describe what you're looking for",
+    placeholder="e.g. something sweet for a cheese board...",
+)
+# ─── Mode radio ───────────────────────────────────────────────────────────────
+mode = st.radio(
+    "Search mode",
+    options=["BM25", "Semantic"],
+    index=0,            # BM25 shown by default
+    horizontal=True,
+    help="BM25 = keyword matching · Semantic = embedding similarity (all-MiniLM-L6-v2 + FAISS)",
+)
+# ─── Run & render ─────────────────────────────────────────────────────────────
+TOP_K = 5  # fixed per milestone requirement
+if query.strip():
+    st.markdown(f"#### Top {TOP_K} results — {mode}")
+    results = bm25_search(query, top_k=TOP_K) if mode == "BM25" else semantic_search(query, top_k=TOP_K)
+    render_results(results, mode=mode.lower(), query=query)
+else:
+    st.markdown(
+        "<p style='color:#aaa; margin-top:1rem;'>Enter a query above to see results.</p>",
+        unsafe_allow_html=True,
+    )
+# ─── Sidebar: feedback log ────────────────────────────────────────────────────
+with st.sidebar:
+    st.header("📋 Feedback Log")
+    if FEEDBACK_CSV.exists():
+        import pandas as pd
+        df = pd.read_csv(FEEDBACK_CSV)
+        st.dataframe(df.tail(20), use_container_width=True)
+        st.download_button(
+            "⬇️ Download feedback.csv",
+            data=df.to_csv(index=False),
+            file_name="feedback.csv",
+            mime="text/csv",
+        )
+    else:
+        st.info("No feedback yet — use 👍/👎 on results.")