Spaces:

rishadaz
/

amazon_retriever

Running

App Files Files Community

github-actions[bot] commited on Apr 23

Commit

251d75e

1 Parent(s): 845adc6

chore: sync app/ and src/ from GitHub

Browse files

Files changed (5) hide show

app/app.py +58 -8
src/bm25.py +7 -0
src/rag_pipeline.py +8 -1
src/semantic.py +6 -0
src/utils.py +2 -0

app/app.py CHANGED Viewed

@@ -52,6 +52,14 @@ VECTOR_STORE_DIR = ROOT / "data" / "processed"
 @st.cache_resource
 def load_vector_store_cached():
     login(token=HF_TOKEN, add_to_git_credential=False)
     VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
@@ -97,10 +105,17 @@ else:
 def bm25_search(query: str, top_k: int = 3) -> list[dict]:
     """
-    PLACEHOLDER — swap with real BM25Retriever call, e.g.:
-        retriever = BM25Retriever.load('data/processed/bm25_index.pkl')
-        return retriever.search(query, top_k=top_k)
-    Returns top_k review-level results (may include multiple reviews per ASIN).
     """
     results = search(retriever, query, top_k)
@@ -109,10 +124,17 @@ def bm25_search(query: str, top_k: int = 3) -> list[dict]:
 def semantic_search(query: str, top_k: int = 3) -> list[dict]:
     """
-    PLACEHOLDER — swap with real SemanticRetriever call, e.g.:
-        retriever = SemanticRetriever.load('data/processed/faiss_index')
-        return retriever.search(query, top_k=top_k)
-    Returns top_k review-level results (scores are cosine similarities, 0–1).
     """
     results = enrich_search_results(vector_store, query, top_k)
@@ -128,12 +150,37 @@ hybrid_retriever = HybridRetriever(
 def llm_retriever(query: str, top_k: int = 5):
     answer, docs, web_sources = run_rag(hybrid_retriever, query=query)
     return answer, docs, web_sources
 # ─── Helpers ──────────────────────────────────────────────────────────────────
 def stars(rating: float) -> str:
     full  = int(rating)
     half  = 1 if (rating - full) >= 0.5 else 0
     empty = 5 - full - half
@@ -141,6 +188,7 @@ def stars(rating: float) -> str:
 def log_feedback(query: str, mode: str, asin: str, title: str, vote: str) -> None:
     file_exists = FEEDBACK_CSV.exists()
     with open(FEEDBACK_CSV, "a", newline="", encoding="utf-8") as f:
         writer = csv.DictWriter(
@@ -158,6 +206,7 @@ def log_feedback(query: str, mode: str, asin: str, title: str, vote: str) -> Non
         })
 def render_product(ind, item, mode):
     item = dict(item)
     if "reviews" in item.keys():
         reviews     = item.get("reviews",{})
@@ -240,6 +289,7 @@ def render_product(ind, item, mode):
 def render_results(results: list[dict], mode: str) -> None:
     if not results:
         st.info("No results returned.")
         return

 @st.cache_resource
 def load_vector_store_cached():
+    """
+    Load vector store and BM25 index from Hugging Face or local cache.
+    Returns
+    -------
+    tuple
+        (vector_store, bm25_retriever)
+    """
     login(token=HF_TOKEN, add_to_git_credential=False)
     VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
 def bm25_search(query: str, top_k: int = 3) -> list[dict]:
     """
+    Run BM25 keyword search.
+    Parameters
+    ----------
+    query : str
+    top_k : int
+    Returns
+    -------
+    list[dict]
+        Top-k retrieved results.
     """
     results = search(retriever, query, top_k)
 def semantic_search(query: str, top_k: int = 3) -> list[dict]:
     """
+    Run semantic (embedding-based) search.
+    Parameters
+    ----------
+    query : str
+    top_k : int
+    Returns
+    -------
+    list[dict]
+        Top-k retrieved results with scores.
     """
     results = enrich_search_results(vector_store, query, top_k)
 def llm_retriever(query: str, top_k: int = 5):
+    """
+    Run RAG pipeline using hybrid retriever.
+    Parameters
+    ----------
+    query : str
+    top_k : int
+    Returns
+    -------
+    tuple
+        (answer, retrieved_docs, web_sources)
+    """
     answer, docs, web_sources = run_rag(hybrid_retriever, query=query)
     return answer, docs, web_sources
 # ─── Helpers ──────────────────────────────────────────────────────────────────
 def stars(rating: float) -> str:
+    """
+    Convert numeric rating into star string.
+    Parameters
+    ----------
+    rating : float
+    Returns
+    -------
+    str
+        Star representation (e.g., ★★★★½).
+    """
     full  = int(rating)
     half  = 1 if (rating - full) >= 0.5 else 0
     empty = 5 - full - half
 def log_feedback(query: str, mode: str, asin: str, title: str, vote: str) -> None:
+    """Append user feedback to CSV log."""
     file_exists = FEEDBACK_CSV.exists()
     with open(FEEDBACK_CSV, "a", newline="", encoding="utf-8") as f:
         writer = csv.DictWriter(
         })
 def render_product(ind, item, mode):
+    """Render a single product card with reviews and feedback buttons."""
     item = dict(item)
     if "reviews" in item.keys():
         reviews     = item.get("reviews",{})
 def render_results(results: list[dict], mode: str) -> None:
+    """Render a list of product results."""
     if not results:
         st.info("No results returned.")
         return

src/bm25.py CHANGED Viewed

@@ -365,6 +365,13 @@ def search(
     query: str,
     top_k: int = 3,
 ) -> list[dict]:
     retriever.k = top_k
     # Tokenize query the same way the index was built

     query: str,
     top_k: int = 3,
 ) -> list[dict]:
+    """
+    Search the BM25Retriever for a query, returning metadata of top-k results.
+    Performs a BM25 keyword search on the indexed documents. Tokenizes the query
+    using the same tokenizer as the index, computes BM25 scores for all documents,
+    and returns structured metadata (including score) for the top-k matches.
+    """
     retriever.k = top_k
     # Tokenize query the same way the index was built

src/rag_pipeline.py CHANGED Viewed

@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
-DEFAULT_REPO_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 DEFAULT_MAX_NEW_TOKENS = 512
 DEFAULT_TOP_K = 5
@@ -97,7 +97,9 @@ def _maybe_web_search(query: str) -> tuple[str, list[dict]]:
 def _make_verbose_tap(label: str, verbose: bool):
     def _tap(value):
         if verbose:
             if hasattr(value, "messages"):
                 rendered = "\n".join(
@@ -115,6 +117,7 @@ def _make_verbose_tap(label: str, verbose: bool):
 def build_context(docs: list[Document]) -> str:
     if not isinstance(docs, list):
         raise TypeError(
             f"'docs' must be a list of Document objects, got {type(docs).__name__}."
@@ -139,6 +142,7 @@ def _build_llm(
     max_new_tokens: int,
     provider: str,
 ) -> ChatHuggingFace:
     endpoint = HuggingFaceEndpoint(
         repo_id=repo_id,
         task="text-generation",
@@ -149,6 +153,7 @@ def _build_llm(
 def _build_prompt_template(system_prompt: str) -> ChatPromptTemplate:
     return ChatPromptTemplate.from_messages([
         ("system", system_prompt),
         (
@@ -172,6 +177,7 @@ def run_rag(
     provider: str = "auto",
     verbose: bool = False,
 ) -> tuple[str, list[Document]]:
     # ------------------------------------------------------------------
     # Build chain components
     # ------------------------------------------------------------------
@@ -184,6 +190,7 @@ def run_rag(
     retrieved_docs: list[Document] = []
     def _retrieve_and_capture(query: str) -> list[Document]:
         docs = retriever.invoke(query)
         retrieved_docs.extend(docs)
         return docs

 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
+DEFAULT_REPO_ID = "Qwen/Qwen2.5-7B-Instruct"
 DEFAULT_MAX_NEW_TOKENS = 512
 DEFAULT_TOP_K = 5
 def _make_verbose_tap(label: str, verbose: bool):
+    """Returns a Runnable that prints the value with a label if verbose=True, then passes it through unchanged."""
     def _tap(value):
+        """Prints the value with a label if verbose=True, then returns it unchanged."""
         if verbose:
             if hasattr(value, "messages"):
                 rendered = "\n".join(
 def build_context(docs: list[Document]) -> str:
+    """Converts a list of Documents into a single string context for the LLM."""
     if not isinstance(docs, list):
         raise TypeError(
             f"'docs' must be a list of Document objects, got {type(docs).__name__}."
     max_new_tokens: int,
     provider: str,
 ) -> ChatHuggingFace:
+    """Initializes a HuggingFaceEndpoint and wraps it in a ChatHuggingFace LLM."""
     endpoint = HuggingFaceEndpoint(
         repo_id=repo_id,
         task="text-generation",
 def _build_prompt_template(system_prompt: str) -> ChatPromptTemplate:
+    """Constructs a ChatPromptTemplate with the given system prompt and a fixed human prompt."""
     return ChatPromptTemplate.from_messages([
         ("system", system_prompt),
         (
     provider: str = "auto",
     verbose: bool = False,
 ) -> tuple[str, list[Document]]:
+    """Runs a Retrieval-Augmented Generation (RAG) chain for a grocery query."""
     # ------------------------------------------------------------------
     # Build chain components
     # ------------------------------------------------------------------
     retrieved_docs: list[Document] = []
     def _retrieve_and_capture(query: str) -> list[Document]:
+        """Invokes the retriever and captures the retrieved documents for later use."""
         docs = retriever.invoke(query)
         retrieved_docs.extend(docs)
         return docs

src/semantic.py CHANGED Viewed

@@ -48,6 +48,7 @@ DEFAULT_TOP_K = 5
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 @st.cache_resource(show_spinner=False)
 def get_embeddings():
     return HuggingFaceEmbeddings(
         model_name=DEFAULT_EMBEDDING_MODEL,
         model_kwargs={
@@ -207,6 +208,10 @@ def build_and_save_vector_store(
     save_path: str,
     batch_size: int = 500,
 ) -> FAISS:
     # --- Resume / initialize ---
     if os.path.exists(os.path.join(save_path, "index.faiss")):
@@ -297,6 +302,7 @@ def enrich_search_results(vector_store, query: str, k: int, filter=None):
 def load_vector_store(
     load_path: str,
 ) -> FAISS:
     return FAISS.load_local(
         load_path,

 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 @st.cache_resource(show_spinner=False)
 def get_embeddings():
+    """Initializes and returns a HuggingFaceEmbeddings instance with the specified model and device settings."""
     return HuggingFaceEmbeddings(
         model_name=DEFAULT_EMBEDDING_MODEL,
         model_kwargs={
     save_path: str,
     batch_size: int = 500,
 ) -> FAISS:
+    """
+    Build a FAISS vector store from a metadata Dataset, processing in batches and saving progress.
+    This function processes the metadata dataset in batches, creating Documents and embedding them into a FAISS vector store.
+    """
     # --- Resume / initialize ---
     if os.path.exists(os.path.join(save_path, "index.faiss")):
 def load_vector_store(
     load_path: str,
 ) -> FAISS:
+    """Load a FAISS vector store from disk."""
     return FAISS.load_local(
         load_path,

src/utils.py CHANGED Viewed

@@ -11,6 +11,7 @@ STOPWORDS = set(stopwords.words('english'))
 # Tokenizer
 def simple_tokenize(text):
     if not text:
         return []
     text = text.lower()
@@ -49,6 +50,7 @@ def extract_image(row):
     return None
 def decode_ratings(page_content):
     block_pattern = r'\[\d\.0★\].*'
     matches = re.findall(block_pattern, page_content)
     if matches:

 # Tokenizer
 def simple_tokenize(text):
+    """A simple tokenizer that lowercases text, removes punctuation, and filters out stopwords."""
     if not text:
         return []
     text = text.lower()
     return None
 def decode_ratings(page_content):
+    """Extracts up to 3 ratings from the page content string, returning a list of dicts with rating, title, and text."""
     block_pattern = r'\[\d\.0★\].*'
     matches = re.findall(block_pattern, page_content)
     if matches: