Spaces:

cicboy
/

AI_Blog_Writer

Sleeping

App Files Files Community

cicboy commited on Oct 15, 2025

Commit

90f40ea

1 Parent(s): 42d47d3

update changes to hybrid_retriever_tool.py file

Browse files

Files changed (1) hide show

tools/hybrid_retriever_tool.py +7 -7

tools/hybrid_retriever_tool.py CHANGED Viewed

@@ -75,7 +75,7 @@ class HybridRetrieverTool(RagTool):
         return text, urls
-    def _build_corpus(self, topic: str):
         """Fetch up-to-date search results."""
         results = self._tavily.search(query=topic, max_results=50)
         raw_texts = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
@@ -87,11 +87,11 @@ class HybridRetrieverTool(RagTool):
                 all_urls.extend(urls)
         #Deduplicate and keep top unique URLs
-        all_urls = list(dict.fromkeys(all_urls))[:5]
         return corpus, all_urls
     # LLM reranker
-    def _rerank(self, query: str, passages: list[str], top_n: int = 5) -> list[str]:
         """
         Use an LLM to re-rank retrieved passages for contextual relevance to the query.
         """
@@ -145,7 +145,7 @@ Instructions:
         """
         Run hybrid search: BM25 + semantic similarity.
         """
-        corpus, urls = self._build_corpus(query)
         if not corpus:
             return "No relevant content found."
@@ -171,10 +171,10 @@ Instructions:
         top_indices= np.argsort(hybrid_scores)[::-1][:top_k]
         top_passages = [corpus[i] for i in top_indices]
-        reranked = self._rerank(query, top_passages)
         return "\n\n".join(reranked)
-    def summarize_passages(self, topic: str, passages):
         """Summarize retrieved content into a coherent short digest, keeping citations."""
         if isinstance(passages, str):
             passages = [passages]
@@ -197,7 +197,7 @@ Instructions:
         text_block = re.sub(r"\s{2,}", " ", text_block).strip()
         text_block = text_block[:5000]  # safety limit for token size
-        unique_urls = list(dict.fromkeys(urls))[:5]
         # --- Structured summarization ---
         prompt = f"""

         return text, urls
+    def _build_corpus(self, topic: str, top_k: int = 8):
         """Fetch up-to-date search results."""
         results = self._tavily.search(query=topic, max_results=50)
         raw_texts = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
                 all_urls.extend(urls)
         #Deduplicate and keep top unique URLs
+        all_urls = list(dict.fromkeys(all_urls))[:top_k]
         return corpus, all_urls
     # LLM reranker
+    def _rerank(self, query: str, passages: list[str], top_n: int) -> list[str]:
         """
         Use an LLM to re-rank retrieved passages for contextual relevance to the query.
         """
         """
         Run hybrid search: BM25 + semantic similarity.
         """
+        corpus, urls = self._build_corpus(query, top_k=top_k)
         if not corpus:
             return "No relevant content found."
         top_indices= np.argsort(hybrid_scores)[::-1][:top_k]
         top_passages = [corpus[i] for i in top_indices]
+        reranked = self._rerank(query, top_passages, top_n=top_k)
         return "\n\n".join(reranked)
+    def summarize_passages(self, topic: str, passages, top_k: int = 8):
         """Summarize retrieved content into a coherent short digest, keeping citations."""
         if isinstance(passages, str):
             passages = [passages]
         text_block = re.sub(r"\s{2,}", " ", text_block).strip()
         text_block = text_block[:5000]  # safety limit for token size
+        unique_urls = list(dict.fromkeys(urls))[:top_k]
         # --- Structured summarization ---
         prompt = f"""