Spaces:
Sleeping
Sleeping
update changes to hybrid_retriever_tool.py file
Browse files
tools/hybrid_retriever_tool.py
CHANGED
|
@@ -75,7 +75,7 @@ class HybridRetrieverTool(RagTool):
|
|
| 75 |
|
| 76 |
return text, urls
|
| 77 |
|
| 78 |
-
def _build_corpus(self, topic: str):
|
| 79 |
"""Fetch up-to-date search results."""
|
| 80 |
results = self._tavily.search(query=topic, max_results=50)
|
| 81 |
raw_texts = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
|
|
@@ -87,11 +87,11 @@ class HybridRetrieverTool(RagTool):
|
|
| 87 |
all_urls.extend(urls)
|
| 88 |
|
| 89 |
#Deduplicate and keep top unique URLs
|
| 90 |
-
all_urls = list(dict.fromkeys(all_urls))[:
|
| 91 |
return corpus, all_urls
|
| 92 |
|
| 93 |
# LLM reranker
|
| 94 |
-
def _rerank(self, query: str, passages: list[str], top_n: int
|
| 95 |
"""
|
| 96 |
Use an LLM to re-rank retrieved passages for contextual relevance to the query.
|
| 97 |
"""
|
|
@@ -145,7 +145,7 @@ Instructions:
|
|
| 145 |
"""
|
| 146 |
Run hybrid search: BM25 + semantic similarity.
|
| 147 |
"""
|
| 148 |
-
corpus, urls = self._build_corpus(query)
|
| 149 |
if not corpus:
|
| 150 |
return "No relevant content found."
|
| 151 |
|
|
@@ -171,10 +171,10 @@ Instructions:
|
|
| 171 |
top_indices= np.argsort(hybrid_scores)[::-1][:top_k]
|
| 172 |
|
| 173 |
top_passages = [corpus[i] for i in top_indices]
|
| 174 |
-
reranked = self._rerank(query, top_passages)
|
| 175 |
return "\n\n".join(reranked)
|
| 176 |
|
| 177 |
-
def summarize_passages(self, topic: str, passages):
|
| 178 |
"""Summarize retrieved content into a coherent short digest, keeping citations."""
|
| 179 |
if isinstance(passages, str):
|
| 180 |
passages = [passages]
|
|
@@ -197,7 +197,7 @@ Instructions:
|
|
| 197 |
text_block = re.sub(r"\s{2,}", " ", text_block).strip()
|
| 198 |
text_block = text_block[:5000] # safety limit for token size
|
| 199 |
|
| 200 |
-
unique_urls = list(dict.fromkeys(urls))[:
|
| 201 |
|
| 202 |
# --- Structured summarization ---
|
| 203 |
prompt = f"""
|
|
|
|
| 75 |
|
| 76 |
return text, urls
|
| 77 |
|
| 78 |
+
def _build_corpus(self, topic: str, top_k: int = 8):
|
| 79 |
"""Fetch up-to-date search results."""
|
| 80 |
results = self._tavily.search(query=topic, max_results=50)
|
| 81 |
raw_texts = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
|
|
|
|
| 87 |
all_urls.extend(urls)
|
| 88 |
|
| 89 |
#Deduplicate and keep top unique URLs
|
| 90 |
+
all_urls = list(dict.fromkeys(all_urls))[:top_k]
|
| 91 |
return corpus, all_urls
|
| 92 |
|
| 93 |
# LLM reranker
|
| 94 |
+
def _rerank(self, query: str, passages: list[str], top_n: int) -> list[str]:
|
| 95 |
"""
|
| 96 |
Use an LLM to re-rank retrieved passages for contextual relevance to the query.
|
| 97 |
"""
|
|
|
|
| 145 |
"""
|
| 146 |
Run hybrid search: BM25 + semantic similarity.
|
| 147 |
"""
|
| 148 |
+
corpus, urls = self._build_corpus(query, top_k=top_k)
|
| 149 |
if not corpus:
|
| 150 |
return "No relevant content found."
|
| 151 |
|
|
|
|
| 171 |
top_indices= np.argsort(hybrid_scores)[::-1][:top_k]
|
| 172 |
|
| 173 |
top_passages = [corpus[i] for i in top_indices]
|
| 174 |
+
reranked = self._rerank(query, top_passages, top_n=top_k)
|
| 175 |
return "\n\n".join(reranked)
|
| 176 |
|
| 177 |
+
def summarize_passages(self, topic: str, passages, top_k: int = 8):
|
| 178 |
"""Summarize retrieved content into a coherent short digest, keeping citations."""
|
| 179 |
if isinstance(passages, str):
|
| 180 |
passages = [passages]
|
|
|
|
| 197 |
text_block = re.sub(r"\s{2,}", " ", text_block).strip()
|
| 198 |
text_block = text_block[:5000] # safety limit for token size
|
| 199 |
|
| 200 |
+
unique_urls = list(dict.fromkeys(urls))[:top_k]
|
| 201 |
|
| 202 |
# --- Structured summarization ---
|
| 203 |
prompt = f"""
|