cicboy commited on
Commit
90f40ea
·
1 Parent(s): 42d47d3

update changes to hybrid_retriever_tool.py file

Browse files
Files changed (1) hide show
  1. tools/hybrid_retriever_tool.py +7 -7
tools/hybrid_retriever_tool.py CHANGED
@@ -75,7 +75,7 @@ class HybridRetrieverTool(RagTool):
75
 
76
  return text, urls
77
 
78
- def _build_corpus(self, topic: str):
79
  """Fetch up-to-date search results."""
80
  results = self._tavily.search(query=topic, max_results=50)
81
  raw_texts = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
@@ -87,11 +87,11 @@ class HybridRetrieverTool(RagTool):
87
  all_urls.extend(urls)
88
 
89
  #Deduplicate and keep top unique URLs
90
- all_urls = list(dict.fromkeys(all_urls))[:5]
91
  return corpus, all_urls
92
 
93
  # LLM reranker
94
- def _rerank(self, query: str, passages: list[str], top_n: int = 5) -> list[str]:
95
  """
96
  Use an LLM to re-rank retrieved passages for contextual relevance to the query.
97
  """
@@ -145,7 +145,7 @@ Instructions:
145
  """
146
  Run hybrid search: BM25 + semantic similarity.
147
  """
148
- corpus, urls = self._build_corpus(query)
149
  if not corpus:
150
  return "No relevant content found."
151
 
@@ -171,10 +171,10 @@ Instructions:
171
  top_indices= np.argsort(hybrid_scores)[::-1][:top_k]
172
 
173
  top_passages = [corpus[i] for i in top_indices]
174
- reranked = self._rerank(query, top_passages)
175
  return "\n\n".join(reranked)
176
 
177
- def summarize_passages(self, topic: str, passages):
178
  """Summarize retrieved content into a coherent short digest, keeping citations."""
179
  if isinstance(passages, str):
180
  passages = [passages]
@@ -197,7 +197,7 @@ Instructions:
197
  text_block = re.sub(r"\s{2,}", " ", text_block).strip()
198
  text_block = text_block[:5000] # safety limit for token size
199
 
200
- unique_urls = list(dict.fromkeys(urls))[:5]
201
 
202
  # --- Structured summarization ---
203
  prompt = f"""
 
75
 
76
  return text, urls
77
 
78
+ def _build_corpus(self, topic: str, top_k: int = 8):
79
  """Fetch up-to-date search results."""
80
  results = self._tavily.search(query=topic, max_results=50)
81
  raw_texts = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
 
87
  all_urls.extend(urls)
88
 
89
  #Deduplicate and keep top unique URLs
90
+ all_urls = list(dict.fromkeys(all_urls))[:top_k]
91
  return corpus, all_urls
92
 
93
  # LLM reranker
94
+ def _rerank(self, query: str, passages: list[str], top_n: int) -> list[str]:
95
  """
96
  Use an LLM to re-rank retrieved passages for contextual relevance to the query.
97
  """
 
145
  """
146
  Run hybrid search: BM25 + semantic similarity.
147
  """
148
+ corpus, urls = self._build_corpus(query, top_k=top_k)
149
  if not corpus:
150
  return "No relevant content found."
151
 
 
171
  top_indices= np.argsort(hybrid_scores)[::-1][:top_k]
172
 
173
  top_passages = [corpus[i] for i in top_indices]
174
+ reranked = self._rerank(query, top_passages, top_n=top_k)
175
  return "\n\n".join(reranked)
176
 
177
+ def summarize_passages(self, topic: str, passages, top_k: int = 8):
178
  """Summarize retrieved content into a coherent short digest, keeping citations."""
179
  if isinstance(passages, str):
180
  passages = [passages]
 
197
  text_block = re.sub(r"\s{2,}", " ", text_block).strip()
198
  text_block = text_block[:5000] # safety limit for token size
199
 
200
+ unique_urls = list(dict.fromkeys(urls))[:top_k]
201
 
202
  # --- Structured summarization ---
203
  prompt = f"""