Spaces:

cicboy
/

AI_Blog_Writer

Sleeping

App Files Files Community

cicboy commited on Oct 15, 2025

Commit

42d47d3

1 Parent(s): b60e7d3

update changes to hybrid_retriever_tool.py file

Browse files

Files changed (1) hide show

tools/hybrid_retriever_tool.py +60 -5

tools/hybrid_retriever_tool.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pydantic import Field, PrivateAttr
 import os
 from html import unescape
 import re
 class HybridRetrieverTool(RagTool):
     name: str = "Hybrid Retriever Tool"
@@ -76,7 +77,7 @@ class HybridRetrieverTool(RagTool):
     def _build_corpus(self, topic: str):
         """Fetch up-to-date search results."""
-        results = self._tavily.search(query=topic, max_results=30)
         raw_texts = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
         corpus, all_urls = [], []
         for t in raw_texts:
@@ -87,7 +88,58 @@ class HybridRetrieverTool(RagTool):
         #Deduplicate and keep top unique URLs
         all_urls = list(dict.fromkeys(all_urls))[:5]
-        return corpus, all_urls
     def _run(self, query: str, top_k: int = 8) -> str:
         """
@@ -119,7 +171,8 @@ class HybridRetrieverTool(RagTool):
         top_indices= np.argsort(hybrid_scores)[::-1][:top_k]
         top_passages = [corpus[i] for i in top_indices]
-        return "\n\n".join(top_passages)
     def summarize_passages(self, topic: str, passages):
         """Summarize retrieved content into a coherent short digest, keeping citations."""
@@ -142,7 +195,7 @@ class HybridRetrieverTool(RagTool):
         unique_texts = list(dict.fromkeys(main_text))[:5]  # prevent duplication
         text_block = " ".join(unique_texts)
         text_block = re.sub(r"\s{2,}", " ", text_block).strip()
-        text_block = text_block[:4000]  # safety limit for token size
         unique_urls = list(dict.fromkeys(urls))[:5]
@@ -178,7 +231,9 @@ Return output in Markdown format.
             summary = response.choices[0].message.content.strip()
             if unique_urls:
-                summary += "\n\n**Sources:**\n" + "\n".join(f"- {u}" for u in unique_urls) + "\n"
             return summary

 import os
 from html import unescape
 import re
+import logging
 class HybridRetrieverTool(RagTool):
     name: str = "Hybrid Retriever Tool"
     def _build_corpus(self, topic: str):
         """Fetch up-to-date search results."""
+        results = self._tavily.search(query=topic, max_results=50)
         raw_texts = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
         corpus, all_urls = [], []
         for t in raw_texts:
         #Deduplicate and keep top unique URLs
         all_urls = list(dict.fromkeys(all_urls))[:5]
+        return corpus, all_urls
+    # LLM reranker
+    def _rerank(self, query: str, passages: list[str], top_n: int = 5) -> list[str]:
+        """
+        Use an LLM to re-rank retrieved passages for contextual relevance to the query.
+        """
+        if not passages:
+            return []
+        try:
+            formatted_passages = "\n\n".join(
+                [f"Passage {i+1}:\n{p}" for i, p in enumerate(passages)]
+            )
+            prompt = f"""
+You are a precise research assistant that ranks text passages for relevance.
+Query:
+"{query}"
+Passages:
+{formatted_passages}
+Instructions:
+- Rank passages by how directly and substantively they address the query.
+- Ignore repetitive, boilerplate, or promotional content.
+- Return ONLY the top {top_n} most relevant passages, in their original text form.
+"""
+            response = self._client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": "You are an expert LLM reranker for information retrieval."},
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=0,
+            )
+            ranked_text = response.choices[0].message.content.strip()
+            reranked = re.split(r"Passage\s*\d+:", ranked_text)
+            reranked = [p.strip() for p in reranked if len(p.strip()) > 20]
+            if len(reranked) == 0:
+                print("⚠️ Reranker returned no valid text, using original order.")
+                return passages[:top_n]
+            return reranked[:top_n]
+        except Exception as e:
+            logging.warning(f"Reranker failed: {e}")
+            return passages[:top_n]
     def _run(self, query: str, top_k: int = 8) -> str:
         """
         top_indices= np.argsort(hybrid_scores)[::-1][:top_k]
         top_passages = [corpus[i] for i in top_indices]
+        reranked = self._rerank(query, top_passages)
+        return "\n\n".join(reranked)
     def summarize_passages(self, topic: str, passages):
         """Summarize retrieved content into a coherent short digest, keeping citations."""
         unique_texts = list(dict.fromkeys(main_text))[:5]  # prevent duplication
         text_block = " ".join(unique_texts)
         text_block = re.sub(r"\s{2,}", " ", text_block).strip()
+        text_block = text_block[:5000]  # safety limit for token size
         unique_urls = list(dict.fromkeys(urls))[:5]
             summary = response.choices[0].message.content.strip()
             if unique_urls:
+                if unique_urls:
+                    summary += "\n\n**Sources:**\n" + "\n".join(f"- [{u}]({u})" for u in unique_urls)
             return summary