Spaces:

cicboy
/

AI_Blog_Writer

Sleeping

App Files Files Community

cicboy commited on Oct 14, 2025

Commit

a370528

1 Parent(s): 8ec15d6

update hybrid_retriever_tool file

Browse files

Files changed (1) hide show

tools/hybrid_retriever_tool.py +50 -12

tools/hybrid_retriever_tool.py CHANGED Viewed

@@ -6,6 +6,7 @@ from openai import OpenAI
 from crewai_tools import RagTool
 from pydantic import Field, PrivateAttr
 import os
 import re
 class HybridRetrieverTool(RagTool):
@@ -24,17 +25,43 @@ class HybridRetrieverTool(RagTool):
         self._tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
         self._client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
     def _build_corpus(self, topic: str):
         """Fetch up-to-date search results."""
         results = self._tavily.search(query=topic, max_results=30)
-        corpus = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
-        return corpus
     def _run(self, query: str, top_k: int = 8) -> str:
         """
         Run hybrid search: BM25 + semantic similarity.
         """
-        corpus = self._build_corpus(query)
         if not corpus:
             return "No relevant content found."
@@ -48,7 +75,11 @@ class HybridRetrieverTool(RagTool):
         sem_scores = np.dot(emb_corpus, emb_query)
         # Normalize scores
-        bm25_norm = (bm25_scores - bm25_scores.min()) / (np.ptp(bm25_scores) + 1e-8)
         sem_norm = (sem_scores - sem_scores.min()) / (np.ptp(sem_scores) + 1e-8)
         # Weighted fusion
@@ -59,17 +90,21 @@ class HybridRetrieverTool(RagTool):
         return "\n\n".join(top_passages)
     def summarize_passages(self, topic: str, passages):
         if isinstance(passages, str):
             passages = [passages]
         # 🧹 Clean each passage (remove links, HTML tags, redundant whitespace)
-        clean_passages = []
         for p in passages:
-            p = re.sub(r"http\S+", "", p)                  # remove URLs
-            p = re.sub(r"\s+", " ", p).strip()             # normalize spaces
-            p = re.sub(r"[^A-Za-z0-9.,!?;:()\-\s]", "", p) # strip stray symbols
-            clean_passages.append(p)
         # Build condensed input (limit total tokens)
-        text_block = " ".join(clean_passages[:5])[:4000]
         try:
             response = self._client.chat.completions.create(
                 model="gpt-4o-mini",
@@ -80,13 +115,16 @@ class HybridRetrieverTool(RagTool):
                             "You are a concise research summarizer. "
                             "Produce a 1–2 paragraph overview that highlights key facts, "
                             "themes, and findings relevant to the topic. "
-                            "Exclude URLs, lists, HTML remnants, or boilerplate text."
                         ),
                     },
                     {"role": "user", "content": f"Summarize these passages about {topic}:\n\n{text_block}"}
                 ],
                 temperature=0.3
             )
-            return response.choices[0].message.content.strip()
         except Exception as e:
             return f"Summarization failed: {e}"

 from crewai_tools import RagTool
 from pydantic import Field, PrivateAttr
 import os
+from html import unescape
 import re
 class HybridRetrieverTool(RagTool):
         self._tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
         self._client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    # 🧹 Text Cleaning
+    def _clean_text(self, text: str):
+        """Remove HTML, images, boilerplate; keep valuable text & extract URLs for citation."""
+        urls = re.findall(r'https?://\S+', text)
+        text = unescape(text)
+        text = re.sub(r"<[^>]+>", " ", text)                   # Remove HTML tags
+        text = re.sub(r"!\[.*?\]\(.*?\)", " ", text)           # Remove Markdown images
+        text = re.sub(r"\[.*?\]\(.*?\)", " ", text)            # Remove Markdown links
+        text = re.sub(r"\S+\.(jpg|jpeg|png|gif|svg|webp|pdf)", " ", text, flags=re.IGNORECASE)
+        text = re.sub(r"http\S+", " ", text)                   # Remove URLs inline
+        text = re.sub(r"(Share|Tweet|Email|Login|Subscribe|Learn More|Read More)+", " ", text, flags=re.IGNORECASE)
+        text = re.sub(r"\s+", " ", text).strip()               # Normalize spaces
+        text = re.sub(r"(Education Weekly Update.*?)+", "", text, flags=re.IGNORECASE)
+        if len(text.split()) < 10:
+            return None, []
+        return text, urls
     def _build_corpus(self, topic: str):
         """Fetch up-to-date search results."""
         results = self._tavily.search(query=topic, max_results=30)
+        raw_texts = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
+        corpus, all_urls = [], []
+        for t in raw_texts:
+            clean_text, urls = self._clean_text(t)
+            if clean_text:
+                corpus.append(clean_text)
+                all_urls.extend(urls)
+        #Deduplicate and keep top unique URLs
+        all_urls = list(dict.fromkeys(all_urls))[:5]
+        return corpus, all_urls
     def _run(self, query: str, top_k: int = 8) -> str:
         """
         Run hybrid search: BM25 + semantic similarity.
         """
+        corpus, urls = self._build_corpus(query)
         if not corpus:
             return "No relevant content found."
         sem_scores = np.dot(emb_corpus, emb_query)
         # Normalize scores
+        if np.ptp(bm25_scores) == 0:
+            bm25_norm = np.zeros_like(bm25_scores) #ensure BM25 works even if only one doc
+        else:
+            bm25_norm = (bm25_scores - bm25_scores.min()) / (np.ptp(bm25_scores) + 1e-8)
         sem_norm = (sem_scores - sem_scores.min()) / (np.ptp(sem_scores) + 1e-8)
         # Weighted fusion
         return "\n\n".join(top_passages)
     def summarize_passages(self, topic: str, passages):
+        """Summarize the retrieved content while retaining citations"""
         if isinstance(passages, str):
             passages = [passages]
         # 🧹 Clean each passage (remove links, HTML tags, redundant whitespace)
+        main_text = []
+        urls = []
         for p in passages:
+            text, found_urls = self._clean_text(p)
+            if text:
+                main_text.append(text)
+                urls.extend(found_urls)
         # Build condensed input (limit total tokens)
+        text_block = " ".join(main_text[:5])[:4000]
+        unique_urls = list(dict.fromkeys(urls))[:5]
         try:
             response = self._client.chat.completions.create(
                 model="gpt-4o-mini",
                             "You are a concise research summarizer. "
                             "Produce a 1–2 paragraph overview that highlights key facts, "
                             "themes, and findings relevant to the topic. "
+                            "Exclude URLs or boilerplate text, but clearly label 'Sources' at the end."
                         ),
                     },
                     {"role": "user", "content": f"Summarize these passages about {topic}:\n\n{text_block}"}
                 ],
                 temperature=0.3
             )
+            summary = response.choices[0].message.content.strip()
+            if unique_urls:
+                summary += "\n\n**Sources**\n" + "\n".join(unique_urls)
+            return summary
         except Exception as e:
             return f"Summarization failed: {e}"