cicboy commited on
Commit
a370528
·
1 Parent(s): 8ec15d6

update hybrid_retriever_tool file

Browse files
Files changed (1) hide show
  1. tools/hybrid_retriever_tool.py +50 -12
tools/hybrid_retriever_tool.py CHANGED
@@ -6,6 +6,7 @@ from openai import OpenAI
6
  from crewai_tools import RagTool
7
  from pydantic import Field, PrivateAttr
8
  import os
 
9
  import re
10
 
11
  class HybridRetrieverTool(RagTool):
@@ -24,17 +25,43 @@ class HybridRetrieverTool(RagTool):
24
  self._tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
25
  self._client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def _build_corpus(self, topic: str):
28
  """Fetch up-to-date search results."""
29
  results = self._tavily.search(query=topic, max_results=30)
30
- corpus = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
31
- return corpus
 
 
 
 
 
 
 
 
 
32
 
33
  def _run(self, query: str, top_k: int = 8) -> str:
34
  """
35
  Run hybrid search: BM25 + semantic similarity.
36
  """
37
- corpus = self._build_corpus(query)
38
  if not corpus:
39
  return "No relevant content found."
40
 
@@ -48,7 +75,11 @@ class HybridRetrieverTool(RagTool):
48
  sem_scores = np.dot(emb_corpus, emb_query)
49
 
50
  # Normalize scores
51
- bm25_norm = (bm25_scores - bm25_scores.min()) / (np.ptp(bm25_scores) + 1e-8)
 
 
 
 
52
  sem_norm = (sem_scores - sem_scores.min()) / (np.ptp(sem_scores) + 1e-8)
53
 
54
  # Weighted fusion
@@ -59,17 +90,21 @@ class HybridRetrieverTool(RagTool):
59
  return "\n\n".join(top_passages)
60
 
61
  def summarize_passages(self, topic: str, passages):
 
62
  if isinstance(passages, str):
63
  passages = [passages]
64
  # 🧹 Clean each passage (remove links, HTML tags, redundant whitespace)
65
- clean_passages = []
 
66
  for p in passages:
67
- p = re.sub(r"http\S+", "", p) # remove URLs
68
- p = re.sub(r"\s+", " ", p).strip() # normalize spaces
69
- p = re.sub(r"[^A-Za-z0-9.,!?;:()\-\s]", "", p) # strip stray symbols
70
- clean_passages.append(p)
71
  # Build condensed input (limit total tokens)
72
- text_block = " ".join(clean_passages[:5])[:4000]
 
 
73
  try:
74
  response = self._client.chat.completions.create(
75
  model="gpt-4o-mini",
@@ -80,13 +115,16 @@ class HybridRetrieverTool(RagTool):
80
  "You are a concise research summarizer. "
81
  "Produce a 1–2 paragraph overview that highlights key facts, "
82
  "themes, and findings relevant to the topic. "
83
- "Exclude URLs, lists, HTML remnants, or boilerplate text."
84
  ),
85
  },
86
  {"role": "user", "content": f"Summarize these passages about {topic}:\n\n{text_block}"}
87
  ],
88
  temperature=0.3
89
  )
90
- return response.choices[0].message.content.strip()
 
 
 
91
  except Exception as e:
92
  return f"Summarization failed: {e}"
 
6
  from crewai_tools import RagTool
7
  from pydantic import Field, PrivateAttr
8
  import os
9
+ from html import unescape
10
  import re
11
 
12
  class HybridRetrieverTool(RagTool):
 
25
  self._tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
26
  self._client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
27
 
28
+ # 🧹 Text Cleaning
29
+ def _clean_text(self, text: str):
30
+ """Remove HTML, images, boilerplate; keep valuable text & extract URLs for citation."""
31
+ urls = re.findall(r'https?://\S+', text)
32
+ text = unescape(text)
33
+ text = re.sub(r"<[^>]+>", " ", text) # Remove HTML tags
34
+ text = re.sub(r"!\[.*?\]\(.*?\)", " ", text) # Remove Markdown images
35
+ text = re.sub(r"\[.*?\]\(.*?\)", " ", text) # Remove Markdown links
36
+ text = re.sub(r"\S+\.(jpg|jpeg|png|gif|svg|webp|pdf)", " ", text, flags=re.IGNORECASE)
37
+ text = re.sub(r"http\S+", " ", text) # Remove URLs inline
38
+ text = re.sub(r"(Share|Tweet|Email|Login|Subscribe|Learn More|Read More)+", " ", text, flags=re.IGNORECASE)
39
+ text = re.sub(r"\s+", " ", text).strip() # Normalize spaces
40
+ text = re.sub(r"(Education Weekly Update.*?)+", "", text, flags=re.IGNORECASE)
41
+ if len(text.split()) < 10:
42
+ return None, []
43
+ return text, urls
44
+
45
  def _build_corpus(self, topic: str):
46
  """Fetch up-to-date search results."""
47
  results = self._tavily.search(query=topic, max_results=30)
48
+ raw_texts = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
49
+ corpus, all_urls = [], []
50
+ for t in raw_texts:
51
+ clean_text, urls = self._clean_text(t)
52
+ if clean_text:
53
+ corpus.append(clean_text)
54
+ all_urls.extend(urls)
55
+
56
+ #Deduplicate and keep top unique URLs
57
+ all_urls = list(dict.fromkeys(all_urls))[:5]
58
+ return corpus, all_urls
59
 
60
  def _run(self, query: str, top_k: int = 8) -> str:
61
  """
62
  Run hybrid search: BM25 + semantic similarity.
63
  """
64
+ corpus, urls = self._build_corpus(query)
65
  if not corpus:
66
  return "No relevant content found."
67
 
 
75
  sem_scores = np.dot(emb_corpus, emb_query)
76
 
77
  # Normalize scores
78
+ if np.ptp(bm25_scores) == 0:
79
+ bm25_norm = np.zeros_like(bm25_scores) #ensure BM25 works even if only one doc
80
+ else:
81
+ bm25_norm = (bm25_scores - bm25_scores.min()) / (np.ptp(bm25_scores) + 1e-8)
82
+
83
  sem_norm = (sem_scores - sem_scores.min()) / (np.ptp(sem_scores) + 1e-8)
84
 
85
  # Weighted fusion
 
90
  return "\n\n".join(top_passages)
91
 
92
  def summarize_passages(self, topic: str, passages):
93
+ """Summarize the retrieved content while retaining citations"""
94
  if isinstance(passages, str):
95
  passages = [passages]
96
  # 🧹 Clean each passage (remove links, HTML tags, redundant whitespace)
97
+ main_text = []
98
+ urls = []
99
  for p in passages:
100
+ text, found_urls = self._clean_text(p)
101
+ if text:
102
+ main_text.append(text)
103
+ urls.extend(found_urls)
104
  # Build condensed input (limit total tokens)
105
+ text_block = " ".join(main_text[:5])[:4000]
106
+ unique_urls = list(dict.fromkeys(urls))[:5]
107
+
108
  try:
109
  response = self._client.chat.completions.create(
110
  model="gpt-4o-mini",
 
115
  "You are a concise research summarizer. "
116
  "Produce a 1–2 paragraph overview that highlights key facts, "
117
  "themes, and findings relevant to the topic. "
118
+ "Exclude URLs or boilerplate text, but clearly label 'Sources' at the end."
119
  ),
120
  },
121
  {"role": "user", "content": f"Summarize these passages about {topic}:\n\n{text_block}"}
122
  ],
123
  temperature=0.3
124
  )
125
+ summary = response.choices[0].message.content.strip()
126
+ if unique_urls:
127
+ summary += "\n\n**Sources**\n" + "\n".join(unique_urls)
128
+ return summary
129
  except Exception as e:
130
  return f"Summarization failed: {e}"