cicboy commited on
Commit
9ab1b3b
·
1 Parent(s): f1e22c1

update hybrid_retriever_tool file

Browse files
Files changed (1) hide show
  1. tools/hybrid_retriever_tool.py +40 -16
tools/hybrid_retriever_tool.py CHANGED
@@ -122,10 +122,11 @@ class HybridRetrieverTool(RagTool):
122
  return "\n\n".join(top_passages)
123
 
124
  def summarize_passages(self, topic: str, passages):
125
- """Summarize the retrieved content while retaining citations"""
126
  if isinstance(passages, str):
127
  passages = [passages]
128
- # 🧹 Clean each passage (remove links, HTML tags, redundant whitespace)
 
129
  main_text = []
130
  urls = []
131
  for p in passages:
@@ -133,30 +134,53 @@ class HybridRetrieverTool(RagTool):
133
  if text:
134
  main_text.append(text)
135
  urls.extend(found_urls)
136
- # Build condensed input (limit total tokens)
137
- text_block = " ".join(main_text[:5])[:4000]
 
 
 
 
 
 
 
 
138
  unique_urls = list(dict.fromkeys(urls))[:5]
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  try:
141
  response = self._client.chat.completions.create(
142
  model="gpt-4o-mini",
143
  messages=[
144
- {
145
- "role": "system",
146
- "content": (
147
- "You are a concise research summarizer. "
148
- "Produce a 1–2 paragraph overview that highlights key facts, "
149
- "themes, and findings relevant to the topic. "
150
- "Exclude URLs or boilerplate text, but clearly label 'Sources' at the end."
151
- ),
152
- },
153
- {"role": "user", "content": f"Summarize these passages about {topic}:\n\n{text_block}"}
154
  ],
155
  temperature=0.3
156
  )
 
157
  summary = response.choices[0].message.content.strip()
 
158
  if unique_urls:
159
- summary += "\n\n**Sources**\n" + "\n".join(unique_urls)
 
160
  return summary
 
161
  except Exception as e:
162
- return f"Summarization failed: {e}"
 
122
  return "\n\n".join(top_passages)
123
 
124
  def summarize_passages(self, topic: str, passages):
125
+ """Summarize retrieved content into a coherent short digest, keeping citations."""
126
  if isinstance(passages, str):
127
  passages = [passages]
128
+
129
+ # Clean and compress passages
130
  main_text = []
131
  urls = []
132
  for p in passages:
 
134
  if text:
135
  main_text.append(text)
136
  urls.extend(found_urls)
137
+
138
+ if not main_text:
139
+ return "No meaningful content found to summarize."
140
+
141
+ # --- Limit and re-rank by diversity ---
142
+ unique_texts = list(dict.fromkeys(main_text))[:5] # prevent duplication
143
+ text_block = " ".join(unique_texts)
144
+ text_block = re.sub(r"\s{2,}", " ", text_block).strip()
145
+ text_block = text_block[:4000] # safety limit for token size
146
+
147
  unique_urls = list(dict.fromkeys(urls))[:5]
148
 
149
+ # --- Structured summarization ---
150
+ prompt = f"""
151
+ You are a research assistant creating a clean, readable summary.
152
+
153
+ Topic: {topic}
154
+
155
+ Condense the following information into **2–3 coherent paragraphs** that:
156
+ 1. Focus on factual insights and trends, not raw data or footnotes.
157
+ 2. Remove list items, footers, or numeric citations (like (1), (2)).
158
+ 3. Retain key facts, organizations, or findings.
159
+ 4. Avoid repeating words or phrases.
160
+ 5. Conclude with a single “Sources” section listing the most relevant URLs.
161
+
162
+ Text to summarize:
163
+ {text_block}
164
+
165
+ Return output in Markdown format.
166
+ """
167
+
168
  try:
169
  response = self._client.chat.completions.create(
170
  model="gpt-4o-mini",
171
  messages=[
172
+ {"role": "system", "content": "You are a concise, professional summarizer."},
173
+ {"role": "user", "content": prompt},
 
 
 
 
 
 
 
 
174
  ],
175
  temperature=0.3
176
  )
177
+
178
  summary = response.choices[0].message.content.strip()
179
+
180
  if unique_urls:
181
+ summary += "\n\n**Sources:**\n" + "\n".join(f"- {u}" for u in unique_urls)
182
+
183
  return summary
184
+
185
  except Exception as e:
186
+ return f"Summarization failed: {e}"