Spaces:
Sleeping
Sleeping
update hybrid_retriever_tool file
Browse files- tools/hybrid_retriever_tool.py +40 -16
tools/hybrid_retriever_tool.py
CHANGED
|
@@ -122,10 +122,11 @@ class HybridRetrieverTool(RagTool):
|
|
| 122 |
return "\n\n".join(top_passages)
|
| 123 |
|
| 124 |
def summarize_passages(self, topic: str, passages):
|
| 125 |
-
"""Summarize
|
| 126 |
if isinstance(passages, str):
|
| 127 |
passages = [passages]
|
| 128 |
-
|
|
|
|
| 129 |
main_text = []
|
| 130 |
urls = []
|
| 131 |
for p in passages:
|
|
@@ -133,30 +134,53 @@ class HybridRetrieverTool(RagTool):
|
|
| 133 |
if text:
|
| 134 |
main_text.append(text)
|
| 135 |
urls.extend(found_urls)
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
unique_urls = list(dict.fromkeys(urls))[:5]
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
try:
|
| 141 |
response = self._client.chat.completions.create(
|
| 142 |
model="gpt-4o-mini",
|
| 143 |
messages=[
|
| 144 |
-
{
|
| 145 |
-
|
| 146 |
-
"content": (
|
| 147 |
-
"You are a concise research summarizer. "
|
| 148 |
-
"Produce a 1–2 paragraph overview that highlights key facts, "
|
| 149 |
-
"themes, and findings relevant to the topic. "
|
| 150 |
-
"Exclude URLs or boilerplate text, but clearly label 'Sources' at the end."
|
| 151 |
-
),
|
| 152 |
-
},
|
| 153 |
-
{"role": "user", "content": f"Summarize these passages about {topic}:\n\n{text_block}"}
|
| 154 |
],
|
| 155 |
temperature=0.3
|
| 156 |
)
|
|
|
|
| 157 |
summary = response.choices[0].message.content.strip()
|
|
|
|
| 158 |
if unique_urls:
|
| 159 |
-
summary += "\n\n**Sources
|
|
|
|
| 160 |
return summary
|
|
|
|
| 161 |
except Exception as e:
|
| 162 |
-
return f"Summarization failed: {e}"
|
|
|
|
| 122 |
return "\n\n".join(top_passages)
|
| 123 |
|
| 124 |
def summarize_passages(self, topic: str, passages):
|
| 125 |
+
"""Summarize retrieved content into a coherent short digest, keeping citations."""
|
| 126 |
if isinstance(passages, str):
|
| 127 |
passages = [passages]
|
| 128 |
+
|
| 129 |
+
# Clean and compress passages
|
| 130 |
main_text = []
|
| 131 |
urls = []
|
| 132 |
for p in passages:
|
|
|
|
| 134 |
if text:
|
| 135 |
main_text.append(text)
|
| 136 |
urls.extend(found_urls)
|
| 137 |
+
|
| 138 |
+
if not main_text:
|
| 139 |
+
return "No meaningful content found to summarize."
|
| 140 |
+
|
| 141 |
+
# --- Limit and re-rank by diversity ---
|
| 142 |
+
unique_texts = list(dict.fromkeys(main_text))[:5] # prevent duplication
|
| 143 |
+
text_block = " ".join(unique_texts)
|
| 144 |
+
text_block = re.sub(r"\s{2,}", " ", text_block).strip()
|
| 145 |
+
text_block = text_block[:4000] # safety limit for token size
|
| 146 |
+
|
| 147 |
unique_urls = list(dict.fromkeys(urls))[:5]
|
| 148 |
|
| 149 |
+
# --- Structured summarization ---
|
| 150 |
+
prompt = f"""
|
| 151 |
+
You are a research assistant creating a clean, readable summary.
|
| 152 |
+
|
| 153 |
+
Topic: {topic}
|
| 154 |
+
|
| 155 |
+
Condense the following information into **2–3 coherent paragraphs** that:
|
| 156 |
+
1. Focus on factual insights and trends, not raw data or footnotes.
|
| 157 |
+
2. Remove list items, footers, or numeric citations (like (1), (2)).
|
| 158 |
+
3. Retain key facts, organizations, or findings.
|
| 159 |
+
4. Avoid repeating words or phrases.
|
| 160 |
+
5. Conclude with a single “Sources” section listing the most relevant URLs.
|
| 161 |
+
|
| 162 |
+
Text to summarize:
|
| 163 |
+
{text_block}
|
| 164 |
+
|
| 165 |
+
Return output in Markdown format.
|
| 166 |
+
"""
|
| 167 |
+
|
| 168 |
try:
|
| 169 |
response = self._client.chat.completions.create(
|
| 170 |
model="gpt-4o-mini",
|
| 171 |
messages=[
|
| 172 |
+
{"role": "system", "content": "You are a concise, professional summarizer."},
|
| 173 |
+
{"role": "user", "content": prompt},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
],
|
| 175 |
temperature=0.3
|
| 176 |
)
|
| 177 |
+
|
| 178 |
summary = response.choices[0].message.content.strip()
|
| 179 |
+
|
| 180 |
if unique_urls:
|
| 181 |
+
summary += "\n\n**Sources:**\n" + "\n".join(f"- {u}" for u in unique_urls)
|
| 182 |
+
|
| 183 |
return summary
|
| 184 |
+
|
| 185 |
except Exception as e:
|
| 186 |
+
return f"Summarization failed: {e}"
|