Spaces:
Sleeping
Sleeping
File size: 9,280 Bytes
baf9e10 f7a28da d49e2bf baf9e10 a370528 8ec15d6 42d47d3 baf9e10 f7a28da 5f02fec bb696dc d49e2bf bb696dc d49e2bf baf9e10 a370528 f1e22c1 a370528 f1e22c1 a370528 f1e22c1 a370528 f1e22c1 a370528 90f40ea baf9e10 42d47d3 a370528 90f40ea 42d47d3 90f40ea 42d47d3 2237728 bb696dc baf9e10 90f40ea baf9e10 5f02fec baf9e10 bb696dc baf9e10 5f02fec 1596fb6 baf9e10 a370528 c659851 baf9e10 90f40ea 42d47d3 baf9e10 90f40ea 9ab1b3b baf9e10 9ab1b3b a370528 8ec15d6 a370528 9ab1b3b 42d47d3 9ab1b3b 90f40ea a370528 9ab1b3b baf9e10 1596fb6 baf9e10 9ab1b3b baf9e10 9ab1b3b a370528 9ab1b3b a370528 42d47d3 9ab1b3b a370528 9ab1b3b baf9e10 9ab1b3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from tavily import TavilyClient
from openai import OpenAI
from crewai_tools import RagTool
from pydantic import Field, PrivateAttr
import os
from html import unescape
import re
import logging
class HybridRetrieverTool(RagTool):
name: str = "Hybrid Retriever Tool"
description: str = "Combines BM25 keyword scoring with semantic similarity for hybrid retrieval"
alpha: float = Field(default=0.6, description="Weight between semantic and lexical scores")
# Define private attributes
_embedder: SentenceTransformer = PrivateAttr()
_tavily: TavilyClient = PrivateAttr()
_client: OpenAI = PrivateAttr()
def __init__(self, **data):
super().__init__(**data)
self._embedder = SentenceTransformer("all-MiniLM-L6-v2")
self._tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
self._client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# 🧹 Text Cleaning
def _clean_text(self, text: str):
"""
Clean Tavily content by removing HTML, bullets, boilerplate, and repetitive junk
while preserving high-value plain text and extracting source URLs for citation.
"""
if not text or len(text.strip()) < 10:
return None, []
# Extract URLs for citation before cleaning
urls = re.findall(r'https?://\S+', text)
# Decode HTML entities and remove tags
text = unescape(text)
text = re.sub(r"<[^>]+>", " ", text) # strip HTML tags
text = re.sub(r"!\[.*?\]\(.*?\)", " ", text) # remove Markdown images
text = re.sub(r"\[.*?\]\(.*?\)", " ", text) # remove Markdown links
text = re.sub(r"\S+\.(jpg|jpeg|png|gif|svg|webp|pdf)", " ", text, flags=re.I)
text = re.sub(r"http\S+", " ", text) # remove URLs inline
# Remove layout and boilerplate junk
text = re.sub(r"(Share|Tweet|Email|Login|Subscribe|Learn More|Read More|Click Here)+", " ", text, flags=re.I)
text = re.sub(r"(Education Weekly Update.*?)+", " ", text, flags=re.I)
text = re.sub(r"(\bAI\s*\+\s*){2,}", "AI ", text) # collapse 'AI + AI + AI'
text = re.sub(r"[•·●○◦‣⁃∙▪]+", " ", text) # remove bullet symbols
text = re.sub(r"(?m)^\s*#.*$", " ", text) # remove markdown headers
text = re.sub(r"\b[A-Z]{2,}\b( [A-Z]{2,}\b)+", " ", text) # collapse ALLCAPS runs
text = text.replace("\xa0", " ") # remove non-breaking spaces
text = re.sub(r"\s{2,}", " ", text).strip() # normalize whitespace
# Filter out boilerplate / short junk sections
if any(kw in text.lower() for kw in [
"education weekly update",
"copyright",
"terms of use",
"cookie policy",
"advertisement",
"site map",
]):
return None, []
if len(text.split()) < 30:
return None, []
# Normalize casing (optional but improves readability)
text = text[0].upper() + text[1:] if len(text) > 1 else text
return text, urls
def _build_corpus(self, topic: str, top_k: int = 8):
"""Fetch up-to-date search results."""
results = self._tavily.search(query=topic, max_results=50)
raw_texts = [r.get("content", "").strip() for r in results.get("results", []) if r.get("content")]
corpus, all_urls = [], []
for t in raw_texts:
clean_text, urls = self._clean_text(t)
if clean_text:
corpus.append(clean_text)
all_urls.extend(urls)
#Deduplicate and keep top unique URLs
all_urls = list(dict.fromkeys(all_urls))[:top_k]
return corpus, all_urls
# LLM reranker
def _rerank(self, query: str, passages: list[str], top_n: int) -> list[str]:
"""
Use an LLM to re-rank retrieved passages for contextual relevance to the query.
"""
if not passages:
return []
try:
formatted_passages = "\n\n".join(
[f"Passage {i+1}:\n{p}" for i, p in enumerate(passages)]
)
prompt = f"""
You are a precise research assistant that ranks text passages for relevance.
Query:
"{query}"
Passages:
{formatted_passages}
Instructions:
- Rank passages by how directly and substantively they address the query.
- Ignore repetitive, boilerplate, or promotional content.
- Return ONLY the top {top_n} most relevant passages, in their original text form.
"""
response = self._client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are an expert LLM reranker for information retrieval."},
{"role": "user", "content": prompt},
],
temperature=0,
)
ranked_text = response.choices[0].message.content.strip()
reranked = re.split(r"Passage\s*\d+:", ranked_text)
reranked = [p.strip() for p in reranked if len(p.strip()) > 20]
if len(reranked) == 0:
print("⚠️ Reranker returned no valid text, using original order.")
return passages[:top_n]
return reranked[:top_n]
except Exception as e:
logging.warning(f"Reranker failed: {e}")
return passages[:top_n]
def _run(self, query: str, top_k: int = 8) -> str:
"""
Run hybrid search: BM25 + semantic similarity.
"""
corpus, urls = self._build_corpus(query, top_k=top_k)
if not corpus:
return "No relevant content found."
# Lexical relevance
bm25 = BM25Okapi([doc.split() for doc in corpus])
bm25_scores = np.array(bm25.get_scores(query.split()))
# semantic relevance
emb_corpus = self._embedder.encode(corpus, convert_to_numpy=True, normalize_embeddings=True)
emb_query = self._embedder.encode(query, convert_to_numpy=True, normalize_embeddings=True)
sem_scores = np.dot(emb_corpus, emb_query)
# Normalize scores
if np.ptp(bm25_scores) == 0:
bm25_norm = np.zeros_like(bm25_scores) #ensure BM25 works even if only one doc
else:
bm25_norm = (bm25_scores - bm25_scores.min()) / (np.ptp(bm25_scores) + 1e-8)
sem_norm = (sem_scores - sem_scores.min()) / (np.ptp(sem_scores) + 1e-8)
# Weighted fusion
hybrid_scores = self.alpha * sem_norm + (1 - self.alpha) * bm25_norm
top_indices= np.argsort(hybrid_scores)[::-1][:top_k]
top_passages = [corpus[i] for i in top_indices]
reranked = self._rerank(query, top_passages, top_n=top_k)
return "\n\n".join(reranked)
def summarize_passages(self, topic: str, passages, top_k: int = 8):
"""Summarize retrieved content into a coherent short digest, keeping citations."""
if isinstance(passages, str):
passages = [passages]
# Clean and compress passages
main_text = []
urls = []
for p in passages:
text, found_urls = self._clean_text(p)
if text:
main_text.append(text)
urls.extend(found_urls)
if not main_text:
return "No meaningful content found to summarize."
# --- Limit and re-rank by diversity ---
unique_texts = list(dict.fromkeys(main_text))[:5] # prevent duplication
text_block = " ".join(unique_texts)
text_block = re.sub(r"\s{2,}", " ", text_block).strip()
text_block = text_block[:5000] # safety limit for token size
unique_urls = list(dict.fromkeys(urls))[:top_k]
# --- Structured summarization ---
prompt = f"""
You are a research assistant creating a clean, readable summary.
Topic: {topic}
Condense the following information into **2–3 coherent paragraphs** that:
1. Focus on factual insights and trends, not raw data or footnotes.
2. Remove list items, footers, or numeric citations (like (1), (2)).
3. Retain key facts, organizations, or findings.
4. Avoid repeating words or phrases.
5. Conclude with a single “Sources” section listing the most relevant URLs.
Text to summarize:
{text_block}
Return output in Markdown format.
"""
try:
response = self._client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a concise, professional summarizer."},
{"role": "user", "content": prompt},
],
temperature=0.3
)
summary = response.choices[0].message.content.strip()
if unique_urls:
if unique_urls:
summary += "\n\n**Sources:**\n" + "\n".join(f"- [{u}]({u})" for u in unique_urls)
return summary
except Exception as e:
return f"Summarization failed: {e}"
|