Spaces:
Sleeping
Sleeping
update hybrid_retriever_tool file
Browse files
tools/hybrid_retriever_tool.py
CHANGED
|
@@ -7,10 +7,11 @@ from crewai_tools import RagTool
|
|
| 7 |
import os
|
| 8 |
|
| 9 |
class HybridRetrieverTool(RagTool):
|
| 10 |
-
name = "Hybrid Retriever Tool"
|
| 11 |
-
description = "Combines BM25 keyword scoring with semantic similarity for hybrid retrieval"
|
| 12 |
|
| 13 |
def __init__(self, alpha=0.6):
|
|
|
|
| 14 |
self.alpha = alpha
|
| 15 |
self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
| 16 |
self.tavily = TavilyClient(api_key=os.getenv("TAVILITY_API_KEY"))
|
|
@@ -26,17 +27,19 @@ class HybridRetrieverTool(RagTool):
|
|
| 26 |
corpus.append(content)
|
| 27 |
return corpus
|
| 28 |
|
| 29 |
-
def _run(self, query, top_k=8):
|
| 30 |
"""
|
| 31 |
Run hybrid search: BM25 + semantic similarity.
|
| 32 |
"""
|
| 33 |
corpus = self._build_corpus(query)
|
| 34 |
if not corpus:
|
| 35 |
return "No relevant content found."
|
| 36 |
-
|
|
|
|
| 37 |
bm25 = BM25Okapi([doc.split() for doc in corpus])
|
| 38 |
bm25_scores = np.array(bm25.get(query.split()))
|
| 39 |
|
|
|
|
| 40 |
emb_corpus = self.embedder.encode(corpus, convert_to_numpy=True, normalize_embeddings=True)
|
| 41 |
emb_query = self.embedder.encode(query, convert_to_numpy=True, normalize_embeddings=True)
|
| 42 |
sem_scores = np.dot(emb_corpus, emb_query)
|
|
@@ -61,7 +64,7 @@ class HybridRetrieverTool(RagTool):
|
|
| 61 |
model="gpt-4o-mini",
|
| 62 |
messages=[
|
| 63 |
{"role": "system", "content": "You are an expert summarizer."},
|
| 64 |
-
{"role": "user", "content": f"Summarize these passages about {topic}"}
|
| 65 |
],
|
| 66 |
temperature=0.3
|
| 67 |
)
|
|
|
|
| 7 |
import os
|
| 8 |
|
| 9 |
class HybridRetrieverTool(RagTool):
|
| 10 |
+
name: str = "Hybrid Retriever Tool"
|
| 11 |
+
description: str = "Combines BM25 keyword scoring with semantic similarity for hybrid retrieval"
|
| 12 |
|
| 13 |
def __init__(self, alpha=0.6):
|
| 14 |
+
super().__init__()
|
| 15 |
self.alpha = alpha
|
| 16 |
self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
| 17 |
self.tavily = TavilyClient(api_key=os.getenv("TAVILITY_API_KEY"))
|
|
|
|
| 27 |
corpus.append(content)
|
| 28 |
return corpus
|
| 29 |
|
| 30 |
+
def _run(self, query: str, top_k=8) -> str:
|
| 31 |
"""
|
| 32 |
Run hybrid search: BM25 + semantic similarity.
|
| 33 |
"""
|
| 34 |
corpus = self._build_corpus(query)
|
| 35 |
if not corpus:
|
| 36 |
return "No relevant content found."
|
| 37 |
+
|
| 38 |
+
# Lexical relevance
|
| 39 |
bm25 = BM25Okapi([doc.split() for doc in corpus])
|
| 40 |
bm25_scores = np.array(bm25.get(query.split()))
|
| 41 |
|
| 42 |
+
# semantic relevance
|
| 43 |
emb_corpus = self.embedder.encode(corpus, convert_to_numpy=True, normalize_embeddings=True)
|
| 44 |
emb_query = self.embedder.encode(query, convert_to_numpy=True, normalize_embeddings=True)
|
| 45 |
sem_scores = np.dot(emb_corpus, emb_query)
|
|
|
|
| 64 |
model="gpt-4o-mini",
|
| 65 |
messages=[
|
| 66 |
{"role": "system", "content": "You are an expert summarizer."},
|
| 67 |
+
{"role": "user", "content": f"Summarize these passages about {topic}:\n\n{text_block}"}
|
| 68 |
],
|
| 69 |
temperature=0.3
|
| 70 |
)
|