Spaces:
Runtime error
Runtime error
File size: 3,496 Bytes
60b97da 35c1d2c 60b97da 35c1d2c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | import re
from collections import defaultdict
from typing import List
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
TOKEN_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_./:-]*")
def tokenize(text: str) -> List[str]:
return [token.lower() for token in TOKEN_RE.findall(text)]
class HybridSearchEngine:
def __init__(self, reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
self.reranker = CrossEncoder(reranker_model)
def build_for_repository(self, repo_id: int, chunks: List[dict]):
return None
def remove_repository(self, repo_id: int):
return None
def bm25_search(self, chunks: List[dict], query: str, top_k: int = 12) -> List[dict]:
if not chunks:
return []
tokens = tokenize(query)
if not tokens:
return []
corpus_tokens = [tokenize(chunk["searchable_text"]) for chunk in chunks]
bm25 = BM25Okapi(corpus_tokens) if corpus_tokens else None
if not bm25:
return []
scores = bm25.get_scores(tokens)
ranked = sorted(
zip(chunks, scores),
key=lambda item: item[1],
reverse=True,
)[:top_k]
results = []
for rank, (chunk, score) in enumerate(ranked, start=1):
chunk = dict(chunk)
chunk["bm25_score"] = float(score)
chunk["bm25_rank"] = rank
results.append(chunk)
return results
def reciprocal_rank_fusion(
self,
lexical_results: List[dict],
semantic_results: List[dict],
top_k: int = 10,
k: int = 60,
) -> List[dict]:
fused = defaultdict(lambda: {"rrf_score": 0.0})
for rank, item in enumerate(lexical_results, start=1):
fused[item["id"]]["rrf_score"] += 1.0 / (k + rank)
fused[item["id"]].update(item)
for rank, item in enumerate(semantic_results, start=1):
fused[item["id"]]["rrf_score"] += 1.0 / (k + rank)
fused[item["id"]].update(item)
merged = sorted(fused.values(), key=lambda item: item["rrf_score"], reverse=True)
return merged[:top_k]
def rerank(self, query: str, candidates: List[dict], top_k: int = 6) -> List[dict]:
"""
FIX: top_k now defaults to 6 and callers should pass a small final number (4-6),
NOT search_depth (which was up to 120). Reranking 120 items then dumping them
all into the LLM context was the main faithfulness killer.
"""
if not candidates:
return []
pairs = [
[query, f'{item["file_path"]}\n{item.get("signature") or ""}\n{item["content"]}']
for item in candidates
]
scores = self.reranker.predict(pairs)
reranked = []
for item, score in zip(candidates, scores):
enriched = dict(item)
enriched["rerank_score"] = float(score)
reranked.append(enriched)
reranked.sort(key=lambda item: item["rerank_score"], reverse=True)
return reranked[:top_k]
@staticmethod
def normalize_semantic_results(results: List[dict]) -> List[dict]:
normalized = []
for rank, item in enumerate(results, start=1):
enriched = dict(item)
enriched["semantic_rank"] = rank
enriched["semantic_score"] = float(item.get("semantic_score", 0.0))
normalized.append(enriched)
return normalized |