Spaces:
Runtime error
Runtime error
| """ | |
| retriever.py | |
| BM25-based retrieval over the support corpus. | |
| No LLM needed — pure keyword matching, fast and reliable. | |
| """ | |
| import re | |
| from rank_bm25 import BM25Okapi | |
| def _tokenize(text: str) -> list[str]: | |
| return re.findall(r"[a-z0-9]+", text.lower()) | |
| class DomainRetriever: | |
| def __init__(self, docs: list[dict]): | |
| self.docs = docs | |
| tokenized = [_tokenize(f"{d['title']} {d['text']}") for d in docs] | |
| self.bm25 = BM25Okapi(tokenized) if tokenized else None | |
| def retrieve(self, query: str, top_k: int = 4) -> list[dict]: | |
| if not self.docs or not self.bm25: | |
| return [] | |
| scores = self.bm25.get_scores(_tokenize(query)) | |
| ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True) | |
| results = [] | |
| for idx in ranked[:top_k]: | |
| if scores[idx] > 0: | |
| doc = dict(self.docs[idx]) | |
| doc["score"] = round(float(scores[idx]), 3) | |
| doc["snippet"] = doc["text"][:1000] | |
| results.append(doc) | |
| return results | |
| class MultiDomainRetriever: | |
| def __init__(self, corpus: dict[str, list[dict]]): | |
| self.retrievers = {d: DomainRetriever(docs) for d, docs in corpus.items()} | |
| def retrieve_for_domain(self, domain: str, query: str, top_k: int = 4) -> list[dict]: | |
| r = self.retrievers.get(domain) | |
| return r.retrieve(query, top_k) if r else [] | |
| def retrieve_all(self, query: str, top_k_per_domain: int = 2) -> list[dict]: | |
| results = [] | |
| for domain, r in self.retrievers.items(): | |
| for hit in r.retrieve(query, top_k_per_domain): | |
| hit["domain"] = domain | |
| results.append(hit) | |
| results.sort(key=lambda x: x["score"], reverse=True) | |
| return results | |