Spaces:
Sleeping
Sleeping
| """Sparse retrieval via BM25Okapi over the persisted token corpus.""" | |
| from __future__ import annotations | |
| from functools import lru_cache | |
| from rank_bm25 import BM25Okapi | |
| from config import RETRIEVAL_CONFIG | |
| from ingestion.indexer import load_bm25_corpus, tokenize | |
| from retrieval.dense import Hit | |
| def _bm25_state(): | |
| corpus = load_bm25_corpus() | |
| bm25 = BM25Okapi(corpus["tokenized"]) | |
| return bm25, corpus | |
| def sparse_search(query: str, k: int | None = None) -> list[Hit]: | |
| k = k or RETRIEVAL_CONFIG["sparse_k"] | |
| bm25, corpus = _bm25_state() | |
| tokens = tokenize(query) | |
| if not tokens: | |
| return [] | |
| scores = bm25.get_scores(tokens) | |
| idx_sorted = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k] | |
| max_score = float(scores[idx_sorted[0]]) if idx_sorted else 0.0 | |
| hits: list[Hit] = [] | |
| for r, i in enumerate(idx_sorted): | |
| s = float(scores[i]) | |
| if s <= 0: | |
| continue | |
| norm = s / max_score if max_score > 0 else 0.0 | |
| hits.append( | |
| Hit( | |
| chunk_id=corpus["ids"][i], | |
| text=corpus["docs"][i], | |
| metadata=dict(corpus["metas"][i]), | |
| score=norm, | |
| rank=r, | |
| ) | |
| ) | |
| return hits | |