Spaces:
Running
Running
| """ | |
| BM25 keyword retriever, tuned for code. | |
| Same interface as the StudyMate version, but with a code-aware tokenizer: | |
| prose `.split()` turns "jwt.encode(token)" into the single junk token | |
| "jwt.encode(token)", so a search for "jwt.encode" never matches. Splitting on | |
| non-identifier characters instead yields ["jwt", "encode", "token"], so exact | |
| symbol searches -- BM25's whole strength on code -- actually work. | |
| """ | |
| import re | |
| from rank_bm25 import BM25Okapi | |
| # Split on anything that isn't a letter, digit, or underscore. | |
| _TOKEN_RE = re.compile(r"[A-Za-z0-9_]+") | |
| def tokenize_code(text): | |
| return _TOKEN_RE.findall(text.lower()) | |
| class BM25Retriever: | |
| def __init__(self, chunks): | |
| self.chunks = chunks | |
| documents = [tokenize_code(c["chunk_text"]) for c in chunks] | |
| self.bm25 = BM25Okapi(documents) | |
| def search(self, query, k=5): | |
| tokens = tokenize_code(query) | |
| scores = self.bm25.get_scores(tokens) | |
| results = [] | |
| top_indexes = scores.argsort()[-k:][::-1] | |
| for idx in top_indexes: | |
| results.append({ | |
| "score": float(scores[idx]), | |
| "document": self.chunks[idx], | |
| }) | |
| return results | |