""" BM25 keyword retriever, tuned for code. Same interface as the StudyMate version, but with a code-aware tokenizer: prose `.split()` turns "jwt.encode(token)" into the single junk token "jwt.encode(token)", so a search for "jwt.encode" never matches. Splitting on non-identifier characters instead yields ["jwt", "encode", "token"], so exact symbol searches -- BM25's whole strength on code -- actually work. """ import re from rank_bm25 import BM25Okapi # Split on anything that isn't a letter, digit, or underscore. _TOKEN_RE = re.compile(r"[A-Za-z0-9_]+") def tokenize_code(text): return _TOKEN_RE.findall(text.lower()) class BM25Retriever: def __init__(self, chunks): self.chunks = chunks documents = [tokenize_code(c["chunk_text"]) for c in chunks] self.bm25 = BM25Okapi(documents) def search(self, query, k=5): tokens = tokenize_code(query) scores = self.bm25.get_scores(tokens) results = [] top_indexes = scores.argsort()[-k:][::-1] for idx in top_indexes: results.append({ "score": float(scores[idx]), "document": self.chunks[idx], }) return results