codebase-agent / src /rag /bm25_search.py
AishaSurve's picture
Codebase Intelligence Agent: code-aware RAG + test-gen agent + eval
8e72e1f
Raw
History Blame Contribute Delete
1.21 kB
"""
BM25 keyword retriever, tuned for code.
Same interface as the StudyMate version, but with a code-aware tokenizer:
prose `.split()` turns "jwt.encode(token)" into the single junk token
"jwt.encode(token)", so a search for "jwt.encode" never matches. Splitting on
non-identifier characters instead yields ["jwt", "encode", "token"], so exact
symbol searches -- BM25's whole strength on code -- actually work.
"""
import re
from rank_bm25 import BM25Okapi
# Split on anything that isn't a letter, digit, or underscore.
_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
def tokenize_code(text):
return _TOKEN_RE.findall(text.lower())
class BM25Retriever:
def __init__(self, chunks):
self.chunks = chunks
documents = [tokenize_code(c["chunk_text"]) for c in chunks]
self.bm25 = BM25Okapi(documents)
def search(self, query, k=5):
tokens = tokenize_code(query)
scores = self.bm25.get_scores(tokens)
results = []
top_indexes = scores.argsort()[-k:][::-1]
for idx in top_indexes:
results.append({
"score": float(scores[idx]),
"document": self.chunks[idx],
})
return results