Spaces:

AishaSurve
/

codebase-agent

Running

codebase-agent / src /rag /bm25_search.py

Codebase Intelligence Agent: code-aware RAG + test-gen agent + eval

8e72e1f 8 days ago

1.21 kB

	"""
	BM25 keyword retriever, tuned for code.

	Same interface as the StudyMate version, but with a code-aware tokenizer:
	prose `.split()` turns "jwt.encode(token)" into the single junk token
	"jwt.encode(token)", so a search for "jwt.encode" never matches. Splitting on
	non-identifier characters instead yields ["jwt", "encode", "token"], so exact
	symbol searches -- BM25's whole strength on code -- actually work.
	"""
	import re

	from rank_bm25 import BM25Okapi

	# Split on anything that isn't a letter, digit, or underscore.
	_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")


	def tokenize_code(text):
	return _TOKEN_RE.findall(text.lower())


	class BM25Retriever:

	def __init__(self, chunks):
	self.chunks = chunks
	documents = [tokenize_code(c["chunk_text"]) for c in chunks]
	self.bm25 = BM25Okapi(documents)

	def search(self, query, k=5):
	tokens = tokenize_code(query)
	scores = self.bm25.get_scores(tokens)

	results = []
	top_indexes = scores.argsort()[-k:][::-1]
	for idx in top_indexes:
	results.append({
	"score": float(scores[idx]),
	"document": self.chunks[idx],
	})
	return results