"""
BM25 keyword retriever, tuned for code.

Same interface as the StudyMate version, but with a code-aware tokenizer:
prose `.split()` turns "jwt.encode(token)" into the single junk token
"jwt.encode(token)", so a search for "jwt.encode" never matches. Splitting on
non-identifier characters instead yields ["jwt", "encode", "token"], so exact
symbol searches -- BM25's whole strength on code -- actually work.
"""
import re

from rank_bm25 import BM25Okapi

# Split on anything that isn't a letter, digit, or underscore.
_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")


def tokenize_code(text):
    return _TOKEN_RE.findall(text.lower())


class BM25Retriever:

    def __init__(self, chunks):
        self.chunks = chunks
        documents = [tokenize_code(c["chunk_text"]) for c in chunks]
        self.bm25 = BM25Okapi(documents)

    def search(self, query, k=5):
        tokens = tokenize_code(query)
        scores = self.bm25.get_scores(tokens)

        results = []
        top_indexes = scores.argsort()[-k:][::-1]
        for idx in top_indexes:
            results.append({
                "score": float(scores[idx]),
                "document": self.chunks[idx],
            })
        return results