import re # Simple non-word splitter (keeps letters/numbers, splits on punctuation/whitespace) _WS = re.compile(r"\W+", flags=re.UNICODE) def tokenize(s: str) -> list[str]: """ Lowercase + split on non-word chars. Returns [] for None/empty. Used by BM25 to build the tokenized corpus and query. """ if not s: return [] return [t for t in _WS.split(s.lower()) if t]