| import re | |
| # Simple non-word splitter (keeps letters/numbers, splits on punctuation/whitespace) | |
| _WS = re.compile(r"\W+", flags=re.UNICODE) | |
| def tokenize(s: str) -> list[str]: | |
| """ | |
| Lowercase + split on non-word chars. Returns [] for None/empty. | |
| Used by BM25 to build the tokenized corpus and query. | |
| """ | |
| if not s: | |
| return [] | |
| return [t for t in _WS.split(s.lower()) if t] | |