File size: 416 Bytes
b7f3196 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
import re
# Simple non-word splitter (keeps letters/numbers, splits on punctuation/whitespace)
_WS = re.compile(r"\W+", flags=re.UNICODE)
def tokenize(s: str) -> list[str]:
"""
Lowercase + split on non-word chars. Returns [] for None/empty.
Used by BM25 to build the tokenized corpus and query.
"""
if not s:
return []
return [t for t in _WS.split(s.lower()) if t]
|