taraky's picture
Upload folder using huggingface_hub
b7f3196 verified
raw
history blame contribute delete
416 Bytes
import re
# Simple non-word splitter (keeps letters/numbers, splits on punctuation/whitespace)
_WS = re.compile(r"\W+", flags=re.UNICODE)
def tokenize(s: str) -> list[str]:
"""
Lowercase + split on non-word chars. Returns [] for None/empty.
Used by BM25 to build the tokenized corpus and query.
"""
if not s:
return []
return [t for t in _WS.split(s.lower()) if t]