File size: 416 Bytes
b7f3196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import re

# Simple non-word splitter (keeps letters/numbers, splits on punctuation/whitespace)
_WS = re.compile(r"\W+", flags=re.UNICODE)

def tokenize(s: str) -> list[str]:
    """

    Lowercase + split on non-word chars. Returns [] for None/empty.

    Used by BM25 to build the tokenized corpus and query.

    """
    if not s:
        return []
    return [t for t in _WS.split(s.lower()) if t]