Spaces:

taraky
/

Medical_Document_Retrieval

Running

File size: 416 Bytes

b7f3196

import re

# Simple non-word splitter (keeps letters/numbers, splits on punctuation/whitespace)
_WS = re.compile(r"\W+", flags=re.UNICODE)

def tokenize(s: str) -> list[str]:
    """

    Lowercase + split on non-word chars. Returns [] for None/empty.

    Used by BM25 to build the tokenized corpus and query.

    """
    if not s:
        return []
    return [t for t in _WS.split(s.lower()) if t]