"""Tiny shared text helper.

Lives in its own module so the offline BM25 index (build_index.py) and the live query
tokenisation (app.py) use the EXACT same tokeniser. If these drift, BM25 scoring becomes
inconsistent between corpus and query.
"""

import re

_WORD_RE = re.compile(r"\w+")


def tokenize(text: str) -> list[str]:
    """Lowercase + split on word characters. Simple and good enough for a teaching demo."""
    return _WORD_RE.findall(text.lower())