"""Tiny shared text helper. Lives in its own module so the offline BM25 index (build_index.py) and the live query tokenisation (app.py) use the EXACT same tokeniser. If these drift, BM25 scoring becomes inconsistent between corpus and query. """ import re _WORD_RE = re.compile(r"\w+") def tokenize(text: str) -> list[str]: """Lowercase + split on word characters. Simple and good enough for a teaching demo.""" return _WORD_RE.findall(text.lower())