from __future__ import annotations import re _SPLIT_RE = re.compile(r"[^\w]+", re.UNICODE) def tokenize(text: str) -> list[str]: return [t for t in _SPLIT_RE.split(text.lower()) if t]