| import re | |
| from typing import List | |
| def clean_text(text: str) -> str: | |
| """ | |
| Minimal preprocessing: normalize whitespace, remove control chars. | |
| """ | |
| text = text.strip() | |
| text = re.sub(r"\s+", " ", text) | |
| text = re.sub(r"[\x00-\x1f]+", "", text) | |
| return text | |
| def tokenize(text: str) -> List[str]: | |
| """ | |
| Simple whitespace tokenizer. | |
| """ | |
| text = clean_text(text) | |
| return text.split() | |