File size: 420 Bytes
4f4965d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
import re
from typing import List
def clean_text(text: str) -> str:
"""
Minimal preprocessing: normalize whitespace, remove control chars.
"""
text = text.strip()
text = re.sub(r"\s+", " ", text)
text = re.sub(r"[\x00-\x1f]+", "", text)
return text
def tokenize(text: str) -> List[str]:
"""
Simple whitespace tokenizer.
"""
text = clean_text(text)
return text.split()
|