File size: 420 Bytes
4f4965d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import re
from typing import List


def clean_text(text: str) -> str:
    """
    Minimal preprocessing: normalize whitespace, remove control chars.
    """
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[\x00-\x1f]+", "", text)
    return text


def tokenize(text: str) -> List[str]:
    """
    Simple whitespace tokenizer.
    """
    text = clean_text(text)
    return text.split()