import re non_terminal_periods = ( r"(?+" r"|[\u4e00-\u9fff]" # For Chinese characters, which are not space delimited r")" ) def naive_tokenize(text: str): return [t for t in naive_tokenize_pattern.split(text) if t != "" and not t.startswith(" ") and not t.startswith("\t")]