| import re | |
| non_terminal_periods = ( | |
| r"(?<!\sApt)" | |
| r"(?<!\sBlvd)" | |
| r"(?<!\sCapt)" | |
| r"(?<!\sDr)" | |
| r"(?<!\sJr)" | |
| r"(?<!\sMr)" | |
| r"(?<!\sMrs)" | |
| r"(?<!\sMs)" | |
| r"(?<!\sPh\.D)" | |
| r"(?<!\sRd)" | |
| r"(?<!\sSr)" | |
| r"(?<!\sSt)" | |
| r"(?<!\se\.g)" | |
| r"(?<!\setc)" | |
| r"(?<!\si\.e)" | |
| r"(?<!\slit)" | |
| r"(?<!\s[A-Z])" | |
| r"(?<!\(r)" | |
| r"(?<!^[a-zA-Z0-9])" | |
| ) | |
| naive_sentence_end_pattern = re.compile(r"([\n\r]+" | |
| r"|[!?]+\"?(?=\s|$)" | |
| r"|" + non_terminal_periods + r"\.+\"?(?=\s|$))") | |
| # Option 1: | |
| # [\n\r]+ - Match consecutive newline and carriage returns | |
| # Option 2: | |
| # [!?]+ - Match ! or ? | |
| # (?=\s|$) - Must be followed by \s or end-of-string | |
| # Option 3: | |
| # non_terminal_periods - Must not be preceded by non-terminal characters | |
| # \.+ - Match . | |
| # (?=\s|$) - Must be followed by \s or end-of-string | |
| naive_tokenize_pattern = re.compile( | |
| r"(" | |
| r"\s+" | |
| r"|-+(?=\s|$)" | |
| r"|(?<=\s)-+" | |
| r"|-{2,}" | |
| r"|–+" | |
| r"|—+" | |
| r"|(?<=[a-z])n’t(?=\s|$)" | |
| r"|(?<=[a-z])n't(?=\s|$)" | |
| r"|’[a-s,u-z]+(?=\s|$)" | |
| r"|'[a-s,u-z]+(?=\s|$)" | |
| r"|’+" | |
| r"|'+" | |
| r"|\"+" | |
| r"|`+" | |
| r"|,+(?=\"|\s|$)" | |
| r"|" + non_terminal_periods + r"\.+(?=\"|\s|$)" | |
| r"|:+" | |
| r"|;+" | |
| r"|[?!]+(?=\"|\s|$)" | |
| r"|\(+" | |
| r"|\)+" | |
| r"|\[+" | |
| r"|]+" | |
| r"|\{+" | |
| r"|}+" | |
| r"|<+" | |
| r"|>+" | |
| r"|[\u4e00-\u9fff]" # For Chinese characters, which are not space delimited | |
| r")" | |
| ) | |
| def naive_tokenize(text: str): | |
| return [t for t in naive_tokenize_pattern.split(text) | |
| if t != "" | |
| and not t.startswith(" ") | |
| and not t.startswith("\t")] |