File size: 1,755 Bytes

import re

non_terminal_periods = (
    r"(?<!\sApt)"
    r"(?<!\sBlvd)"
    r"(?<!\sCapt)"
    r"(?<!\sDr)"
    r"(?<!\sJr)"
    r"(?<!\sMr)"
    r"(?<!\sMrs)"
    r"(?<!\sMs)"
    r"(?<!\sPh\.D)"
    r"(?<!\sRd)"
    r"(?<!\sSr)"
    r"(?<!\sSt)"
    r"(?<!\se\.g)"
    r"(?<!\setc)"
    r"(?<!\si\.e)"
    r"(?<!\slit)"
    r"(?<!\s[A-Z])"
    r"(?<!\(r)"
    r"(?<!^[a-zA-Z0-9])"
)

naive_sentence_end_pattern = re.compile(r"([\n\r]+"
                                        r"|[!?]+\"?(?=\s|$)"
                                        r"|" + non_terminal_periods + r"\.+\"?(?=\s|$))")
# Option 1:
#   [\n\r]+    - Match consecutive newline and carriage returns
# Option 2:
#   [!?]+      - Match ! or ?
#   (?=\s|$)   - Must be followed by \s or end-of-string
# Option 3:
#   non_terminal_periods  - Must not be preceded by non-terminal characters
#   \.+                   - Match .
#   (?=\s|$)              - Must be followed by \s or end-of-string

naive_tokenize_pattern = re.compile(
    r"("
    r"\s+"
    r"|-+(?=\s|$)"
    r"|(?<=\s)-+"
    r"|-{2,}"
    r"|–+"
    r"|—+"
    r"|(?<=[a-z])n’t(?=\s|$)"
    r"|(?<=[a-z])n't(?=\s|$)"
    r"|’[a-s,u-z]+(?=\s|$)"
    r"|'[a-s,u-z]+(?=\s|$)"
    r"|’+"
    r"|'+"
    r"|\"+"
    r"|`+"
    r"|,+(?=\"|\s|$)"
    r"|" + non_terminal_periods + r"\.+(?=\"|\s|$)"
    r"|:+"
    r"|;+"
    r"|[?!]+(?=\"|\s|$)"
    r"|\(+"
    r"|\)+"
    r"|\[+"
    r"|]+"
    r"|\{+"
    r"|}+"
    r"|<+"
    r"|>+"
    r"|[\u4e00-\u9fff]"  # For Chinese characters, which are not space delimited
    r")"
)


def naive_tokenize(text: str):
    return [t for t in naive_tokenize_pattern.split(text)
            if t != ""
            and not t.startswith(" ")
            and not t.startswith("\t")]