ner-explorer / util.py
veryfansome's picture
feat: functional CLI editor
051eb53
import re
non_terminal_periods = (
r"(?<!\sApt)"
r"(?<!\sBlvd)"
r"(?<!\sCapt)"
r"(?<!\sDr)"
r"(?<!\sJr)"
r"(?<!\sMr)"
r"(?<!\sMrs)"
r"(?<!\sMs)"
r"(?<!\sPh\.D)"
r"(?<!\sRd)"
r"(?<!\sSr)"
r"(?<!\sSt)"
r"(?<!\se\.g)"
r"(?<!\setc)"
r"(?<!\si\.e)"
r"(?<!\slit)"
r"(?<!\s[A-Z])"
r"(?<!\(r)"
r"(?<!^[a-zA-Z0-9])"
)
naive_sentence_end_pattern = re.compile(r"([\n\r]+"
r"|[!?]+\"?(?=\s|$)"
r"|" + non_terminal_periods + r"\.+\"?(?=\s|$))")
# Option 1:
# [\n\r]+ - Match consecutive newline and carriage returns
# Option 2:
# [!?]+ - Match ! or ?
# (?=\s|$) - Must be followed by \s or end-of-string
# Option 3:
# non_terminal_periods - Must not be preceded by non-terminal characters
# \.+ - Match .
# (?=\s|$) - Must be followed by \s or end-of-string
naive_tokenize_pattern = re.compile(
r"("
r"\s+"
r"|-+(?=\s|$)"
r"|(?<=\s)-+"
r"|-{2,}"
r"|–+"
r"|—+"
r"|(?<=[a-z])n’t(?=\s|$)"
r"|(?<=[a-z])n't(?=\s|$)"
r"|’[a-s,u-z]+(?=\s|$)"
r"|'[a-s,u-z]+(?=\s|$)"
r"|’+"
r"|'+"
r"|\"+"
r"|`+"
r"|,+(?=\"|\s|$)"
r"|" + non_terminal_periods + r"\.+(?=\"|\s|$)"
r"|:+"
r"|;+"
r"|[?!]+(?=\"|\s|$)"
r"|\(+"
r"|\)+"
r"|\[+"
r"|]+"
r"|\{+"
r"|}+"
r"|<+"
r"|>+"
r"|[\u4e00-\u9fff]" # For Chinese characters, which are not space delimited
r")"
)
def naive_tokenize(text: str):
return [t for t in naive_tokenize_pattern.split(text)
if t != ""
and not t.startswith(" ")
and not t.startswith("\t")]