File size: 1,755 Bytes
5a7f6ac 051eb53 5a7f6ac 051eb53 5a7f6ac 051eb53 5a7f6ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import re
non_terminal_periods = (
r"(?<!\sApt)"
r"(?<!\sBlvd)"
r"(?<!\sCapt)"
r"(?<!\sDr)"
r"(?<!\sJr)"
r"(?<!\sMr)"
r"(?<!\sMrs)"
r"(?<!\sMs)"
r"(?<!\sPh\.D)"
r"(?<!\sRd)"
r"(?<!\sSr)"
r"(?<!\sSt)"
r"(?<!\se\.g)"
r"(?<!\setc)"
r"(?<!\si\.e)"
r"(?<!\slit)"
r"(?<!\s[A-Z])"
r"(?<!\(r)"
r"(?<!^[a-zA-Z0-9])"
)
naive_sentence_end_pattern = re.compile(r"([\n\r]+"
r"|[!?]+\"?(?=\s|$)"
r"|" + non_terminal_periods + r"\.+\"?(?=\s|$))")
# Option 1:
# [\n\r]+ - Match consecutive newline and carriage returns
# Option 2:
# [!?]+ - Match ! or ?
# (?=\s|$) - Must be followed by \s or end-of-string
# Option 3:
# non_terminal_periods - Must not be preceded by non-terminal characters
# \.+ - Match .
# (?=\s|$) - Must be followed by \s or end-of-string
naive_tokenize_pattern = re.compile(
r"("
r"\s+"
r"|-+(?=\s|$)"
r"|(?<=\s)-+"
r"|-{2,}"
r"|–+"
r"|—+"
r"|(?<=[a-z])n’t(?=\s|$)"
r"|(?<=[a-z])n't(?=\s|$)"
r"|’[a-s,u-z]+(?=\s|$)"
r"|'[a-s,u-z]+(?=\s|$)"
r"|’+"
r"|'+"
r"|\"+"
r"|`+"
r"|,+(?=\"|\s|$)"
r"|" + non_terminal_periods + r"\.+(?=\"|\s|$)"
r"|:+"
r"|;+"
r"|[?!]+(?=\"|\s|$)"
r"|\(+"
r"|\)+"
r"|\[+"
r"|]+"
r"|\{+"
r"|}+"
r"|<+"
r"|>+"
r"|[\u4e00-\u9fff]" # For Chinese characters, which are not space delimited
r")"
)
def naive_tokenize(text: str):
return [t for t in naive_tokenize_pattern.split(text)
if t != ""
and not t.startswith(" ")
and not t.startswith("\t")] |