File size: 1,755 Bytes
5a7f6ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
051eb53
5a7f6ac
051eb53
5a7f6ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
051eb53
5a7f6ac
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import re

non_terminal_periods = (
    r"(?<!\sApt)"
    r"(?<!\sBlvd)"
    r"(?<!\sCapt)"
    r"(?<!\sDr)"
    r"(?<!\sJr)"
    r"(?<!\sMr)"
    r"(?<!\sMrs)"
    r"(?<!\sMs)"
    r"(?<!\sPh\.D)"
    r"(?<!\sRd)"
    r"(?<!\sSr)"
    r"(?<!\sSt)"
    r"(?<!\se\.g)"
    r"(?<!\setc)"
    r"(?<!\si\.e)"
    r"(?<!\slit)"
    r"(?<!\s[A-Z])"
    r"(?<!\(r)"
    r"(?<!^[a-zA-Z0-9])"
)

naive_sentence_end_pattern = re.compile(r"([\n\r]+"
                                        r"|[!?]+\"?(?=\s|$)"
                                        r"|" + non_terminal_periods + r"\.+\"?(?=\s|$))")
# Option 1:
#   [\n\r]+    - Match consecutive newline and carriage returns
# Option 2:
#   [!?]+      - Match ! or ?
#   (?=\s|$)   - Must be followed by \s or end-of-string
# Option 3:
#   non_terminal_periods  - Must not be preceded by non-terminal characters
#   \.+                   - Match .
#   (?=\s|$)              - Must be followed by \s or end-of-string

naive_tokenize_pattern = re.compile(
    r"("
    r"\s+"
    r"|-+(?=\s|$)"
    r"|(?<=\s)-+"
    r"|-{2,}"
    r"|–+"
    r"|—+"
    r"|(?<=[a-z])n’t(?=\s|$)"
    r"|(?<=[a-z])n't(?=\s|$)"
    r"|’[a-s,u-z]+(?=\s|$)"
    r"|'[a-s,u-z]+(?=\s|$)"
    r"|’+"
    r"|'+"
    r"|\"+"
    r"|`+"
    r"|,+(?=\"|\s|$)"
    r"|" + non_terminal_periods + r"\.+(?=\"|\s|$)"
    r"|:+"
    r"|;+"
    r"|[?!]+(?=\"|\s|$)"
    r"|\(+"
    r"|\)+"
    r"|\[+"
    r"|]+"
    r"|\{+"
    r"|}+"
    r"|<+"
    r"|>+"
    r"|[\u4e00-\u9fff]"  # For Chinese characters, which are not space delimited
    r")"
)


def naive_tokenize(text: str):
    return [t for t in naive_tokenize_pattern.split(text)
            if t != ""
            and not t.startswith(" ")
            and not t.startswith("\t")]