ethaneng commited on
Commit
97fb329
·
verified ·
1 Parent(s): f05971a

Upload ner_tokenization.txt

Browse files
Files changed (1) hide show
  1. ner_tokenization.txt +93 -0
ner_tokenization.txt ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ """Basic Tokenization to separate non-word characters from tokens"""
4
+ class NerTokenization:
5
+ def __init__(self):
6
+ self.tok_final = {".", ",", "-", "/", "\\", ";", ":", '"', "'", ")", "]"}
7
+ self.tok_initial = {'"', "'", "(","["}
8
+ self.internal_split = {"/", ";", ":", "\\"}
9
+ self.split_regex = '|'.join(map(re.escape, self.internal_split)) # characters concatenated on logical or ("|")
10
+
11
+
12
+ def right_strip(self, tok):
13
+
14
+ retokenized = []
15
+
16
+ if all(c in self.tok_final for c in tok):
17
+ retokenized.append(tok)
18
+
19
+ else:
20
+ while True:
21
+ if tok[-1] in self.tok_final:
22
+ tok, punc = tok[:-1], tok[-1]
23
+ retokenized.insert(0, punc)
24
+ else:
25
+ retokenized.insert(0, tok)
26
+ break
27
+
28
+
29
+ return retokenized
30
+
31
+ def left_strip(self, token_list):
32
+
33
+ retokenized = []
34
+
35
+ for tok in token_list:
36
+
37
+ if all(c in self.tok_final for c in tok):
38
+ retokenized.append(tok)
39
+
40
+ else:
41
+ while True:
42
+ try:
43
+ if tok[0] in self.tok_initial:
44
+ punc, tok = tok[0], tok[1:]
45
+ retokenized.append(punc)
46
+ else:
47
+ retokenized.append(tok)
48
+ break
49
+ except:
50
+ break
51
+
52
+ return retokenized
53
+
54
+ def tok_split(self, tok):
55
+
56
+ if any(c in self.internal_split for c in tok[:-1]):
57
+
58
+ result = re.split(f'({self.split_regex})', tok)
59
+ tok_split = [s for s in result if s]
60
+
61
+ else:
62
+ tok_split = [tok]
63
+
64
+ return tok_split
65
+
66
+ def tokenize_string(self, string_to_tokenize):
67
+
68
+ tokenized_string = []
69
+
70
+ toks = string_to_tokenize.split()
71
+
72
+ for this_tok in toks:
73
+ split_toks = self.tok_split(this_tok) # check if splits
74
+
75
+ for tok in split_toks:
76
+ r_stripped_toks = self.right_strip(tok)
77
+ l_stripped_toks = self.left_strip(r_stripped_toks)
78
+ tokenized_string.extend(l_stripped_toks)
79
+
80
+ return " ".join(tokenized_string)
81
+
82
+ @staticmethod
83
+ def convert_to_training_format(conll_file_path):
84
+
85
+ pass
86
+
87
+ if __name__ == "__main__":
88
+
89
+ s = 'The "dog." is (here): to stay, I think. 47-52'
90
+
91
+ tokenizer = NerTokenization()
92
+ s2 = tokenizer.tokenize_string(s)
93
+ print(s2)