File size: 2,619 Bytes
97fb329 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import re
"""Basic Tokenization to separate non-word characters from tokens"""
class NerTokenization:
def __init__(self):
self.tok_final = {".", ",", "-", "/", "\\", ";", ":", '"', "'", ")", "]"}
self.tok_initial = {'"', "'", "(","["}
self.internal_split = {"/", ";", ":", "\\"}
self.split_regex = '|'.join(map(re.escape, self.internal_split)) # characters concatenated on logical or ("|")
def right_strip(self, tok):
retokenized = []
if all(c in self.tok_final for c in tok):
retokenized.append(tok)
else:
while True:
if tok[-1] in self.tok_final:
tok, punc = tok[:-1], tok[-1]
retokenized.insert(0, punc)
else:
retokenized.insert(0, tok)
break
return retokenized
def left_strip(self, token_list):
retokenized = []
for tok in token_list:
if all(c in self.tok_final for c in tok):
retokenized.append(tok)
else:
while True:
try:
if tok[0] in self.tok_initial:
punc, tok = tok[0], tok[1:]
retokenized.append(punc)
else:
retokenized.append(tok)
break
except:
break
return retokenized
def tok_split(self, tok):
if any(c in self.internal_split for c in tok[:-1]):
result = re.split(f'({self.split_regex})', tok)
tok_split = [s for s in result if s]
else:
tok_split = [tok]
return tok_split
def tokenize_string(self, string_to_tokenize):
tokenized_string = []
toks = string_to_tokenize.split()
for this_tok in toks:
split_toks = self.tok_split(this_tok) # check if splits
for tok in split_toks:
r_stripped_toks = self.right_strip(tok)
l_stripped_toks = self.left_strip(r_stripped_toks)
tokenized_string.extend(l_stripped_toks)
return " ".join(tokenized_string)
@staticmethod
def convert_to_training_format(conll_file_path):
pass
if __name__ == "__main__":
s = 'The "dog." is (here): to stay, I think. 47-52'
tokenizer = NerTokenization()
s2 = tokenizer.tokenize_string(s)
print(s2)
|