|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" Code from |
|
|
https://github.com/NVIDIA/DeepLearningExamples/blob/ |
|
|
master/PyTorch/Translation/Transformer/fairseq/tokenizer.py |
|
|
""" |
|
|
|
|
|
import re |
|
|
import sys |
|
|
import unicodedata |
|
|
from collections import defaultdict |
|
|
|
|
|
__all__ = ['get_unicode_categories', 'tokenize_en'] |
|
|
|
|
|
|
|
|
def get_unicode_categories(): |
|
|
cats = defaultdict(list) |
|
|
for c in map(chr, range(sys.maxunicode + 1)): |
|
|
cats[unicodedata.category(c)].append(c) |
|
|
return cats |
|
|
|
|
|
|
|
|
NUMERICS = ''.join(get_unicode_categories()['No']) |
|
|
|
|
|
|
|
|
def tokenize_en(line): |
|
|
line = line.strip() |
|
|
line = ' ' + line + ' ' |
|
|
|
|
|
line = re.sub(r'\s+', ' ', line) |
|
|
line = re.sub(r'[\x00-\x1F]', '', line) |
|
|
|
|
|
line = re.sub(r'\ +', ' ', line) |
|
|
line = re.sub('^ ', '', line) |
|
|
line = re.sub(' $', '', line) |
|
|
|
|
|
line = re.sub(r'([^\s\.\'\`\,\-\w]|[_' + NUMERICS + '])', r' \g<1> ', line) |
|
|
line = re.sub(r'(\w)\-(?=\w)', r'\g<1> @-@ ', line) |
|
|
|
|
|
|
|
|
line = re.sub(r'\.([\.]+)', r' DOTMULTI\g<1>', line) |
|
|
while re.search(r'DOTMULTI\.', line): |
|
|
line = re.sub(r'DOTMULTI\.([^\.])', r'DOTDOTMULTI \g<1>', line) |
|
|
line = re.sub(r'DOTMULTI\.', r'DOTDOTMULTI', line) |
|
|
|
|
|
|
|
|
line = re.sub(r'([\D])[,]', r'\g<1> , ', line) |
|
|
line = re.sub(r'[,]([\D])', r' , \g<1>', line) |
|
|
|
|
|
|
|
|
line = re.sub(r'(\d)[,]$', r'\g<1> ,', line) |
|
|
|
|
|
|
|
|
line = re.sub(r'([\W\d])[\']([\W\d])', r'\g<1> \' \g<2>', line) |
|
|
line = re.sub(r'(\W)[\']([\w\D])', r'\g<1> \' \g<2>', line) |
|
|
line = re.sub(r'([\w\D])[\']([\W\d])', r'\g<1> \' \g<2>', line) |
|
|
line = re.sub(r'([\w\D])[\']([\w\D])', r'\g<1> \'\g<2>', line) |
|
|
|
|
|
line = re.sub(r'([\W\d])[\']([s])', r'\g<1> \'\g<2>', line) |
|
|
|
|
|
|
|
|
words = line.split() |
|
|
line = '' |
|
|
for i in range(len(words)): |
|
|
word = words[i] |
|
|
match = re.search(r'^(\S+)\.$', word) |
|
|
if match: |
|
|
pre = match.group(1) |
|
|
if i == len(words) - 1: |
|
|
"""split last words independently as they are unlikely |
|
|
to be non-breaking prefixes""" |
|
|
word = pre + ' .' |
|
|
else: |
|
|
word = pre + ' .' |
|
|
|
|
|
word += ' ' |
|
|
line += word |
|
|
|
|
|
|
|
|
line = re.sub(' +', ' ', line) |
|
|
line = re.sub('^ ', '', line) |
|
|
line = re.sub(' $', '', line) |
|
|
|
|
|
|
|
|
line = re.sub(r'\.\' ?$', ' . \' ', line) |
|
|
|
|
|
|
|
|
while re.search('DOTDOTMULTI', line): |
|
|
line = re.sub('DOTDOTMULTI', 'DOTMULTI.', line) |
|
|
|
|
|
line = re.sub('DOTMULTI', '.', line) |
|
|
|
|
|
|
|
|
line = re.sub(r'\&', r'&', line) |
|
|
line = re.sub(r'\|', r'|', line) |
|
|
line = re.sub(r'\<', r'<', line) |
|
|
line = re.sub(r'\>', r'>', line) |
|
|
line = re.sub(r'\'', r''', line) |
|
|
line = re.sub(r'\"', r'"', line) |
|
|
line = re.sub(r'\[', r'[', line) |
|
|
line = re.sub(r'\]', r']', line) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return line |
|
|
|