| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import re |
| from functools import lru_cache |
|
|
|
|
| class BaseTokenizer: |
| """A base dummy tokenizer to derive from.""" |
|
|
| def signature(self): |
| """ |
| Returns a signature for the tokenizer. |
| :return: signature string |
| """ |
| return "none" |
|
|
| def __call__(self, line): |
| """ |
| Tokenizes an input line with the tokenizer. |
| :param line: a segment to tokenize |
| :return: the tokenized line |
| """ |
| return line |
|
|
|
|
| class TokenizerRegexp(BaseTokenizer): |
| def signature(self): |
| return "re" |
|
|
| def __init__(self): |
| self._re = [ |
| |
| (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "), |
| |
| (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "), |
| |
| (re.compile(r"([\.,])([^0-9])"), r" \1 \2"), |
| |
| (re.compile(r"([0-9])(-)"), r"\1 \2 "), |
| |
| |
| |
| ] |
|
|
| @lru_cache(maxsize=2**16) |
| def __call__(self, line): |
| """Common post-processing tokenizer for `13a` and `zh` tokenizers. |
| :param line: a segment to tokenize |
| :return: the tokenized line |
| """ |
| for (_re, repl) in self._re: |
| line = _re.sub(repl, line) |
|
|
| |
| |
| |
| return line.split() |
|
|
|
|
| class Tokenizer13a(BaseTokenizer): |
| def signature(self): |
| return "13a" |
|
|
| def __init__(self): |
| self._post_tokenizer = TokenizerRegexp() |
|
|
| @lru_cache(maxsize=2**16) |
| def __call__(self, line): |
| """Tokenizes an input line using a relatively minimal tokenization |
| that is however equivalent to mteval-v13a, used by WMT. |
| |
| :param line: a segment to tokenize |
| :return: the tokenized line |
| """ |
|
|
| |
| line = line.replace("<skipped>", "") |
| line = line.replace("-\n", "") |
| line = line.replace("\n", " ") |
|
|
| if "&" in line: |
| line = line.replace(""", '"') |
| line = line.replace("&", "&") |
| line = line.replace("<", "<") |
| line = line.replace(">", ">") |
|
|
| return self._post_tokenizer(f" {line} ") |
|
|