Spaces:

lsy641
/

distinct

Runtime error

App Files Files Community

lsy641 commited on Jul 8, 2023

Commit

00ff4e2

1 Parent(s): 1bb2612

Upload tokenizer_13a.py

Browse files

Files changed (1) hide show

tokenizer_13a.py +100 -0

tokenizer_13a.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py
+# Copyright 2020 SacreBLEU Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from functools import lru_cache
+class BaseTokenizer:
+    """A base dummy tokenizer to derive from."""
+    def signature(self):
+        """
+        Returns a signature for the tokenizer.
+        :return: signature string
+        """
+        return "none"
+    def __call__(self, line):
+        """
+        Tokenizes an input line with the tokenizer.
+        :param line: a segment to tokenize
+        :return: the tokenized line
+        """
+        return line
+class TokenizerRegexp(BaseTokenizer):
+    def signature(self):
+        return "re"
+    def __init__(self):
+        self._re = [
+            # language-dependent part (assuming Western languages)
+            (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "),
+            # tokenize period and comma unless preceded by a digit
+            (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "),
+            # tokenize period and comma unless followed by a digit
+            (re.compile(r"([\.,])([^0-9])"), r" \1 \2"),
+            # tokenize dash when preceded by a digit
+            (re.compile(r"([0-9])(-)"), r"\1 \2 "),
+            # one space only between words
+            # NOTE: Doing this in Python (below) is faster
+            # (re.compile(r'\s+'), r' '),
+        ]
+    @lru_cache(maxsize=2**16)
+    def __call__(self, line):
+        """Common post-processing tokenizer for `13a` and `zh` tokenizers.
+        :param line: a segment to tokenize
+        :return: the tokenized line
+        """
+        for (_re, repl) in self._re:
+            line = _re.sub(repl, line)
+        # no leading or trailing spaces, single space within words
+        # return ' '.join(line.split())
+        # This line is changed with regards to the original tokenizer (seen above) to return individual words
+        return line.split()
+class Tokenizer13a(BaseTokenizer):
+    def signature(self):
+        return "13a"
+    def __init__(self):
+        self._post_tokenizer = TokenizerRegexp()
+    @lru_cache(maxsize=2**16)
+    def __call__(self, line):
+        """Tokenizes an input line using a relatively minimal tokenization
+        that is however equivalent to mteval-v13a, used by WMT.
+        :param line: a segment to tokenize
+        :return: the tokenized line
+        """
+        # language-independent part:
+        line = line.replace("<skipped>", "")
+        line = line.replace("-\n", "")
+        line = line.replace("\n", " ")
+        if "&" in line:
+            line = line.replace("&quot;", '"')
+            line = line.replace("&amp;", "&")
+            line = line.replace("&lt;", "<")
+            line = line.replace("&gt;", ">")
+        return self._post_tokenizer(f" {line} ")