Spaces:

lsy641
/

distinct

Runtime error

lsy641 commited on Jul 8, 2023

Commit

577b530

1 Parent(s): 6563183

distinct

Files changed (3) hide show

__pycache__/distinct.cpython-38.pyc CHANGED Viewed

Binary files a/__pycache__/distinct.cpython-38.pyc and b/__pycache__/distinct.cpython-38.pyc differ

distinct.py CHANGED Viewed

@@ -115,8 +115,9 @@ class distinct(evaluate.Measurement):
     def _download_and_prepare(self, dl_manager):
         """Optional: download external resources useful to compute the scores"""
-    def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer=Tokenizer13a(), mode="Expectation-Adjusted-Distinct"):
         from nltk.util import ngrams
@@ -135,10 +136,10 @@ class distinct(evaluate.Measurement):
             if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
                 vocab = set()
                 for sentence in dataForVocabCal:
-                    if tokenizer == "white_space":
-                        vocab = vocab | set(sentence.split(" "))
-                    else:
-                        vocab = vocab | set(tokenizer.tokenize(sentence))
                 vocab_size = len(vocab)
             else:
                 raise TypeError("Argument dataForVocabCal should be a list of strings")
@@ -152,6 +153,7 @@ class distinct(evaluate.Measurement):
         for prediction in predictions:
             try:
                 tokens = list(tokenizer.tokenize(prediction))
                 tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
                 tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
             except Exception as e:

     def _download_and_prepare(self, dl_manager):
         """Optional: download external resources useful to compute the scores"""
+    def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer="white_space", mode="Expectation-Adjusted-Distinct"):
         from nltk.util import ngrams
+        from nltk.tokenize import WhitespaceTokenizer
             if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
                 vocab = set()
                 for sentence in dataForVocabCal:
+                    # if tokenizer == "white_space":
+                    #     vocab = vocab | set(sentence.split(" "))
+                    # else:
+                    vocab = vocab | set(tokenizer.tokenize(sentence))
                 vocab_size = len(vocab)
             else:
                 raise TypeError("Argument dataForVocabCal should be a list of strings")
         for prediction in predictions:
             try:
                 tokens = list(tokenizer.tokenize(prediction))
+                print(tokens)
                 tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
                 tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
             except Exception as e:

tokenizer_13a.py CHANGED Viewed

@@ -98,3 +98,7 @@ class Tokenizer13a(BaseTokenizer):
             line = line.replace("&gt;", ">")
         return self._post_tokenizer(f" {line} ")

             line = line.replace("&gt;", ">")
         return self._post_tokenizer(f" {line} ")
+    @lru_cache(maxsize=2**16)
+    def tokenize(self, line):
+        self.__call__(self, line)