Spaces:

lsy641
/

distinct

Runtime error

App Files Files Community

lsy641 commited on Jul 8, 2023

Commit

6563183

1 Parent(s): 00ff4e2

distinct

Browse files

Files changed (5) hide show

README.md +1 -1
__pycache__/distinct.cpython-38.pyc +0 -0
distinct.py +14 -15
requirements.txt +2 -1
tests.py +3 -6

README.md CHANGED Viewed

@@ -53,7 +53,7 @@ Downloading builder script: 100%|███████████████
 - **mode** *(string): 'Expectation-Adjusted-Distinct' or 'Distinct' for diversity calculation. If 'Expectation-Adjusted-Distinct', the scores for both modes will be returned. The default value is 'Expectation-Adjusted-Distinct'*
 - **vocab_size** *(int): For calculating 'Expectation-Adjusted-Distinct', either vocab_size or  dataForVocabCal should not be None. Default value is None*
 - **dataForVocabCal** *(list of string): dataForVocabCal for calculating the vocab_size for 'Expectation-Adjusted-Distinct'. Typically, it should be a list of sentences consisting the task dataset. For calculating 'Expectation-Adjusted-Distinct', either vocab_size or dataForVocabCal should not be None. Default value is None*
-- **tokenizer** *(string or tokenizer class): tokenizer for splitting sentences into words. Default value is "white_space". NLTK tokenizer is available.*
 ### Output Values

 - **mode** *(string): 'Expectation-Adjusted-Distinct' or 'Distinct' for diversity calculation. If 'Expectation-Adjusted-Distinct', the scores for both modes will be returned. The default value is 'Expectation-Adjusted-Distinct'*
 - **vocab_size** *(int): For calculating 'Expectation-Adjusted-Distinct', either vocab_size or  dataForVocabCal should not be None. Default value is None*
 - **dataForVocabCal** *(list of string): dataForVocabCal for calculating the vocab_size for 'Expectation-Adjusted-Distinct'. Typically, it should be a list of sentences consisting the task dataset. For calculating 'Expectation-Adjusted-Distinct', either vocab_size or dataForVocabCal should not be None. Default value is None*
+- **tokenizer** *(string or tokenizer class): tokenizer for splitting sentences into words. Default value is Tokenizer13a(). NLTK tokenizer is available.*
 ### Output Values

__pycache__/distinct.cpython-38.pyc ADDED Viewed

Binary file (6.1 kB). View file

distinct.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import evaluate
 import datasets
@@ -113,11 +114,11 @@ class distinct(evaluate.Measurement):
     def _download_and_prepare(self, dl_manager):
         """Optional: download external resources useful to compute the scores"""
-        pass
-    def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer="white_space", mode="Expectation-Adjusted-Distinct"):
         from nltk.util import ngrams
         """Returns the scores"""
         if mode == "Expectation-Adjusted-Distinct" and vocab_size is None and dataForVocabCal is None:
@@ -127,6 +128,9 @@ class distinct(evaluate.Measurement):
         elif mode == "Distinct":
             pass
         if mode == "Expectation-Adjusted-Distinct" and dataForVocabCal is not None:
             if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
                 vocab = set()
@@ -145,18 +149,13 @@ class distinct(evaluate.Measurement):
         total_tokens_2grams = []
         total_tokens_3grams = []
-        for prediction in predictions:
-            if tokenizer == "white_space":
-                tokens = prediction.split(" ")
-                tokens_2grams = list(ngrams(prediction.split(" "), 2, pad_left=True, left_pad_symbol='<s>'))
-                tokens_3grams = list(ngrams(prediction.split(" "), 3, pad_left=True, left_pad_symbol='<s>'))
-            else:
-                try:
-                    tokens = list(tokenizer.tokenize(prediction))
-                    tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
-                    tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
-                except Exception as e:
-                    raise e
             distinct_tokens = distinct_tokens | set(tokens)
             distinct_tokens_2grams = distinct_tokens_2grams | set(tokens_2grams)

 import evaluate
 import datasets
+from .tokenizer_13a import Tokenizer13a
     def _download_and_prepare(self, dl_manager):
         """Optional: download external resources useful to compute the scores"""
+    def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer=Tokenizer13a(), mode="Expectation-Adjusted-Distinct"):
         from nltk.util import ngrams
         """Returns the scores"""
         if mode == "Expectation-Adjusted-Distinct" and vocab_size is None and dataForVocabCal is None:
         elif mode == "Distinct":
             pass
+        if tokenizer == "white_space":
+            tokenizer = WhitespaceTokenizer()
         if mode == "Expectation-Adjusted-Distinct" and dataForVocabCal is not None:
             if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
                 vocab = set()
         total_tokens_2grams = []
         total_tokens_3grams = []
+        for prediction in predictions:
+            try:
+                tokens = list(tokenizer.tokenize(prediction))
+                tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
+                tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
+            except Exception as e:
+                raise e
             distinct_tokens = distinct_tokens | set(tokens)
             distinct_tokens_2grams = distinct_tokens_2grams | set(tokens_2grams)

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- git+https://github.com/huggingface/evaluate@main


1	+ git+https://github.com/huggingface/evaluate@main
2	+ nltk

tests.py CHANGED Viewed

@@ -1,17 +1,14 @@
 test_cases = [
     {
-        "predictions": [0, 0],
-        "references": [1, 1],
         "result": {"metric_score": 0}
     },
     {
-        "predictions": [1, 1],
-        "references": [1, 1],
         "result": {"metric_score": 1}
     },
     {
-        "predictions": [1, 0],
-        "references": [1, 1],
         "result": {"metric_score": 0.5}
     }
 ]

 test_cases = [
     {
+        "predictions": ["Hi.", "I'm sorry to hear that", "I don't know"],
         "result": {"metric_score": 0}
     },
     {
+        "predictions": ["Hi.", "I'm sorry to hear that", "I don't know"],
         "result": {"metric_score": 1}
     },
     {
+        "predictions": ["Hi.", "I'm sorry to hear that", "I don't know"],
         "result": {"metric_score": 0.5}
     }
 ]