distinct
Browse files- distinct.py +1 -7
distinct.py
CHANGED
|
@@ -117,7 +117,6 @@ class distinct(evaluate.Measurement):
|
|
| 117 |
|
| 118 |
def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer=Tokenizer13a(), mode="Expectation-Adjusted-Distinct"):
|
| 119 |
from nltk.util import ngrams
|
| 120 |
-
from nltk.tokenize import WhitespaceTokenizer
|
| 121 |
|
| 122 |
|
| 123 |
|
|
@@ -128,9 +127,7 @@ class distinct(evaluate.Measurement):
|
|
| 128 |
raise Warning("We've detected that both vocab_size and dataForVocabCal are specified. We will use dataForVocabCal.")
|
| 129 |
elif mode == "Distinct":
|
| 130 |
pass
|
| 131 |
-
|
| 132 |
-
if tokenizer == "white_space":
|
| 133 |
-
tokenizer = WhitespaceTokenizer()
|
| 134 |
|
| 135 |
if mode == "Expectation-Adjusted-Distinct" and dataForVocabCal is not None:
|
| 136 |
if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
|
|
@@ -152,10 +149,7 @@ class distinct(evaluate.Measurement):
|
|
| 152 |
|
| 153 |
for prediction in predictions:
|
| 154 |
try:
|
| 155 |
-
print(prediction)
|
| 156 |
-
print(tokenizer.tokenize(prediction))
|
| 157 |
tokens = list(tokenizer.tokenize(prediction))
|
| 158 |
-
print(tokens)
|
| 159 |
tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
|
| 160 |
tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
|
| 161 |
except Exception as e:
|
|
|
|
| 117 |
|
| 118 |
def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer=Tokenizer13a(), mode="Expectation-Adjusted-Distinct"):
|
| 119 |
from nltk.util import ngrams
|
|
|
|
| 120 |
|
| 121 |
|
| 122 |
|
|
|
|
| 127 |
raise Warning("We've detected that both vocab_size and dataForVocabCal are specified. We will use dataForVocabCal.")
|
| 128 |
elif mode == "Distinct":
|
| 129 |
pass
|
| 130 |
+
|
|
|
|
|
|
|
| 131 |
|
| 132 |
if mode == "Expectation-Adjusted-Distinct" and dataForVocabCal is not None:
|
| 133 |
if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
|
|
|
|
| 149 |
|
| 150 |
for prediction in predictions:
|
| 151 |
try:
|
|
|
|
|
|
|
| 152 |
tokens = list(tokenizer.tokenize(prediction))
|
|
|
|
| 153 |
tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
|
| 154 |
tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
|
| 155 |
except Exception as e:
|