distinct
Browse files- __pycache__/distinct.cpython-38.pyc +0 -0
- distinct.py +7 -5
- tokenizer_13a.py +4 -0
__pycache__/distinct.cpython-38.pyc
CHANGED
|
Binary files a/__pycache__/distinct.cpython-38.pyc and b/__pycache__/distinct.cpython-38.pyc differ
|
|
|
distinct.py
CHANGED
|
@@ -115,8 +115,9 @@ class distinct(evaluate.Measurement):
|
|
| 115 |
def _download_and_prepare(self, dl_manager):
|
| 116 |
"""Optional: download external resources useful to compute the scores"""
|
| 117 |
|
| 118 |
-
def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer=
|
| 119 |
from nltk.util import ngrams
|
|
|
|
| 120 |
|
| 121 |
|
| 122 |
|
|
@@ -135,10 +136,10 @@ class distinct(evaluate.Measurement):
|
|
| 135 |
if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
|
| 136 |
vocab = set()
|
| 137 |
for sentence in dataForVocabCal:
|
| 138 |
-
if tokenizer == "white_space":
|
| 139 |
-
|
| 140 |
-
else:
|
| 141 |
-
|
| 142 |
vocab_size = len(vocab)
|
| 143 |
else:
|
| 144 |
raise TypeError("Argument dataForVocabCal should be a list of strings")
|
|
@@ -152,6 +153,7 @@ class distinct(evaluate.Measurement):
|
|
| 152 |
for prediction in predictions:
|
| 153 |
try:
|
| 154 |
tokens = list(tokenizer.tokenize(prediction))
|
|
|
|
| 155 |
tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
|
| 156 |
tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
|
| 157 |
except Exception as e:
|
|
|
|
| 115 |
def _download_and_prepare(self, dl_manager):
|
| 116 |
"""Optional: download external resources useful to compute the scores"""
|
| 117 |
|
| 118 |
+
def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer="white_space", mode="Expectation-Adjusted-Distinct"):
|
| 119 |
from nltk.util import ngrams
|
| 120 |
+
from nltk.tokenize import WhitespaceTokenizer
|
| 121 |
|
| 122 |
|
| 123 |
|
|
|
|
| 136 |
if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
|
| 137 |
vocab = set()
|
| 138 |
for sentence in dataForVocabCal:
|
| 139 |
+
# if tokenizer == "white_space":
|
| 140 |
+
# vocab = vocab | set(sentence.split(" "))
|
| 141 |
+
# else:
|
| 142 |
+
vocab = vocab | set(tokenizer.tokenize(sentence))
|
| 143 |
vocab_size = len(vocab)
|
| 144 |
else:
|
| 145 |
raise TypeError("Argument dataForVocabCal should be a list of strings")
|
|
|
|
| 153 |
for prediction in predictions:
|
| 154 |
try:
|
| 155 |
tokens = list(tokenizer.tokenize(prediction))
|
| 156 |
+
print(tokens)
|
| 157 |
tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
|
| 158 |
tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
|
| 159 |
except Exception as e:
|
tokenizer_13a.py
CHANGED
|
@@ -98,3 +98,7 @@ class Tokenizer13a(BaseTokenizer):
|
|
| 98 |
line = line.replace(">", ">")
|
| 99 |
|
| 100 |
return self._post_tokenizer(f" {line} ")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
line = line.replace(">", ">")
|
| 99 |
|
| 100 |
return self._post_tokenizer(f" {line} ")
|
| 101 |
+
|
| 102 |
+
@lru_cache(maxsize=2**16)
|
| 103 |
+
def tokenize(self, line):
|
| 104 |
+
self.__call__(self, line)
|